├── .gitignore
├── LICENSE
├── README.md
├── api.js
├── index.js
├── nginx_config_sample
    └── example.com.conf
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Zied Hosni
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img width="321" alt="Screenshot 2024-06-30 at 21 50 13" src="https://github.com/html5-ninja/page-replica/assets/2590579/d606e994-b6ac-4235-9ff6-5ec7a76fa095">
  2 | 
  3 | # Good news everyone! Page Replica is Available as a Web App!
  4 | 
  5 | If you want to avoid the hassle of setting up your own pre-rendering tool, check out [Page Replica](https://page-replica.com). Manage and re-render your pages effortlessly!
  6 | 
  7 | ### Key Feature
  8 | - Page Replica is free to use for up to 5,000 requests per month.
  9 | - Unlimited sites
 10 | - API access
 11 | 
 12 | ### Need Assistance?
 13 | If you have any questions or need support, we're here to help! Join our [GitHub Discussion](https://github.com/html5-ninja/page-replica/discussions/3) to get in touch with us.
 14 | 
 15 | ---
 16 | 
 17 | # Page Replica free tool
 18 | 
 19 | "Page Replica" is a versatile web scraping and caching tool built with Node.js, Express, and Puppeteer. It helps prerender web app (React, Angular, Vue,...) pages, which can be served via Nginx for SEO or other purposes.
 20 | 
 21 | The tool allows you to scrape individual web pages or entire sitemaps trough an api, selectively removing JavaScript, and caching the resulting HTML.
 22 | 
 23 | Additionally, it features an Nginx configuration that optimally handles user and search engine bot traffic.
 24 | 
 25 | 
 26 | ## Installation
 27 | 
 28 | 1. **Clone the Repository:**
 29 | 
 30 |    ```bash
 31 |    git clone https://github.com/html5-ninja/page-replica.git
 32 |    cd page-replica
 33 |    ```
 34 | 
 35 | 2. **Install Dependencies:**
 36 | 
 37 |    ```bash
 38 |    npm install
 39 |    ```
 40 | 
 41 | 3. **Settings:**
 42 | - index.js 
 43 |    ```bash
 44 |    const CONFIG = {
 45 |    baseUrl: "https://example.com",
 46 |    removeJS: true,
 47 |    addBaseURL: true,
 48 |    cacheFolder: "path_to_cache_folder",
 49 |    }
 50 |    ```
 51 | - app.js : set the port for your API
 52 | 
 53 | 4. **Start the API:**
 54 | 
 55 |    ```bash
 56 |    npm start
 57 |    ```
 58 | 
 59 | ## Usage
 60 | 
 61 | By scraping a page or a sitemap, a copy of the prerendered page will be stored in the cache folder.
 62 | 
 63 | ### Scraping Individual Pages
 64 | 
 65 | To scrape a single page, make a GET request to `/page` with the `url` query parameter:
 66 | 
 67 | ```bash
 68 | curl http://localhost:8080/page?url=https://example.com
 69 | ```
 70 | 
 71 | ### Scraping Sitemaps
 72 | 
 73 | To scrape pages from a sitemap, make a GET request to `/sitemap` with the `url` query parameter:
 74 | 
 75 | ```bash
 76 | curl http://localhost:8080/sitemap?url=https://example.com/sitemap.xml
 77 | ```
 78 | 
 79 | ## Serve the Cached Pages to Bots with Nginx (My Recipe)
 80 | 
 81 | In this case, the cached pages are served using Nginx. You can adapt this configuration to your needs and your server.
 82 | 
 83 | The Nginx configuration, residing in `nginx_config_sample/example.com.conf`, thoughtfully manages traffic. 
 84 | It efficiently routes regular users to the main application server and redirects search engine bots to a dedicated server block for cached HTML delivery.
 85 | 
 86 | Please review the `nginx_config_sample/example.com.conf` file to gain an understanding of its functionality.
 87 | 
 88 | ## Contribution
 89 | We welcome contributions! If you have ideas for new features or server/cloud configurations that could enhance this tool, feel free to:
 90 | 
 91 | - Open an issue to discuss your ideas.
 92 | - Fork the repository and make your changes.
 93 | - Submit a pull request with a clear description of your changes.
 94 | 
 95 | ### Feature Requests and Suggestions
 96 | If you have any feature requests or suggestions for server/cloud configurations beyond Nginx, please open an issue to start a discussion.
 97 | 
 98 | ## Folder Structure
 99 | 
100 | - `nginx_config_sample`: Presents a sample Nginx configuration for redirecting bot traffic to the cached content server.
101 | - `api.js`: An Express application responsible for handling web scraping requests.
102 | - `index.js`: The core web scraping logic employing Puppeteer.
103 | - `package.json`: Node.js project configuration.
104 | 


--------------------------------------------------------------------------------
/api.js:
--------------------------------------------------------------------------------
 1 | const process = require("process");
 2 | const express = require("express");
 3 | const Sitemapper = require("sitemapper");
 4 | const scrap = require("./index").scrap;
 5 | 
 6 | // Set the maximum number of listeners to unlimited to prevent warning messages
 7 | process.setMaxListeners(0);
 8 | 
 9 | // Create an instance of Express
10 | const app = express();
11 | // Define the port for the Express app
12 | const port = 8080;
13 | 
14 | // Start the Express app and listen on the specified port
15 | app.listen(port, () => {
16 |   console.log(`Example app listening at http://localhost:${port}`);
17 | });
18 | 
19 | /**
20 |  * Endpoint to scrape a single page.
21 |  * @name GET /page
22 |  * @function
23 |  * @memberof app
24 |  * @param {string} url - The URL of the page to be scraped.
25 |  * @returns {void}
26 |  */
27 | app.get("/page", (req, res) => {
28 |   // Extract the URL from the query parameters
29 |   const url = req.query.url;
30 |   // Call the scrap function to scrape the specified page
31 |   scrap(url);
32 |   // Send a response without any content
33 |   res.send();
34 | });
35 | 
36 | /**
37 |  * Endpoint to scrape pages from a sitemap.
38 |  * @name GET /sitemap
39 |  * @function
40 |  * @memberof app
41 |  * @param {string} url - The URL of the sitemap to be scraped.
42 |  * @returns {void}
43 |  */
44 | app.get("/sitemap", (req, res) => {
45 |   // Create a new instance of Sitemapper
46 |   const sitemap = new Sitemapper();
47 |   // Fetch the sitemap from the specified URL
48 |   sitemap.fetch(req.query.url).then(function ({ sites }) {
49 |     // Extract the list of URLs from the fetched sitemap
50 |     const urls = sites;
51 |     // Set an interval to scrape each URL with a delay of 3000 milliseconds (3 seconds)
52 |     const interval = setInterval(() => {
53 |       const url = urls.shift();
54 |       if (!url) {
55 |         // If there are no more URLs, clear the interval
56 |         clearInterval(interval);
57 |         return;
58 |       }
59 |       // Call the scrap function to scrape the current URL
60 |       scrap(url);
61 |     }, 3000);
62 |   });
63 |   // Send a response without any content
64 |   res.send();
65 | });
66 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require("puppeteer");
 2 | const fs = require("fs");
 3 | const path = require("path");
 4 | 
 5 | /**
 6 |  * Configuration settings for the web scraper.
 7 |  * @typedef {Object} Config
 8 |  * @property {string} baseUrl - The base URL used for creating absolute URLs.
 9 |  * @property {boolean} removeJS - Whether to remove JavaScript code from the scraped HTML.
10 |  * @property {boolean} addBaseURL - Whether to add a base URL to the head of the HTML.
11 |  * @property {string} cacheFolder - The folder for caching scraped HTML content.
12 |  */
13 | 
14 | /**
15 |  * Configuration object with settings.
16 |  * @type {Config}
17 |  */
18 | const CONFIG = {
19 |   baseUrl: "https://example.com",
20 |   removeJS: true,
21 |   addBaseURL: true,
22 |   cacheFolder: "path_to_cache_folder",
23 | };
24 | 
25 | /**
26 |  * Function to create necessary folders based on the provided directory path.
27 |  * @param {string} directory - The directory path to create folders for.
28 |  */
29 | const createFolders = (directory) => {
30 |   const folders = directory.split(path.sep);
31 |   folders.shift();
32 |   let currentPath = CONFIG.cacheFolder;
33 |   folders.forEach((folder) => {
34 |     currentPath = path.join(currentPath, folder);
35 |     if (!fs.existsSync(currentPath)) {
36 |       fs.mkdirSync(currentPath);
37 |     }
38 |   });
39 | };
40 | 
41 | /**
42 |  * Main scraping function.
43 |  * @param {string} pathUrl - The URL to scrape.
44 |  */
45 | const scrap = async (pathUrl) => {
46 |   try {
47 |     // Launch Puppeteer browser
48 |     const browser = await puppeteer.launch({
49 |       headless: "new",
50 |       args: ["--no-sandbox", "--disable-setuid-sandbox"],
51 |     });
52 |     // Create a new page in the browser
53 |     const page = await browser.newPage();
54 |     // Navigate to the specified URL and wait until the page is fully loaded
55 |     await page.goto(pathUrl, { waitUntil: "networkidle2" });
56 |     // Get the outer HTML of the entire document
57 |     let html = await page.evaluate(() => document.documentElement.outerHTML);
58 | 
59 |     // Remove JavaScript code from the HTML if configured to do so
60 |     if (CONFIG.removeJS) {
61 |       html = html.replace(
62 |         /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
63 |         "",
64 |       );
65 |     }
66 | 
67 |     // Add base URL to the head if configured to do so
68 |     if (CONFIG.addBaseURL) {
69 |       html = html.replace(/<head>/gi, `<head><base href="${CONFIG.baseUrl}">`);
70 |     }
71 | 
72 |     // Create necessary folders for caching based on the URL
73 |     createFolders(pathUrl);
74 |     // Generate a path for caching by removing the protocol (http/https)
75 |     const path = pathUrl.replace(/(^\w+:|^)\/\//, "");
76 |     // Write the HTML content to a file in the cache folder
77 |     fs.writeFileSync(`${CONFIG.cacheFolder}/${path}/index.html`, html);
78 | 
79 |     // Close the Puppeteer browser
80 |     await browser.close();
81 |   } catch (error) {
82 |     // Log any errors that occur during the scraping process
83 |     console.error(error);
84 |   }
85 | };
86 | 
87 | // Export the scraping function for external use
88 | exports.scrap = scrap;


--------------------------------------------------------------------------------
/nginx_config_sample/example.com.conf:
--------------------------------------------------------------------------------
 1 | # Map user agents to identify search engines
 2 | map $http_user_agent $search_engines {
 3 |     "~bingbot" 1;
 4 |     "~BingPreview" 1;
 5 |     "~Googlebot" 1;
 6 |     # Add more search engines as needed
 7 | }
 8 | 
 9 | # Map user agents to identify social networks
10 | map $http_user_agent $social_networks {
11 |     "~*facebook" 1;
12 |     "~*twitter" 1;
13 |     # Add more social networks as needed
14 | }
15 | 
16 | # Combine search engines and social networks to determine if the user agent is a bot
17 | map $search_engines$social_networks $is_bot {
18 |     ""      "";
19 |     default 1;
20 | }
21 | 
22 | # Main server block
23 | server {
24 |     listen                *:80;
25 |     server_name           example.com, www.example.com;
26 |     access_log            /var/log/nginx/example.com.access.log;
27 |     error_log             /var/log/nginx/example.com.error.log;
28 | 
29 |     location / {
30 |         # Check if the request is coming from a bot
31 |         if ($is_bot) {
32 |             proxy_pass http://127.0.0.1:8090;
33 |         }
34 | 
35 |         # If not a bot, forward the request to the main application
36 |         # in this case the web app run on port 8080
37 |         proxy_pass http://127.0.0.1:8080;
38 |         proxy_http_version 1.1;
39 |         proxy_set_header Upgrade $http_upgrade;
40 |         proxy_set_header Connection 'upgrade';
41 |         proxy_set_header X-Forwarded-For $remote_addr;
42 |     }
43 | }
44 | 
45 | # Secondary server block for handling requests identified as bots
46 | # bots will see this
47 | server {
48 |     listen 8090;
49 |     server_name 127.0.0.1;
50 | 
51 |     access_log /var/log/nginx/pagereplica.access.log;
52 |     error_log  /var/log/nginx/pagereplica.error.log;
53 | 
54 |     # path to your cache folder 
55 |     root /usr/share/nginx/html/cache/example.com;
56 |     index index.html;
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pagereplica",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "api.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "start": "node api.js"
 9 |   },
10 |   "author": "",
11 |   "license": "ISC",
12 |   "dependencies": {
13 |     "express": "^4.18.2",
14 |     "puppeteer": "^21.6.1",
15 |     "sitemapper": "^3.2.8"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------