├── .gitignore ├── LICENSE ├── README.md ├── api.js ├── index.js ├── nginx_config_sample └── example.com.conf └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Zied Hosni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Screenshot 2024-06-30 at 21 50 13 2 | 3 | # Good news everyone! Page Replica is Available as a Web App! 4 | 5 | If you want to avoid the hassle of setting up your own pre-rendering tool, check out [Page Replica](https://page-replica.com). Manage and re-render your pages effortlessly! 6 | 7 | ### Key Feature 8 | - Page Replica is free to use for up to 5,000 requests per month. 9 | - Unlimited sites 10 | - API access 11 | 12 | ### Need Assistance? 13 | If you have any questions or need support, we're here to help! Join our [GitHub Discussion](https://github.com/html5-ninja/page-replica/discussions/3) to get in touch with us. 14 | 15 | --- 16 | 17 | # Page Replica free tool 18 | 19 | "Page Replica" is a versatile web scraping and caching tool built with Node.js, Express, and Puppeteer. It helps prerender web app (React, Angular, Vue,...) pages, which can be served via Nginx for SEO or other purposes. 20 | 21 | The tool allows you to scrape individual web pages or entire sitemaps trough an api, selectively removing JavaScript, and caching the resulting HTML. 22 | 23 | Additionally, it features an Nginx configuration that optimally handles user and search engine bot traffic. 24 | 25 | 26 | ## Installation 27 | 28 | 1. **Clone the Repository:** 29 | 30 | ```bash 31 | git clone https://github.com/html5-ninja/page-replica.git 32 | cd page-replica 33 | ``` 34 | 35 | 2. **Install Dependencies:** 36 | 37 | ```bash 38 | npm install 39 | ``` 40 | 41 | 3. **Settings:** 42 | - index.js 43 | ```bash 44 | const CONFIG = { 45 | baseUrl: "https://example.com", 46 | removeJS: true, 47 | addBaseURL: true, 48 | cacheFolder: "path_to_cache_folder", 49 | } 50 | ``` 51 | - app.js : set the port for your API 52 | 53 | 4. **Start the API:** 54 | 55 | ```bash 56 | npm start 57 | ``` 58 | 59 | ## Usage 60 | 61 | By scraping a page or a sitemap, a copy of the prerendered page will be stored in the cache folder. 62 | 63 | ### Scraping Individual Pages 64 | 65 | To scrape a single page, make a GET request to `/page` with the `url` query parameter: 66 | 67 | ```bash 68 | curl http://localhost:8080/page?url=https://example.com 69 | ``` 70 | 71 | ### Scraping Sitemaps 72 | 73 | To scrape pages from a sitemap, make a GET request to `/sitemap` with the `url` query parameter: 74 | 75 | ```bash 76 | curl http://localhost:8080/sitemap?url=https://example.com/sitemap.xml 77 | ``` 78 | 79 | ## Serve the Cached Pages to Bots with Nginx (My Recipe) 80 | 81 | In this case, the cached pages are served using Nginx. You can adapt this configuration to your needs and your server. 82 | 83 | The Nginx configuration, residing in `nginx_config_sample/example.com.conf`, thoughtfully manages traffic. 84 | It efficiently routes regular users to the main application server and redirects search engine bots to a dedicated server block for cached HTML delivery. 85 | 86 | Please review the `nginx_config_sample/example.com.conf` file to gain an understanding of its functionality. 87 | 88 | ## Contribution 89 | We welcome contributions! If you have ideas for new features or server/cloud configurations that could enhance this tool, feel free to: 90 | 91 | - Open an issue to discuss your ideas. 92 | - Fork the repository and make your changes. 93 | - Submit a pull request with a clear description of your changes. 94 | 95 | ### Feature Requests and Suggestions 96 | If you have any feature requests or suggestions for server/cloud configurations beyond Nginx, please open an issue to start a discussion. 97 | 98 | ## Folder Structure 99 | 100 | - `nginx_config_sample`: Presents a sample Nginx configuration for redirecting bot traffic to the cached content server. 101 | - `api.js`: An Express application responsible for handling web scraping requests. 102 | - `index.js`: The core web scraping logic employing Puppeteer. 103 | - `package.json`: Node.js project configuration. 104 | -------------------------------------------------------------------------------- /api.js: -------------------------------------------------------------------------------- 1 | const process = require("process"); 2 | const express = require("express"); 3 | const Sitemapper = require("sitemapper"); 4 | const scrap = require("./index").scrap; 5 | 6 | // Set the maximum number of listeners to unlimited to prevent warning messages 7 | process.setMaxListeners(0); 8 | 9 | // Create an instance of Express 10 | const app = express(); 11 | // Define the port for the Express app 12 | const port = 8080; 13 | 14 | // Start the Express app and listen on the specified port 15 | app.listen(port, () => { 16 | console.log(`Example app listening at http://localhost:${port}`); 17 | }); 18 | 19 | /** 20 | * Endpoint to scrape a single page. 21 | * @name GET /page 22 | * @function 23 | * @memberof app 24 | * @param {string} url - The URL of the page to be scraped. 25 | * @returns {void} 26 | */ 27 | app.get("/page", (req, res) => { 28 | // Extract the URL from the query parameters 29 | const url = req.query.url; 30 | // Call the scrap function to scrape the specified page 31 | scrap(url); 32 | // Send a response without any content 33 | res.send(); 34 | }); 35 | 36 | /** 37 | * Endpoint to scrape pages from a sitemap. 38 | * @name GET /sitemap 39 | * @function 40 | * @memberof app 41 | * @param {string} url - The URL of the sitemap to be scraped. 42 | * @returns {void} 43 | */ 44 | app.get("/sitemap", (req, res) => { 45 | // Create a new instance of Sitemapper 46 | const sitemap = new Sitemapper(); 47 | // Fetch the sitemap from the specified URL 48 | sitemap.fetch(req.query.url).then(function ({ sites }) { 49 | // Extract the list of URLs from the fetched sitemap 50 | const urls = sites; 51 | // Set an interval to scrape each URL with a delay of 3000 milliseconds (3 seconds) 52 | const interval = setInterval(() => { 53 | const url = urls.shift(); 54 | if (!url) { 55 | // If there are no more URLs, clear the interval 56 | clearInterval(interval); 57 | return; 58 | } 59 | // Call the scrap function to scrape the current URL 60 | scrap(url); 61 | }, 3000); 62 | }); 63 | // Send a response without any content 64 | res.send(); 65 | }); 66 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require("puppeteer"); 2 | const fs = require("fs"); 3 | const path = require("path"); 4 | 5 | /** 6 | * Configuration settings for the web scraper. 7 | * @typedef {Object} Config 8 | * @property {string} baseUrl - The base URL used for creating absolute URLs. 9 | * @property {boolean} removeJS - Whether to remove JavaScript code from the scraped HTML. 10 | * @property {boolean} addBaseURL - Whether to add a base URL to the head of the HTML. 11 | * @property {string} cacheFolder - The folder for caching scraped HTML content. 12 | */ 13 | 14 | /** 15 | * Configuration object with settings. 16 | * @type {Config} 17 | */ 18 | const CONFIG = { 19 | baseUrl: "https://example.com", 20 | removeJS: true, 21 | addBaseURL: true, 22 | cacheFolder: "path_to_cache_folder", 23 | }; 24 | 25 | /** 26 | * Function to create necessary folders based on the provided directory path. 27 | * @param {string} directory - The directory path to create folders for. 28 | */ 29 | const createFolders = (directory) => { 30 | const folders = directory.split(path.sep); 31 | folders.shift(); 32 | let currentPath = CONFIG.cacheFolder; 33 | folders.forEach((folder) => { 34 | currentPath = path.join(currentPath, folder); 35 | if (!fs.existsSync(currentPath)) { 36 | fs.mkdirSync(currentPath); 37 | } 38 | }); 39 | }; 40 | 41 | /** 42 | * Main scraping function. 43 | * @param {string} pathUrl - The URL to scrape. 44 | */ 45 | const scrap = async (pathUrl) => { 46 | try { 47 | // Launch Puppeteer browser 48 | const browser = await puppeteer.launch({ 49 | headless: "new", 50 | args: ["--no-sandbox", "--disable-setuid-sandbox"], 51 | }); 52 | // Create a new page in the browser 53 | const page = await browser.newPage(); 54 | // Navigate to the specified URL and wait until the page is fully loaded 55 | await page.goto(pathUrl, { waitUntil: "networkidle2" }); 56 | // Get the outer HTML of the entire document 57 | let html = await page.evaluate(() => document.documentElement.outerHTML); 58 | 59 | // Remove JavaScript code from the HTML if configured to do so 60 | if (CONFIG.removeJS) { 61 | html = html.replace( 62 | /)<[^<]*)*<\/script>/gi, 63 | "", 64 | ); 65 | } 66 | 67 | // Add base URL to the head if configured to do so 68 | if (CONFIG.addBaseURL) { 69 | html = html.replace(//gi, ``); 70 | } 71 | 72 | // Create necessary folders for caching based on the URL 73 | createFolders(pathUrl); 74 | // Generate a path for caching by removing the protocol (http/https) 75 | const path = pathUrl.replace(/(^\w+:|^)\/\//, ""); 76 | // Write the HTML content to a file in the cache folder 77 | fs.writeFileSync(`${CONFIG.cacheFolder}/${path}/index.html`, html); 78 | 79 | // Close the Puppeteer browser 80 | await browser.close(); 81 | } catch (error) { 82 | // Log any errors that occur during the scraping process 83 | console.error(error); 84 | } 85 | }; 86 | 87 | // Export the scraping function for external use 88 | exports.scrap = scrap; -------------------------------------------------------------------------------- /nginx_config_sample/example.com.conf: -------------------------------------------------------------------------------- 1 | # Map user agents to identify search engines 2 | map $http_user_agent $search_engines { 3 | "~bingbot" 1; 4 | "~BingPreview" 1; 5 | "~Googlebot" 1; 6 | # Add more search engines as needed 7 | } 8 | 9 | # Map user agents to identify social networks 10 | map $http_user_agent $social_networks { 11 | "~*facebook" 1; 12 | "~*twitter" 1; 13 | # Add more social networks as needed 14 | } 15 | 16 | # Combine search engines and social networks to determine if the user agent is a bot 17 | map $search_engines$social_networks $is_bot { 18 | "" ""; 19 | default 1; 20 | } 21 | 22 | # Main server block 23 | server { 24 | listen *:80; 25 | server_name example.com, www.example.com; 26 | access_log /var/log/nginx/example.com.access.log; 27 | error_log /var/log/nginx/example.com.error.log; 28 | 29 | location / { 30 | # Check if the request is coming from a bot 31 | if ($is_bot) { 32 | proxy_pass http://127.0.0.1:8090; 33 | } 34 | 35 | # If not a bot, forward the request to the main application 36 | # in this case the web app run on port 8080 37 | proxy_pass http://127.0.0.1:8080; 38 | proxy_http_version 1.1; 39 | proxy_set_header Upgrade $http_upgrade; 40 | proxy_set_header Connection 'upgrade'; 41 | proxy_set_header X-Forwarded-For $remote_addr; 42 | } 43 | } 44 | 45 | # Secondary server block for handling requests identified as bots 46 | # bots will see this 47 | server { 48 | listen 8090; 49 | server_name 127.0.0.1; 50 | 51 | access_log /var/log/nginx/pagereplica.access.log; 52 | error_log /var/log/nginx/pagereplica.error.log; 53 | 54 | # path to your cache folder 55 | root /usr/share/nginx/html/cache/example.com; 56 | index index.html; 57 | 58 | } 59 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pagereplica", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "api.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node api.js" 9 | }, 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "express": "^4.18.2", 14 | "puppeteer": "^21.6.1", 15 | "sitemapper": "^3.2.8" 16 | } 17 | } 18 | --------------------------------------------------------------------------------