├── compare └── .gitkeep ├── static └── .gitkeep ├── .gitignore ├── docs └── project-process-flow.png ├── babel.config.js ├── server.js ├── jest.config.js ├── .github ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── publish_staging.yml │ ├── deploy-without-scraping.yml │ └── publish.yml ├── scraper ├── config.mjs ├── sitemap.mjs ├── scrape.mjs ├── assetScraper.mjs └── htmlScraper.mjs ├── tests ├── ngrok.test.js ├── test.template.js └── setup.mjs ├── .vscode └── settings.json ├── package.json ├── LICENSE ├── helpers └── utils.mjs └── README.MD /compare/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | static/* 3 | !static/.gitkeep 4 | .env 5 | -------------------------------------------------------------------------------- /docs/project-process-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/offerzen/wombat/HEAD/docs/project-process-flow.png -------------------------------------------------------------------------------- /babel.config.js: -------------------------------------------------------------------------------- 1 | const presets = [ 2 | [ 3 | "@babel/preset-env", 4 | ] 5 | ]; 6 | 7 | module.exports = { presets }; 8 | -------------------------------------------------------------------------------- /server.js: -------------------------------------------------------------------------------- 1 | const express = require('express'); 2 | const app = express(); 3 | 4 | app.use(express.static('static')); 5 | 6 | app.listen(3333, () => console.log('Server started!')); 7 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | require('dotenv').config() 2 | 3 | module.exports = { 4 | testEnvironment: "node", 5 | globalSetup: '/tests/setup.mjs', 6 | moduleFileExtensions: ['js', 'mjs'] 7 | }; 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | - package-ecosystem: "npm" 8 | directory: "/" 9 | schedule: 10 | interval: "weekly" 11 | -------------------------------------------------------------------------------- /scraper/config.mjs: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | import { parseSitemap } from './sitemap.mjs'; 3 | 4 | const site = process.env.SITE.split(',')[0].trim(); 5 | export const urls = await parseSitemap(site); 6 | 7 | const buildDirectory = 'compare'; 8 | export const baseDirectory = path.join(process.cwd(), buildDirectory); 9 | -------------------------------------------------------------------------------- /tests/ngrok.test.js: -------------------------------------------------------------------------------- 1 | const testIf = (condition) => condition ? test : test.skip; 2 | 3 | testIf(!global.site.match(/(webflow\.io)/ig)?.length).each(global.urls)('ngrok for %s', (url) => { 4 | // Match based on number so it's easy to see how many matches exist in a page 5 | const matches = global.html(url).match(/ngrok/ig)?.length ?? 0; 6 | 7 | expect(matches).toBe(0); 8 | }); 9 | -------------------------------------------------------------------------------- /tests/test.template.js: -------------------------------------------------------------------------------- 1 | // Run the test for every URL. `global.urls` is created in `/tests/setup.mjs` 2 | test.each(global.urls)('Something about URL: %s', (url) => { 3 | // Use global.html(url) to get html source code for a particular URL. Any tests can be done on that 4 | const matches = global.html(url).match(/foo/ig); 5 | 6 | // Jest expects 7 | expect(matches).toBe(null); 8 | }); 9 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### :building_construction: Changed 2 | What has been changed. 3 | 4 | ### :earth_africa: Why 5 | Why the feature is necessary, important information to consider etc. 6 | 7 | ### :link: Links 8 | - :ticket: [Jira]() 9 | - :iphone: [Review App]() 10 | 11 | ### :robot: QA Steps 12 | - [ ] Add QA steps here. 13 | 14 | ### :spiral_notepad: Notes 15 | Additional notes or trade-offs. 16 | 17 | #### For easy copy & pasting: 18 |
19 | Add a dropdown 20 | Content goes here. 21 |
22 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "yaml.schemas": { 3 | "file:///Users/jethroflanagan/.vscode/extensions/atlassian.atlascode-2.10.0/resources/schemas/pipelines-schema.json": "bitbucket-pipelines.yml", 4 | "https://json.schemastore.org/github-workflow.json": [ 5 | "file:///Users/jethroflanagan/Work/offerzen/webflow-platform/.github/workflows/publish_staging.yml", 6 | "file:///Users/jethroflanagan/Work/offerzen/webflow-platform/.github/workflows/deploy.yml", 7 | "file:///Users/jethroflanagan/Work/offerzen/webflow-platform/.github/workflows/publish.yml" 8 | ] 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /tests/setup.mjs: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path from 'path'; 3 | import { baseDirectory, urls } from '../scraper/config.mjs'; 4 | 5 | export default async function (globalConfig, projectConfig) { 6 | 7 | global.site = process.env.SITE || ''; 8 | 9 | // These are taken from the sitemap 10 | // Filters out testing and staging pages so that only production url get tested 11 | global.urls = urls.filter((url) => !url.match(/(\/test\-)/i)?.length); 12 | 13 | // Synchronous reading of source code for any given url (read from `/compare`) 14 | global.html = (url) => { 15 | // Tidies up directory to not include https:// 16 | const httpsSlash = url.indexOf('//') + 2; 17 | const directory = url.slice(httpsSlash); 18 | 19 | const filePath = path.join(baseDirectory, directory, 'index.html'); 20 | 21 | return fs.readFileSync(filePath, 'utf-8'); 22 | }; 23 | }; 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webflow-deployment-platform", 3 | "version": "1.0.0", 4 | "description": "Scrape Webflow projects, run tests, report", 5 | "license": "MIT", 6 | "scripts": { 7 | "build": "node ./build/index.mjs", 8 | "test": "jest", 9 | "scrape": "node scraper/scrape.mjs", 10 | "doc-tree": "tree -I 'node_modules|yarn.lock|README.MD|package.json|compare|static'" 11 | }, 12 | "dependencies": { 13 | "@babel/preset-env": "^7.18.2", 14 | "axios": "^0.27.2", 15 | "cheerio": "^1.0.0-rc.11", 16 | "command-line-args": "^5.2.1", 17 | "dotenv": "^16.0.1", 18 | "env-cmd": "^10.1.0", 19 | "express": "^4.18.2", 20 | "fs-extra": "^10.1.0", 21 | "html-minifier": "^4.0.0", 22 | "jest": "^28.1.1", 23 | "rimraf": "^3.0.2", 24 | "sitemapper": "^3.2.2", 25 | "uncss": "^0.17.3", 26 | "url-join": "^5.0.0", 27 | "uuid": "^9.0.0" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /scraper/sitemap.mjs: -------------------------------------------------------------------------------- 1 | import Sitemapper from 'sitemapper'; 2 | 3 | export const parseSitemap = async (site) => { 4 | let urlWithoutTrailingSlash = site.replace(/\/$/, ''); 5 | if (!urlWithoutTrailingSlash.match(/^http/)) { 6 | urlWithoutTrailingSlash = 'https://' + urlWithoutTrailingSlash; 7 | } 8 | const sitemap = new Sitemapper({ 9 | url: `${urlWithoutTrailingSlash}/sitemap.xml`, 10 | timeout: 15000, // 15 seconds 11 | }); 12 | 13 | try { 14 | const { sites, errors } = await sitemap.fetch(); 15 | if (errors?.length) { 16 | console.error(errors); 17 | } 18 | 19 | // Change webflow URLs to actual domain 20 | if (site.includes('webflow.io')) { 21 | return sites.map((url) => { 22 | return url.replace(process.env.PRODUCTION_DOMAIN, 'webflow.io'); // e.g. PRODUCTION_DOMAIN=test.com (leave out protocol) 23 | }); 24 | } 25 | return sites; 26 | } catch (error) { 27 | console.error(error); 28 | } 29 | }; 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OfferZen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scraper/scrape.mjs: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | 3 | import path from 'path'; 4 | import rimraf from 'rimraf'; 5 | import { baseDirectory, urls } from './config.mjs'; 6 | import { scrapePages } from './htmlScraper.mjs'; 7 | 8 | (async () => { 9 | await clearBaseDirectories(); 10 | scrapePages(); 11 | })(); 12 | 13 | async function clearBaseDirectories() { 14 | // clear existing content per sitemap base directories (for all possible values in sitemap) e.g. flow.offerzen.com 15 | // Does not clear assets 16 | try { 17 | const sitemapBaseDirectoriesHash = {}; 18 | urls.forEach(url => { 19 | const httpsSlash = url.indexOf('//') + 2; 20 | const firstSlash = url.indexOf('/', httpsSlash) + 1; 21 | const directory = url.slice(httpsSlash, firstSlash); 22 | 23 | return sitemapBaseDirectoriesHash[directory] = true; 24 | }); 25 | const sitemapBaseDirectories = Object.keys(sitemapBaseDirectoriesHash); 26 | 27 | for (let directory of sitemapBaseDirectories) { 28 | const directoryPath = path.join(baseDirectory, directory); 29 | rimraf.sync(directoryPath); 30 | } 31 | } 32 | catch (e) { 33 | console.error('Clearing directories failed', e); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /helpers/utils.mjs: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path from 'path'; 3 | 4 | /** 5 | * Synchronously and recursively get directory contents 6 | * @param {string} dir Path to files (use `path`) 7 | * @param {object} object Defaults to empty object 8 | * @param {regex} object.includeOnly Use single regex to define what files 9 | * should be matched, others will be ignored 10 | */ 11 | export function getDirectoryContents(dir, { includeOnly } = {}) { 12 | var results = []; 13 | var list = fs.readdirSync(dir); 14 | 15 | list.forEach((file) => { 16 | file = path.join(dir, file); 17 | let include = true; 18 | 19 | var stat = fs.statSync(file); 20 | if (stat && stat.isDirectory()) { 21 | /* Recurse into a subdirectory */ 22 | results = results.concat(getDirectoryContents(file, { includeOnly })); 23 | } else { 24 | /* Is a file */ 25 | if (includeOnly) { 26 | include = file.match(includeOnly) != null; 27 | } 28 | if (!include) return; 29 | 30 | results.push(file); 31 | } 32 | }); 33 | 34 | return results; 35 | } 36 | 37 | export function getLocalAssetPathFromCdn(url) { 38 | const cdnUrl = new RegExp(`(${process.env.CDN_URL}|${process.env.WEBFLOW_CDN_URL})`.replace(/\./g, '\\.'), 'i'); 39 | 40 | let localPath = path.resolve('static'); 41 | if (!localPath.match(`${path.sep}$`)) { 42 | localPath += path.sep; 43 | } 44 | 45 | const filePath = url.replace(cdnUrl, localPath); 46 | 47 | return filePath; 48 | } 49 | 50 | export function resolveUrl(base, src) { 51 | const url = new URL(src, new URL(base, 'resolve://')); 52 | if (url.protocol === 'resolve:') { 53 | // `base` can be a relative URL. 54 | const { pathname, search, hash } = url; 55 | return pathname + search + hash; 56 | } 57 | return url.toString(); 58 | } 59 | 60 | // Internal use 61 | const _pipe = (a, b) => (arg) => b(a(arg)); 62 | 63 | export const pipe = (...ops) => ops.reduce(_pipe) 64 | -------------------------------------------------------------------------------- /scraper/assetScraper.mjs: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import fs, { createWriteStream } from 'fs'; 3 | import path from 'path'; 4 | import { resolveUrl } from '../helpers/utils.mjs'; 5 | 6 | /** 7 | * Gets assets from the content based if it's from the webflow cdn 8 | * @param {object} object 9 | * @param {string} object.content html or css 10 | * @param {string} object.url base url to resolve relative asset paths against e.g. hello.com/blah/bar will load ../foo.png as hello.com/blah/foo.png 11 | */ 12 | export function getAssetUrls({ content, url }) { 13 | // Only copy webflow cdn content e.g. assets.website-files.com/foo/bar.png with or without an extension 14 | // Example matches: https://regexr.com/6p22m 15 | // Captures without '" surrounding it otherwise it's difficult to deal with capturing srcset (https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images) 16 | const matches = 17 | content.match( 18 | /\bhttps?\:\/\/assets\.website\-files\.com\/[a-z0-9.\-_~!$&()*+;=:@% /]+\.([a-z]{2,4})\b/gi 19 | ) ?? []; 20 | 21 | // Only collect unique ones 22 | const uniqueMatches = {}; 23 | matches.forEach((match) => (uniqueMatches[match] = true)); 24 | 25 | const assetUrls = Object.keys(uniqueMatches).map((src) => 26 | resolveUrl(url, src) 27 | ); 28 | 29 | return assetUrls; 30 | } 31 | 32 | /** 33 | * TODO: make recursive 34 | * @param {object} object 35 | * @param {string} object.url URL including https 36 | * @param {string} object.directoryPath Use path.resolve on it first 37 | */ 38 | export async function downloadAsset({ url, directoryPath }) { 39 | const normalisedPath = url.replace(/https?\:\/\//, ''); 40 | const assetPath = path.join(directoryPath, cleanupAssetUrl(normalisedPath)); // Must decode otherwise it gets saved with things like %20 and cannot get served 41 | 42 | try { 43 | fs.mkdirSync(path.dirname(assetPath), { recursive: true }); 44 | } catch (e) { 45 | console.error('Could not create asset directory', e); 46 | } 47 | 48 | const writer = createWriteStream(assetPath); 49 | 50 | let response = null; 51 | let fetchAttempts = 0; 52 | let didFetch = false; 53 | 54 | while (!didFetch) { 55 | try { 56 | response = await axios({ 57 | method: 'get', 58 | url, 59 | responseType: 'stream', 60 | }); 61 | didFetch = true; 62 | } catch (e) { 63 | if (fetchAttempts >= 2) { 64 | console.error(`Axios failed for ${url}.`); 65 | didFetch = true; 66 | } else { 67 | console.error(`Retrying to download: ${url}.`); 68 | await delay(10000) 69 | } 70 | } 71 | fetchAttempts++; 72 | } 73 | 74 | return new Promise((resolve, reject) => { 75 | response.data.pipe(writer); 76 | writer.on('error', (err) => { 77 | writer.close(); 78 | reject(err); 79 | }); 80 | writer.on('finish', () => resolve(assetPath)); 81 | }); 82 | } 83 | 84 | function cleanupAssetUrl(url) { 85 | return decodeURIComponent(url); 86 | } 87 | 88 | function delay(delayInMs) { 89 | return new Promise((resolve) => setTimeout(resolve, delayInMs)); 90 | } 91 | -------------------------------------------------------------------------------- /scraper/htmlScraper.mjs: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import cheerio from 'cheerio'; 3 | import fs, { readFileSync } from 'fs'; 4 | import path from 'path'; 5 | import { downloadAsset, getAssetUrls } from './assetScraper.mjs'; 6 | import { baseDirectory, urls } from './config.mjs'; 7 | 8 | const downloadedAssetUrls = {}; 9 | 10 | export async function scrapePages() { 11 | for (let pageUrl of urls) { 12 | // Sitemap in a webflow project might be prefixed with https://flow.offerzen.com/ when actual site queried is https://offerzen.webflow.io/ 13 | // Get path after first slash after `https://` 14 | const httpsSlash = pageUrl.indexOf('//') + 2; 15 | const directoryName = pageUrl.slice(httpsSlash); 16 | 17 | // Add folders to static under project base 18 | const directoryPath = path.join(baseDirectory, directoryName); 19 | 20 | try { 21 | fs.mkdirSync(directoryPath, { recursive: true }); 22 | } catch (e) { 23 | console.error('Could not create directories', e); 24 | } 25 | const { assetUrls, content: html } = await getPageContent({ 26 | url: pageUrl, 27 | directoryPath, 28 | }); 29 | // Paths don't matter, just download for later 30 | await downloadAssets({ assetUrls }); 31 | 32 | try { 33 | fs.writeFileSync(path.join(directoryPath, 'index.html'), html, { 34 | encoding: 'utf-8', 35 | }); 36 | } catch (e) { 37 | console.error('Failed to write file', pageUrl, 'Error', e); 38 | } 39 | } 40 | } 41 | 42 | async function getPageContent({ url }) { 43 | let data = null; 44 | try { 45 | data = (await axios.get(url)).data; 46 | } catch (e) { 47 | console.error(`Axios failed for: ${url}`); 48 | } 49 | 50 | const $ = cheerio.load(data); 51 | 52 | const content = $.html(); 53 | 54 | const assetUrls = getAssetUrls({ content, url }); 55 | 56 | return { content, assetUrls }; 57 | } 58 | 59 | async function downloadAssets({ assetUrls }) { 60 | // Download only new assets 61 | const assetsToDownload = assetUrls.filter( 62 | (assetUrl) => !(assetUrl in downloadedAssetUrls) 63 | ); 64 | 65 | if (!assetsToDownload.length) return []; 66 | 67 | const assetPaths = await Promise.allSettled( 68 | assetsToDownload.map(async (url, i) => { 69 | // download to baseDirectory instead of subdomain path so all sites can share assets 70 | const assetPath = await downloadAsset({ 71 | url, 72 | directoryPath: baseDirectory, 73 | }); 74 | downloadedAssetUrls[url] = assetPath; 75 | console.log(`Downloaded ${Object.keys(downloadedAssetUrls).length}`); 76 | 77 | const nestedAssetPaths = await downloadNestedAssets({ assetPath, url }); 78 | 79 | return [assetPath, ...nestedAssetPaths]; 80 | }) 81 | ); 82 | 83 | const failed = assetPaths 84 | .filter((r) => r.status === 'rejected') 85 | .map((r) => r.reason); 86 | if (failed.length) { 87 | console.error('Failed to download assets', failed); 88 | } 89 | 90 | return assetPaths.flat(Infinity); 91 | } 92 | 93 | /** 94 | * Recurse into css and svg files for other asset links 95 | * @param {string} assetPath Asset file path 96 | * @param {string} url Asset url 97 | */ 98 | async function downloadNestedAssets({ assetPath, url }) { 99 | if (assetPath.match(/\.(css|svg|js)$/gi) == null) { 100 | // Stick to the likely candidates for including nested content 101 | return []; 102 | } 103 | 104 | const content = fs.readFileSync(assetPath, 'utf-8'); 105 | const assetUrls = getAssetUrls({ content, url }); 106 | 107 | return downloadAssets({ assetUrls }); 108 | } 109 | -------------------------------------------------------------------------------- /.github/workflows/publish_staging.yml: -------------------------------------------------------------------------------- 1 | name: Publish Staging 2 | 3 | on: 4 | workflow_dispatch: 5 | repository_dispatch: 6 | types: [publish_webflow] 7 | 8 | jobs: 9 | publish-target: 10 | runs-on: ubuntu-latest 11 | timeout-minutes: 1 12 | env: 13 | SITE: ${{ github.event.client_payload.site }} 14 | outputs: 15 | site: ${{ steps.regex-match.outputs.match }} 16 | steps: 17 | - uses: actions-ecosystem/action-regex-match@v2 18 | id: regex-match 19 | with: 20 | text: ${{ github.event.client_payload.site }} 21 | regex: '[a-z_\-]+\.webflow\.io' 22 | 23 | scrape: 24 | needs: [publish-target] 25 | timeout-minutes: 10 26 | if: ${{ needs.publish-target.outputs.site != '' }} 27 | runs-on: ubuntu-latest 28 | env: 29 | NODE_ENV: staging 30 | SITE: ${{ needs.publish-target.outputs.site }} 31 | CDN_URL: ${{ secrets.CDN_URL }} 32 | WEBFLOW_CDN_URL: https://assets.website-files.com/ 33 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | BRANCH_NAME: staging 35 | PREVIEW_DOMAIN: ${{ secrets.PREVIEW_DOMAIN }} 36 | 37 | strategy: 38 | matrix: 39 | node-version: [16.x] 40 | 41 | steps: 42 | - uses: actions/checkout@v4 43 | with: 44 | ref: master 45 | token: ${{ secrets.GITHUB_TOKEN }} 46 | 47 | - name: Reset branch 48 | run: | 49 | if [ "`git branch -r | egrep staging`" ] 50 | then 51 | git push origin --delete ${{ env.BRANCH_NAME }} 52 | fi 53 | git checkout -b ${{ env.BRANCH_NAME }} master 54 | git pull origin master 55 | 56 | - name: Use Node.js ${{ matrix.node-version }} 57 | uses: actions/setup-node@v4 58 | with: 59 | node-version: ${{ matrix.node-version }} 60 | # yarn caching 61 | - uses: actions/cache@v4 62 | with: 63 | path: '**/node_modules' 64 | key: ${{ runner.os }}-modules-${{ hashFiles('**/yarn.lock') }} 65 | 66 | - name: Install Dependencies 67 | run: | 68 | yarn 69 | 70 | - name: Run scraper 71 | run: | 72 | yarn scrape 73 | 74 | - name: Send slack notification 75 | if: failure() 76 | run: | 77 | curl -X POST -H "Content-type: application/json" --data '{"text": " Scraper failed for ${{ env.SITE }}"}' ${{ secrets.SLACK_HOOK }} 78 | 79 | # Without any changes it will not trigger deploys 80 | - name: fake 81 | run: | 82 | echo $(date '+%s') > test.txt 83 | 84 | - name: Commit and push 85 | uses: actions-js/push@v1.4 86 | with: 87 | github_token: ${{ secrets.GITHUB_TOKEN }} 88 | message: 'Automatic: scrape' 89 | branch: ${{ env.BRANCH_NAME }} 90 | force: true 91 | 92 | - name: Actually push 93 | run: | 94 | git push origin ${{ env.BRANCH_NAME }} 95 | 96 | - name: Run test 97 | id: test 98 | run: | 99 | SITE=${{ env.SITE }} yarn test 100 | 101 | - name: Send slack notification 102 | if: failure() 103 | run: | 104 | curl -X POST -H "Content-type: application/json" --data '{"text": " Tests failed for ${{ env.SITE }}: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}?check_suite_focus=true"}' ${{ secrets.SLACK_HOOK }} 105 | 106 | - name: Send slack notification 107 | if: success() 108 | run: | 109 | echo "SITE_PATH=$(echo "${{ env.SITE }}" | sed -e 's/https:\/\///g' -e 's:/*$::')" >> $GITHUB_ENV 110 | curl -X POST -H "Content-type: application/json" --data '{"text": "Tests successful for ${{ env.SITE }}"}' ${{ secrets.SLACK_HOOK }} 111 | 112 | - name: Send preview url notification 113 | if: success() 114 | run: | 115 | curl -X POST -H "Content-type: application/json" --data '{"text": "Preview Deployed: ${{ env.PREVIEW_DOMAIN }}${{ env.SITE_PATH }}"}' ${{ secrets.SLACK_HOOK }} 116 | -------------------------------------------------------------------------------- /.github/workflows/deploy-without-scraping.yml: -------------------------------------------------------------------------------- 1 | # This can be used to redeploy after rollback as it re-uses already scraped assets. It targets a specific site / project only 2 | name: Deploy without scraping 3 | 4 | on: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | build: 9 | env: 10 | SITE: ${{ github.event.client_payload.site }} # Replace with actual webflow site url if this isn't triggered by webhook 11 | NODE_ENV: production 12 | CDN_URL: ${{ secrets.CDN_URL }} 13 | WEBFLOW_CDN_URL: https://assets.website-files.com/ 14 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 15 | PREVIEW_DOMAIN: ${{ secrets.CDN_URL }} 16 | 17 | runs-on: ubuntu-latest 18 | timeout-minutes: 7 19 | steps: 20 | 21 | - uses: actions/checkout@v4 22 | with: 23 | ref: master 24 | token: ${{ secrets.GITHUB_TOKEN }} 25 | 26 | - name: Use Node.js ${{ matrix.node-version }} 27 | uses: actions/setup-node@v4 28 | with: 29 | node-version: ${{ matrix.node-version }} 30 | 31 | # yarn caching 32 | - uses: actions/cache@v4 33 | with: 34 | path: '**/node_modules' 35 | key: ${{ runner.os }}-modules-${{ hashFiles('**/yarn.lock') }} 36 | 37 | - name: Install Dependencies 38 | run: | 39 | yarn build 40 | echo "SITE_PATH=$(echo "${{ env.SITE }}" | sed -e 's/https:\/\///g' -e 's:/*$::')" >> $GITHUB_ENV 41 | 42 | - name: Upload transformed sites 43 | uses: actions/upload-artifact@v4 44 | with: 45 | name: static-files 46 | path: static/ 47 | retention-days: 1 48 | 49 | - name: Send slack notification 50 | if: failure() 51 | run: | 52 | curl -X POST -H "Content-type: application/json" --data '{"text": " Production Deploy failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}?check_suite_focus=true"}' ${{ secrets.SLACK_HOOK }} 53 | 54 | 55 | - name: Send preview url notification 56 | if: success() 57 | run: | 58 | curl -X POST -H "Content-type: application/json" --data '{"text": "Production Deployed: ${{ env.PREVIEW_DOMAIN }}${{ env.SITE_PATH }}"}' ${{ secrets.SLACK_HOOK }} 59 | 60 | deploy-production: 61 | needs: [build] 62 | runs-on: ubuntu-latest 63 | timeout-minutes: 6 64 | 65 | steps: 66 | - name: Checkout 67 | uses: actions/checkout@v4 68 | 69 | - name: Set shared environment variables 70 | uses: offerzen/action-env-vars-from-ssm@v1 71 | with: 72 | path: '/shared/' 73 | env: 74 | AWS_ACCESS_KEY_ID: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 75 | AWS_SECRET_ACCESS_KEY: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 76 | AWS_DEFAULT_REGION: ${{ secrets.ORG_AWS_REGION }} 77 | AWS_ROLE_ARN: ${{ vars.ORG_AWS_ROLE_ARN_GHA_PRODUCTION }} 78 | 79 | - name: Configure AWS Credentials 80 | uses: aws-actions/configure-aws-credentials@v4 81 | with: 82 | aws-access-key-id: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 83 | aws-secret-access-key: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 84 | aws-region: ${{ secrets.ORG_AWS_REGION }} 85 | role-to-assume: ${{ vars.ORG_AWS_ROLE_ARN_GHA_PRODUCTION }} 86 | role-duration-seconds: 900 87 | 88 | - name: Download transformed sites 89 | uses: actions/download-artifact@v4 90 | with: 91 | name: static-files 92 | path: static/ 93 | 94 | - name: Upload build to S3 bucket 95 | env: 96 | S3_DEPLOY_PATH: "s3://$S3_BUCKET_NAME_CDN/${{ github.event.repository.id }}" 97 | run: | 98 | echo "Uploading non-HTML files to ${{ env.S3_DEPLOY_PATH }}" 99 | aws s3 sync --no-progress --exclude *.html static/ ${{ env.S3_DEPLOY_PATH }} 100 | echo 101 | echo "Uploading HTML files to S3 with 'cache-control:no-cache' header..." 102 | aws s3 sync --no-progress --include *.html --content-type "text/html;charset=utf-8" --cache-control no-cache static/ ${{ env.S3_DEPLOY_PATH }} 103 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Wombat 2 | 3 | Webflow Combat 4 | 5 | ## Purpose 6 | 7 | Scrapes pages from Webflow whenever a project is published. 8 | 9 | - Run tests on the source of scraped pages 10 | - Transform source code with Core Web Vitals optimisations 11 | - Serve all pages on a different platform so any Webflow publish will only go live if it passes all tests and transformations 12 | 13 | ## Why 14 | 15 | Webflow has signficant issues that reduce the safety of any given publish action. 16 | 17 | **Webflow problems** 18 | Ticks are where this project solves the issue: 19 | 20 | - [x] Changes are very obfuscated and not clear what is being done (or undone if reverting) 21 | - [x] Changes made while developing can accidentally be left in the code and published 22 | - [x] Bad page construction for core web vitals that cannot be changed as they are editable in the Webflow designer 23 | - [x] Single CSS file for entire project (all pages share one large file) 24 | - [x] Unable to see what changes will go live without manually reviewing an entire project _before_ clicking publish 25 | - [x] Clear list of changes each publish 26 | - [x] Reverting changes 27 | - [ ] When publishing all project pages will go live and it cannot be "locked" to a given page or set of changes 28 | - [ ] CSS edited on one side of the site can affect the rest of the site 29 | 30 | ## Project architecture 31 | 32 | ### Process flow 33 | 34 | ![Process flow](./docs/project-process-flow.png) 35 | 36 | #### Setup a Webflow project 37 | 38 | Add a webhook to a Webflow project on `publish`. Zapier is one option, there are many others. 39 | 40 | #### When a Webflow project is published 41 | 42 | - WebFlow is published, fires the Webhook 43 | - `domains` are extracted and first domain is included in `client_payload.site` for GitHub workflow. Any domain can be used, all point to the same place 44 | - `POST` to [Wombat GitHub actions](https://api.github.com/repos/offerzen/wombat/dispatches) as `publish_webflow` 45 | - Fires the `.github/workflows/publish.yml` workflow 46 | - Webflow project specified in `client_payload.site` is scraped 47 | - All files are placed in `/compare` 48 | - Files are placed in a root folder based on name from `client_payload.site` 49 | - Folders are created based on directory path and pages are added as `index.html` files 50 | - PR is created so raw (without transformations) Webflow files can be compared to the last publish 51 | - Source code tests (`yarn test`) are run against `/compare` files. This uses Jest (all added to `/tests`) 52 | - Any successes or failures for this workflow will be reported to Slack _#gp-webflow-platform_ 53 | 54 | #### Build and deploy 55 | 56 | To ensure only tested files are deployed, all files are taken from `/compare` and not rescraped. 57 | 58 | - Delete everything in `/static` to ensure we don't serve old content 59 | - Copy everything `/compare` to `/static`. 60 | - `/compare` content is always added to git but `/static` is not, so that there are no extra 61 | - Run optimsation transformations (`/core-web-vitals/index.mjs`) on html source and recopy files to `/static`. 62 | - Possibility: trigger end-to-end tests in staging (outside of project scope) 63 | - Serve 64 | 65 | ### File architecture 66 | 67 | Core components are marked by `[component]`. 68 | 69 | ```bash 70 | ├── babel.config.js # 71 | │ 72 | ├── [build] `Component: Builder` 73 | │ │ Building the output for deployment. This will be run as a final step before deployment. 74 | │ │ Files are modified and copied from `/compare` to `/static`. 75 | │ ├── assets # Operate on any assets 76 | │ │ └── rewriteAssetPaths.mjs # All paths to Webflow's CDN are rewritten to use our CDN 77 | │ ├── coreWebVitals # Optimisation transformations plugins 78 | │ │ ├── css # Optimisation plugins for css 79 | │ │ │ └── uncss.mjs # Plugin: Makes CSS per-page instead of per-project and strips out unnecessary rules 80 | │ │ ├── html # Optimisation plugins for html 81 | │ │ │ └── typekit.mjs # Makes Typekit async 82 | │ │ └── index.mjs # 83 | │ └── index.mjs # Run by `yarn build`. Copies from /compare to /static and runs all plugins 84 | │ 85 | ├── compare # Scraped files from Webflow without transformations 86 | │ └── ... 87 | │ 88 | ├── docs # Images for this readme 89 | │ └── ... 90 | │ 91 | ├── helpers # Shared tools 92 | │ └── utils.mjs # Read files from directory, pipe 93 | │ 94 | ├── jest.config.js # For jest source code tests 95 | ├── scrape-all.sh # Scrapes all files for each Webflow project to populate /compare if needed 96 | │ 97 | ├── [scraper] `Component: Scraper` 98 | │ │ Get all the pages and assets from Webflow. Downloads files to `/compare`. 99 | │ ├── assetScraper.mjs # Handle scraping and downloading assets e.g. images, css, svg files 100 | │ ├── config.mjs # Global vars for current site 101 | │ ├── htmlScraper.mjs # Handle scraping HTML 102 | │ ├── scrape.mjs # Scrape Webflow based on sitemap 103 | │ └── sitemap.mjs # Get sitemap from Webflow as targets 104 | │ 105 | ├── static # Scraped files from Webflow with optimisations for serving 106 | │ └── ... 107 | │ 108 | └── [tests] `Component: Tester` 109 | │ Tests to run on scraped content 110 | │ Use Jest, and see the example `test.template.js` to learn more. 111 | ├── ngrok.test.js # Checks for presence of ngrok 112 | ├── setup.mjs # Allows tests to query source with `global.html(...)` 113 | └── test.template.js # Example test 114 | ``` 115 | 116 | _To regenerate above, install `tree` e.g. `brew install tree` and run `yarn doc-tree` and cleanup output._ 117 | 118 | ## Local development 119 | 120 | Add a .env file with the following: 121 | 122 | ```bash 123 | SITE= # e.g https://offerzen.webflow.io/ 124 | CDN_URL= # e.g. https://offerzen.github.io/assets/ 125 | ``` 126 | 127 | Setup the project: 128 | 129 | ```bash 130 | $ yarn 131 | ``` 132 | 133 | Scrape just one project: 134 | 135 | ```bash 136 | $ yarn scrape 137 | ``` 138 | 139 | Run tests on scraped files: 140 | 141 | ```bash 142 | $ yarn test 143 | ``` 144 | 145 | Run build-step optimisations/transformations: 146 | 147 | ```bash 148 | $ yarn build 149 | ``` 150 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | workflow_dispatch: 5 | repository_dispatch: 6 | types: [publish_webflow] 7 | 8 | jobs: 9 | publish-target: 10 | runs-on: ubuntu-latest 11 | timeout-minutes: 1 12 | env: 13 | SITE: ${{ github.event.client_payload.site }} 14 | outputs: 15 | site: ${{ steps.regex-match.outputs.match }} 16 | steps: 17 | - uses: actions-ecosystem/action-regex-match@v2 18 | id: regex-match 19 | with: 20 | text: ${{ github.event.client_payload.site }} 21 | regex: '[a-z_\-]+\.offerzen\.com' 22 | 23 | scrape: 24 | needs: [publish-target] 25 | if: ${{ needs.publish-target.outputs.site != '' }} 26 | runs-on: ubuntu-latest 27 | timeout-minutes: 10 28 | env: 29 | NODE_ENV: test 30 | SITE: ${{ needs.publish-target.outputs.site }} 31 | CDN_URL: ${{ secrets.CDN_URL }} 32 | WEBFLOW_CDN_URL: https://assets.website-files.com/ 33 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | BRANCH_NAME: test 35 | 36 | strategy: 37 | matrix: 38 | node-version: [16.x] 39 | 40 | steps: 41 | - uses: actions/checkout@v4 42 | with: 43 | ref: master 44 | token: ${{ secrets.GITHUB_TOKEN }} 45 | 46 | - name: Use Node.js ${{ matrix.node-version }} 47 | uses: actions/setup-node@v4 48 | with: 49 | node-version: ${{ matrix.node-version }} 50 | # yarn caching 51 | - uses: actions/cache@v4 52 | with: 53 | path: '**/node_modules' 54 | key: ${{ runner.os }}-modules-${{ hashFiles('**/yarn.lock') }} 55 | 56 | - name: Install Dependencies 57 | run: | 58 | yarn 59 | 60 | - name: Run scraper 61 | run: | 62 | yarn scrape 63 | 64 | - name: Send slack notification 65 | if: failure() 66 | run: | 67 | curl -X POST -H "Content-type: application/json" --data '{"text": " Scraper failed for ${{ env.SITE }}"}' ${{ secrets.SLACK_HOOK }} 68 | 69 | - name: Setup branch 70 | id: branch 71 | run: | 72 | echo "SITE_PATH=$(echo "${{ env.SITE }}" | sed -e 's/https:\/\///g' -e 's:/*$::')" >> $GITHUB_ENV 73 | echo "BRANCH_NAME=automatic/$(date '+%s')" >> $GITHUB_ENV 74 | 75 | - name: Create branch 76 | run: | 77 | git checkout -b ${{ env.BRANCH_NAME }} 78 | 79 | - name: Commit and push 80 | uses: actions-js/push@v1.4 81 | with: 82 | github_token: ${{ secrets.GITHUB_TOKEN }} 83 | message: 'Automatic: scrape' 84 | branch: ${{ env.BRANCH_NAME }} 85 | 86 | - name: Actually push 87 | run: | 88 | git push -u origin ${{ env.BRANCH_NAME }} 89 | 90 | - name: Create Pull Request 91 | id: cpr 92 | run: | 93 | echo "PULL_REQUEST_URL=$(gh pr create --base master --title 'Webflow publish for ${{ env.SITE_PATH }}' --body 'Automatic: Webflow publish')" >> $GITHUB_ENV 94 | 95 | - name: Run test 96 | id: test 97 | run: | 98 | SITE=${{ env.SITE }} yarn test 99 | 100 | - name: Send slack notification 101 | if: failure() 102 | run: | 103 | curl -X POST -H "Content-type: application/json" --data '{"text": " Tests failed for ${{ env.SITE }}: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}?check_suite_focus=true"}' ${{ secrets.SLACK_HOOK }} 104 | 105 | - name: Send slack notification 106 | if: success() 107 | run: | 108 | curl -X POST -H "Content-type: application/json" --data '{"text": "Tests successful for ${{ env.SITE_PATH }}"}' ${{ secrets.SLACK_HOOK }} 109 | 110 | - name: Auto merge PR 111 | id: automerge 112 | run: | 113 | gh pr merge ${{ env.PULL_REQUEST_URL }} --delete-branch --auto --merge 114 | 115 | - name: feedback 116 | run: | 117 | curl -X POST -H "Content-type: application/json" --data '{"text": "Pull request: ${{ env.PULL_REQUEST_URL }}"}' ${{ secrets.SLACK_HOOK }} 118 | 119 | build: 120 | needs: [publish-target,scrape] 121 | env: 122 | SITE: ${{ needs.publish-target.outputs.site }} 123 | NODE_ENV: production 124 | CDN_URL: ${{ secrets.CDN_URL }} 125 | WEBFLOW_CDN_URL: https://assets.website-files.com/ 126 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 127 | PREVIEW_DOMAIN: ${{ secrets.PREVIEW_DOMAIN }} 128 | 129 | runs-on: ubuntu-latest 130 | timeout-minutes: 20 131 | steps: 132 | 133 | - uses: actions/checkout@v4 134 | with: 135 | ref: master 136 | token: ${{ secrets.GITHUB_TOKEN }} 137 | 138 | - name: Use Node.js ${{ matrix.node-version }} 139 | uses: actions/setup-node@v4 140 | with: 141 | node-version: ${{ matrix.node-version }} 142 | 143 | # yarn caching 144 | - uses: actions/cache@v4 145 | with: 146 | path: '**/node_modules' 147 | key: ${{ runner.os }}-modules-${{ hashFiles('**/yarn.lock') }} 148 | 149 | - name: Install Dependencies 150 | run: | 151 | yarn build 152 | echo "SITE_PATH=$(echo "${{ env.SITE }}" | sed -e 's/https:\/\///g' -e 's:/*$::')" >> $GITHUB_ENV 153 | 154 | - name: Upload transformed sites 155 | uses: actions/upload-artifact@v4 156 | with: 157 | name: static-files 158 | path: static/ 159 | retention-days: 1 160 | 161 | - name: Send slack notification 162 | if: failure() 163 | run: | 164 | curl -X POST -H "Content-type: application/json" --data '{"text": " Production Deploy failed: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}?check_suite_focus=true"}' ${{ secrets.SLACK_HOOK }} 165 | 166 | 167 | - name: Send preview url notification 168 | if: success() 169 | run: | 170 | curl -X POST -H "Content-type: application/json" --data '{"text": "Production Deployed: ${{ env.PREVIEW_DOMAIN }}${{ env.SITE_PATH }}"}' ${{ secrets.SLACK_HOOK }} 171 | 172 | deploy-staging: 173 | needs: [build] 174 | runs-on: ubuntu-latest 175 | timeout-minutes: 15 176 | 177 | steps: 178 | - name: Checkout 179 | uses: actions/checkout@v4 180 | 181 | - name: Set shared environment variables 182 | uses: offerzen/action-env-vars-from-ssm@v1 183 | with: 184 | path: '/shared/' 185 | env: 186 | AWS_ACCESS_KEY_ID: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 187 | AWS_SECRET_ACCESS_KEY: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 188 | AWS_DEFAULT_REGION: ${{ secrets.ORG_AWS_REGION }} 189 | AWS_ROLE_ARN: ${{ vars.ORG_AWS_ROLE_ARN_GHA_STAGING }} 190 | 191 | - name: Configure AWS Credentials 192 | uses: aws-actions/configure-aws-credentials@v4 193 | with: 194 | aws-access-key-id: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 195 | aws-secret-access-key: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 196 | aws-region: ${{ secrets.ORG_AWS_REGION }} 197 | role-to-assume: ${{ vars.ORG_AWS_ROLE_ARN_GHA_STAGING }} 198 | role-duration-seconds: 900 199 | 200 | - name: Download transformed sites 201 | uses: actions/download-artifact@v4 202 | with: 203 | name: static-files 204 | path: static/ 205 | 206 | - name: Upload build to S3 bucket 207 | env: 208 | S3_DEPLOY_PATH: "s3://$S3_BUCKET_NAME_CDN/${{ github.event.repository.id }}" 209 | run: | 210 | echo "Uploading non-HTML files to ${{ env.S3_DEPLOY_PATH }}" 211 | aws s3 sync --no-progress --exclude *.html static/ ${{ env.S3_DEPLOY_PATH }} 212 | echo 213 | echo "Uploading HTML files to S3 with 'cache-control:no-cache' header..." 214 | aws s3 sync --no-progress --include *.html --content-type "text/html;charset=utf-8" --cache-control no-cache static/ ${{ env.S3_DEPLOY_PATH }} 215 | 216 | deploy-production: 217 | needs: [build] 218 | runs-on: ubuntu-latest 219 | timeout-minutes: 15 220 | 221 | steps: 222 | - name: Checkout 223 | uses: actions/checkout@v4 224 | 225 | - name: Set shared environment variables 226 | uses: offerzen/action-env-vars-from-ssm@v1 227 | with: 228 | path: '/shared/' 229 | env: 230 | AWS_ACCESS_KEY_ID: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 231 | AWS_SECRET_ACCESS_KEY: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 232 | AWS_DEFAULT_REGION: ${{ secrets.ORG_AWS_REGION }} 233 | AWS_ROLE_ARN: ${{ vars.ORG_AWS_ROLE_ARN_GHA_PRODUCTION }} 234 | 235 | - name: Configure AWS Credentials 236 | uses: aws-actions/configure-aws-credentials@v4 237 | with: 238 | aws-access-key-id: ${{ vars.ORG_AWS_ACCESS_KEY_ID_GHA }} 239 | aws-secret-access-key: ${{ secrets.ORG_AWS_SECRET_ACCESS_KEY_GHA }} 240 | aws-region: ${{ secrets.ORG_AWS_REGION }} 241 | role-to-assume: ${{ vars.ORG_AWS_ROLE_ARN_GHA_PRODUCTION }} 242 | role-duration-seconds: 900 243 | 244 | - name: Download transformed sites 245 | uses: actions/download-artifact@v4 246 | with: 247 | name: static-files 248 | path: static/ 249 | 250 | - name: Upload build to S3 bucket 251 | env: 252 | S3_DEPLOY_PATH: "s3://$S3_BUCKET_NAME_CDN/${{ github.event.repository.id }}" 253 | run: | 254 | echo "Uploading non-HTML files to ${{ env.S3_DEPLOY_PATH }}" 255 | aws s3 sync --no-progress --exclude *.html static/ ${{ env.S3_DEPLOY_PATH }} 256 | echo 257 | echo "Uploading HTML files to S3 with 'cache-control:no-cache' header..." 258 | aws s3 sync --no-progress --include *.html --content-type "text/html;charset=utf-8" --cache-control no-cache static/ ${{ env.S3_DEPLOY_PATH }} 259 | --------------------------------------------------------------------------------