├── .dockerignore ├── .editorconfig ├── .env.example ├── .eslintrc.js ├── .gitignore ├── .prettierignore ├── .prettierrc ├── Dockerfile ├── LICENSE ├── README.md ├── bin └── deploy_container.sh ├── docker_entrypoint.sh ├── lib ├── constants │ ├── args.js │ ├── env.js │ └── index.js ├── crawler.js ├── index.js ├── logger.js ├── run-script.js └── utils │ ├── async-for-each.js │ ├── debugger.js │ ├── from-image-url.js │ ├── promise-retry.js │ ├── remove-specials-and-spaces.js │ └── sentry.js ├── package.json ├── resources ├── conditions.yml ├── fargate.yml ├── iam.yml ├── outputs.yml ├── parameters.yml ├── sns.yml └── state-machines │ └── crawl.yml ├── scripts ├── screenshot.js └── utils │ ├── evaluate.js │ └── wait-and-retry-until.js ├── serverless.yml └── yarn.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | .vscode/ 3 | node_modules/ 4 | .serverless 5 | cache/ 6 | debug/ 7 | logs/ 8 | test/ 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = tab 5 | indent_size = 2 6 | tab_width = 2 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | 12 | [*.yml] 13 | indent_style = space 14 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | SENTRY_DSN="" 2 | 3 | AWS_ACCESS_ID="" 4 | AWS_SECRET_KEY="" 5 | 6 | NOTIFY_EMAIL="" 7 | 8 | S3_BUCKET_NAME="" 9 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | node: true 4 | }, 5 | globals: { 6 | // For use in page.evaluate() 7 | document: "readonly", 8 | window: "readonly" 9 | }, 10 | extends: [ 11 | "eslint:recommended", 12 | "plugin:node/recommended", 13 | "prettier", 14 | "plugin:import/errors", 15 | "plugin:import/warnings" 16 | ], 17 | plugins: [], 18 | parserOptions: { 19 | ecmaVersion: 2018, 20 | sourceType: "module", 21 | allowImportExportEverywhere: true 22 | }, 23 | settings: { 24 | "import/resolver": { 25 | alias: { 26 | extensions: [".js", ".jsx", ".json"] 27 | } 28 | } 29 | }, 30 | rules: { 31 | "no-console": "off", 32 | "no-shadow": "off", 33 | "no-unused-vars": ["error", { ignoreRestSiblings: true }], 34 | // See: https://github.com/benmosher/eslint-plugin-import/issues/496 35 | // https://stackoverflow.com/questions/44939304/eslint-should-be-listed-in-the-projects-dependencies-not-devdependencies 36 | "import/no-extraneous-dependencies": ["error", { devDependencies: true }], 37 | "import/prefer-default-export": 0, 38 | "no-template-curly-in-string": 0, 39 | "no-underscore-dangle": 0, 40 | "class-methods-use-this": 0, 41 | "no-param-reassign": 0, 42 | "no-await-in-loop": 0, 43 | "no-plusplus": 0, 44 | "import/no-dynamic-require": 0 45 | } 46 | }; 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies 2 | /node_modules 3 | jspm_packages 4 | /flow-typed 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /.test_temp 10 | /coverage 11 | 12 | # misc 13 | .DS_Store 14 | .env.* 15 | !.env.example 16 | package.json.lerna_backup 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | 22 | # serverless 23 | .serverless 24 | 25 | # output 26 | /logs 27 | /debug 28 | cache/* 29 | !cache/.gitkeep 30 | output/* 31 | !output/.gitkeep 32 | screenshot.jpg 33 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | coverage 4 | node_modules 5 | vendor 6 | .yarn 7 | .next 8 | out 9 | logs 10 | debug 11 | cache -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "trailingComma": "none", 3 | "tabWidth": 2, 4 | "useTabs": true, 5 | "semi": true, 6 | "overrides": [ 7 | { 8 | "files": "*.yml", 9 | "options": { 10 | "useTabs": false 11 | } 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:14-slim 2 | RUN apt-get update 3 | 4 | ENV HOME_DIR /usr/src/app 5 | ENV DISABLE_TOR_PROXY false 6 | 7 | # for https 8 | RUN apt-get install -yyq ca-certificates 9 | 10 | # install libraries 11 | RUN apt-get install -yyq libappindicator1 libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 12 | 13 | # tools 14 | RUN apt-get install -yyq gconf-service lsb-release wget xdg-utils 15 | 16 | # and fonts 17 | RUN apt-get install -yyq fonts-liberation 18 | 19 | # OS dependencies for image manipulation 20 | RUN apt-get install -yyq build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev libvips libvips-dev 21 | 22 | # Install Tor Proxy dependencies 23 | RUN apt-get install -yyq apt-transport-https curl 24 | RUN echo "deb https://deb.torproject.org/torproject.org/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/tor.list 25 | RUN curl https://deb.torproject.org/torproject.org/A3C4F0F979CAA22CDBA8F512EE8CBC9E886DDD89.asc | gpg --import 26 | RUN gpg --export A3C4F0F979CAA22CDBA8F512EE8CBC9E886DDD89 | apt-key add - 27 | RUN apt update 28 | RUN apt install -yyq tor tor-geoipdb torsocks deb.torproject.org-keyring; \ 29 | tor --version 30 | 31 | RUN mkdir -p $HOME_DIR 32 | 33 | WORKDIR $HOME_DIR 34 | 35 | # Add our package.json and install *before* adding our application files to 36 | # optimize build performance 37 | ADD package.json $HOME_DIR 38 | ADD yarn.lock $HOME_DIR 39 | 40 | # install the necessary packages 41 | RUN npm_config_build_from_source=true yarn install --unsafe-perm --save-exact --production 42 | COPY . $HOME_DIR 43 | RUN yarn clean 44 | 45 | RUN chmod +x ./docker_entrypoint.sh 46 | 47 | ENTRYPOINT ["./docker_entrypoint.sh"] 48 | CMD ["clean"] 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ryan Soury 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Serverless Web Crawler 2 | 3 | Serverless Web Crawler that executes for an indefinite amount of time. Perfect for Crawling Jobs that last longer than a minute and only need to be executed once or twice a month. 4 | 5 | This boilerplate library can be used to deploy a completely severless workflow to AWS that allows for multi-step web crawling. 6 | It runs a web crawler in a Docker Container that is managed by AWS Fargate. 7 | The AWS Fargate proceess is triggered in an AWS Step Functions Workflow. 8 | This allows you to extend the workflow and prepare data for the Web Crawler or manipulate the data produced by the Web Crawler. 9 | 10 | AWS Step Functions serve as a really good initiator for Fargate processes as they can be triggered by a schedule or a HTTP Request. 11 | AWS Step Functions can also trigger Notifications via SNS for when processes fail or complete. 12 | AWS Step Functions is also serverless by default, requiring no compute resources until it's executed. 13 | 14 | ## Getting Started 15 | 16 | ### Set up your environment file 17 | 18 | 1. Copy `.env.example` to `.env.development` or `.env.production` depending on which environment you're configuring 19 | 2. Add your values to the environment dotenv file. 20 | 21 | #### Environment Variables vs Argument Parameters 22 | 23 | It's important to distinguish between Environment and Argument parameters. 24 | Environment variables should configure how the crawler interfaces with its environment. This includes where it transmits data, which email is notified via SNS, which AWS Credentials to use, etc. 25 | Argument parameters should configure how the crawler operates. These are settings that directly change the way the crawler runs. This includes modifying how many concurrent browsers/requests are executed, whether TOR is used, which storage mechanism to use, etc. 26 | 27 | ### Local 28 | 29 | In Development: 30 | 31 | ```shell 32 | yarn dev -r screenshot -p url=https://www.webdoodle.com.au/ --with-ui 33 | ``` 34 | 35 | In Production: 36 | 37 | ```shell 38 | yarn start -r screenshot -p url=https://www.webdoodle.com.au/ --with-ui 39 | ``` 40 | 41 | ### Docker 42 | 43 | The Docker Image will only work for a production environment. Be sure to configure your `.env.production` dotenv file before building your Docker Image 44 | 45 | Build Docker Image 46 | 47 | ```shell 48 | docker build -t serverless-web-crawl:latest . 49 | ``` 50 | 51 | Run Docker Container Locally 52 | 53 | ```shell 54 | docker run --rm -it serverless-web-crawl:latest start -r screenshot -p url=https://www.webdoodle.com.au/ -s s3 55 | ``` 56 | 57 | #### AWS ECR Settings 58 | 59 | To configure settings used to push the Docker Image to AWS ECR, please see `./bin/deploy_container.sh` 60 | 61 | ### Passing Parameters 62 | 63 | To learn what parameters can be passed to the crawler, please see `./lib/constants` 64 | Environment variables are set here `./lib/constants/env.js` 65 | Argument parameters are set here `./lib/constants/args.js` 66 | 67 | ## Features 68 | 69 | This repository is full of features to simplify getting started with a Serverless Web Crawl. 70 | 71 | - Puppeteer Concurrency using `puppeteer-cluster` 72 | - Puppeteer Addons using `puppeteer-extra` 73 | - User Agent Masking 74 | - Optionally use Tor Proxy (to really avoid detection) 75 | 76 | ## Building and Deploying Docker Image 77 | 78 | Use the script provided: `./bin/deploy_container.sh` 79 | 80 | ## Developing a Crawl Script 81 | 82 | Refer to the example Crawl Script that takes a Screenshot of the URL provided through a parameter - `./scripts/screenshot.js` 83 | Create new scripts by creating a new file in `./scripts/` folder, then exporting an `async function(){}` 84 | 85 | ## Deploy to AWS 86 | 87 | ```shell 88 | yarn deploy 89 | ``` 90 | 91 | You can execute the deployed API (if you've used a HTTP event to trigger the state machine) like so: 92 | 93 | ```shell 94 | curl https://5jh0zty1c3.execute-api.ap-southeast-2.amazonaws.com/prod/ -X POST -d '{"command": ["start", "--run", "screenshot", "--params", "url=https://www.webdoodle.com.au/", "--concurrency", "1", "--storage", "s3"]}' 95 | ``` 96 | 97 | ## Example State Machine Definition 98 | 99 | ```yaml 100 | id: Crawl 101 | events: 102 | - http: 103 | path: "/" 104 | method: "POST" 105 | # - schedule: 106 | # rate: rate(24 hours) 107 | # enabled: 108 | # # ${self:custom.scheduleEnabled.${opt:stage, self:provider.stage}, false} 109 | # false 110 | # input: 111 | # executionId.$: $$.Execution.Id 112 | # executionName.$: $$.Execution.Name 113 | notifications: 114 | ABORTED: 115 | - sns: !Ref WebCrawlNotificationsTopic 116 | FAILED: 117 | - sns: !Ref WebCrawlNotificationsTopic 118 | TIMED_OUT: 119 | - sns: !Ref WebCrawlNotificationsTopic 120 | SUCCEEDED: 121 | - sns: !Ref WebCrawlNotificationsTopic 122 | role: 123 | Fn::GetAtt: [StateMachinePassRole, Arn] 124 | definition: 125 | Comment: "Serverless Web Crawl" 126 | StartAt: PullProducts 127 | States: 128 | PullProducts: 129 | Type: Task 130 | Resource: "arn:aws:states:::ecs:runTask.sync" 131 | Parameters: 132 | LaunchType: "FARGATE" 133 | Cluster: "#{ECSCluster}" 134 | TaskDefinition: "#{FargateTaskDefinition}" 135 | NetworkConfiguration: 136 | AwsvpcConfiguration: 137 | Subnets: 138 | - "#{PublicSubnetOne}" 139 | - "#{PublicSubnetTwo}" 140 | AssignPublicIp: ENABLED 141 | Overrides: 142 | ContainerOverrides: 143 | - Name: "#{ServiceName}" 144 | Command: 145 | - start 146 | - --run 147 | - products 148 | - --cloud 149 | - --concurrency 150 | - "6" 151 | - --tor 152 | Environment: 153 | - Name: EXECUTION_ID 154 | Value.$: $$.Execution.Id 155 | - Name: EXECUTION_NAME 156 | Value.$: $$.Execution.Name 157 | Next: Check 158 | Check: 159 | Type: Task 160 | Resource: 161 | Fn::GetAtt: [checkCrawl, Arn] 162 | Parameters: 163 | executionId.$: "$$.Execution.Id" 164 | executionName.$: "$$.Execution.Name" 165 | storeId.$: "$$.Execution.Name" 166 | ResultPath: "$.check" 167 | Next: DetermineActionOnProducts 168 | FailedToPullProducts: 169 | Type: Fail 170 | Cause: "No products pulled." 171 | DetermineActionOnProducts: 172 | Type: Choice 173 | Choices: 174 | - Variable: "$.check" 175 | NumericEquals: 0 176 | Next: FailedToPullProducts 177 | Default: Enrich 178 | Enrich: 179 | Type: Parallel 180 | Branches: 181 | - StartAt: ProductAttributes 182 | States: 183 | ProductAttributes: 184 | Type: Task 185 | Resource: "arn:aws:states:::ecs:runTask.sync" 186 | Parameters: 187 | LaunchType: "FARGATE" 188 | Cluster: "#{ECSCluster}" 189 | TaskDefinition: "#{FargateTaskDefinition}" 190 | NetworkConfiguration: 191 | AwsvpcConfiguration: 192 | Subnets: 193 | - "#{PublicSubnetOne}" 194 | - "#{PublicSubnetTwo}" 195 | AssignPublicIp: ENABLED 196 | Overrides: 197 | ContainerOverrides: 198 | - Name: "#{ServiceName}" 199 | Command: 200 | - start 201 | - --run 202 | - product/attributes 203 | - --cloud 204 | - --concurrency 205 | - "6" 206 | - --tor 207 | Environment: 208 | - Name: EXECUTION_ID 209 | Value.$: $$.Execution.Id 210 | - Name: EXECUTION_NAME 211 | Value.$: $$.Execution.Name 212 | End: true 213 | - StartAt: ProductPages 214 | States: 215 | ProductPages: 216 | Type: Task 217 | Resource: "arn:aws:states:::ecs:runTask.sync" 218 | Parameters: 219 | LaunchType: "FARGATE" 220 | Cluster: "#{ECSCluster}" 221 | TaskDefinition: "#{FargateTaskDefinition}" 222 | NetworkConfiguration: 223 | AwsvpcConfiguration: 224 | Subnets: 225 | - "#{PublicSubnetOne}" 226 | - "#{PublicSubnetTwo}" 227 | AssignPublicIp: ENABLED 228 | Overrides: 229 | ContainerOverrides: 230 | - Name: "#{ServiceName}" 231 | Command: 232 | - start 233 | - --run 234 | - product/pages 235 | - --cloud 236 | - --concurrency 237 | - "6" 238 | - --tor 239 | Environment: 240 | - Name: EXECUTION_ID 241 | Value.$: $$.Execution.Id 242 | - Name: EXECUTION_NAME 243 | Value.$: $$.Execution.Name 244 | Next: ProductSearchResults 245 | ProductSearchResults: 246 | Type: Task 247 | Resource: "arn:aws:states:::ecs:runTask.sync" 248 | Parameters: 249 | LaunchType: "FARGATE" 250 | Cluster: "#{ECSCluster}" 251 | TaskDefinition: "#{FargateTaskDefinition}" 252 | NetworkConfiguration: 253 | AwsvpcConfiguration: 254 | Subnets: 255 | - "#{PublicSubnetOne}" 256 | - "#{PublicSubnetTwo}" 257 | AssignPublicIp: ENABLED 258 | Overrides: 259 | ContainerOverrides: 260 | - Name: "#{ServiceName}" 261 | Command: 262 | - start 263 | - --run 264 | - product/search-results 265 | - --cloud 266 | - --concurrency 267 | - "6" 268 | Environment: 269 | - Name: DISABLE_TOR_PROXY 270 | Value: "true" 271 | - Name: EXECUTION_ID 272 | Value.$: $$.Execution.Id 273 | - Name: EXECUTION_NAME 274 | Value.$: $$.Execution.Name 275 | End: true 276 | Next: Merge 277 | Merge: 278 | Type: Task 279 | Resource: "arn:aws:states:::ecs:runTask.sync" 280 | Parameters: 281 | LaunchType: "FARGATE" 282 | Cluster: "#{ECSCluster}" 283 | TaskDefinition: "#{FargateTaskDefinition}" 284 | NetworkConfiguration: 285 | AwsvpcConfiguration: 286 | Subnets: 287 | - "#{PublicSubnetOne}" 288 | - "#{PublicSubnetTwo}" 289 | AssignPublicIp: ENABLED 290 | Overrides: 291 | ContainerOverrides: 292 | - Name: "#{ServiceName}" 293 | Command: 294 | - start 295 | - --run 296 | - merge 297 | - --cloud 298 | Environment: 299 | - Name: DISABLE_TOR_PROXY 300 | Value: "true" 301 | - Name: EXECUTION_ID 302 | Value.$: $$.Execution.Id 303 | - Name: EXECUTION_NAME 304 | Value.$: $$.Execution.Name 305 | Cpu: 256 306 | Memory: 512 307 | Retry: 308 | - ErrorEquals: 309 | - "States.TaskFailed" 310 | IntervalSeconds: 60 311 | MaxAttempts: 3 312 | BackoffRate: 5 313 | Next: PrepareData 314 | PrepareData: 315 | Type: Task 316 | Resource: arn:aws:states:::states:startExecution 317 | Parameters: 318 | Input: 319 | parentExecutionId.$: "$$.Execution.Id" 320 | parentExecutionName.$: "$$.Execution.Name" 321 | StateMachineArn: ${self:resources.Outputs.InventorySyncDataPrepare.Value} 322 | End: true 323 | ``` 324 | -------------------------------------------------------------------------------- /bin/deploy_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is using to build and push docker image to ECR repository 4 | 5 | AWS_PROFILE=default 6 | REGION=ap-southeast-2 7 | REPO_NAME=serverless-web-crawl 8 | TAG=latest 9 | 10 | # A way to pass in options. 11 | while getopts p:r:n:t: option 12 | do 13 | case "${option}" 14 | in 15 | p) AWS_PROFILE=${OPTARG};; 16 | r) REGION=${OPTARG};; 17 | n) REPO_NAME=${OPTARG};; 18 | t) TAG=${OPTARG};; 19 | esac 20 | done 21 | 22 | echo "Profile: $AWS_PROFILE" 23 | echo "Region: $REGION" 24 | echo "Repository: $REPO_NAME:$TAG" 25 | 26 | # Get the account number associated with the current IAM credentials 27 | ACCOUNT_ID=$(aws sts get-caller-identity --profile $AWS_PROFILE --query Account --output text) 28 | 29 | if [ $? -ne 0 ] 30 | then 31 | exit 255 32 | fi 33 | 34 | 35 | # If the repository doesn't exist in ECR, create it. 36 | echo 'Checking repo existance...' 37 | aws ecr describe-repositories --region $REGION --profile $AWS_PROFILE --repository-names "${REPO_NAME}" > /dev/null 2>&1 38 | 39 | if [ $? -ne 0 ] 40 | then 41 | echo "Repo $REPO_NAME doesn't exist, try to create a new one" 42 | aws ecr create-repository --region $REGION --profile $AWS_PROFILE --repository-name "${REPO_NAME}" > /dev/null 43 | fi 44 | 45 | echo 'Login to ECR Repository...' 46 | 47 | AWS_ECR_PASSWORD=$(aws ecr get-login-password --region $REGION --profile $AWS_PROFILE) 48 | docker login --username AWS --password $AWS_ECR_PASSWORD $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com 49 | 50 | echo 'Building and pushing docker image to ECR repository...' 51 | docker build -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:$TAG . 52 | 53 | if [ "$1" != "true" ]; then 54 | docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:$TAG 55 | fi 56 | 57 | echo 'Publish docker image completed' 58 | -------------------------------------------------------------------------------- /docker_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "${DISABLE_TOR_PROXY}" != "true" ] && [ "${DISABLE_TOR_PROXY}" != true ]; then 4 | /etc/init.d/tor start 5 | fi 6 | 7 | /usr/local/bin/yarn $@ 8 | -------------------------------------------------------------------------------- /lib/constants/args.js: -------------------------------------------------------------------------------- 1 | const { Command } = require("commander"); 2 | const isNumber = require("is-number"); 3 | const { isTest, isLambda } = require("./env"); 4 | 5 | const program = new Command(); 6 | let options = {}; 7 | 8 | const primitiveTyping = val => { 9 | if (val === "true") { 10 | return true; 11 | } 12 | if (val === "false") { 13 | return false; 14 | } 15 | if (isNumber(val)) { 16 | return parseInt(val, 10); 17 | } 18 | return val; 19 | }; 20 | 21 | if (!isTest && !isLambda) { 22 | program 23 | .requiredOption( 24 | "-r, --run ", 25 | "The script name to run. 'pull-products' or 'pull-pricing'." 26 | ) 27 | .option( 28 | "-p, --params ", 29 | "Parameters can be passed to the script in a comma-separated key value format. Array values are | separated. ie. key=value1|value2|value3. Wrap values with spaces in quotes. ie. categories='gift cards'", 30 | value => { 31 | const keyvals = value.split(","); 32 | const params = {}; 33 | keyvals.forEach(keyval => { 34 | const [key, val] = keyval.split("="); 35 | let v = primitiveTyping(val); 36 | if (val.indexOf("|") > -1 && v === val) { 37 | v = val.split("|").map(arrVal => primitiveTyping(arrVal)); 38 | } 39 | params[key] = v; 40 | }); 41 | return params; 42 | } 43 | ) 44 | .option( 45 | "-n, --concurrency ", 46 | "Set the number of concurrent Chrome instances. Defaults to 2.", 47 | value => parseInt(value, 10) 48 | ) 49 | .option("-w, --with-ui", "Run with the Chrome UI. ie. Non-headless mode.") 50 | .option("-m, --monitor", "Monitor the status for the puppeteer cluster.") 51 | .option( 52 | "-sd, --same-domain-delay ", 53 | "Time in miliseconds to wait between each request to the same domain." 54 | ) 55 | .option( 56 | "-s, --storage ", 57 | "By default, storage is 'local'. Valid values are 'local', 's3'" 58 | ) 59 | .option("-t, --tor", "Use Tor Proxy Server to anonymously crawl.") 60 | .option( 61 | "-tp, --tor-proxy-port ", 62 | "Tor Proxy Sock5 Port to pass to Chrome. Defaults to 9050." 63 | ) 64 | .option( 65 | "-l, --log-level ", 66 | "Set the log level. ie. 'fatal', 'error', 'warn', 'info', 'debug', 'trace' or 'silent'." 67 | ) 68 | .option( 69 | "-d, --debug ", 70 | "Run in debug mode. All key data points will be stored for review. Does not respect target sites. Do not run in production.", 71 | value => { 72 | return value.split(","); 73 | } 74 | ) 75 | .parse(process.argv); 76 | 77 | options = program.opts(); 78 | } 79 | 80 | if (options.debug) { 81 | console.log(options); 82 | } 83 | 84 | let useLocalStorage = true; 85 | let useS3Storage = false; 86 | if (options.storage === "s3") { 87 | useLocalStorage = false; 88 | useS3Storage = true; 89 | } 90 | 91 | module.exports = { 92 | ...options, 93 | params: options.params || {}, 94 | sameDomainDelay: options.sameDomainDelay || 1000, 95 | concurrency: options.concurrency || 2, 96 | torProxyPort: options.torProxyPort || 9050, 97 | useLocalStorage, 98 | useS3Storage 99 | }; 100 | -------------------------------------------------------------------------------- /lib/constants/env.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const envalid = require("envalid"); 3 | 4 | const { str } = envalid; 5 | 6 | const env = envalid.cleanEnv( 7 | process.env, 8 | { 9 | AWS_LAMBDA_FUNCTION_NAME: str({ default: "" }), 10 | EXECUTION_ID: str({ default: "" }), 11 | EXECUTION_NAME: str({ default: "" }), 12 | EXECUTION_STATE_NAME: str({ default: "" }), 13 | S3_BUCKET_NAME: str({ default: "" }), 14 | SENTRY_DSN: str({ default: "" }), 15 | AWS_ACCESS_ID: str({ default: "" }), 16 | AWS_SECRET_KEY: str({ default: "" }) 17 | }, 18 | { 19 | dotEnvPath: path.resolve( 20 | __dirname, 21 | `../../.env.${process.env.NODE_ENV || "development"}` 22 | ) 23 | } 24 | ); 25 | 26 | module.exports.isProd = env.isProduction; 27 | module.exports.isTest = env.isTest; 28 | module.exports.functionName = env.AWS_LAMBDA_FUNCTION_NAME; 29 | module.exports.isLambda = !!env.AWS_LAMBDA_FUNCTION_NAME; 30 | module.exports.executionId = env.EXECUTION_ID; 31 | module.exports.executionName = env.EXECUTION_NAME; 32 | module.exports.executionStateName = env.EXECUTION_STATE_NAME; 33 | module.exports.s3BucketName = env.S3_BUCKET_NAME; 34 | module.exports.sentryDSN = env.SENTRY_DSN; 35 | module.exports.awsCredentials = 36 | !!env.AWS_ACCESS_ID && !!env.AWS_SECRET_KEY 37 | ? { 38 | accessKeyId: env.AWS_ACCESS_ID, 39 | secretAccessKey: env.AWS_SECRET_KEY 40 | } 41 | : {}; 42 | -------------------------------------------------------------------------------- /lib/constants/index.js: -------------------------------------------------------------------------------- 1 | const programArgs = require("./args"); 2 | const envArgs = require("./env"); 3 | 4 | module.exports = { 5 | ...programArgs, 6 | ...envArgs 7 | }; 8 | -------------------------------------------------------------------------------- /lib/crawler.js: -------------------------------------------------------------------------------- 1 | const vanillaPuppeteer = require("puppeteer"); 2 | const { addExtra } = require("puppeteer-extra"); 3 | const { Cluster } = require("puppeteer-cluster"); 4 | const Adblocker = require("puppeteer-extra-plugin-adblocker"); 5 | const Stealth = require("puppeteer-extra-plugin-stealth"); 6 | const AnonymizeUA = require("puppeteer-extra-plugin-anonymize-ua"); 7 | const UserDataDir = require("puppeteer-extra-plugin-user-data-dir"); 8 | const isUndefined = require("lodash/isUndefined"); 9 | const isEmpty = require("is-empty"); 10 | const treekill = require("tree-kill"); 11 | const get = require("lodash/get"); 12 | const UserAgent = require("user-agents"); 13 | const ono = require("ono"); 14 | 15 | const { withModule } = require("./logger"); 16 | const { 17 | withUi, 18 | monitor, 19 | concurrency, 20 | tor, 21 | torProxyPort, 22 | sameDomainDelay 23 | } = require("./constants"); 24 | 25 | const logger = withModule("crawler"); 26 | let headless = true; 27 | if (!isUndefined(withUi)) { 28 | headless = !withUi; 29 | } 30 | 31 | // Setup puppeteer plugins 32 | const puppeteer = addExtra(vanillaPuppeteer); 33 | puppeteer.use(Adblocker()); 34 | puppeteer.use(Stealth()); 35 | puppeteer.use( 36 | AnonymizeUA({ 37 | makeWindows: false, 38 | stripHeadless: false, 39 | customFn() { 40 | const ua = new UserAgent(); 41 | return ua.toString(); 42 | } 43 | }) 44 | ); 45 | puppeteer.use(UserDataDir()); // Manages temp store and clean at launch/close for user data dir. 46 | 47 | // Outside of the exported function to ensure a singleton. ie. cannot launch multiple clusters per execution. 48 | const crawlerInstancePromise = Cluster.launch({ 49 | concurrency: Cluster.CONCURRENCY_CONTEXT, 50 | maxConcurrency: concurrency, 51 | timeout: 10800000, // 3 hours 52 | monitor: !isUndefined(monitor), 53 | puppeteer, 54 | puppeteerOptions: { 55 | headless, 56 | ignoreHTTPSErrors: true, 57 | defaultViewport: { 58 | width: 1280, 59 | height: 900 60 | }, 61 | dumpio: false, 62 | args: [ 63 | "--disable-setuid-sandbox", 64 | "--disable-dev-shm-usage", 65 | "--no-sandbox" 66 | ].concat(tor ? [`--proxy-server=socks5://127.0.0.1:${torProxyPort}`] : []) 67 | }, 68 | retryLimit: 3, 69 | retryDelay: 3000, 70 | sameDomainDelay 71 | }); 72 | 73 | let instance; 74 | 75 | const getCrawlerInstance = () => instance; 76 | 77 | const destroyCrawler = async () => { 78 | if (!isEmpty(instance)) { 79 | // Gracefully close 80 | await instance.close(); 81 | 82 | if (typeof get(instance, "browser.browser.process") === "function") { 83 | const { pid } = instance.browser.browser.process(); 84 | if (!isUndefined(pid)) { 85 | treekill(pid, "SIGKILL"); 86 | } 87 | } 88 | } 89 | }; 90 | 91 | process.on("SIGTERM", () => destroyCrawler()); 92 | process.on("exit", () => destroyCrawler()); 93 | 94 | async function Crawler() { 95 | if (!isEmpty(instance)) { 96 | return instance; 97 | } 98 | try { 99 | instance = await crawlerInstancePromise; 100 | // Event handler to be called in case of problems 101 | instance.on("taskerror", (err, data) => { 102 | const args = [`Error crawling:`, err.message, err.stack]; 103 | if (!isEmpty(data)) { 104 | args.push(typeof data === "object" ? JSON.stringify(data) : data); 105 | } 106 | logger.error(...args); 107 | }); 108 | 109 | if (tor) { 110 | logger.info(`Establishing Tor Proxy connection: ${torProxyPort}`); 111 | let isOperational = false; 112 | instance.queue( 113 | "https://check.torproject.org/", 114 | async ({ page, data: url }) => { 115 | await Promise.all([page.waitForNavigation(), page.goto(url)]); 116 | const isUsingTor = await page.$eval("body", el => 117 | el.innerHTML.includes( 118 | "Congratulations. This browser is configured to use Tor" 119 | ) 120 | ); 121 | isOperational = isUsingTor; 122 | } 123 | ); 124 | await instance.idle(); 125 | if (isOperational) { 126 | logger.info(`Browser is using Tor successfully.`); 127 | } else { 128 | throw new Error(`Browser failed to use Tor.`); 129 | } 130 | } 131 | 132 | return instance; 133 | } catch (e) { 134 | logger.error(e); 135 | throw ono(e, "Cannot initiate Crawler"); 136 | } 137 | } 138 | 139 | module.exports.Crawler = Crawler; 140 | module.exports.getCrawlerInstance = getCrawlerInstance; 141 | module.exports.destroyCrawler = destroyCrawler; 142 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Entry point for the library that manages the process of running the Puppeteer script(s). 3 | */ 4 | 5 | const path = require("path"); 6 | const fs = require("fs"); 7 | const isUndefined = require("lodash/isUndefined"); 8 | const { run: runScriptName, executionName } = require("./constants"); 9 | const logger = require("./logger"); 10 | const runScript = require("./run-script"); 11 | 12 | logger.info(`Job Tasked: ${runScriptName}`); 13 | logger.info(`Execution: ${executionName || "test"}`); 14 | 15 | if (isUndefined(runScriptName)) { 16 | throw new Error("Script name is undefined."); 17 | } 18 | 19 | const scriptPath = path.resolve(__dirname, `../scripts/${runScriptName}.js`); 20 | 21 | if (!fs.existsSync(scriptPath)) { 22 | throw new Error("Invalid script name."); 23 | } 24 | 25 | runScript(scriptPath); 26 | -------------------------------------------------------------------------------- /lib/logger.js: -------------------------------------------------------------------------------- 1 | const Pino = require("pino"); 2 | const { isProd, logLevel } = require("./constants"); 3 | 4 | const logger = Pino({ 5 | level: logLevel || "info", 6 | prettyPrint: !isProd 7 | }); 8 | 9 | module.exports = logger; 10 | 11 | module.exports.withModule = module => logger.child({ module }); 12 | -------------------------------------------------------------------------------- /lib/run-script.js: -------------------------------------------------------------------------------- 1 | const logger = require("./logger"); 2 | const Sentry = require("./utils/sentry"); 3 | const { params } = require("./constants"); 4 | 5 | async function runScript(scriptPath) { 6 | console.time("Finished in"); 7 | const die = async signalNumber => { 8 | console.timeEnd("Finished in"); 9 | // Exit fires an even in crawler that will destroy remaining crawler zombie processes. 10 | process.exit(signalNumber); // eslint-disable-line 11 | }; 12 | try { 13 | const scriptFn = require(scriptPath); 14 | await scriptFn(params); 15 | die(0); 16 | } catch (e) { 17 | Sentry.captureException(e); 18 | logger.error(e.message, e.stack); 19 | Sentry.flush(2500) 20 | .catch(flushErr => { 21 | logger.error(flushErr.message, flushErr.stack); 22 | }) 23 | .finally(() => { 24 | die(1); 25 | }); 26 | } 27 | } 28 | 29 | module.exports = runScript; 30 | -------------------------------------------------------------------------------- /lib/utils/async-for-each.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This Util is great if you're iterating over S3 Objects and need to run some browser related functions as per data in each S3 Object. 3 | */ 4 | 5 | async function asyncForEach(array, callback, untilFn = () => false) { 6 | for (let index = 0; index < array.length; index++) { 7 | await callback(array[index], index, array); 8 | if (untilFn(array[index], index, array)) { 9 | break; 10 | } 11 | } 12 | } 13 | 14 | module.exports = asyncForEach; 15 | -------------------------------------------------------------------------------- /lib/utils/debugger.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Our own debugger that accepts objects, and writes to console/file if a --debug flag is passed. 3 | */ 4 | 5 | const util = require("util"); 6 | const path = require("path"); 7 | const { writeJsonSync, emptyDirSync } = require("fs-extra"); 8 | const isUndefined = require("lodash/isUndefined"); 9 | const filenamify = require("filenamify"); 10 | const debugLog = require("debug-logfmt"); 11 | const { isTest, debug: debugArg } = require("../constants"); 12 | 13 | let dir; 14 | let debug = debugArg; 15 | if (isTest) { 16 | debug = ["file"]; 17 | } 18 | const enabled = !isUndefined(debug); 19 | 20 | if (enabled) { 21 | console.log(`Log id is: ${process.pid}`); 22 | dir = path.resolve( 23 | __dirname, 24 | `../../debug/${isTest ? "test" : "logs"}/${Date.now()}-${process.pid}` 25 | ); 26 | emptyDirSync(dir); 27 | } 28 | 29 | const DebugLogger = debugLog; 30 | 31 | const getDebugPath = additionalPath => { 32 | if (!enabled) { 33 | throw new Error("Debug not set. Cannot return Debug path."); 34 | } 35 | return path.resolve(dir, additionalPath); 36 | }; 37 | 38 | const debugScreenshot = (page, name, options = {}) => { 39 | if (!enabled) { 40 | return null; 41 | } 42 | 43 | const baseOptions = { 44 | path: path.resolve(dir, filenamify(name, { replacement: "-" })), 45 | type: "jpeg", 46 | quality: 100 47 | }; 48 | 49 | return page.screenshot({ 50 | ...baseOptions, 51 | ...options 52 | }); 53 | }; 54 | 55 | const Debugger = (data, filename = "debug.log") => { 56 | if (!enabled) { 57 | return null; 58 | } 59 | 60 | if (typeof data === "function") { 61 | const fn = data; 62 | return fn(dir); 63 | } 64 | 65 | // Sanitize file name. 66 | filename = filenamify(filename, { replacement: "-" }); 67 | 68 | let toConsole = true; 69 | let toFile = true; 70 | 71 | if (Array.isArray(debug)) { 72 | if (!debug.includes("console")) { 73 | toConsole = false; 74 | } 75 | if (!debug.includes("file")) { 76 | toFile = false; 77 | } 78 | } else if (typeof debug === "string") { 79 | switch (debug) { 80 | case "file": 81 | toConsole = false; 82 | toFile = true; 83 | break; 84 | case "console": 85 | toConsole = true; 86 | toFile = false; 87 | break; 88 | default: 89 | break; 90 | } 91 | } 92 | 93 | if (toConsole) { 94 | console.log(util.inspect(data, false, null, true)); 95 | } 96 | 97 | if (toFile) { 98 | writeJsonSync( 99 | path.resolve(dir, filename), 100 | data, 101 | { 102 | spaces: "\t" 103 | }, 104 | err => { 105 | if (err) console.error(err); 106 | } 107 | ); 108 | } 109 | }; 110 | 111 | module.exports.Debugger = Debugger; 112 | module.exports.DebugLogger = DebugLogger; 113 | module.exports.getDebugPath = getDebugPath; 114 | module.exports.debugScreenshot = debugScreenshot; 115 | -------------------------------------------------------------------------------- /lib/utils/from-image-url.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple util to download images. 3 | */ 4 | 5 | const axios = require("axios"); 6 | const axiosRetry = require("axios-retry"); 7 | 8 | axiosRetry(axios, { retries: 3 }); 9 | 10 | const fromImageUrl = (url, useBase64 = true) => 11 | axios 12 | .get(url, { 13 | responseType: "arraybuffer" 14 | }) 15 | .then(response => 16 | useBase64 17 | ? `data:${response.headers["content-type"]};base64,${Buffer.from( 18 | response.data, 19 | "binary" 20 | ).toString("base64")}` 21 | : response.data 22 | ); 23 | 24 | module.exports = fromImageUrl; 25 | -------------------------------------------------------------------------------- /lib/utils/promise-retry.js: -------------------------------------------------------------------------------- 1 | const promiseRetry = async (requestFn, config = {}) => { 2 | const { retries = 3, onRetry = () => {} } = config; 3 | let count = 0; 4 | let isWorking = false; 5 | let resp; 6 | while (isWorking === false) { 7 | try { 8 | resp = await requestFn(); 9 | isWorking = true; 10 | } catch (e) { 11 | count++; 12 | if (count >= retries) { 13 | throw e; 14 | } else { 15 | onRetry(count, e); 16 | } 17 | } 18 | } 19 | return resp; 20 | }; 21 | 22 | module.exports = promiseRetry; 23 | -------------------------------------------------------------------------------- /lib/utils/remove-specials-and-spaces.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Removes special characters and spaces 3 | * 4 | * @param {string} str 5 | * @return {string} stripped characters str 6 | */ 7 | const removeSpecialsAndSpaces = str => str.replace(/[^A-Z0-9]/gi, ""); 8 | 9 | module.exports = removeSpecialsAndSpaces; 10 | -------------------------------------------------------------------------------- /lib/utils/sentry.js: -------------------------------------------------------------------------------- 1 | const Sentry = require("@sentry/node"); 2 | const { sentryDSN } = require("../constants"); 3 | const logger = require("../logger"); 4 | 5 | if (sentryDSN) { 6 | Sentry.init({ 7 | dsn: sentryDSN, 8 | environment: process.env.NODE_ENV || "development" 9 | }); 10 | } else { 11 | logger.warn(`Sentry DSN doesnt exist. Errors are not being tracked.`); 12 | } 13 | 14 | module.exports = Sentry; 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "serverless-web-crawler", 3 | "version": "0.0.1", 4 | "description": "Serverless Web Crawler", 5 | "keywords": [ 6 | "data", 7 | "sync", 8 | "inventory", 9 | "products", 10 | "woocommerce", 11 | "serverless" 12 | ], 13 | "author": "Ryan Soury ", 14 | "license": "MIT", 15 | "main": "lib/scripts/index.js", 16 | "directories": { 17 | "lib": "lib", 18 | "test": "test" 19 | }, 20 | "files": [ 21 | "lib" 22 | ], 23 | "repository": { 24 | "type": "git", 25 | "url": "git@github.com:rsoury/serverless-web-crawler.git" 26 | }, 27 | "engines": { 28 | "node": ">= 12.13.1" 29 | }, 30 | "bin": { 31 | "deploy-container": "./bin/deploy_container.sh" 32 | }, 33 | "scripts": { 34 | "start": "cross-env NODE_ENV=production node ./lib/index", 35 | "dev": "cross-env NODE_ENV=development node --trace-warnings ./lib/index", 36 | "clean": "run-p clean:*", 37 | "clean:debug": "del ./debug", 38 | "deploy": "sls deploy -s prod --env production", 39 | "format": "prettier \"lib/**/*.{js,jsx,ts,tsx,json,css,scss,md}\" --write", 40 | "lint": "eslint -c ./.eslintrc.js \"{lib,test}/**/*.{js,jsx,ts,tsx}\"", 41 | "test": "echo \"Add unit tests\"" 42 | }, 43 | "dependencies": { 44 | "@sentry/node": "^5.10.2", 45 | "aws-sdk": "^2.936.0", 46 | "axios": "^0.19.0", 47 | "axios-retry": "^3.1.2", 48 | "commander": "^4.0.1", 49 | "cross-env": "^6.0.3", 50 | "debug-logfmt": "^1.0.4", 51 | "del-cli": "^3.0.1", 52 | "envalid": "^6.0.0", 53 | "filenamify": "^4.1.0", 54 | "fs-extra": "^8.1.0", 55 | "is-empty": "^1.2.0", 56 | "is-number": "^7.0.0", 57 | "lodash": "^4.17.15", 58 | "npm-run-all": "^4.1.5", 59 | "ono": "^7.1.3", 60 | "pino": "^5.14.0", 61 | "puppeteer": "^2.0.0", 62 | "puppeteer-cluster": "^0.18.0", 63 | "puppeteer-extra": "^3.1.7", 64 | "puppeteer-extra-plugin-adblocker": "^2.11.1", 65 | "puppeteer-extra-plugin-anonymize-ua": "^2.2.6", 66 | "puppeteer-extra-plugin-stealth": "^2.4.5", 67 | "puppeteer-extra-plugin-user-data-dir": "^2.2.2", 68 | "tree-kill": "^1.2.2", 69 | "url-parse": "^1.4.7", 70 | "user-agents": "^1.0.505" 71 | }, 72 | "devDependencies": { 73 | "eslint": "^7.29.0", 74 | "eslint-config-prettier": "^8.3.0", 75 | "eslint-import-resolver-alias": "^1.1.2", 76 | "eslint-plugin-import": "^2.23.4", 77 | "eslint-plugin-node": "^11.1.0", 78 | "husky": "^3.1.0", 79 | "lint-staged": "^9.5.0", 80 | "pino-pretty": "^3.5.0", 81 | "prettier": "^1.19.1", 82 | "serverless": "^1.59.1", 83 | "serverless-dotenv-plugin": "^3.9.0", 84 | "serverless-offline": "^5.12.1", 85 | "serverless-plugin-common-excludes": "^3.0.0", 86 | "serverless-plugin-ifelse": "^1.0.7", 87 | "serverless-plugin-include-dependencies": "github:rsoury/serverless-plugin-include-dependencies", 88 | "serverless-pseudo-parameters": "^2.5.0", 89 | "serverless-step-functions": "^2.12.0" 90 | }, 91 | "husky": { 92 | "hooks": { 93 | "pre-commit": "lint-staged" 94 | } 95 | }, 96 | "lint-staged": { 97 | "{lib,test}/**/*.js": [ 98 | "yarn lint --fix", 99 | "git add" 100 | ], 101 | "{lib,test}/**/*.{js,jsx,ts,tsx,json,css,scss,sass,md}": [ 102 | "yarn format", 103 | "git add" 104 | ] 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /resources/conditions.yml: -------------------------------------------------------------------------------- 1 | Conditions: 2 | HasCustomRole: !Not [!Equals [!Ref "Role", ""]] 3 | -------------------------------------------------------------------------------- /resources/fargate.yml: -------------------------------------------------------------------------------- 1 | Resources: 2 | # VPC in which containers will be networked. 3 | # It has two public subnets 4 | # We distribute the subnets across the first two available subnets 5 | # for the region, for high availability. 6 | # Important Note: Public IPs required for direct outbound traffic 7 | VPC: 8 | Type: AWS::EC2::VPC 9 | Properties: 10 | EnableDnsSupport: true 11 | EnableDnsHostnames: true 12 | CidrBlock: "172.15.0.0/16" 13 | 14 | # Two public subnets, where containers can have public IP addresses 15 | PublicSubnetOne: 16 | Type: AWS::EC2::Subnet 17 | Properties: 18 | AvailabilityZone: 19 | Fn::Select: 20 | - 0 21 | - Fn::GetAZs: { Ref: "AWS::Region" } 22 | VpcId: !Ref "VPC" 23 | CidrBlock: "172.15.0.0/24" 24 | MapPublicIpOnLaunch: true 25 | PublicSubnetTwo: 26 | Type: AWS::EC2::Subnet 27 | Properties: 28 | AvailabilityZone: 29 | Fn::Select: 30 | - 1 31 | - Fn::GetAZs: { Ref: "AWS::Region" } 32 | VpcId: !Ref "VPC" 33 | CidrBlock: "172.15.1.0/24" 34 | MapPublicIpOnLaunch: true 35 | 36 | # Setup networking resources for the public subnets. Containers 37 | # in the public subnets have public IP addresses and the routing table 38 | # sends network traffic via the internet gateway. 39 | InternetGateway: 40 | Type: AWS::EC2::InternetGateway 41 | GatewayAttachement: 42 | Type: AWS::EC2::VPCGatewayAttachment 43 | Properties: 44 | VpcId: !Ref "VPC" 45 | InternetGatewayId: !Ref "InternetGateway" 46 | PublicRouteTable: 47 | Type: AWS::EC2::RouteTable 48 | Properties: 49 | VpcId: !Ref "VPC" 50 | PublicRoute: 51 | Type: AWS::EC2::Route 52 | DependsOn: 53 | - GatewayAttachement 54 | Properties: 55 | RouteTableId: !Ref "PublicRouteTable" 56 | DestinationCidrBlock: "0.0.0.0/0" 57 | GatewayId: !Ref "InternetGateway" 58 | PublicSubnetOneRouteTableAssociation: 59 | Type: AWS::EC2::SubnetRouteTableAssociation 60 | Properties: 61 | SubnetId: !Ref PublicSubnetOne 62 | RouteTableId: !Ref PublicRouteTable 63 | PublicSubnetTwoRouteTableAssociation: 64 | Type: AWS::EC2::SubnetRouteTableAssociation 65 | Properties: 66 | SubnetId: !Ref PublicSubnetTwo 67 | RouteTableId: !Ref PublicRouteTable 68 | 69 | # ECS Resources 70 | ECSCluster: 71 | Type: AWS::ECS::Cluster 72 | 73 | # A security group for the containers we will run in Fargate. 74 | # Two rules, allowing network traffic from a public facing load 75 | # balancer and from other members of the security group. 76 | # 77 | # Remove any of the following ingress rules that are not needed. 78 | # If you want to make direct requests to a container using its 79 | # public IP address you'll need to add a security group rule 80 | # to allow traffic from all IP addresses. 81 | FargateContainerSecurityGroup: 82 | Type: AWS::EC2::SecurityGroup 83 | Properties: 84 | GroupDescription: Access to the Fargate containers 85 | VpcId: !Ref "VPC" 86 | EcsSecurityGroupIngressFromSelf: 87 | Type: AWS::EC2::SecurityGroupIngress 88 | Properties: 89 | Description: Ingress from other containers in the same security group 90 | GroupId: !Ref "FargateContainerSecurityGroup" 91 | IpProtocol: -1 92 | SourceSecurityGroupId: !Ref "FargateContainerSecurityGroup" 93 | # This is an IAM role which authorizes ECS to manage resources on your 94 | # account on your behalf, such as updating your load balancer with the 95 | # details of where your containers are, so that traffic can reach your 96 | # containers. 97 | ECSRole: 98 | Type: AWS::IAM::Role 99 | Properties: 100 | AssumeRolePolicyDocument: 101 | Statement: 102 | - Effect: Allow 103 | Principal: 104 | Service: [ecs.amazonaws.com] 105 | Action: ["sts:AssumeRole"] 106 | Path: / 107 | Policies: 108 | - PolicyName: ecs-service 109 | PolicyDocument: 110 | Statement: 111 | - Effect: Allow 112 | Action: 113 | # Rules which allow ECS to attach network interfaces to instances 114 | # on your behalf in order for awsvpc networking mode to work right 115 | - "ec2:AttachNetworkInterface" 116 | - "ec2:CreateNetworkInterface" 117 | - "ec2:CreateNetworkInterfacePermission" 118 | - "ec2:DeleteNetworkInterface" 119 | - "ec2:DeleteNetworkInterfacePermission" 120 | - "ec2:Describe*" 121 | - "ec2:DetachNetworkInterface" 122 | Resource: "*" 123 | 124 | # This is a role which is used by the ECS tasks themselves. 125 | ECSTaskExecutionRole: 126 | Type: AWS::IAM::Role 127 | Properties: 128 | AssumeRolePolicyDocument: 129 | Statement: 130 | - Effect: Allow 131 | Principal: 132 | Service: [ecs-tasks.amazonaws.com] 133 | Action: ["sts:AssumeRole"] 134 | Path: / 135 | Policies: 136 | - PolicyName: AmazonECSTaskExecutionRolePolicy 137 | PolicyDocument: 138 | Statement: 139 | - Effect: Allow 140 | Action: 141 | # Allow the ECS Tasks to download images from ECR 142 | - "ecr:GetAuthorizationToken" 143 | - "ecr:BatchCheckLayerAvailability" 144 | - "ecr:GetDownloadUrlForLayer" 145 | - "ecr:BatchGetImage" 146 | 147 | # Allow the ECS tasks to upload logs to CloudWatch 148 | - "logs:CreateLogGroup" 149 | - "logs:CreateLogStream" 150 | - "logs:PutLogEvents" 151 | - "logs:DescribeLogStreams" 152 | Resource: "*" 153 | 154 | # The task definition. This is a simple metadata description of what 155 | # container to run, and what resource requirements it has. 156 | FargateTaskDefinition: 157 | Type: AWS::ECS::TaskDefinition 158 | Properties: 159 | Family: !Ref "ServiceName" 160 | Cpu: !Ref "ContainerCpu" 161 | Memory: !Ref "ContainerMemory" 162 | NetworkMode: awsvpc 163 | RequiresCompatibilities: 164 | - FARGATE 165 | ExecutionRoleArn: !Ref ECSTaskExecutionRole 166 | TaskRoleArn: 167 | Fn::If: 168 | - "HasCustomRole" 169 | - !Ref "Role" 170 | - !Ref "AWS::NoValue" 171 | ContainerDefinitions: 172 | - Name: !Ref "ServiceName" 173 | Cpu: !Ref "ContainerCpu" 174 | Memory: !Ref "ContainerMemory" 175 | Image: "#{AWS::AccountId}.dkr.ecr.#{AWS::Region}.amazonaws.com/#{ImageUrl}" 176 | PortMappings: 177 | - ContainerPort: !Ref "ContainerPort" 178 | LogConfiguration: 179 | LogDriver: awslogs 180 | Options: 181 | awslogs-create-group: true 182 | awslogs-group: "/fargate/service/#{ServiceName}" 183 | awslogs-region: "#{AWS::Region}" 184 | awslogs-stream-prefix: ecs 185 | 186 | # The service. The service is a resource which allows you to run multiple 187 | # copies of a type of task, and gather up their logs and metrics, as well 188 | # as monitor the number of running tasks and replace any that have crashed 189 | Service: 190 | Type: AWS::ECS::Service 191 | Properties: 192 | ServiceName: !Ref "ServiceName" 193 | Cluster: !Ref "ECSCluster" 194 | LaunchType: FARGATE 195 | DeploymentConfiguration: 196 | MaximumPercent: 200 197 | MinimumHealthyPercent: 75 198 | DesiredCount: !Ref "DesiredCount" 199 | NetworkConfiguration: 200 | AwsvpcConfiguration: 201 | AssignPublicIp: ENABLED 202 | SecurityGroups: 203 | - !Ref FargateContainerSecurityGroup 204 | Subnets: 205 | - !Ref PublicSubnetOne 206 | - !Ref PublicSubnetTwo 207 | TaskDefinition: !Ref "FargateTaskDefinition" 208 | -------------------------------------------------------------------------------- /resources/iam.yml: -------------------------------------------------------------------------------- 1 | Resources: 2 | StateMachinePassRole: 3 | Type: AWS::IAM::Role 4 | Properties: 5 | ManagedPolicyArns: 6 | - arn:aws:iam::aws:policy/AWSStepFunctionsFullAccess 7 | AssumeRolePolicyDocument: 8 | Statement: 9 | - Effect: Allow 10 | Principal: 11 | Service: ["states.#{AWS::Region}.amazonaws.com"] 12 | Action: ["sts:AssumeRole"] 13 | - Effect: Allow 14 | Principal: 15 | Service: "lambda.amazonaws.com" 16 | Action: "sts:AssumeRole" 17 | Path: / 18 | Policies: 19 | - PolicyName: AmazonECSPassRolePolicy 20 | PolicyDocument: 21 | Statement: 22 | - Effect: Allow 23 | Action: 24 | - "iam:PassRole" 25 | Resource: "*" 26 | - Effect: Allow 27 | Action: 28 | - "lambda:InvokeFunction" 29 | Resource: "*" 30 | - Effect: Allow 31 | Action: 32 | - "ecs:RunTask" 33 | - "ecs:StopTask" 34 | - "ecs:DescribeTasks" 35 | - "ecs:StartTelemetrySession" 36 | Resource: "*" 37 | - Effect: Allow 38 | Action: 39 | - "events:PutTargets" 40 | - "events:PutRule" 41 | - "events:DescribeRule" 42 | Resource: "arn:aws:events:#{AWS::Region}:#{AWS::AccountId}:rule/StepFunctionsGetEventsForECSTaskRule" 43 | - Effect: Allow 44 | Action: 45 | - "events:PutTargets" 46 | - "events:PutRule" 47 | - "events:DescribeRule" 48 | Resource: "arn:aws:events:#{AWS::Region}:#{AWS::AccountId}:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule" 49 | -------------------------------------------------------------------------------- /resources/outputs.yml: -------------------------------------------------------------------------------- 1 | Outputs: 2 | WebCrawl: 3 | Description: "The ARN of the StateMachine WebCrawl" 4 | Value: 5 | Ref: CrawlStateMachine 6 | -------------------------------------------------------------------------------- /resources/parameters.yml: -------------------------------------------------------------------------------- 1 | Parameters: 2 | ServiceName: 3 | Type: String 4 | Default: ServerlessWebCrawl 5 | Description: A name for the service 6 | ImageUrl: 7 | Type: String 8 | Default: serverless-web-crawl:latest 9 | Description: 10 | The url of a docker image that contains the application process that will 11 | handle the traffic for this service 12 | ContainerPort: 13 | Type: Number 14 | Default: 80 15 | Description: What port number the application inside the docker container is binding to 16 | ContainerCpu: 17 | Type: Number 18 | Default: 1024 19 | Description: How much CPU to give the container. 1024 is 1 CPU 20 | ContainerMemory: 21 | Type: Number 22 | Default: 2048 23 | Description: How much memory in megabytes to give the container 24 | Path: 25 | Type: String 26 | Default: "*" 27 | Description: 28 | A path on the public load balancer that this service should be connected 29 | to. Use * to send all load balancer traffic to this service. 30 | Priority: 31 | Type: Number 32 | Default: 1 33 | Description: 34 | The priority for the routing rule added to the load balancer. This only 35 | applies if your have multiple services which have been assigned to 36 | different paths on the load balancer. 37 | DesiredCount: 38 | Type: Number 39 | Default: 0 40 | Description: How many copies of the service task to run 41 | Role: 42 | Type: String 43 | Default: "" 44 | Description: 45 | (Optional) An IAM role to give the service's containers if the code within 46 | needs to access other AWS resources like S3 buckets, DynamoDB tables, etc 47 | -------------------------------------------------------------------------------- /resources/sns.yml: -------------------------------------------------------------------------------- 1 | Resources: 2 | WebCrawlNotificationsTopic: 3 | Type: AWS::SNS::Topic 4 | Properties: 5 | DisplayName: "Serverless Web Crawler Notifications" 6 | TopicName: ServerlessWebCrawlerNotifications 7 | Subscription: 8 | - Endpoint: "ryan@webdoodle.com.au" 9 | Protocol: email 10 | -------------------------------------------------------------------------------- /resources/state-machines/crawl.yml: -------------------------------------------------------------------------------- 1 | id: CrawlStateMachine 2 | events: 3 | - http: 4 | path: "/" 5 | method: "POST" 6 | # - schedule: 7 | # rate: rate(24 hours) 8 | # enabled: 9 | # # ${self:custom.scheduleEnabled.${opt:stage, self:provider.stage}, false} 10 | # false 11 | # input: 12 | # executionId.$: $$.Execution.Id 13 | # executionName.$: $$.Execution.Name 14 | notifications: 15 | ABORTED: 16 | - sns: !Ref WebCrawlNotificationsTopic 17 | FAILED: 18 | - sns: !Ref WebCrawlNotificationsTopic 19 | TIMED_OUT: 20 | - sns: !Ref WebCrawlNotificationsTopic 21 | SUCCEEDED: 22 | - sns: !Ref WebCrawlNotificationsTopic 23 | role: 24 | Fn::GetAtt: [StateMachinePassRole, Arn] 25 | definition: 26 | Comment: "Serverless Web Crawl" 27 | StartAt: WebCrawl 28 | States: 29 | WebCrawl: 30 | Type: Task 31 | Resource: "arn:aws:states:::ecs:runTask.sync" 32 | Parameters: 33 | LaunchType: "FARGATE" 34 | Cluster: "#{ECSCluster}" 35 | TaskDefinition: "#{FargateTaskDefinition}" 36 | NetworkConfiguration: 37 | AwsvpcConfiguration: 38 | Subnets: 39 | - "#{PublicSubnetOne}" 40 | - "#{PublicSubnetTwo}" 41 | AssignPublicIp: ENABLED 42 | Overrides: 43 | ContainerOverrides: 44 | - Name: "#{ServiceName}" 45 | Command.$: $$.Execution.Input.command # For more information on Contact Object, see https://docs.aws.amazon.com/step-functions/latest/dg/input-output-contextobject.html 46 | Environment: 47 | - Name: EXECUTION_ID 48 | Value.$: $$.Execution.Id 49 | - Name: EXECUTION_NAME 50 | Value.$: $$.Execution.Name 51 | End: true 52 | -------------------------------------------------------------------------------- /scripts/screenshot.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This is an example script to demonstrate the library. 3 | */ 4 | 5 | const path = require("path"); 6 | const S3 = require("aws-sdk/clients/s3"); 7 | const fs = require("fs").promises; 8 | 9 | const { Crawler } = require("../lib/crawler"); 10 | const logger = require("../lib/logger"); 11 | const { 12 | useS3Storage, 13 | s3BucketName, 14 | awsCredentials 15 | } = require("../lib/constants"); 16 | 17 | const s3client = awsCredentials.accessKeyId 18 | ? new S3({ ...awsCredentials }) 19 | : new S3(); 20 | 21 | async function ScreenshotScript(params) { 22 | const crawler = await Crawler(); 23 | 24 | crawler.queue(params.url, async ({ page, data: url }) => { 25 | await page.goto(url); 26 | const screenshotData = await page.screenshot({ 27 | type: "jpeg", 28 | fullPage: true 29 | }); 30 | 31 | logger.info(`Screenshot taken of ${params.url}`); 32 | 33 | if (useS3Storage) { 34 | await s3client 35 | .upload({ 36 | Bucket: s3BucketName, 37 | Key: "serverless-web-crawler/screenshot.jpg", 38 | Body: screenshotData, 39 | ContentType: "image/jpeg", 40 | ACL: "public-read" 41 | }) 42 | .promise(); 43 | logger.info(`Saved screenshot to S3.`); 44 | } else { 45 | const outputFilePath = path.resolve(__dirname, "../screenshot.jpg"); 46 | try { 47 | await fs.writeFile(outputFilePath, screenshotData); 48 | logger.info("Saved screenshot to disk."); 49 | } catch (err) { 50 | logger.error("Failed to write screenshot to file", err); 51 | } 52 | } 53 | }); 54 | 55 | await crawler.idle(); 56 | } 57 | 58 | module.exports = ScreenshotScript; 59 | -------------------------------------------------------------------------------- /scripts/utils/evaluate.js: -------------------------------------------------------------------------------- 1 | const evaluate = (page, ...params) => browserFn => { 2 | const fnIndexes = []; 3 | params = params.map((param, i) => { 4 | if (typeof param === "function") { 5 | fnIndexes.push(i); 6 | return param.toString(); 7 | } 8 | return param; 9 | }); 10 | return page.evaluate( 11 | (fnIndexes, browserFnStr, ...params) => { 12 | for (let i = 0; i < fnIndexes.length; i++) { 13 | params[fnIndexes[i]] = new Function( 14 | " return (" + params[fnIndexes[i]] + ").apply(null, arguments)" 15 | ); 16 | } 17 | browserFn = new Function( 18 | " return (" + browserFnStr + ").apply(null, arguments)" 19 | ); 20 | return browserFn(...params); 21 | }, 22 | fnIndexes, 23 | browserFn.toString(), 24 | ...params 25 | ); 26 | }; 27 | 28 | module.exports = evaluate; 29 | -------------------------------------------------------------------------------- /scripts/utils/wait-and-retry-until.js: -------------------------------------------------------------------------------- 1 | import isUndefined from "lodash/isUndefined"; 2 | 3 | export default async function(page, selector, options = {}) { 4 | const { timeout = 10000, retries = 3, visible, hidden } = options; 5 | let isFound = false; 6 | let count = 0; 7 | while (!isFound) { 8 | try { 9 | const waitOptions = { 10 | timeout 11 | }; 12 | if (!isUndefined(visible)) { 13 | waitOptions.visible = visible; 14 | } 15 | if (!isUndefined(hidden)) { 16 | waitOptions.hidden = hidden; 17 | } 18 | await page.waitForSelector(selector, waitOptions); 19 | isFound = true; 20 | } catch (e) { 21 | await page.reload(); 22 | count++; 23 | if (count > retries - 1) { 24 | isFound = true; 25 | throw e; 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | # Fargate CloudFormation resources based on: https://github.com/ryfeus/stepfunctions2processing/blob/master/aws-fargate/serverless.yml 2 | # Fargate based Puppeteer based on: https://dev.to/hoangleitvn/building-serverless-web-crawler-with-puppeteer-on-aws-fargate-22k3 3 | 4 | service: serverless-web-crawler 5 | 6 | # You can pin your service to only deploy with a specific Serverless version 7 | # Check out our docs for more details 8 | frameworkVersion: ">=1.28.0 <2.0.0" 9 | 10 | provider: 11 | name: aws 12 | runtime: nodejs14.x 13 | region: ${opt:region, 'ap-southeast-2'} 14 | memorySize: 512 15 | timeout: 900 16 | stage: ${opt:stage, 'dev'} 17 | environment: 18 | NODE_ENV: 19 | ${self:custom.nodeEnv.${opt:stage, self:provider.stage}, 'development'} 20 | 21 | package: 22 | exclude: 23 | - bin/** 24 | - cache/** 25 | - debug/** 26 | - logs/** 27 | - test/** 28 | 29 | plugins: 30 | - serverless-plugin-include-dependencies 31 | - serverless-plugin-common-excludes 32 | - serverless-pseudo-parameters 33 | - serverless-step-functions 34 | - serverless-dotenv-plugin # Learn more - https://www.serverless.com/plugins/serverless-dotenv-plugin 35 | - serverless-plugin-ifelse # Learn more - https://www.serverless.com/plugins/serverless-plugin-ifelse 36 | - serverless-offline 37 | 38 | custom: 39 | nodeEnv: 40 | dev: development 41 | prod: production 42 | serverless-offline: 43 | port: 9000 44 | serverlessIfElse: 45 | - If: '"${env:S3_BUCKET_NAME}" != ""' 46 | Set: 47 | provider.iamRoleStatements: 48 | - Effect: "Allow" 49 | Action: 50 | - "s3:*Object" 51 | Resource: "arn:aws:s3:::${env:S3_BUCKET_NAME}*/*" 52 | 53 | # functions: 54 | 55 | stepFunctions: 56 | validate: true 57 | stateMachines: 58 | Crawl: ${file(resources/state-machines/crawl.yml)} 59 | 60 | resources: 61 | - ${file(resources/parameters.yml)} 62 | - ${file(resources/conditions.yml)} 63 | - ${file(resources/fargate.yml)} 64 | - ${file(resources/iam.yml)} 65 | - ${file(resources/sns.yml)} 66 | - ${file(resources/outputs.yml)} 67 | --------------------------------------------------------------------------------