├── .dockerignore
├── .editorconfig
├── .env.example
├── .eslintrc.js
├── .gitignore
├── .prettierignore
├── .prettierrc
├── Dockerfile
├── LICENSE
├── README.md
├── bin
    └── deploy_container.sh
├── docker_entrypoint.sh
├── lib
    ├── constants
    │   ├── args.js
    │   ├── env.js
    │   └── index.js
    ├── crawler.js
    ├── index.js
    ├── logger.js
    ├── run-script.js
    └── utils
    │   ├── async-for-each.js
    │   ├── debugger.js
    │   ├── from-image-url.js
    │   ├── promise-retry.js
    │   ├── remove-specials-and-spaces.js
    │   └── sentry.js
├── package.json
├── resources
    ├── conditions.yml
    ├── fargate.yml
    ├── iam.yml
    ├── outputs.yml
    ├── parameters.yml
    ├── sns.yml
    └── state-machines
    │   └── crawl.yml
├── scripts
    ├── screenshot.js
    └── utils
    │   ├── evaluate.js
    │   └── wait-and-retry-until.js
├── serverless.yml
└── yarn.lock


/.dockerignore:
--------------------------------------------------------------------------------
1 | bin/
2 | .vscode/
3 | node_modules/
4 | .serverless
5 | cache/
6 | debug/
7 | logs/
8 | test/
9 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = tab
 5 | indent_size = 2
 6 | tab_width = 2
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | insert_final_newline = true
11 | 
12 | [*.yml]
13 | indent_style = space
14 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | SENTRY_DSN=""
2 | 
3 | AWS_ACCESS_ID=""
4 | AWS_SECRET_KEY=""
5 | 
6 | NOTIFY_EMAIL=""
7 | 
8 | S3_BUCKET_NAME=""
9 | 


--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 | 	env: {
 3 | 		node: true
 4 | 	},
 5 | 	globals: {
 6 | 		// For use in page.evaluate()
 7 | 		document: "readonly",
 8 | 		window: "readonly"
 9 | 	},
10 | 	extends: [
11 | 		"eslint:recommended",
12 | 		"plugin:node/recommended",
13 | 		"prettier",
14 | 		"plugin:import/errors",
15 | 		"plugin:import/warnings"
16 | 	],
17 | 	plugins: [],
18 | 	parserOptions: {
19 | 		ecmaVersion: 2018,
20 | 		sourceType: "module",
21 | 		allowImportExportEverywhere: true
22 | 	},
23 | 	settings: {
24 | 		"import/resolver": {
25 | 			alias: {
26 | 				extensions: [".js", ".jsx", ".json"]
27 | 			}
28 | 		}
29 | 	},
30 | 	rules: {
31 | 		"no-console": "off",
32 | 		"no-shadow": "off",
33 | 		"no-unused-vars": ["error", { ignoreRestSiblings: true }],
34 | 		// See: https://github.com/benmosher/eslint-plugin-import/issues/496
35 | 		// https://stackoverflow.com/questions/44939304/eslint-should-be-listed-in-the-projects-dependencies-not-devdependencies
36 | 		"import/no-extraneous-dependencies": ["error", { devDependencies: true }],
37 | 		"import/prefer-default-export": 0,
38 | 		"no-template-curly-in-string": 0,
39 | 		"no-underscore-dangle": 0,
40 | 		"class-methods-use-this": 0,
41 | 		"no-param-reassign": 0,
42 | 		"no-await-in-loop": 0,
43 | 		"no-plusplus": 0,
44 | 		"import/no-dynamic-require": 0
45 | 	}
46 | };
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | /node_modules
 3 | jspm_packages
 4 | /flow-typed
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /.test_temp
10 | /coverage
11 | 
12 | # misc
13 | .DS_Store
14 | .env.*
15 | !.env.example
16 | package.json.lerna_backup
17 | 
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 
22 | # serverless
23 | .serverless
24 | 
25 | # output
26 | /logs
27 | /debug
28 | cache/*
29 | !cache/.gitkeep
30 | output/*
31 | !output/.gitkeep
32 | screenshot.jpg
33 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
 1 | build
 2 | dist
 3 | coverage
 4 | node_modules
 5 | vendor
 6 | .yarn
 7 | .next
 8 | out
 9 | logs
10 | debug
11 | cache


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"trailingComma": "none",
 3 | 	"tabWidth": 2,
 4 | 	"useTabs": true,
 5 | 	"semi": true,
 6 | 	"overrides": [
 7 | 		{
 8 | 			"files": "*.yml",
 9 | 			"options": {
10 | 				"useTabs": false
11 | 			}
12 | 		}
13 | 	]
14 | }
15 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:14-slim
 2 | RUN apt-get update
 3 | 
 4 | ENV HOME_DIR /usr/src/app
 5 | ENV DISABLE_TOR_PROXY false
 6 | 
 7 | # for https
 8 | RUN apt-get install -yyq ca-certificates
 9 | 
10 | # install libraries
11 | RUN apt-get install -yyq libappindicator1 libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6
12 | 
13 | # tools
14 | RUN apt-get install -yyq gconf-service lsb-release wget xdg-utils
15 | 
16 | # and fonts
17 | RUN apt-get install -yyq fonts-liberation
18 | 
19 | # OS dependencies for image manipulation
20 | RUN apt-get install -yyq build-essential libcairo2-dev libpango1.0-dev libjpeg-dev libgif-dev librsvg2-dev libvips libvips-dev
21 | 
22 | # Install Tor Proxy dependencies
23 | RUN apt-get install -yyq apt-transport-https curl
24 | RUN echo "deb https://deb.torproject.org/torproject.org/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/tor.list
25 | RUN curl https://deb.torproject.org/torproject.org/A3C4F0F979CAA22CDBA8F512EE8CBC9E886DDD89.asc | gpg --import
26 | RUN gpg --export A3C4F0F979CAA22CDBA8F512EE8CBC9E886DDD89 | apt-key add -
27 | RUN apt update
28 | RUN apt install -yyq tor tor-geoipdb torsocks deb.torproject.org-keyring; \
29 | 		tor --version
30 | 
31 | RUN mkdir -p $HOME_DIR
32 | 
33 | WORKDIR $HOME_DIR
34 | 
35 | # Add our package.json and install *before* adding our application files to
36 | # optimize build performance
37 | ADD package.json $HOME_DIR
38 | ADD yarn.lock $HOME_DIR
39 | 
40 | # install the necessary packages
41 | RUN npm_config_build_from_source=true yarn install --unsafe-perm --save-exact --production
42 | COPY . $HOME_DIR
43 | RUN yarn clean
44 | 
45 | RUN chmod +x ./docker_entrypoint.sh
46 | 
47 | ENTRYPOINT ["./docker_entrypoint.sh"]
48 | CMD ["clean"]
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Ryan Soury
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Serverless Web Crawler
  2 | 
  3 | Serverless Web Crawler that executes for an indefinite amount of time. Perfect for Crawling Jobs that last longer than a minute and only need to be executed once or twice a month.
  4 | 
  5 | This boilerplate library can be used to deploy a completely severless workflow to AWS that allows for multi-step web crawling.
  6 | It runs a web crawler in a Docker Container that is managed by AWS Fargate.
  7 | The AWS Fargate proceess is triggered in an AWS Step Functions Workflow.
  8 | This allows you to extend the workflow and prepare data for the Web Crawler or manipulate the data produced by the Web Crawler.
  9 | 
 10 | AWS Step Functions serve as a really good initiator for Fargate processes as they can be triggered by a schedule or a HTTP Request.
 11 | AWS Step Functions can also trigger Notifications via SNS for when processes fail or complete.
 12 | AWS Step Functions is also serverless by default, requiring no compute resources until it's executed.
 13 | 
 14 | ## Getting Started
 15 | 
 16 | ### Set up your environment file
 17 | 
 18 | 1. Copy `.env.example` to `.env.development` or `.env.production` depending on which environment you're configuring
 19 | 2. Add your values to the environment dotenv file.
 20 | 
 21 | #### Environment Variables vs Argument Parameters
 22 | 
 23 | It's important to distinguish between Environment and Argument parameters.  
 24 | Environment variables should configure how the crawler interfaces with its environment. This includes where it transmits data, which email is notified via SNS, which AWS Credentials to use, etc.  
 25 | Argument parameters should configure how the crawler operates. These are settings that directly change the way the crawler runs. This includes modifying how many concurrent browsers/requests are executed, whether TOR is used, which storage mechanism to use, etc.
 26 | 
 27 | ### Local
 28 | 
 29 | In Development:
 30 | 
 31 | ```shell
 32 | yarn dev -r screenshot -p url=https://www.webdoodle.com.au/ --with-ui
 33 | ```
 34 | 
 35 | In Production:
 36 | 
 37 | ```shell
 38 | yarn start -r screenshot -p url=https://www.webdoodle.com.au/ --with-ui
 39 | ```
 40 | 
 41 | ### Docker
 42 | 
 43 | The Docker Image will only work for a production environment. Be sure to configure your `.env.production` dotenv file before building your Docker Image
 44 | 
 45 | Build Docker Image
 46 | 
 47 | ```shell
 48 | docker build -t serverless-web-crawl:latest .
 49 | ```
 50 | 
 51 | Run Docker Container Locally
 52 | 
 53 | ```shell
 54 | docker run --rm -it serverless-web-crawl:latest start -r screenshot -p url=https://www.webdoodle.com.au/ -s s3
 55 | ```
 56 | 
 57 | #### AWS ECR Settings
 58 | 
 59 | To configure settings used to push the Docker Image to AWS ECR, please see `./bin/deploy_container.sh`
 60 | 
 61 | ### Passing Parameters
 62 | 
 63 | To learn what parameters can be passed to the crawler, please see `./lib/constants`  
 64 | Environment variables are set here `./lib/constants/env.js`  
 65 | Argument parameters are set here `./lib/constants/args.js`
 66 | 
 67 | ## Features
 68 | 
 69 | This repository is full of features to simplify getting started with a Serverless Web Crawl.
 70 | 
 71 | - Puppeteer Concurrency using `puppeteer-cluster`
 72 | - Puppeteer Addons using `puppeteer-extra`
 73 | - User Agent Masking
 74 | - Optionally use Tor Proxy (to really avoid detection)
 75 | 
 76 | ## Building and Deploying Docker Image
 77 | 
 78 | Use the script provided: `./bin/deploy_container.sh`
 79 | 
 80 | ## Developing a Crawl Script
 81 | 
 82 | Refer to the example Crawl Script that takes a Screenshot of the URL provided through a parameter - `./scripts/screenshot.js`  
 83 | Create new scripts by creating a new file in `./scripts/` folder, then exporting an `async function(){}` 
 84 | 
 85 | ## Deploy to AWS
 86 | 
 87 | ```shell
 88 | yarn deploy
 89 | ```
 90 | 
 91 | You can execute the deployed API (if you've used a HTTP event to trigger the state machine) like so:
 92 | 
 93 | ```shell
 94 | curl https://5jh0zty1c3.execute-api.ap-southeast-2.amazonaws.com/prod/ -X POST -d '{"command": ["start", "--run", "screenshot", "--params", "url=https://www.webdoodle.com.au/", "--concurrency", "1", "--storage", "s3"]}'
 95 | ```
 96 | 
 97 | ## Example State Machine Definition
 98 | 
 99 | ```yaml
100 | id: Crawl
101 | events:
102 |   - http:
103 |       path: "/"
104 |       method: "POST"
105 |   # - schedule:
106 |   #     rate: rate(24 hours)
107 |   #     enabled:
108 |   #       # ${self:custom.scheduleEnabled.${opt:stage, self:provider.stage}, false}
109 |   #       false
110 |   #     input:
111 |   #       executionId.$: $$.Execution.Id
112 |   #       executionName.$: $$.Execution.Name
113 | notifications:
114 |   ABORTED:
115 |     - sns: !Ref WebCrawlNotificationsTopic
116 |   FAILED:
117 |     - sns: !Ref WebCrawlNotificationsTopic
118 |   TIMED_OUT:
119 |     - sns: !Ref WebCrawlNotificationsTopic
120 |   SUCCEEDED:
121 |     - sns: !Ref WebCrawlNotificationsTopic
122 | role:
123 |   Fn::GetAtt: [StateMachinePassRole, Arn]
124 | definition:
125 |   Comment: "Serverless Web Crawl"
126 |   StartAt: PullProducts
127 |   States:
128 |     PullProducts:
129 |       Type: Task
130 |       Resource: "arn:aws:states:::ecs:runTask.sync"
131 |       Parameters:
132 |         LaunchType: "FARGATE"
133 |         Cluster: "#{ECSCluster}"
134 |         TaskDefinition: "#{FargateTaskDefinition}"
135 |         NetworkConfiguration:
136 |           AwsvpcConfiguration:
137 |             Subnets:
138 |               - "#{PublicSubnetOne}"
139 |               - "#{PublicSubnetTwo}"
140 |             AssignPublicIp: ENABLED
141 |         Overrides:
142 |           ContainerOverrides:
143 |             - Name: "#{ServiceName}"
144 |               Command:
145 |                 - start
146 |                 - --run
147 |                 - products
148 |                 - --cloud
149 |                 - --concurrency
150 |                 - "6"
151 |                 - --tor
152 |               Environment:
153 |                 - Name: EXECUTION_ID
154 |                   Value.$: $$.Execution.Id
155 |                 - Name: EXECUTION_NAME
156 |                   Value.$: $$.Execution.Name
157 |       Next: Check
158 |     Check:
159 |       Type: Task
160 |       Resource:
161 |         Fn::GetAtt: [checkCrawl, Arn]
162 |       Parameters:
163 |         executionId.$: "$$.Execution.Id"
164 |         executionName.$: "$$.Execution.Name"
165 |         storeId.$: "$$.Execution.Name"
166 |       ResultPath: "$.check"
167 |       Next: DetermineActionOnProducts
168 |     FailedToPullProducts:
169 |       Type: Fail
170 |       Cause: "No products pulled."
171 |     DetermineActionOnProducts:
172 |       Type: Choice
173 |       Choices:
174 |         - Variable: "$.check"
175 |           NumericEquals: 0
176 |           Next: FailedToPullProducts
177 |       Default: Enrich
178 |     Enrich:
179 |       Type: Parallel
180 |       Branches:
181 |         - StartAt: ProductAttributes
182 |           States:
183 |             ProductAttributes:
184 |               Type: Task
185 |               Resource: "arn:aws:states:::ecs:runTask.sync"
186 |               Parameters:
187 |                 LaunchType: "FARGATE"
188 |                 Cluster: "#{ECSCluster}"
189 |                 TaskDefinition: "#{FargateTaskDefinition}"
190 |                 NetworkConfiguration:
191 |                   AwsvpcConfiguration:
192 |                     Subnets:
193 |                       - "#{PublicSubnetOne}"
194 |                       - "#{PublicSubnetTwo}"
195 |                     AssignPublicIp: ENABLED
196 |                 Overrides:
197 |                   ContainerOverrides:
198 |                     - Name: "#{ServiceName}"
199 |                       Command:
200 |                         - start
201 |                         - --run
202 |                         - product/attributes
203 |                         - --cloud
204 |                         - --concurrency
205 |                         - "6"
206 |                         - --tor
207 |                       Environment:
208 |                         - Name: EXECUTION_ID
209 |                           Value.$: $$.Execution.Id
210 |                         - Name: EXECUTION_NAME
211 |                           Value.$: $$.Execution.Name
212 |               End: true
213 |         - StartAt: ProductPages
214 |           States:
215 |             ProductPages:
216 |               Type: Task
217 |               Resource: "arn:aws:states:::ecs:runTask.sync"
218 |               Parameters:
219 |                 LaunchType: "FARGATE"
220 |                 Cluster: "#{ECSCluster}"
221 |                 TaskDefinition: "#{FargateTaskDefinition}"
222 |                 NetworkConfiguration:
223 |                   AwsvpcConfiguration:
224 |                     Subnets:
225 |                       - "#{PublicSubnetOne}"
226 |                       - "#{PublicSubnetTwo}"
227 |                     AssignPublicIp: ENABLED
228 |                 Overrides:
229 |                   ContainerOverrides:
230 |                     - Name: "#{ServiceName}"
231 |                       Command:
232 |                         - start
233 |                         - --run
234 |                         - product/pages
235 |                         - --cloud
236 |                         - --concurrency
237 |                         - "6"
238 |                         - --tor
239 |                       Environment:
240 |                         - Name: EXECUTION_ID
241 |                           Value.$: $$.Execution.Id
242 |                         - Name: EXECUTION_NAME
243 |                           Value.$: $$.Execution.Name
244 |               Next: ProductSearchResults
245 |             ProductSearchResults:
246 |               Type: Task
247 |               Resource: "arn:aws:states:::ecs:runTask.sync"
248 |               Parameters:
249 |                 LaunchType: "FARGATE"
250 |                 Cluster: "#{ECSCluster}"
251 |                 TaskDefinition: "#{FargateTaskDefinition}"
252 |                 NetworkConfiguration:
253 |                   AwsvpcConfiguration:
254 |                     Subnets:
255 |                       - "#{PublicSubnetOne}"
256 |                       - "#{PublicSubnetTwo}"
257 |                     AssignPublicIp: ENABLED
258 |                 Overrides:
259 |                   ContainerOverrides:
260 |                     - Name: "#{ServiceName}"
261 |                       Command:
262 |                         - start
263 |                         - --run
264 |                         - product/search-results
265 |                         - --cloud
266 |                         - --concurrency
267 |                         - "6"
268 |                       Environment:
269 |                         - Name: DISABLE_TOR_PROXY
270 |                           Value: "true"
271 |                         - Name: EXECUTION_ID
272 |                           Value.$: $$.Execution.Id
273 |                         - Name: EXECUTION_NAME
274 |                           Value.$: $$.Execution.Name
275 |               End: true
276 |       Next: Merge
277 |     Merge:
278 |       Type: Task
279 |       Resource: "arn:aws:states:::ecs:runTask.sync"
280 |       Parameters:
281 |         LaunchType: "FARGATE"
282 |         Cluster: "#{ECSCluster}"
283 |         TaskDefinition: "#{FargateTaskDefinition}"
284 |         NetworkConfiguration:
285 |           AwsvpcConfiguration:
286 |             Subnets:
287 |               - "#{PublicSubnetOne}"
288 |               - "#{PublicSubnetTwo}"
289 |             AssignPublicIp: ENABLED
290 |         Overrides:
291 |           ContainerOverrides:
292 |             - Name: "#{ServiceName}"
293 |               Command:
294 |                 - start
295 |                 - --run
296 |                 - merge
297 |                 - --cloud
298 |               Environment:
299 |                 - Name: DISABLE_TOR_PROXY
300 |                   Value: "true"
301 |                 - Name: EXECUTION_ID
302 |                   Value.$: $$.Execution.Id
303 |                 - Name: EXECUTION_NAME
304 |                   Value.$: $$.Execution.Name
305 |               Cpu: 256
306 |               Memory: 512
307 |       Retry:
308 |         - ErrorEquals:
309 |             - "States.TaskFailed"
310 |           IntervalSeconds: 60
311 |           MaxAttempts: 3
312 |           BackoffRate: 5
313 |       Next: PrepareData
314 |     PrepareData:
315 |       Type: Task
316 |       Resource: arn:aws:states:::states:startExecution
317 |       Parameters:
318 |         Input:
319 |           parentExecutionId.$: "$$.Execution.Id"
320 |           parentExecutionName.$: "$$.Execution.Name"
321 |         StateMachineArn: ${self:resources.Outputs.InventorySyncDataPrepare.Value}
322 |       End: true
323 | ```
324 | 


--------------------------------------------------------------------------------
/bin/deploy_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script is using to build and push docker image to ECR repository
 4 | 
 5 | AWS_PROFILE=default
 6 | REGION=ap-southeast-2
 7 | REPO_NAME=serverless-web-crawl
 8 | TAG=latest
 9 | 
10 | # A way to pass in options.
11 | while getopts p:r:n:t: option
12 | do
13 | 	case "${option}"
14 | in
15 | 	p) AWS_PROFILE=${OPTARG};;
16 | 	r) REGION=${OPTARG};;
17 | 	n) REPO_NAME=${OPTARG};;
18 | 	t) TAG=${OPTARG};;
19 | esac
20 | done
21 | 
22 | echo "Profile: $AWS_PROFILE"
23 | echo "Region: $REGION"
24 | echo "Repository: $REPO_NAME:$TAG"
25 | 
26 | # Get the account number associated with the current IAM credentials
27 | ACCOUNT_ID=$(aws sts get-caller-identity --profile $AWS_PROFILE --query Account --output text)
28 | 
29 | if [ $? -ne 0 ]
30 | then
31 |     exit 255
32 | fi
33 | 
34 | 
35 | # If the repository doesn't exist in ECR, create it.
36 | echo 'Checking repo existance...'
37 | aws ecr describe-repositories --region $REGION --profile $AWS_PROFILE --repository-names "${REPO_NAME}" > /dev/null 2>&1
38 | 
39 | if [ $? -ne 0 ]
40 | then
41 |     echo "Repo $REPO_NAME doesn't exist, try to create a new one"
42 |     aws ecr create-repository --region $REGION --profile $AWS_PROFILE --repository-name "${REPO_NAME}" > /dev/null
43 | fi
44 | 
45 | echo 'Login to ECR Repository...'
46 | 
47 | AWS_ECR_PASSWORD=$(aws ecr get-login-password --region $REGION --profile $AWS_PROFILE)
48 | docker login --username AWS --password $AWS_ECR_PASSWORD $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com
49 | 
50 | echo 'Building and pushing docker image to ECR repository...'
51 | docker build -t $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:$TAG .
52 | 
53 | if [ "$1" != "true" ]; then
54 | docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:$TAG
55 | fi
56 | 
57 | echo 'Publish docker image completed'
58 | 


--------------------------------------------------------------------------------
/docker_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | if [ "${DISABLE_TOR_PROXY}" != "true" ] && [ "${DISABLE_TOR_PROXY}" != true ]; then
4 | 	/etc/init.d/tor start
5 | fi
6 | 
7 | /usr/local/bin/yarn $@
8 | 


--------------------------------------------------------------------------------
/lib/constants/args.js:
--------------------------------------------------------------------------------
  1 | const { Command } = require("commander");
  2 | const isNumber = require("is-number");
  3 | const { isTest, isLambda } = require("./env");
  4 | 
  5 | const program = new Command();
  6 | let options = {};
  7 | 
  8 | const primitiveTyping = val => {
  9 | 	if (val === "true") {
 10 | 		return true;
 11 | 	}
 12 | 	if (val === "false") {
 13 | 		return false;
 14 | 	}
 15 | 	if (isNumber(val)) {
 16 | 		return parseInt(val, 10);
 17 | 	}
 18 | 	return val;
 19 | };
 20 | 
 21 | if (!isTest && !isLambda) {
 22 | 	program
 23 | 		.requiredOption(
 24 | 			"-r, --run <string>",
 25 | 			"The script name to run. 'pull-products' or 'pull-pricing'."
 26 | 		)
 27 | 		.option(
 28 | 			"-p, --params <value>",
 29 | 			"Parameters can be passed to the script in a comma-separated key value format. Array values are | separated. ie. key=value1|value2|value3. Wrap values with spaces in quotes. ie. categories='gift cards'",
 30 | 			value => {
 31 | 				const keyvals = value.split(",");
 32 | 				const params = {};
 33 | 				keyvals.forEach(keyval => {
 34 | 					const [key, val] = keyval.split("=");
 35 | 					let v = primitiveTyping(val);
 36 | 					if (val.indexOf("|") > -1 && v === val) {
 37 | 						v = val.split("|").map(arrVal => primitiveTyping(arrVal));
 38 | 					}
 39 | 					params[key] = v;
 40 | 				});
 41 | 				return params;
 42 | 			}
 43 | 		)
 44 | 		.option(
 45 | 			"-n, --concurrency <number>",
 46 | 			"Set the number of concurrent Chrome instances. Defaults to 2.",
 47 | 			value => parseInt(value, 10)
 48 | 		)
 49 | 		.option("-w, --with-ui", "Run with the Chrome UI. ie. Non-headless mode.")
 50 | 		.option("-m, --monitor", "Monitor the status for the puppeteer cluster.")
 51 | 		.option(
 52 | 			"-sd, --same-domain-delay <value>",
 53 | 			"Time in miliseconds to wait between each request to the same domain."
 54 | 		)
 55 | 		.option(
 56 | 			"-s, --storage <value>",
 57 | 			"By default, storage is 'local'. Valid values are 'local', 's3'"
 58 | 		)
 59 | 		.option("-t, --tor", "Use Tor Proxy Server to anonymously crawl.")
 60 | 		.option(
 61 | 			"-tp, --tor-proxy-port <port>",
 62 | 			"Tor Proxy Sock5 Port to pass to Chrome. Defaults to 9050."
 63 | 		)
 64 | 		.option(
 65 | 			"-l, --log-level <string>",
 66 | 			"Set the log level. ie. 'fatal', 'error', 'warn', 'info', 'debug', 'trace' or 'silent'."
 67 | 		)
 68 | 		.option(
 69 | 			"-d, --debug <items>",
 70 | 			"Run in debug mode. All key data points will be stored for review. Does not respect target sites. Do not run in production.",
 71 | 			value => {
 72 | 				return value.split(",");
 73 | 			}
 74 | 		)
 75 | 		.parse(process.argv);
 76 | 
 77 | 	options = program.opts();
 78 | }
 79 | 
 80 | if (options.debug) {
 81 | 	console.log(options);
 82 | }
 83 | 
 84 | let useLocalStorage = true;
 85 | let useS3Storage = false;
 86 | if (options.storage === "s3") {
 87 | 	useLocalStorage = false;
 88 | 	useS3Storage = true;
 89 | }
 90 | 
 91 | module.exports = {
 92 | 	...options,
 93 | 	params: options.params || {},
 94 | 	sameDomainDelay: options.sameDomainDelay || 1000,
 95 | 	concurrency: options.concurrency || 2,
 96 | 	torProxyPort: options.torProxyPort || 9050,
 97 | 	useLocalStorage,
 98 | 	useS3Storage
 99 | };
100 | 


--------------------------------------------------------------------------------
/lib/constants/env.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | const envalid = require("envalid");
 3 | 
 4 | const { str } = envalid;
 5 | 
 6 | const env = envalid.cleanEnv(
 7 | 	process.env,
 8 | 	{
 9 | 		AWS_LAMBDA_FUNCTION_NAME: str({ default: "" }),
10 | 		EXECUTION_ID: str({ default: "" }),
11 | 		EXECUTION_NAME: str({ default: "" }),
12 | 		EXECUTION_STATE_NAME: str({ default: "" }),
13 | 		S3_BUCKET_NAME: str({ default: "" }),
14 | 		SENTRY_DSN: str({ default: "" }),
15 | 		AWS_ACCESS_ID: str({ default: "" }),
16 | 		AWS_SECRET_KEY: str({ default: "" })
17 | 	},
18 | 	{
19 | 		dotEnvPath: path.resolve(
20 | 			__dirname,
21 | 			`../../.env.${process.env.NODE_ENV || "development"}`
22 | 		)
23 | 	}
24 | );
25 | 
26 | module.exports.isProd = env.isProduction;
27 | module.exports.isTest = env.isTest;
28 | module.exports.functionName = env.AWS_LAMBDA_FUNCTION_NAME;
29 | module.exports.isLambda = !!env.AWS_LAMBDA_FUNCTION_NAME;
30 | module.exports.executionId = env.EXECUTION_ID;
31 | module.exports.executionName = env.EXECUTION_NAME;
32 | module.exports.executionStateName = env.EXECUTION_STATE_NAME;
33 | module.exports.s3BucketName = env.S3_BUCKET_NAME;
34 | module.exports.sentryDSN = env.SENTRY_DSN;
35 | module.exports.awsCredentials =
36 | 	!!env.AWS_ACCESS_ID && !!env.AWS_SECRET_KEY
37 | 		? {
38 | 				accessKeyId: env.AWS_ACCESS_ID,
39 | 				secretAccessKey: env.AWS_SECRET_KEY
40 | 		  }
41 | 		: {};
42 | 


--------------------------------------------------------------------------------
/lib/constants/index.js:
--------------------------------------------------------------------------------
1 | const programArgs = require("./args");
2 | const envArgs = require("./env");
3 | 
4 | module.exports = {
5 | 	...programArgs,
6 | 	...envArgs
7 | };
8 | 


--------------------------------------------------------------------------------
/lib/crawler.js:
--------------------------------------------------------------------------------
  1 | const vanillaPuppeteer = require("puppeteer");
  2 | const { addExtra } = require("puppeteer-extra");
  3 | const { Cluster } = require("puppeteer-cluster");
  4 | const Adblocker = require("puppeteer-extra-plugin-adblocker");
  5 | const Stealth = require("puppeteer-extra-plugin-stealth");
  6 | const AnonymizeUA = require("puppeteer-extra-plugin-anonymize-ua");
  7 | const UserDataDir = require("puppeteer-extra-plugin-user-data-dir");
  8 | const isUndefined = require("lodash/isUndefined");
  9 | const isEmpty = require("is-empty");
 10 | const treekill = require("tree-kill");
 11 | const get = require("lodash/get");
 12 | const UserAgent = require("user-agents");
 13 | const ono = require("ono");
 14 | 
 15 | const { withModule } = require("./logger");
 16 | const {
 17 | 	withUi,
 18 | 	monitor,
 19 | 	concurrency,
 20 | 	tor,
 21 | 	torProxyPort,
 22 | 	sameDomainDelay
 23 | } = require("./constants");
 24 | 
 25 | const logger = withModule("crawler");
 26 | let headless = true;
 27 | if (!isUndefined(withUi)) {
 28 | 	headless = !withUi;
 29 | }
 30 | 
 31 | // Setup puppeteer plugins
 32 | const puppeteer = addExtra(vanillaPuppeteer);
 33 | puppeteer.use(Adblocker());
 34 | puppeteer.use(Stealth());
 35 | puppeteer.use(
 36 | 	AnonymizeUA({
 37 | 		makeWindows: false,
 38 | 		stripHeadless: false,
 39 | 		customFn() {
 40 | 			const ua = new UserAgent();
 41 | 			return ua.toString();
 42 | 		}
 43 | 	})
 44 | );
 45 | puppeteer.use(UserDataDir()); // Manages temp store and clean at launch/close for user data dir.
 46 | 
 47 | // Outside of the exported function to ensure a singleton. ie. cannot launch multiple clusters per execution.
 48 | const crawlerInstancePromise = Cluster.launch({
 49 | 	concurrency: Cluster.CONCURRENCY_CONTEXT,
 50 | 	maxConcurrency: concurrency,
 51 | 	timeout: 10800000, // 3 hours
 52 | 	monitor: !isUndefined(monitor),
 53 | 	puppeteer,
 54 | 	puppeteerOptions: {
 55 | 		headless,
 56 | 		ignoreHTTPSErrors: true,
 57 | 		defaultViewport: {
 58 | 			width: 1280,
 59 | 			height: 900
 60 | 		},
 61 | 		dumpio: false,
 62 | 		args: [
 63 | 			"--disable-setuid-sandbox",
 64 | 			"--disable-dev-shm-usage",
 65 | 			"--no-sandbox"
 66 | 		].concat(tor ? [`--proxy-server=socks5://127.0.0.1:${torProxyPort}`] : [])
 67 | 	},
 68 | 	retryLimit: 3,
 69 | 	retryDelay: 3000,
 70 | 	sameDomainDelay
 71 | });
 72 | 
 73 | let instance;
 74 | 
 75 | const getCrawlerInstance = () => instance;
 76 | 
 77 | const destroyCrawler = async () => {
 78 | 	if (!isEmpty(instance)) {
 79 | 		// Gracefully close
 80 | 		await instance.close();
 81 | 
 82 | 		if (typeof get(instance, "browser.browser.process") === "function") {
 83 | 			const { pid } = instance.browser.browser.process();
 84 | 			if (!isUndefined(pid)) {
 85 | 				treekill(pid, "SIGKILL");
 86 | 			}
 87 | 		}
 88 | 	}
 89 | };
 90 | 
 91 | process.on("SIGTERM", () => destroyCrawler());
 92 | process.on("exit", () => destroyCrawler());
 93 | 
 94 | async function Crawler() {
 95 | 	if (!isEmpty(instance)) {
 96 | 		return instance;
 97 | 	}
 98 | 	try {
 99 | 		instance = await crawlerInstancePromise;
100 | 		// Event handler to be called in case of problems
101 | 		instance.on("taskerror", (err, data) => {
102 | 			const args = [`Error crawling:`, err.message, err.stack];
103 | 			if (!isEmpty(data)) {
104 | 				args.push(typeof data === "object" ? JSON.stringify(data) : data);
105 | 			}
106 | 			logger.error(...args);
107 | 		});
108 | 
109 | 		if (tor) {
110 | 			logger.info(`Establishing Tor Proxy connection: ${torProxyPort}`);
111 | 			let isOperational = false;
112 | 			instance.queue(
113 | 				"https://check.torproject.org/",
114 | 				async ({ page, data: url }) => {
115 | 					await Promise.all([page.waitForNavigation(), page.goto(url)]);
116 | 					const isUsingTor = await page.$eval("body", el =>
117 | 						el.innerHTML.includes(
118 | 							"Congratulations. This browser is configured to use Tor"
119 | 						)
120 | 					);
121 | 					isOperational = isUsingTor;
122 | 				}
123 | 			);
124 | 			await instance.idle();
125 | 			if (isOperational) {
126 | 				logger.info(`Browser is using Tor successfully.`);
127 | 			} else {
128 | 				throw new Error(`Browser failed to use Tor.`);
129 | 			}
130 | 		}
131 | 
132 | 		return instance;
133 | 	} catch (e) {
134 | 		logger.error(e);
135 | 		throw ono(e, "Cannot initiate Crawler");
136 | 	}
137 | }
138 | 
139 | module.exports.Crawler = Crawler;
140 | module.exports.getCrawlerInstance = getCrawlerInstance;
141 | module.exports.destroyCrawler = destroyCrawler;
142 | 


--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Entry point for the library that manages the process of running the Puppeteer script(s).
 3 |  */
 4 | 
 5 | const path = require("path");
 6 | const fs = require("fs");
 7 | const isUndefined = require("lodash/isUndefined");
 8 | const { run: runScriptName, executionName } = require("./constants");
 9 | const logger = require("./logger");
10 | const runScript = require("./run-script");
11 | 
12 | logger.info(`Job Tasked: ${runScriptName}`);
13 | logger.info(`Execution: ${executionName || "test"}`);
14 | 
15 | if (isUndefined(runScriptName)) {
16 | 	throw new Error("Script name is undefined.");
17 | }
18 | 
19 | const scriptPath = path.resolve(__dirname, `../scripts/${runScriptName}.js`);
20 | 
21 | if (!fs.existsSync(scriptPath)) {
22 | 	throw new Error("Invalid script name.");
23 | }
24 | 
25 | runScript(scriptPath);
26 | 


--------------------------------------------------------------------------------
/lib/logger.js:
--------------------------------------------------------------------------------
 1 | const Pino = require("pino");
 2 | const { isProd, logLevel } = require("./constants");
 3 | 
 4 | const logger = Pino({
 5 | 	level: logLevel || "info",
 6 | 	prettyPrint: !isProd
 7 | });
 8 | 
 9 | module.exports = logger;
10 | 
11 | module.exports.withModule = module => logger.child({ module });
12 | 


--------------------------------------------------------------------------------
/lib/run-script.js:
--------------------------------------------------------------------------------
 1 | const logger = require("./logger");
 2 | const Sentry = require("./utils/sentry");
 3 | const { params } = require("./constants");
 4 | 
 5 | async function runScript(scriptPath) {
 6 | 	console.time("Finished in");
 7 | 	const die = async signalNumber => {
 8 | 		console.timeEnd("Finished in");
 9 | 		// Exit fires an even in crawler that will destroy remaining crawler zombie processes.
10 | 		process.exit(signalNumber); // eslint-disable-line
11 | 	};
12 | 	try {
13 | 		const scriptFn = require(scriptPath);
14 | 		await scriptFn(params);
15 | 		die(0);
16 | 	} catch (e) {
17 | 		Sentry.captureException(e);
18 | 		logger.error(e.message, e.stack);
19 | 		Sentry.flush(2500)
20 | 			.catch(flushErr => {
21 | 				logger.error(flushErr.message, flushErr.stack);
22 | 			})
23 | 			.finally(() => {
24 | 				die(1);
25 | 			});
26 | 	}
27 | }
28 | 
29 | module.exports = runScript;
30 | 


--------------------------------------------------------------------------------
/lib/utils/async-for-each.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This Util is great if you're iterating over S3 Objects and need to run some browser related functions as per data in each S3 Object.
 3 |  */
 4 | 
 5 | async function asyncForEach(array, callback, untilFn = () => false) {
 6 | 	for (let index = 0; index < array.length; index++) {
 7 | 		await callback(array[index], index, array);
 8 | 		if (untilFn(array[index], index, array)) {
 9 | 			break;
10 | 		}
11 | 	}
12 | }
13 | 
14 | module.exports = asyncForEach;
15 | 


--------------------------------------------------------------------------------
/lib/utils/debugger.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Our own debugger that accepts objects, and writes to console/file if a --debug flag is passed.
  3 |  */
  4 | 
  5 | const util = require("util");
  6 | const path = require("path");
  7 | const { writeJsonSync, emptyDirSync } = require("fs-extra");
  8 | const isUndefined = require("lodash/isUndefined");
  9 | const filenamify = require("filenamify");
 10 | const debugLog = require("debug-logfmt");
 11 | const { isTest, debug: debugArg } = require("../constants");
 12 | 
 13 | let dir;
 14 | let debug = debugArg;
 15 | if (isTest) {
 16 | 	debug = ["file"];
 17 | }
 18 | const enabled = !isUndefined(debug);
 19 | 
 20 | if (enabled) {
 21 | 	console.log(`Log id is: ${process.pid}`);
 22 | 	dir = path.resolve(
 23 | 		__dirname,
 24 | 		`../../debug/${isTest ? "test" : "logs"}/${Date.now()}-${process.pid}`
 25 | 	);
 26 | 	emptyDirSync(dir);
 27 | }
 28 | 
 29 | const DebugLogger = debugLog;
 30 | 
 31 | const getDebugPath = additionalPath => {
 32 | 	if (!enabled) {
 33 | 		throw new Error("Debug not set. Cannot return Debug path.");
 34 | 	}
 35 | 	return path.resolve(dir, additionalPath);
 36 | };
 37 | 
 38 | const debugScreenshot = (page, name, options = {}) => {
 39 | 	if (!enabled) {
 40 | 		return null;
 41 | 	}
 42 | 
 43 | 	const baseOptions = {
 44 | 		path: path.resolve(dir, filenamify(name, { replacement: "-" })),
 45 | 		type: "jpeg",
 46 | 		quality: 100
 47 | 	};
 48 | 
 49 | 	return page.screenshot({
 50 | 		...baseOptions,
 51 | 		...options
 52 | 	});
 53 | };
 54 | 
 55 | const Debugger = (data, filename = "debug.log") => {
 56 | 	if (!enabled) {
 57 | 		return null;
 58 | 	}
 59 | 
 60 | 	if (typeof data === "function") {
 61 | 		const fn = data;
 62 | 		return fn(dir);
 63 | 	}
 64 | 
 65 | 	// Sanitize file name.
 66 | 	filename = filenamify(filename, { replacement: "-" });
 67 | 
 68 | 	let toConsole = true;
 69 | 	let toFile = true;
 70 | 
 71 | 	if (Array.isArray(debug)) {
 72 | 		if (!debug.includes("console")) {
 73 | 			toConsole = false;
 74 | 		}
 75 | 		if (!debug.includes("file")) {
 76 | 			toFile = false;
 77 | 		}
 78 | 	} else if (typeof debug === "string") {
 79 | 		switch (debug) {
 80 | 			case "file":
 81 | 				toConsole = false;
 82 | 				toFile = true;
 83 | 				break;
 84 | 			case "console":
 85 | 				toConsole = true;
 86 | 				toFile = false;
 87 | 				break;
 88 | 			default:
 89 | 				break;
 90 | 		}
 91 | 	}
 92 | 
 93 | 	if (toConsole) {
 94 | 		console.log(util.inspect(data, false, null, true));
 95 | 	}
 96 | 
 97 | 	if (toFile) {
 98 | 		writeJsonSync(
 99 | 			path.resolve(dir, filename),
100 | 			data,
101 | 			{
102 | 				spaces: "\t"
103 | 			},
104 | 			err => {
105 | 				if (err) console.error(err);
106 | 			}
107 | 		);
108 | 	}
109 | };
110 | 
111 | module.exports.Debugger = Debugger;
112 | module.exports.DebugLogger = DebugLogger;
113 | module.exports.getDebugPath = getDebugPath;
114 | module.exports.debugScreenshot = debugScreenshot;
115 | 


--------------------------------------------------------------------------------
/lib/utils/from-image-url.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Simple util to download images.
 3 |  */
 4 | 
 5 | const axios = require("axios");
 6 | const axiosRetry = require("axios-retry");
 7 | 
 8 | axiosRetry(axios, { retries: 3 });
 9 | 
10 | const fromImageUrl = (url, useBase64 = true) =>
11 | 	axios
12 | 		.get(url, {
13 | 			responseType: "arraybuffer"
14 | 		})
15 | 		.then(response =>
16 | 			useBase64
17 | 				? `data:${response.headers["content-type"]};base64,${Buffer.from(
18 | 						response.data,
19 | 						"binary"
20 | 				  ).toString("base64")}`
21 | 				: response.data
22 | 		);
23 | 
24 | module.exports = fromImageUrl;
25 | 


--------------------------------------------------------------------------------
/lib/utils/promise-retry.js:
--------------------------------------------------------------------------------
 1 | const promiseRetry = async (requestFn, config = {}) => {
 2 | 	const { retries = 3, onRetry = () => {} } = config;
 3 | 	let count = 0;
 4 | 	let isWorking = false;
 5 | 	let resp;
 6 | 	while (isWorking === false) {
 7 | 		try {
 8 | 			resp = await requestFn();
 9 | 			isWorking = true;
10 | 		} catch (e) {
11 | 			count++;
12 | 			if (count >= retries) {
13 | 				throw e;
14 | 			} else {
15 | 				onRetry(count, e);
16 | 			}
17 | 		}
18 | 	}
19 | 	return resp;
20 | };
21 | 
22 | module.exports = promiseRetry;
23 | 


--------------------------------------------------------------------------------
/lib/utils/remove-specials-and-spaces.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Removes special characters and spaces
 3 |  *
 4 |  * @param {string} str
 5 |  * @return {string} stripped characters str
 6 |  */
 7 | const removeSpecialsAndSpaces = str => str.replace(/[^A-Z0-9]/gi, "");
 8 | 
 9 | module.exports = removeSpecialsAndSpaces;
10 | 


--------------------------------------------------------------------------------
/lib/utils/sentry.js:
--------------------------------------------------------------------------------
 1 | const Sentry = require("@sentry/node");
 2 | const { sentryDSN } = require("../constants");
 3 | const logger = require("../logger");
 4 | 
 5 | if (sentryDSN) {
 6 | 	Sentry.init({
 7 | 		dsn: sentryDSN,
 8 | 		environment: process.env.NODE_ENV || "development"
 9 | 	});
10 | } else {
11 | 	logger.warn(`Sentry DSN doesnt exist. Errors are not being tracked.`);
12 | }
13 | 
14 | module.exports = Sentry;
15 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "serverless-web-crawler",
  3 | 	"version": "0.0.1",
  4 | 	"description": "Serverless Web Crawler",
  5 | 	"keywords": [
  6 | 		"data",
  7 | 		"sync",
  8 | 		"inventory",
  9 | 		"products",
 10 | 		"woocommerce",
 11 | 		"serverless"
 12 | 	],
 13 | 	"author": "Ryan Soury <ryan@webdoodle.com.au>",
 14 | 	"license": "MIT",
 15 | 	"main": "lib/scripts/index.js",
 16 | 	"directories": {
 17 | 		"lib": "lib",
 18 | 		"test": "test"
 19 | 	},
 20 | 	"files": [
 21 | 		"lib"
 22 | 	],
 23 | 	"repository": {
 24 | 		"type": "git",
 25 | 		"url": "git@github.com:rsoury/serverless-web-crawler.git"
 26 | 	},
 27 | 	"engines": {
 28 | 		"node": ">= 12.13.1"
 29 | 	},
 30 | 	"bin": {
 31 | 		"deploy-container": "./bin/deploy_container.sh"
 32 | 	},
 33 | 	"scripts": {
 34 | 		"start": "cross-env NODE_ENV=production node ./lib/index",
 35 | 		"dev": "cross-env NODE_ENV=development node --trace-warnings ./lib/index",
 36 | 		"clean": "run-p clean:*",
 37 | 		"clean:debug": "del ./debug",
 38 | 		"deploy": "sls deploy -s prod --env production",
 39 | 		"format": "prettier \"lib/**/*.{js,jsx,ts,tsx,json,css,scss,md}\" --write",
 40 | 		"lint": "eslint -c ./.eslintrc.js \"{lib,test}/**/*.{js,jsx,ts,tsx}\"",
 41 | 		"test": "echo \"Add unit tests\""
 42 | 	},
 43 | 	"dependencies": {
 44 | 		"@sentry/node": "^5.10.2",
 45 | 		"aws-sdk": "^2.936.0",
 46 | 		"axios": "^0.19.0",
 47 | 		"axios-retry": "^3.1.2",
 48 | 		"commander": "^4.0.1",
 49 | 		"cross-env": "^6.0.3",
 50 | 		"debug-logfmt": "^1.0.4",
 51 | 		"del-cli": "^3.0.1",
 52 | 		"envalid": "^6.0.0",
 53 | 		"filenamify": "^4.1.0",
 54 | 		"fs-extra": "^8.1.0",
 55 | 		"is-empty": "^1.2.0",
 56 | 		"is-number": "^7.0.0",
 57 | 		"lodash": "^4.17.15",
 58 | 		"npm-run-all": "^4.1.5",
 59 | 		"ono": "^7.1.3",
 60 | 		"pino": "^5.14.0",
 61 | 		"puppeteer": "^2.0.0",
 62 | 		"puppeteer-cluster": "^0.18.0",
 63 | 		"puppeteer-extra": "^3.1.7",
 64 | 		"puppeteer-extra-plugin-adblocker": "^2.11.1",
 65 | 		"puppeteer-extra-plugin-anonymize-ua": "^2.2.6",
 66 | 		"puppeteer-extra-plugin-stealth": "^2.4.5",
 67 | 		"puppeteer-extra-plugin-user-data-dir": "^2.2.2",
 68 | 		"tree-kill": "^1.2.2",
 69 | 		"url-parse": "^1.4.7",
 70 | 		"user-agents": "^1.0.505"
 71 | 	},
 72 | 	"devDependencies": {
 73 | 		"eslint": "^7.29.0",
 74 | 		"eslint-config-prettier": "^8.3.0",
 75 | 		"eslint-import-resolver-alias": "^1.1.2",
 76 | 		"eslint-plugin-import": "^2.23.4",
 77 | 		"eslint-plugin-node": "^11.1.0",
 78 | 		"husky": "^3.1.0",
 79 | 		"lint-staged": "^9.5.0",
 80 | 		"pino-pretty": "^3.5.0",
 81 | 		"prettier": "^1.19.1",
 82 | 		"serverless": "^1.59.1",
 83 | 		"serverless-dotenv-plugin": "^3.9.0",
 84 | 		"serverless-offline": "^5.12.1",
 85 | 		"serverless-plugin-common-excludes": "^3.0.0",
 86 | 		"serverless-plugin-ifelse": "^1.0.7",
 87 | 		"serverless-plugin-include-dependencies": "github:rsoury/serverless-plugin-include-dependencies",
 88 | 		"serverless-pseudo-parameters": "^2.5.0",
 89 | 		"serverless-step-functions": "^2.12.0"
 90 | 	},
 91 | 	"husky": {
 92 | 		"hooks": {
 93 | 			"pre-commit": "lint-staged"
 94 | 		}
 95 | 	},
 96 | 	"lint-staged": {
 97 | 		"{lib,test}/**/*.js": [
 98 | 			"yarn lint --fix",
 99 | 			"git add"
100 | 		],
101 | 		"{lib,test}/**/*.{js,jsx,ts,tsx,json,css,scss,sass,md}": [
102 | 			"yarn format",
103 | 			"git add"
104 | 		]
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/resources/conditions.yml:
--------------------------------------------------------------------------------
1 | Conditions:
2 |   HasCustomRole: !Not [!Equals [!Ref "Role", ""]]
3 | 


--------------------------------------------------------------------------------
/resources/fargate.yml:
--------------------------------------------------------------------------------
  1 | Resources:
  2 |   # VPC in which containers will be networked.
  3 |   # It has two public subnets
  4 |   # We distribute the subnets across the first two available subnets
  5 |   # for the region, for high availability.
  6 |   # Important Note: Public IPs required for direct outbound traffic
  7 |   VPC:
  8 |     Type: AWS::EC2::VPC
  9 |     Properties:
 10 |       EnableDnsSupport: true
 11 |       EnableDnsHostnames: true
 12 |       CidrBlock: "172.15.0.0/16"
 13 | 
 14 |   # Two public subnets, where containers can have public IP addresses
 15 |   PublicSubnetOne:
 16 |     Type: AWS::EC2::Subnet
 17 |     Properties:
 18 |       AvailabilityZone:
 19 |         Fn::Select:
 20 |           - 0
 21 |           - Fn::GetAZs: { Ref: "AWS::Region" }
 22 |       VpcId: !Ref "VPC"
 23 |       CidrBlock: "172.15.0.0/24"
 24 |       MapPublicIpOnLaunch: true
 25 |   PublicSubnetTwo:
 26 |     Type: AWS::EC2::Subnet
 27 |     Properties:
 28 |       AvailabilityZone:
 29 |         Fn::Select:
 30 |           - 1
 31 |           - Fn::GetAZs: { Ref: "AWS::Region" }
 32 |       VpcId: !Ref "VPC"
 33 |       CidrBlock: "172.15.1.0/24"
 34 |       MapPublicIpOnLaunch: true
 35 | 
 36 |   # Setup networking resources for the public subnets. Containers
 37 |   # in the public subnets have public IP addresses and the routing table
 38 |   # sends network traffic via the internet gateway.
 39 |   InternetGateway:
 40 |     Type: AWS::EC2::InternetGateway
 41 |   GatewayAttachement:
 42 |     Type: AWS::EC2::VPCGatewayAttachment
 43 |     Properties:
 44 |       VpcId: !Ref "VPC"
 45 |       InternetGatewayId: !Ref "InternetGateway"
 46 |   PublicRouteTable:
 47 |     Type: AWS::EC2::RouteTable
 48 |     Properties:
 49 |       VpcId: !Ref "VPC"
 50 |   PublicRoute:
 51 |     Type: AWS::EC2::Route
 52 |     DependsOn:
 53 |       - GatewayAttachement
 54 |     Properties:
 55 |       RouteTableId: !Ref "PublicRouteTable"
 56 |       DestinationCidrBlock: "0.0.0.0/0"
 57 |       GatewayId: !Ref "InternetGateway"
 58 |   PublicSubnetOneRouteTableAssociation:
 59 |     Type: AWS::EC2::SubnetRouteTableAssociation
 60 |     Properties:
 61 |       SubnetId: !Ref PublicSubnetOne
 62 |       RouteTableId: !Ref PublicRouteTable
 63 |   PublicSubnetTwoRouteTableAssociation:
 64 |     Type: AWS::EC2::SubnetRouteTableAssociation
 65 |     Properties:
 66 |       SubnetId: !Ref PublicSubnetTwo
 67 |       RouteTableId: !Ref PublicRouteTable
 68 | 
 69 |   # ECS Resources
 70 |   ECSCluster:
 71 |     Type: AWS::ECS::Cluster
 72 | 
 73 |   # A security group for the containers we will run in Fargate.
 74 |   # Two rules, allowing network traffic from a public facing load
 75 |   # balancer and from other members of the security group.
 76 |   #
 77 |   # Remove any of the following ingress rules that are not needed.
 78 |   # If you want to make direct requests to a container using its
 79 |   # public IP address you'll need to add a security group rule
 80 |   # to allow traffic from all IP addresses.
 81 |   FargateContainerSecurityGroup:
 82 |     Type: AWS::EC2::SecurityGroup
 83 |     Properties:
 84 |       GroupDescription: Access to the Fargate containers
 85 |       VpcId: !Ref "VPC"
 86 |   EcsSecurityGroupIngressFromSelf:
 87 |     Type: AWS::EC2::SecurityGroupIngress
 88 |     Properties:
 89 |       Description: Ingress from other containers in the same security group
 90 |       GroupId: !Ref "FargateContainerSecurityGroup"
 91 |       IpProtocol: -1
 92 |       SourceSecurityGroupId: !Ref "FargateContainerSecurityGroup"
 93 |   # This is an IAM role which authorizes ECS to manage resources on your
 94 |   # account on your behalf, such as updating your load balancer with the
 95 |   # details of where your containers are, so that traffic can reach your
 96 |   # containers.
 97 |   ECSRole:
 98 |     Type: AWS::IAM::Role
 99 |     Properties:
100 |       AssumeRolePolicyDocument:
101 |         Statement:
102 |           - Effect: Allow
103 |             Principal:
104 |               Service: [ecs.amazonaws.com]
105 |             Action: ["sts:AssumeRole"]
106 |       Path: /
107 |       Policies:
108 |         - PolicyName: ecs-service
109 |           PolicyDocument:
110 |             Statement:
111 |               - Effect: Allow
112 |                 Action:
113 |                   # Rules which allow ECS to attach network interfaces to instances
114 |                   # on your behalf in order for awsvpc networking mode to work right
115 |                   - "ec2:AttachNetworkInterface"
116 |                   - "ec2:CreateNetworkInterface"
117 |                   - "ec2:CreateNetworkInterfacePermission"
118 |                   - "ec2:DeleteNetworkInterface"
119 |                   - "ec2:DeleteNetworkInterfacePermission"
120 |                   - "ec2:Describe*"
121 |                   - "ec2:DetachNetworkInterface"
122 |                 Resource: "*"
123 | 
124 |   # This is a role which is used by the ECS tasks themselves.
125 |   ECSTaskExecutionRole:
126 |     Type: AWS::IAM::Role
127 |     Properties:
128 |       AssumeRolePolicyDocument:
129 |         Statement:
130 |           - Effect: Allow
131 |             Principal:
132 |               Service: [ecs-tasks.amazonaws.com]
133 |             Action: ["sts:AssumeRole"]
134 |       Path: /
135 |       Policies:
136 |         - PolicyName: AmazonECSTaskExecutionRolePolicy
137 |           PolicyDocument:
138 |             Statement:
139 |               - Effect: Allow
140 |                 Action:
141 |                   # Allow the ECS Tasks to download images from ECR
142 |                   - "ecr:GetAuthorizationToken"
143 |                   - "ecr:BatchCheckLayerAvailability"
144 |                   - "ecr:GetDownloadUrlForLayer"
145 |                   - "ecr:BatchGetImage"
146 | 
147 |                   # Allow the ECS tasks to upload logs to CloudWatch
148 |                   - "logs:CreateLogGroup"
149 |                   - "logs:CreateLogStream"
150 |                   - "logs:PutLogEvents"
151 |                   - "logs:DescribeLogStreams"
152 |                 Resource: "*"
153 | 
154 |   # The task definition. This is a simple metadata description of what
155 |   # container to run, and what resource requirements it has.
156 |   FargateTaskDefinition:
157 |     Type: AWS::ECS::TaskDefinition
158 |     Properties:
159 |       Family: !Ref "ServiceName"
160 |       Cpu: !Ref "ContainerCpu"
161 |       Memory: !Ref "ContainerMemory"
162 |       NetworkMode: awsvpc
163 |       RequiresCompatibilities:
164 |         - FARGATE
165 |       ExecutionRoleArn: !Ref ECSTaskExecutionRole
166 |       TaskRoleArn:
167 |         Fn::If:
168 |           - "HasCustomRole"
169 |           - !Ref "Role"
170 |           - !Ref "AWS::NoValue"
171 |       ContainerDefinitions:
172 |         - Name: !Ref "ServiceName"
173 |           Cpu: !Ref "ContainerCpu"
174 |           Memory: !Ref "ContainerMemory"
175 |           Image: "#{AWS::AccountId}.dkr.ecr.#{AWS::Region}.amazonaws.com/#{ImageUrl}"
176 |           PortMappings:
177 |             - ContainerPort: !Ref "ContainerPort"
178 |           LogConfiguration:
179 |             LogDriver: awslogs
180 |             Options:
181 |               awslogs-create-group: true
182 |               awslogs-group: "/fargate/service/#{ServiceName}"
183 |               awslogs-region: "#{AWS::Region}"
184 |               awslogs-stream-prefix: ecs
185 | 
186 |   # The service. The service is a resource which allows you to run multiple
187 |   # copies of a type of task, and gather up their logs and metrics, as well
188 |   # as monitor the number of running tasks and replace any that have crashed
189 |   Service:
190 |     Type: AWS::ECS::Service
191 |     Properties:
192 |       ServiceName: !Ref "ServiceName"
193 |       Cluster: !Ref "ECSCluster"
194 |       LaunchType: FARGATE
195 |       DeploymentConfiguration:
196 |         MaximumPercent: 200
197 |         MinimumHealthyPercent: 75
198 |       DesiredCount: !Ref "DesiredCount"
199 |       NetworkConfiguration:
200 |         AwsvpcConfiguration:
201 |           AssignPublicIp: ENABLED
202 |           SecurityGroups:
203 |             - !Ref FargateContainerSecurityGroup
204 |           Subnets:
205 |             - !Ref PublicSubnetOne
206 |             - !Ref PublicSubnetTwo
207 |       TaskDefinition: !Ref "FargateTaskDefinition"
208 | 


--------------------------------------------------------------------------------
/resources/iam.yml:
--------------------------------------------------------------------------------
 1 | Resources:
 2 |   StateMachinePassRole:
 3 |     Type: AWS::IAM::Role
 4 |     Properties:
 5 |       ManagedPolicyArns:
 6 |         - arn:aws:iam::aws:policy/AWSStepFunctionsFullAccess
 7 |       AssumeRolePolicyDocument:
 8 |         Statement:
 9 |           - Effect: Allow
10 |             Principal:
11 |               Service: ["states.#{AWS::Region}.amazonaws.com"]
12 |             Action: ["sts:AssumeRole"]
13 |           - Effect: Allow
14 |             Principal:
15 |               Service: "lambda.amazonaws.com"
16 |             Action: "sts:AssumeRole"
17 |       Path: /
18 |       Policies:
19 |         - PolicyName: AmazonECSPassRolePolicy
20 |           PolicyDocument:
21 |             Statement:
22 |               - Effect: Allow
23 |                 Action:
24 |                   - "iam:PassRole"
25 |                 Resource: "*"
26 |               - Effect: Allow
27 |                 Action:
28 |                   - "lambda:InvokeFunction"
29 |                 Resource: "*"
30 |               - Effect: Allow
31 |                 Action:
32 |                   - "ecs:RunTask"
33 |                   - "ecs:StopTask"
34 |                   - "ecs:DescribeTasks"
35 |                   - "ecs:StartTelemetrySession"
36 |                 Resource: "*"
37 |               - Effect: Allow
38 |                 Action:
39 |                   - "events:PutTargets"
40 |                   - "events:PutRule"
41 |                   - "events:DescribeRule"
42 |                 Resource: "arn:aws:events:#{AWS::Region}:#{AWS::AccountId}:rule/StepFunctionsGetEventsForECSTaskRule"
43 |               - Effect: Allow
44 |                 Action:
45 |                   - "events:PutTargets"
46 |                   - "events:PutRule"
47 |                   - "events:DescribeRule"
48 |                 Resource: "arn:aws:events:#{AWS::Region}:#{AWS::AccountId}:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule"
49 | 


--------------------------------------------------------------------------------
/resources/outputs.yml:
--------------------------------------------------------------------------------
1 | Outputs:
2 |   WebCrawl:
3 |     Description: "The ARN of the StateMachine WebCrawl"
4 |     Value:
5 |       Ref: CrawlStateMachine
6 | 


--------------------------------------------------------------------------------
/resources/parameters.yml:
--------------------------------------------------------------------------------
 1 | Parameters:
 2 |   ServiceName:
 3 |     Type: String
 4 |     Default: ServerlessWebCrawl
 5 |     Description: A name for the service
 6 |   ImageUrl:
 7 |     Type: String
 8 |     Default: serverless-web-crawl:latest
 9 |     Description:
10 |       The url of a docker image that contains the application process that will
11 |       handle the traffic for this service
12 |   ContainerPort:
13 |     Type: Number
14 |     Default: 80
15 |     Description: What port number the application inside the docker container is binding to
16 |   ContainerCpu:
17 |     Type: Number
18 |     Default: 1024
19 |     Description: How much CPU to give the container. 1024 is 1 CPU
20 |   ContainerMemory:
21 |     Type: Number
22 |     Default: 2048
23 |     Description: How much memory in megabytes to give the container
24 |   Path:
25 |     Type: String
26 |     Default: "*"
27 |     Description:
28 |       A path on the public load balancer that this service should be connected
29 |       to. Use * to send all load balancer traffic to this service.
30 |   Priority:
31 |     Type: Number
32 |     Default: 1
33 |     Description:
34 |       The priority for the routing rule added to the load balancer. This only
35 |       applies if your have multiple services which have been assigned to
36 |       different paths on the load balancer.
37 |   DesiredCount:
38 |     Type: Number
39 |     Default: 0
40 |     Description: How many copies of the service task to run
41 |   Role:
42 |     Type: String
43 |     Default: ""
44 |     Description:
45 |       (Optional) An IAM role to give the service's containers if the code within
46 |       needs to access other AWS resources like S3 buckets, DynamoDB tables, etc
47 | 


--------------------------------------------------------------------------------
/resources/sns.yml:
--------------------------------------------------------------------------------
 1 | Resources:
 2 |   WebCrawlNotificationsTopic:
 3 |     Type: AWS::SNS::Topic
 4 |     Properties:
 5 |       DisplayName: "Serverless Web Crawler Notifications"
 6 |       TopicName: ServerlessWebCrawlerNotifications
 7 |       Subscription:
 8 |         - Endpoint: "ryan@webdoodle.com.au"
 9 |           Protocol: email
10 | 


--------------------------------------------------------------------------------
/resources/state-machines/crawl.yml:
--------------------------------------------------------------------------------
 1 | id: CrawlStateMachine
 2 | events:
 3 |   - http:
 4 |       path: "/"
 5 |       method: "POST"
 6 |   # - schedule:
 7 |   #     rate: rate(24 hours)
 8 |   #     enabled:
 9 |   #       # ${self:custom.scheduleEnabled.${opt:stage, self:provider.stage}, false}
10 |   #       false
11 |   #     input:
12 |   #       executionId.$: $$.Execution.Id
13 |   #       executionName.$: $$.Execution.Name
14 | notifications:
15 |   ABORTED:
16 |     - sns: !Ref WebCrawlNotificationsTopic
17 |   FAILED:
18 |     - sns: !Ref WebCrawlNotificationsTopic
19 |   TIMED_OUT:
20 |     - sns: !Ref WebCrawlNotificationsTopic
21 |   SUCCEEDED:
22 |     - sns: !Ref WebCrawlNotificationsTopic
23 | role:
24 |   Fn::GetAtt: [StateMachinePassRole, Arn]
25 | definition:
26 |   Comment: "Serverless Web Crawl"
27 |   StartAt: WebCrawl
28 |   States:
29 |     WebCrawl:
30 |       Type: Task
31 |       Resource: "arn:aws:states:::ecs:runTask.sync"
32 |       Parameters:
33 |         LaunchType: "FARGATE"
34 |         Cluster: "#{ECSCluster}"
35 |         TaskDefinition: "#{FargateTaskDefinition}"
36 |         NetworkConfiguration:
37 |           AwsvpcConfiguration:
38 |             Subnets:
39 |               - "#{PublicSubnetOne}"
40 |               - "#{PublicSubnetTwo}"
41 |             AssignPublicIp: ENABLED
42 |         Overrides:
43 |           ContainerOverrides:
44 |             - Name: "#{ServiceName}"
45 |               Command.$: $$.Execution.Input.command  # For more information on Contact Object, see https://docs.aws.amazon.com/step-functions/latest/dg/input-output-contextobject.html
46 |               Environment:
47 |                 - Name: EXECUTION_ID
48 |                   Value.$: $$.Execution.Id
49 |                 - Name: EXECUTION_NAME
50 |                   Value.$: $$.Execution.Name
51 |       End: true
52 | 


--------------------------------------------------------------------------------
/scripts/screenshot.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This is an example script to demonstrate the library.
 3 |  */
 4 | 
 5 | const path = require("path");
 6 | const S3 = require("aws-sdk/clients/s3");
 7 | const fs = require("fs").promises;
 8 | 
 9 | const { Crawler } = require("../lib/crawler");
10 | const logger = require("../lib/logger");
11 | const {
12 | 	useS3Storage,
13 | 	s3BucketName,
14 | 	awsCredentials
15 | } = require("../lib/constants");
16 | 
17 | const s3client = awsCredentials.accessKeyId
18 | 	? new S3({ ...awsCredentials })
19 | 	: new S3();
20 | 
21 | async function ScreenshotScript(params) {
22 | 	const crawler = await Crawler();
23 | 
24 | 	crawler.queue(params.url, async ({ page, data: url }) => {
25 | 		await page.goto(url);
26 | 		const screenshotData = await page.screenshot({
27 | 			type: "jpeg",
28 | 			fullPage: true
29 | 		});
30 | 
31 | 		logger.info(`Screenshot taken of ${params.url}`);
32 | 
33 | 		if (useS3Storage) {
34 | 			await s3client
35 | 				.upload({
36 | 					Bucket: s3BucketName,
37 | 					Key: "serverless-web-crawler/screenshot.jpg",
38 | 					Body: screenshotData,
39 | 					ContentType: "image/jpeg",
40 | 					ACL: "public-read"
41 | 				})
42 | 				.promise();
43 | 			logger.info(`Saved screenshot to S3.`);
44 | 		} else {
45 | 			const outputFilePath = path.resolve(__dirname, "../screenshot.jpg");
46 | 			try {
47 | 				await fs.writeFile(outputFilePath, screenshotData);
48 | 				logger.info("Saved screenshot to disk.");
49 | 			} catch (err) {
50 | 				logger.error("Failed to write screenshot to file", err);
51 | 			}
52 | 		}
53 | 	});
54 | 
55 | 	await crawler.idle();
56 | }
57 | 
58 | module.exports = ScreenshotScript;
59 | 


--------------------------------------------------------------------------------
/scripts/utils/evaluate.js:
--------------------------------------------------------------------------------
 1 | const evaluate = (page, ...params) => browserFn => {
 2 | 	const fnIndexes = [];
 3 | 	params = params.map((param, i) => {
 4 | 		if (typeof param === "function") {
 5 | 			fnIndexes.push(i);
 6 | 			return param.toString();
 7 | 		}
 8 | 		return param;
 9 | 	});
10 | 	return page.evaluate(
11 | 		(fnIndexes, browserFnStr, ...params) => {
12 | 			for (let i = 0; i < fnIndexes.length; i++) {
13 | 				params[fnIndexes[i]] = new Function(
14 | 					" return (" + params[fnIndexes[i]] + ").apply(null, arguments)"
15 | 				);
16 | 			}
17 | 			browserFn = new Function(
18 | 				" return (" + browserFnStr + ").apply(null, arguments)"
19 | 			);
20 | 			return browserFn(...params);
21 | 		},
22 | 		fnIndexes,
23 | 		browserFn.toString(),
24 | 		...params
25 | 	);
26 | };
27 | 
28 | module.exports = evaluate;
29 | 


--------------------------------------------------------------------------------
/scripts/utils/wait-and-retry-until.js:
--------------------------------------------------------------------------------
 1 | import isUndefined from "lodash/isUndefined";
 2 | 
 3 | export default async function(page, selector, options = {}) {
 4 | 	const { timeout = 10000, retries = 3, visible, hidden } = options;
 5 | 	let isFound = false;
 6 | 	let count = 0;
 7 | 	while (!isFound) {
 8 | 		try {
 9 | 			const waitOptions = {
10 | 				timeout
11 | 			};
12 | 			if (!isUndefined(visible)) {
13 | 				waitOptions.visible = visible;
14 | 			}
15 | 			if (!isUndefined(hidden)) {
16 | 				waitOptions.hidden = hidden;
17 | 			}
18 | 			await page.waitForSelector(selector, waitOptions);
19 | 			isFound = true;
20 | 		} catch (e) {
21 | 			await page.reload();
22 | 			count++;
23 | 			if (count > retries - 1) {
24 | 				isFound = true;
25 | 				throw e;
26 | 			}
27 | 		}
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/serverless.yml:
--------------------------------------------------------------------------------
 1 | # Fargate CloudFormation resources based on: https://github.com/ryfeus/stepfunctions2processing/blob/master/aws-fargate/serverless.yml
 2 | # Fargate based Puppeteer based on: https://dev.to/hoangleitvn/building-serverless-web-crawler-with-puppeteer-on-aws-fargate-22k3
 3 | 
 4 | service: serverless-web-crawler
 5 | 
 6 | # You can pin your service to only deploy with a specific Serverless version
 7 | # Check out our docs for more details
 8 | frameworkVersion: ">=1.28.0 <2.0.0"
 9 | 
10 | provider:
11 |   name: aws
12 |   runtime: nodejs14.x
13 |   region: ${opt:region, 'ap-southeast-2'}
14 |   memorySize: 512
15 |   timeout: 900
16 |   stage: ${opt:stage, 'dev'}
17 |   environment:
18 |     NODE_ENV:
19 |       ${self:custom.nodeEnv.${opt:stage, self:provider.stage}, 'development'}
20 | 
21 | package:
22 |   exclude:
23 |     - bin/**
24 |     - cache/**
25 |     - debug/**
26 |     - logs/**
27 |     - test/**
28 | 
29 | plugins:
30 |   - serverless-plugin-include-dependencies
31 |   - serverless-plugin-common-excludes
32 |   - serverless-pseudo-parameters
33 |   - serverless-step-functions
34 |   - serverless-dotenv-plugin # Learn more - https://www.serverless.com/plugins/serverless-dotenv-plugin
35 |   - serverless-plugin-ifelse # Learn more - https://www.serverless.com/plugins/serverless-plugin-ifelse
36 |   - serverless-offline
37 | 
38 | custom:
39 |   nodeEnv:
40 |     dev: development
41 |     prod: production
42 |   serverless-offline:
43 |     port: 9000
44 |   serverlessIfElse:
45 |     - If: '"${env:S3_BUCKET_NAME}" != ""'
46 |       Set:
47 |         provider.iamRoleStatements:
48 |           - Effect: "Allow"
49 |             Action:
50 |               - "s3:*Object"
51 |             Resource: "arn:aws:s3:::${env:S3_BUCKET_NAME}*/*"
52 | 
53 | # functions:
54 | 
55 | stepFunctions:
56 |   validate: true
57 |   stateMachines:
58 |     Crawl: ${file(resources/state-machines/crawl.yml)}
59 | 
60 | resources:
61 |   - ${file(resources/parameters.yml)}
62 |   - ${file(resources/conditions.yml)}
63 |   - ${file(resources/fargate.yml)}
64 |   - ${file(resources/iam.yml)}
65 |   - ${file(resources/sns.yml)}
66 |   - ${file(resources/outputs.yml)}
67 | 


--------------------------------------------------------------------------------