├── Makefile
├── README.md
├── crawl.yaml
├── index.js
└── run


/Makefile:
--------------------------------------------------------------------------------
 1 | OUT=out
 2 | 
 3 | NOW=$(shell date +%Y%m%d%H%M%S)
 4 | FXN_NAME?=cc-$(NOW)
 5 | S3_BUCKET?=candid-serverlessrepo
 6 | OUTPUT_CF=$(OUT)/serverless.yaml
 7 | REGION?=us-east-1
 8 | APPLICATION_NAME=LambdaScale
 9 | APPLICATION_ID=arn:aws:serverlessrepo:$(REGION):$(AWS_ACN):applications/$(APPLICATION_NAME)
10 | VERSION?=1.0.$(NOW)
11 | 
12 | INDEX_ZIP = $(OUT)/index.zip
13 | STAMP_SETUP = $(OUT)/stamp-setup
14 | 
15 | AWS=aws --profile $(AWS_PROFILE)
16 | 
17 | .DEFAULT_GOAL := $(OUTPUT_CF)
18 | 
19 | $(STAMP_SETUP): | $(OUT)
20 | 	npm i --prefix $(OUT) aws-sdk
21 | 
22 | $(INDEX_ZIP): index.js tags | $(OUT)
23 | 	nodejs -c $<
24 | 	zip $@ $<
25 | 
26 | upgrade: $(INDEX_ZIP) $(STAMP_SETUP)
27 | 	$(AWS) lambda update-function-code \
28 | 		--zip-file fileb://$< \
29 | 		--function-name $(FXN_NAME)
30 | 
31 | clean:
32 | 	rm -rf $(OUT)
33 | 
34 | $(OUTPUT_CF): crawl.yaml index.js | $(OUT)
35 | 	sam package \
36 | 		--template-file $< \
37 | 		--output-template-file $(OUTPUT_CF) \
38 | 		--s3-bucket $(S3_BUCKET)
39 | 
40 | package: $(OUTPUT_CF)
41 | 
42 | deploy-cf: $(OUTPUT_CF)
43 | 	aws cloudformation deploy --template-file $< --stack-name $(FXN_NAME) --capabilities CAPABILITY_IAM
44 | 
45 | deploy: CONF_YAML=$(APPLICATION_NAME)-$(VERSION).yaml
46 | deploy: $(OUTPUT_CF)
47 | 	$(AWS) s3 cp $< s3://$(S3_BUCKET)/$(CONF_YAML)
48 | 	$(AWS) serverlessrepo create-application-version \
49 | 		--application-id $(APPLICATION_ID) \
50 | 		--semantic-version $(VERSION) \
51 | 		--source-code-url https://github.com/candidpartners/lambda-at-scale \
52 | 		--template-url s3://$(S3_BUCKET)/$(CONF_YAML)
53 | 	$(AWS) s3 rm s3://$(S3_BUCKET)/$(CONF_YAML)
54 | 
55 | destroy:
56 | 	$(AWS) cloudformation delete-stack --stack-name $(FXN_NAME)
57 | 
58 | test: | $(OUT)
59 | 	$(AWS) lambda invoke --function-name $(FXN_NAME) --invocation-type Event $(OUT)/test.$(shell date +%s)
60 | 
61 | tags: index.js
62 | 	ctags --recurse=yes .
63 | 
64 | $(OUT):
65 | 	mkdir -p $@
66 | 
67 | .PHONY: setup push clean package deploy test deploy-cf
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # lambda-at-scale
 2 | Proof of concept lambda for massive parallelism using up to 20k concurrent lambdas
 3 | 
 4 | Currently to use this you need to go to the lambda that it creates and create a test event.  Then you can simply invoke it from the console
 5 | 
 6 | Metrics are emitted to a separate queue to be drained offline.  The metric generation rate can exceed that allowable by cloudwatch, so some throttling is needed.
 7 | 
 8 | ## Deployment
 9 | The Makefile will create the serverless repo in your account.  By default it isn't public.  You'll have to have Amazon's aws cli and sam installed.  These tools require a bucket to create the serverless repo.  That is left as an exercise for the reader.  We leverage node and npm as well.  You'll need to update or override the variables at the top of the Makefile to adapt it to your environment.
10 | 
11 | The 'make deploy' target will create a serverless repo for you that you can choose to make public.  Alternately, you can run 'make deploy-cf' and it will deploy via CloudFront.
12 | 
13 | We used an Ubuntu 16.04-based Linux distribution for our development and testing.  It should be easily portable to similar environments.
14 | 
15 | ## Running
16 | The 'run' script takes a single argument, the name of the stack from the deployment stage.  It isn't strictly necessary, but it ties together the cleaning out of various queues and outputs links for the dashboards.  It is also useful to override the number of chunks to process or impose a concurrency limit.
17 | 


--------------------------------------------------------------------------------
/crawl.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Transform: AWS::Serverless-2016-10-31
 3 | Parameters:
 4 |   NumberOfWorkers:
 5 |     Type: Number
 6 |     Description: 'How many concurrent workers do you want'
 7 |     Default: 4
 8 |   NumberOfChunks:
 9 |     Type: Number
10 |     Description: 'How many index files do you want to consider'
11 |     Default: 2
12 |   IndexBucket:
13 |     Type: String
14 |     Description: 'Bucket to look for the index key'
15 |     Default: 'commoncrawl'
16 |   IndexKey:
17 |     Type: String
18 |     Description: 'Key for the index object'
19 |     Default: 'crawl-data/CC-MAIN-2018-17/warc.paths.gz'
20 |   SearchRegex:
21 |     Type: String
22 |     Description: 'The regular expression to apply to the data chunks'
23 |     Default: '(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}'
24 | 
25 | Resources:
26 | 
27 |   Drainer:
28 |     Type: AWS::Serverless::Function
29 |     Properties:
30 |       Handler: index.metric_handler
31 |       Runtime: nodejs8.10
32 |       MemorySize: 128
33 |       Timeout: 5
34 |       Policies:
35 |         - SQSPollerPolicy:
36 |              QueueName:
37 |                 Fn::GetAtt: [ "MetricQueue", "QueueName" ]
38 |         - CloudWatchPutMetricPolicy: {}
39 | 
40 |   DrainerTrigger:
41 |     Type: AWS::Lambda::EventSourceMapping
42 |     Properties:
43 |       BatchSize: 10
44 |       Enabled: true
45 |       EventSourceArn: !GetAtt MetricQueue.Arn
46 |       FunctionName: !GetAtt Drainer.Arn
47 | 
48 |   Worker:
49 |     Type: AWS::Serverless::Function
50 |     Properties:
51 |       Handler: index.handler
52 |       Runtime: nodejs8.10
53 |       MemorySize: 1024
54 |       Timeout: 300
55 |       Policies:
56 |         - SQSPollerPolicy:
57 |              QueueName:
58 |                 Fn::GetAtt: [ "CrawlQueue", "QueueName" ]
59 |         - SQSPollerPolicy:
60 |              QueueName:
61 |                 Fn::GetAtt: [ "MetricQueue", "QueueName" ]
62 |         - SQSSendMessagePolicy:
63 |              QueueName:
64 |                 Fn::GetAtt: [ "CrawlQueue", "QueueName" ]
65 |         - SQSSendMessagePolicy:
66 |              QueueName:
67 |                 Fn::GetAtt: [ "MetricQueue", "QueueName" ]
68 |         - S3ReadPolicy:
69 |              BucketName: !Ref IndexBucket
70 |         - CloudWatchPutMetricPolicy: {}
71 |         # NB: this allows us to invoke all functions in the stack
72 |         - LambdaInvokePolicy:
73 |              FunctionName: !Ref AWS::StackName
74 |       Environment:
75 |         Variables:
76 |           QUEUE_URL: !Ref CrawlQueue
77 |           METRIC_URL: !Ref MetricQueue
78 |           MAX_WORKERS: !Ref NumberOfWorkers
79 |           MAX_CHUNKS: !Ref NumberOfChunks
80 |           CRAWL_INDEX_BUCKET: !Ref IndexBucket
81 |           CRAWL_INDEX_KEY: !Ref IndexKey
82 |   CrawlQueue:
83 |     Type: AWS::SQS::Queue
84 |     Properties:
85 |       VisibilityTimeout: 300
86 |   MetricQueue:
87 |     Type: AWS::SQS::Queue
88 |     Properties:
89 |       VisibilityTimeout: 30
90 | 
91 | Outputs:
92 |   Worker:
93 |     Value: !Ref Worker
94 |   MetricQueue:
95 |     Value: !GetAtt MetricQueue.QueueName
96 |   CrawlQueue:
97 |     Value: !GetAtt CrawlQueue.QueueName
98 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict'
  2 | 
  3 | const AWS = require('aws-sdk')
  4 | 
  5 | AWS.config.update({region: 'us-east-1'}); // TODO: pull from environment or something
  6 | 
  7 | const zlib = require('zlib')
  8 | const { Transform } = require('stream');
  9 | 
 10 | const cloudWatch = new AWS.CloudWatch()
 11 | const s3 = new AWS.S3()
 12 | const sqs = new AWS.SQS()
 13 | const lambda = new AWS.Lambda()
 14 | 
 15 | const BUCKET = process.env.CRAWL_INDEX_BUCKET || 'commoncrawl'
 16 | const KEY = process.env.CRAWL_INDEX_KEY || 'crawl-data/CC-MAIN-2018-17/warc.paths.gz'
 17 | 
 18 | const INPUT_URL = process.env.QUEUE_URL
 19 | const METRIC_URL = process.env.METRIC_URL
 20 | const MAX_WORKERS = process.env.MAX_WORKERS || 4
 21 | const DEFAULT_REGEX = '(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}'
 22 | const DEFAULT_REGEX_FLAGS = 'gm'
 23 | const REGEX = new RegExp(process.env.REGEX || DEFAULT_REGEX, process.env.REGEX_FLAGS || DEFAULT_REGEX_FLAGS)
 24 | 
 25 | const REQUEST_REGEX = new RegExp('\nWARC-Type: request', 'gm')
 26 | 
 27 | const WORK_REQUEST = 'work'
 28 | const START_REQUEST = 'start'
 29 | 
 30 | const ONE_MINUTE_MILLIS = 60 * 1000
 31 | 
 32 | let invocations = 0
 33 | 
 34 | async function sandbag(delay){
 35 |     return new Promise(resolve => setTimeout(resolve, delay))
 36 | }
 37 | 
 38 | // borrows from SO
 39 | function shuffle(input) {
 40 |     for (let i = input.length - 1; i > 0; i--) {
 41 |         const j = Math.floor(Math.random() * (i + 1));
 42 |         [input[i], input[j]] = [input[j], input[i]]; // eslint-disable-line no-param-reassign
 43 |     }
 44 |     return input
 45 | }
 46 | 
 47 | function annoying(err) {
 48 |     console.log("ANNOY: " + err)
 49 | }
 50 | 
 51 | function fatal(err) {
 52 |     console.log("FATAL: " + err)
 53 |     process.exit(0)
 54 | }
 55 | 
 56 | async function get_object(bucket, key) {
 57 |     const params = { Bucket: bucket, Key : key }
 58 |     return s3.getObject(params).promise()
 59 |         .catch(err => fatal("Unable to get S3 data: " + err))
 60 | 
 61 | }
 62 | 
 63 | async function gunzipBuf(buffer) {
 64 |     return new Promise((resolve, reject) => {
 65 |         zlib.gunzip(buffer, (err, data) => {
 66 |             if (err) {
 67 |                 reject(err)
 68 |             } else {
 69 |                 resolve(data)
 70 |             }
 71 |         })
 72 |     })
 73 | }
 74 | 
 75 | async function run_lambda(fxn_name, type, run_id, worker_id, launch_count) {
 76 |     const params = {
 77 |         FunctionName: fxn_name,
 78 |         Payload: JSON.stringify({ type, run_id, worker_id, launch_count }),
 79 |         InvocationType: 'Event'
 80 |     }
 81 | 
 82 |     return lambda.invoke(params).promise()
 83 |         .catch(err => fatal('Something went wrong invoking lambda ' + err))
 84 | }
 85 | 
 86 | async function populate_queue(){
 87 |     const max = process.env.MAX_CHUNKS ? parseInt(process.env.MAX_CHUNKS) : 2
 88 | 
 89 |     const content = await get_object(BUCKET, KEY)
 90 |     const manifest = await gunzipBuf(content.Body)
 91 |     const all_archives = manifest.toString().split("\n")
 92 | 
 93 |     // mix things up so we can test random archives other than the first couple
 94 |     const input = process.env.SHUFFLE ?  shuffle(all_archives) : all_archives
 95 | 
 96 |     const lines = input.slice(0, max).filter(data => 0 !== data.length) // limit for now
 97 | 
 98 |     console.log(`Populating with ${lines.length} archive entries`)
 99 | 
100 |     const enqueuers = []
101 |     let count = 0
102 |     for (let index = 0; index < lines.length; index = index + 10) {
103 |         let counter = 0
104 |         const entries = lines.slice(index, index + 10).map(line => {
105 |             return {
106 |                 MessageBody: line,
107 |                 Id: `${counter++}` // There has to be a better way to do this
108 |             }
109 |         })
110 | 
111 |         const message = { Entries: entries, QueueUrl: INPUT_URL }
112 |         count += entries.length
113 |         enqueuers.push(sqs.sendMessageBatch(message).promise())
114 |     }
115 | 
116 |     await Promise.all(enqueuers)
117 |         .catch(err => fatal("Something bad happened while enqueueing: " + err))
118 | 
119 |     // return what we did so we can wait for it
120 |     return lines
121 | }
122 | 
123 | async function get_queue_size(){
124 |     const attr_params = {
125 |         QueueUrl: INPUT_URL,
126 |         AttributeNames: [ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ]
127 |     }
128 | 
129 |     const results = await sqs.getQueueAttributes(attr_params).promise()
130 |             .catch(err => fatal("Unable to determine queue depth: " + err))
131 |     return parseInt(results.Attributes.ApproximateNumberOfMessages) + parseInt(results.Attributes.ApproximateNumberOfMessagesNotVisible)
132 | }
133 | 
134 | async function await_queue(target){
135 |     // the promises have all run, now we need to make sure SQS shows all of them
136 |     // or our lambdas may start and immediately be done
137 | 
138 |     while (true){
139 |         const ready = await get_queue_size()
140 |         if (ready >= target){ // NB: type coercion here
141 |             console.log("Queue reports " + ready + " messages available")
142 |             break
143 |         }
144 |         console.log("Waiting for messages to show in SQS, see " + ready + ", want " + target)
145 |     }
146 | 
147 |     return true
148 | }
149 | 
150 | async function warm_target(launch_count){
151 |     const depth = await get_queue_size()
152 |     const remaining = Math.max(0, MAX_WORKERS - launch_count)
153 |     const initial = process.env.INITIAL_COUNT || 3000
154 |     const step = process.env.INCREMENT_STEP || 500
155 |     const limit = 0 === launch_count ? initial : step
156 |     // if we don't have a full step to do, do what we need
157 |     const full_limit = Math.min(limit, remaining)
158 | 
159 |     // if our queue is empty then we don't want spin any more up,
160 |     // or if we have half the increment step in the queue, don't spin up the full step
161 |     console.log(`Calculating warm target from ${limit}, ${remaining} and ${depth}`)
162 |     return Math.min(full_limit, depth)
163 | }
164 | 
165 | async function driver(fxn_name, memorySize, run_id, launch_count) {
166 |     const full_start_time = new Date().getTime()
167 |     const first_run = 0 === launch_count
168 | 
169 |     let lines = []
170 |     if (first_run) {
171 |         console.log("Starting ", fxn_name, run_id)
172 |         lines = await populate_queue()
173 |         await await_queue(lines.length)
174 |     }
175 | 
176 |     const target = await warm_target(launch_count)
177 | 
178 |     console.log(`Launching ${target} workers`)
179 |     const workers = []
180 |     for (let worker_id = 0; worker_id !== target; worker_id++) {
181 |         workers.push(run_lambda(fxn_name, WORK_REQUEST, run_id, launch_count + worker_id))
182 |     }
183 | 
184 |     if (first_run) {
185 |         const launch_start_time = new Date().getTime()
186 | 
187 |         const metrics = [
188 |             create_metric('memory_size', memorySize, 'Megabytes'),
189 |             create_metric('full_start_time', full_start_time, 'Milliseconds'),
190 |             create_metric('launch_start_time', launch_start_time, 'Milliseconds'),
191 |             create_metric('total_workers', MAX_WORKERS),
192 |             create_metric('total_chunks', lines.length)
193 |         ]
194 | 
195 |         await on_metrics(metrics, fxn_name, run_id)
196 |     }
197 | 
198 |     console.log("Waiting for " + workers.length + " workers to spin up...")
199 |     await Promise.all(workers)
200 |         .catch(err => fatal("Something went wrong starting workers: " + err))
201 |         .then(() => "All launched for " + run_id)
202 | 
203 |     const current_count = target + launch_count
204 |     if (0 === target){
205 |        console.log(`All workers launched`)
206 |        return
207 |     }
208 | 
209 |     // we want to launch every minute as close as possible
210 |     const next_run_time = full_start_time + ONE_MINUTE_MILLIS
211 |     const sleep_time = Math.max(0, next_run_time - new Date().getTime() )
212 | 
213 |     await sandbag(sleep_time)
214 |     console.log(`Recursing with launch count of ${current_count}`)
215 |     return run_lambda(fxn_name, START_REQUEST, run_id, 0, current_count)
216 | }
217 | 
218 | function on_regex(data, run_id){
219 | 	// if you want to do something on getting a hit, put it here
220 | }
221 | 
222 | async function handle_stream(stream, run_id){
223 |     let uncompressed_bytes = 0
224 |     let compressed_bytes = 0
225 |     let total_requests = 0
226 |     let count = 0
227 | 
228 |     const start_time = new Date().getTime()
229 |     const extractor = new Transform({
230 |         transform(chunk, encoding, callback) {
231 |             try {
232 |                 const matches = chunk.toString().match(REGEX) || []
233 |                 matches.forEach(data => this.push(data))
234 |             } catch (error){
235 |                 annoying("Failed to extract: " + error)
236 |             } finally {
237 |                 callback()
238 |             }
239 |         }
240 |     })
241 | 
242 |     const gunzipper = zlib.createGunzip()
243 |     let data_ts = 0
244 | 
245 |     const log_traffic = () => {
246 |         const ts = new Date().getTime() / 1000
247 |         const elapsed = ts - data_ts
248 |         if (10 <= elapsed){
249 |             console.log("Received data: " + compressed_bytes)
250 |             data_ts = ts
251 |         }
252 |     }
253 | 
254 |     await new Promise((resolve, _reject) => {
255 |         const reject = message => {
256 |             console.log("Failed: " + message)
257 |             _reject(message)
258 |         }
259 | 
260 |         const gunzipStream = stream
261 |             .on('error', err => fatal("GZip stream error " + err))
262 |             .on('data', log_traffic)
263 |             .on('data', data => compressed_bytes += data.length)
264 |             .on('end', () => console.log("End of base stream"))
265 |             .pipe(gunzipper)
266 | 
267 |         const extractorStream = gunzipStream
268 |             .on('error', err => fatal("Extract stream error " + err))
269 |             .on('data', data => {
270 |                 // technically our stream could split our request
271 |                 // marker, but that will be rare, and over the total
272 |                 // number of requests we have we should be ok
273 |                 try {
274 |                     const requests = data.toString().match(REQUEST_REGEX) || []
275 |                     total_requests += requests.length
276 |                 } catch (error) {
277 |                     annoying("Failed to match batches: " + error)
278 |                 }
279 |             })
280 |             .on('data', data => uncompressed_bytes += data.length)
281 |             .on('end', () => console.log("End of gzip stream"))
282 |             .pipe(extractor)
283 | 
284 |         const extractedStream = extractorStream
285 |             .on('error', err => fatal("Extracted stream error " + err))
286 |             .on('data', () => count++)
287 |             .on('data', data => on_regex(data, run_id))
288 |             .on('end', () => {
289 |                 console.log("Streaming complete")
290 |                 resolve("complete")
291 |             })
292 |     })
293 | 
294 |     const now = new Date().getTime()
295 |     // we're measuring time internally, this isn't 100%, but go ahead
296 |     // and round up to the nearest 100ms
297 |     const elapsed = Math.ceil((now - start_time) / 100) * 100
298 | 
299 |     return [
300 |         create_metric('start_time', start_time, 'Milliseconds'),
301 |         create_metric('regex_hits', count),
302 |         create_metric('total_requests', total_requests),
303 |         create_metric('compressed_bytes', compressed_bytes, 'Bytes'),
304 |         create_metric('uncompressed_bytes', uncompressed_bytes, 'Bytes'),
305 |         create_metric('elapsed_ms', elapsed, 'Milliseconds')
306 |     ]
307 | }
308 | 
309 | async function handle_path(path, run_id) {
310 |     const params = {
311 |         Bucket : BUCKET,
312 |         Key : path
313 |     }
314 | 
315 |     const stream = s3.getObject(params)
316 |         .on('httpHeaders', (code, headers) => {
317 |             const requestId = headers['x-amz-request-id']
318 |             const amzId = headers['x-amz-id-2']
319 |             console.log("Streaming as x-amz-id-2=" + amzId + ", x-amz-request-id=" + requestId + "/" + JSON.stringify(params))
320 |         }).createReadStream()
321 | 
322 |     return handle_stream(stream, run_id)
323 | }
324 | 
325 | function create_metric(key, value, unit){
326 |     return {
327 |         key,
328 |         value,
329 |         unit: unit || 'Count'
330 |     }
331 | }
332 | 
333 | async function on_metrics(metrics, fxn_name, run_id){
334 |     const date = new Date()
335 |     const metricList = metrics.map(metric => {
336 |         console.log(metric.key + "." + run_id, ' -> ', metric.value)
337 |         return {
338 |             MetricName: metric.key,
339 |             Dimensions: [
340 |                 { Name: 'run_id', Value: run_id }
341 |             ],
342 |             Timestamp: date,
343 |             Unit: metric.unit,
344 |             Value: metric.value
345 |         }
346 |     })
347 | 
348 |     const update = {
349 |         'MetricData' : metricList,
350 |         'Namespace' : fxn_name
351 |     }
352 | 
353 |     // ... so for now, shunt to this queue
354 |     return sqs.sendMessage({
355 |             MessageBody: JSON.stringify(update),
356 |             QueueUrl : METRIC_URL
357 |         }).promise()
358 |         .catch(err => fatal("Unable to send metric: " + err))
359 | }
360 | 
361 | async function setVisibilityTimeout(message, time){
362 |     return sqs.changeMessageVisibility({ QueueUrl : INPUT_URL, ReceiptHandle: message.ReceiptHandle, VisibilityTimeout: time })
363 |         .promise()
364 |         .then(() => console.log(`Message ${message.ReceiptHandle} timeout set to ${time}`))
365 |         .catch(err => annoying("Failed to reset the visibility: " + err))
366 | }
367 | 
368 | async function handle_message(fxn_name, run_id, worker_id, end_time) {
369 |     const response = await sqs.receiveMessage({ QueueUrl : INPUT_URL }).promise()
370 |         .catch(err => fatal("Failed to receive message from queue: " + err))
371 |     const messages = response.Messages || []
372 | 
373 |     // we're at the end of our queue, so send our done time
374 |     // sometimes SQS gives us no work when we hammer it, so try a couple times before give up
375 |     if (0 === messages.length && 0 !== invocations){
376 |         console.log(worker_id + ": No work to do")
377 |         const metric = create_metric('end_time', new Date().getTime(), 'Milliseconds')
378 |         await on_metrics([metric], fxn_name, run_id)
379 |         return "All done"
380 |     }
381 | 
382 |     invocations++
383 | 
384 |     for(const message of messages){
385 |         let metrics = [create_metric('messages_attempted', 1)]
386 | 
387 |         // if we aren't done by panic_time then we need to push the message back
388 |         const panic_time = end_time - new Date().getTime()
389 | 
390 |         // with high concurrency, S3 gets mad causing timeouts, this handles that for us by pushing the message back on the queue
391 |         const timer = setTimeout(async () => await setVisibilityTimeout(message, 0), panic_time)
392 |         try {
393 |             const message_metrics = await handle_path(message.Body, run_id)
394 |             metrics.push(create_metric("metrics_handled", 1))
395 |             metrics = metrics.concat(message_metrics)
396 |             clearTimeout(timer)
397 |         } catch (error) {
398 |             metrics.push(create_metric("messages_error", 1))
399 |             annoying("Failed to handle path: " + error)
400 |             await Promise.all([
401 |                 setVisibilityTimeout(message, 0),
402 |                 on_metrics(metrics, fxn_name, run_id)
403 |             ])
404 |             break // don't delete, punt immediately
405 |         }
406 |         await sqs.deleteMessage({ QueueUrl : INPUT_URL, ReceiptHandle: message.ReceiptHandle }).promise()
407 |             .catch(err => fatal("Failed to delete message from queue: " + err))
408 |         await on_metrics(metrics, fxn_name, run_id)
409 |     }
410 | 
411 |     const run = await run_lambda(fxn_name, WORK_REQUEST, run_id, worker_id)
412 |     console.log(run)
413 |     return "All done"
414 | }
415 | 
416 | exports.metric_handler = async (event) => {
417 |     const records = event.Records || []
418 | 
419 |     const metrics = records.map(record => {
420 |         const raw_payload = record.body
421 |         return JSON.parse(raw_payload)
422 |     })
423 | 
424 |     await persist_metric_batch(metrics)
425 |     console.log(`Inserted ${metrics.length} metrics`)
426 | }
427 | 
428 | async function persist_metric_batch(messages){
429 |     for (const message of messages){
430 |         await cloudWatch.putMetricData(message).promise()
431 |             .catch(err => fatal("Error putting metric data: " + err))
432 |     }
433 | 
434 |     return true
435 | }
436 | 
437 | exports.handler = async (args, context) => {
438 |     const { type, run_id, worker_id, launch_count } = args
439 | 
440 |     // peel off N seconds to give us a bit of a buffer for the event to clear and code to run
441 |     const grace_period = process.env.PANIC_GRACE_PERIOD || 3000
442 |     const end_time = new Date().getTime() + context.getRemainingTimeInMillis() - grace_period
443 | 
444 |     switch (type){
445 |         case WORK_REQUEST:
446 |             return handle_message(context.functionName, run_id, worker_id, end_time)
447 |         case START_REQUEST:
448 |             const memorySize = parseInt(context.memoryLimitInMB)
449 |             // if we have a run id, use it, else, make one
450 |             const id = process.env.RUN_ID || run_id
451 |             return driver(context.functionName, memorySize, id, launch_count || 0)
452 |     }
453 | }
454 | 
455 | 


--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eu
 4 | 
 5 | STACK=out/stack.json
 6 | 
 7 | if [ -z "$1" ]
 8 | then
 9 | 	echo "Need a stack name"
10 | 	exit 1
11 | fi
12 | 
13 | mkdir -p out
14 | aws cloudformation describe-stacks --stack-name $1 > $STACK
15 | 
16 | FXN_NAME=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="Worker") | .OutputValue ' | tr -d '"'`
17 | CRAWL_QUEUE=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="CrawlQueue") | .OutputValue ' | tr -d '"'`
18 | METRIC_QUEUE=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="MetricQueue") | .OutputValue ' | tr -d '"'`
19 | 
20 | export SIZE=1024
21 | export MAX_WORKERS=19900
22 | export INITIAL_COUNT=3000
23 | export MAX_CHUNKS=75000
24 | export INCREMENT_STEP=500
25 | export AWS_PROFILE=caws
26 | export FXN_NAME=$FXN_NAME
27 | export METRIC_URL=https://sqs.us-east-1.amazonaws.com/$AWS_ACN/$METRIC_QUEUE
28 | export INPUT_URL=https://sqs.us-east-1.amazonaws.com/$AWS_ACN/$CRAWL_QUEUE
29 | export SINGLE_BUNDLE=crawl-data/CC-MAIN-2018-17/segments/1524125944742.25/warc/CC-MAIN-20180420213743-20180420233743-00299.warc.gz
30 | export CRAWL_INDEX_BUCKET=commoncrawl
31 | export CRAWL_INDEX_KEY=crawl-data/CC-MAIN-2018-17/warc.paths.gz
32 | export RUN_ID=$(date +%s)
33 | 
34 | aws --profile $AWS_PROFILE sqs purge-queue --queue-url $METRIC_URL || true
35 | aws --profile $AWS_PROFILE sqs purge-queue --queue-url $INPUT_URL || true
36 | aws --profile $AWS_PROFILE lambda update-function-configuration --function-name $FXN_NAME --memory-size $SIZE --environment "Variables={RUN_ID=$RUN_ID,METRIC_URL=$METRIC_URL,QUEUE_URL=$INPUT_URL,CRAWL_INDEX_BUCKET=$CRAWL_INDEX_BUCKET,CRAWL_INDEX_KEY=$CRAWL_INDEX_KEY,MAX_CHUNKS=$MAX_CHUNKS,MAX_WORKERS=$MAX_WORKERS,SHUFFLE=t,SINGLE_BUNDLE=$SINGLE_BUNDLE,INITIAL_COUNT=$INITIAL_COUNT,INCREMENT_STEP=$INCREMENT_STEP}"
37 | 
38 | aws --profile $AWS_PROFILE lambda invoke --function-name $FXN_NAME --invocation-type Event out/test.`date +%s` --payload '{ "type" : "start" }'
39 | 
40 | echo $RUN_ID | tee -a run.ids
41 | 
42 | echo "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2:graph=~(view~'singleValue~stacked~false~metrics~(~(~'NAME~'memory_size~'run_id~'RUN_ID~(id~'memory~stat~'Average~period~2592000~label~'Lambda*20Size))~(~(expression~'0.00001667*20*2a*20*28memory*20*2f*201024*29*20*2a*20compute_sec~label~'Cost~id~'e6))~(~(expression~'*28end*20-*20start*29*20*2f*201000~label~'Wall*20Secs~id~'wall_sec~yAxis~'left))~(~(expression~'elapsed_sum*20*2f*201000~label~'Compute*20Secs~id~'compute_sec))~(~'NAME~'total_workers~'run_id~'RUN_ID~(id~'workers~label~'Workers~period~2592000))~(~'.~'start_time~'.~'.~(visible~false~id~'start~period~2592000~stat~'Minimum))~(~'.~'total_chunks~'.~'.~(id~'archives~label~'Archives~period~2592000))~(~'.~'regex_hits~'.~'.~(id~'regex_hits~period~2592000~stat~'Sum~label~'Numbers*20Found))~(~(expression~'regex_hits*20*2f*20wall_sec~label~'Numbers*20Found*20*2f*20Sec~id~'e5))~(~'NAME~'end_time~'run_id~'RUN_ID~(visible~false~id~'end~stat~'Maximum~period~2592000))~(~'.~'compressed_bytes~'.~'.~(id~'cbytes~stat~'Sum~period~2592000~label~'Compressed*20Bytes*20Scanned))~(~'.~'uncompressed_bytes~'.~'.~(id~'ucbytes~stat~'Sum~period~2592000~label~'Uncompressed*20Bytes*20Scanned))~(~(expression~'cbytes*20*2f*20wall_sec~label~'Compressed*20Bytes*20*2f*20Sec~id~'e3))~(~(expression~'ucbytes*20*2f*20wall_sec~label~'Uncompressed*20Bytes*20*2f*20Sec~id~'e4))~(~(expression~'cbytes*20*2f*20workers~label~'Compressed*20Bytes*20*2f*20Worker~id~'e1))~(~(expression~'ucbytes*20*2f*20workers~label~'Uncompressed*20Bytes*20*2f*20Worker~id~'e8))~(~'NAME~'elapsed_ms~'run_id~'RUN_ID~(id~'elapsed_sum~stat~'Sum~visible~false~period~2592000))~(~'.~'total_requests~'.~'.~(id~'requests~period~2592000~label~'HTTP*20Requests*20Scanned~stat~'Sum))~(~(expression~'requests*20*2f*20workers~label~'HTTP*20Requests*20*2f*20Worker~id~'e2))~(~(expression~'requests*20*2f*20wall_sec~label~'HTTP*20Requests*20*2f*20Sec~id~'e7)))~region~'us-east-1~start~'-P7D~end~'P0D~title~'SIZEmb*20Run);namespace=NAME;dimensions=run_id" | \
43 | 	sed "s/NAME/${FXN_NAME}/g" | \
44 | 	sed "s/RUN_ID/$RUN_ID/g" | \
45 | 	sed "s/SIZE/$SIZE/g"
46 | 
47 | echo
48 | 
49 | echo "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2:graph=~(metrics~(~(~'NAME~'full_start_time~'run_id~'RUN_ID~(period~1~visible~false))~(~'.~'memory_size~'.~'.~(period~1~visible~false))~(~'.~'launch_start_time~'.~'.~(period~1~visible~false))~(~'.~'total_workers~'.~'.~(period~1~visible~false))~(~'.~'total_chunks~'.~'.~(period~1~visible~false))~(~'.~'regex_hits~'.~'.~(period~1~stat~'Sum))~(~'.~'elapsed_ms~'.~'.~(period~1~visible~false))~(~'.~'total_requests~'.~'.~(period~1~stat~'Sum))~(~'.~'uncompressed_bytes~'.~'.~(period~1~stat~'Sum))~(~'.~'start_time~'.~'.~(period~1~visible~false))~(~'.~'compressed_bytes~'.~'.~(period~1~stat~'Sum))~(~'.~'end_time~'.~'.~(period~1~visible~false)))~view~'timeSeries~stacked~false~region~'us-east-1);namespace=NAME;dimensions=run_id" | \
50 | 	sed "s/NAME/${FXN_NAME}/g" | \
51 | 	sed "s/RUN_ID/$RUN_ID/g"
52 | 
53 | 


--------------------------------------------------------------------------------