├── Makefile ├── README.md ├── crawl.yaml ├── index.js └── run /Makefile: -------------------------------------------------------------------------------- 1 | OUT=out 2 | 3 | NOW=$(shell date +%Y%m%d%H%M%S) 4 | FXN_NAME?=cc-$(NOW) 5 | S3_BUCKET?=candid-serverlessrepo 6 | OUTPUT_CF=$(OUT)/serverless.yaml 7 | REGION?=us-east-1 8 | APPLICATION_NAME=LambdaScale 9 | APPLICATION_ID=arn:aws:serverlessrepo:$(REGION):$(AWS_ACN):applications/$(APPLICATION_NAME) 10 | VERSION?=1.0.$(NOW) 11 | 12 | INDEX_ZIP = $(OUT)/index.zip 13 | STAMP_SETUP = $(OUT)/stamp-setup 14 | 15 | AWS=aws --profile $(AWS_PROFILE) 16 | 17 | .DEFAULT_GOAL := $(OUTPUT_CF) 18 | 19 | $(STAMP_SETUP): | $(OUT) 20 | npm i --prefix $(OUT) aws-sdk 21 | 22 | $(INDEX_ZIP): index.js tags | $(OUT) 23 | nodejs -c $< 24 | zip $@ $< 25 | 26 | upgrade: $(INDEX_ZIP) $(STAMP_SETUP) 27 | $(AWS) lambda update-function-code \ 28 | --zip-file fileb://$< \ 29 | --function-name $(FXN_NAME) 30 | 31 | clean: 32 | rm -rf $(OUT) 33 | 34 | $(OUTPUT_CF): crawl.yaml index.js | $(OUT) 35 | sam package \ 36 | --template-file $< \ 37 | --output-template-file $(OUTPUT_CF) \ 38 | --s3-bucket $(S3_BUCKET) 39 | 40 | package: $(OUTPUT_CF) 41 | 42 | deploy-cf: $(OUTPUT_CF) 43 | aws cloudformation deploy --template-file $< --stack-name $(FXN_NAME) --capabilities CAPABILITY_IAM 44 | 45 | deploy: CONF_YAML=$(APPLICATION_NAME)-$(VERSION).yaml 46 | deploy: $(OUTPUT_CF) 47 | $(AWS) s3 cp $< s3://$(S3_BUCKET)/$(CONF_YAML) 48 | $(AWS) serverlessrepo create-application-version \ 49 | --application-id $(APPLICATION_ID) \ 50 | --semantic-version $(VERSION) \ 51 | --source-code-url https://github.com/candidpartners/lambda-at-scale \ 52 | --template-url s3://$(S3_BUCKET)/$(CONF_YAML) 53 | $(AWS) s3 rm s3://$(S3_BUCKET)/$(CONF_YAML) 54 | 55 | destroy: 56 | $(AWS) cloudformation delete-stack --stack-name $(FXN_NAME) 57 | 58 | test: | $(OUT) 59 | $(AWS) lambda invoke --function-name $(FXN_NAME) --invocation-type Event $(OUT)/test.$(shell date +%s) 60 | 61 | tags: index.js 62 | ctags --recurse=yes . 63 | 64 | $(OUT): 65 | mkdir -p $@ 66 | 67 | .PHONY: setup push clean package deploy test deploy-cf 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lambda-at-scale 2 | Proof of concept lambda for massive parallelism using up to 20k concurrent lambdas 3 | 4 | Currently to use this you need to go to the lambda that it creates and create a test event. Then you can simply invoke it from the console 5 | 6 | Metrics are emitted to a separate queue to be drained offline. The metric generation rate can exceed that allowable by cloudwatch, so some throttling is needed. 7 | 8 | ## Deployment 9 | The Makefile will create the serverless repo in your account. By default it isn't public. You'll have to have Amazon's aws cli and sam installed. These tools require a bucket to create the serverless repo. That is left as an exercise for the reader. We leverage node and npm as well. You'll need to update or override the variables at the top of the Makefile to adapt it to your environment. 10 | 11 | The 'make deploy' target will create a serverless repo for you that you can choose to make public. Alternately, you can run 'make deploy-cf' and it will deploy via CloudFront. 12 | 13 | We used an Ubuntu 16.04-based Linux distribution for our development and testing. It should be easily portable to similar environments. 14 | 15 | ## Running 16 | The 'run' script takes a single argument, the name of the stack from the deployment stage. It isn't strictly necessary, but it ties together the cleaning out of various queues and outputs links for the dashboards. It is also useful to override the number of chunks to process or impose a concurrency limit. 17 | -------------------------------------------------------------------------------- /crawl.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Parameters: 4 | NumberOfWorkers: 5 | Type: Number 6 | Description: 'How many concurrent workers do you want' 7 | Default: 4 8 | NumberOfChunks: 9 | Type: Number 10 | Description: 'How many index files do you want to consider' 11 | Default: 2 12 | IndexBucket: 13 | Type: String 14 | Description: 'Bucket to look for the index key' 15 | Default: 'commoncrawl' 16 | IndexKey: 17 | Type: String 18 | Description: 'Key for the index object' 19 | Default: 'crawl-data/CC-MAIN-2018-17/warc.paths.gz' 20 | SearchRegex: 21 | Type: String 22 | Description: 'The regular expression to apply to the data chunks' 23 | Default: '(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}' 24 | 25 | Resources: 26 | 27 | Drainer: 28 | Type: AWS::Serverless::Function 29 | Properties: 30 | Handler: index.metric_handler 31 | Runtime: nodejs8.10 32 | MemorySize: 128 33 | Timeout: 5 34 | Policies: 35 | - SQSPollerPolicy: 36 | QueueName: 37 | Fn::GetAtt: [ "MetricQueue", "QueueName" ] 38 | - CloudWatchPutMetricPolicy: {} 39 | 40 | DrainerTrigger: 41 | Type: AWS::Lambda::EventSourceMapping 42 | Properties: 43 | BatchSize: 10 44 | Enabled: true 45 | EventSourceArn: !GetAtt MetricQueue.Arn 46 | FunctionName: !GetAtt Drainer.Arn 47 | 48 | Worker: 49 | Type: AWS::Serverless::Function 50 | Properties: 51 | Handler: index.handler 52 | Runtime: nodejs8.10 53 | MemorySize: 1024 54 | Timeout: 300 55 | Policies: 56 | - SQSPollerPolicy: 57 | QueueName: 58 | Fn::GetAtt: [ "CrawlQueue", "QueueName" ] 59 | - SQSPollerPolicy: 60 | QueueName: 61 | Fn::GetAtt: [ "MetricQueue", "QueueName" ] 62 | - SQSSendMessagePolicy: 63 | QueueName: 64 | Fn::GetAtt: [ "CrawlQueue", "QueueName" ] 65 | - SQSSendMessagePolicy: 66 | QueueName: 67 | Fn::GetAtt: [ "MetricQueue", "QueueName" ] 68 | - S3ReadPolicy: 69 | BucketName: !Ref IndexBucket 70 | - CloudWatchPutMetricPolicy: {} 71 | # NB: this allows us to invoke all functions in the stack 72 | - LambdaInvokePolicy: 73 | FunctionName: !Ref AWS::StackName 74 | Environment: 75 | Variables: 76 | QUEUE_URL: !Ref CrawlQueue 77 | METRIC_URL: !Ref MetricQueue 78 | MAX_WORKERS: !Ref NumberOfWorkers 79 | MAX_CHUNKS: !Ref NumberOfChunks 80 | CRAWL_INDEX_BUCKET: !Ref IndexBucket 81 | CRAWL_INDEX_KEY: !Ref IndexKey 82 | CrawlQueue: 83 | Type: AWS::SQS::Queue 84 | Properties: 85 | VisibilityTimeout: 300 86 | MetricQueue: 87 | Type: AWS::SQS::Queue 88 | Properties: 89 | VisibilityTimeout: 30 90 | 91 | Outputs: 92 | Worker: 93 | Value: !Ref Worker 94 | MetricQueue: 95 | Value: !GetAtt MetricQueue.QueueName 96 | CrawlQueue: 97 | Value: !GetAtt CrawlQueue.QueueName 98 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const AWS = require('aws-sdk') 4 | 5 | AWS.config.update({region: 'us-east-1'}); // TODO: pull from environment or something 6 | 7 | const zlib = require('zlib') 8 | const { Transform } = require('stream'); 9 | 10 | const cloudWatch = new AWS.CloudWatch() 11 | const s3 = new AWS.S3() 12 | const sqs = new AWS.SQS() 13 | const lambda = new AWS.Lambda() 14 | 15 | const BUCKET = process.env.CRAWL_INDEX_BUCKET || 'commoncrawl' 16 | const KEY = process.env.CRAWL_INDEX_KEY || 'crawl-data/CC-MAIN-2018-17/warc.paths.gz' 17 | 18 | const INPUT_URL = process.env.QUEUE_URL 19 | const METRIC_URL = process.env.METRIC_URL 20 | const MAX_WORKERS = process.env.MAX_WORKERS || 4 21 | const DEFAULT_REGEX = '(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}' 22 | const DEFAULT_REGEX_FLAGS = 'gm' 23 | const REGEX = new RegExp(process.env.REGEX || DEFAULT_REGEX, process.env.REGEX_FLAGS || DEFAULT_REGEX_FLAGS) 24 | 25 | const REQUEST_REGEX = new RegExp('\nWARC-Type: request', 'gm') 26 | 27 | const WORK_REQUEST = 'work' 28 | const START_REQUEST = 'start' 29 | 30 | const ONE_MINUTE_MILLIS = 60 * 1000 31 | 32 | let invocations = 0 33 | 34 | async function sandbag(delay){ 35 | return new Promise(resolve => setTimeout(resolve, delay)) 36 | } 37 | 38 | // borrows from SO 39 | function shuffle(input) { 40 | for (let i = input.length - 1; i > 0; i--) { 41 | const j = Math.floor(Math.random() * (i + 1)); 42 | [input[i], input[j]] = [input[j], input[i]]; // eslint-disable-line no-param-reassign 43 | } 44 | return input 45 | } 46 | 47 | function annoying(err) { 48 | console.log("ANNOY: " + err) 49 | } 50 | 51 | function fatal(err) { 52 | console.log("FATAL: " + err) 53 | process.exit(0) 54 | } 55 | 56 | async function get_object(bucket, key) { 57 | const params = { Bucket: bucket, Key : key } 58 | return s3.getObject(params).promise() 59 | .catch(err => fatal("Unable to get S3 data: " + err)) 60 | 61 | } 62 | 63 | async function gunzipBuf(buffer) { 64 | return new Promise((resolve, reject) => { 65 | zlib.gunzip(buffer, (err, data) => { 66 | if (err) { 67 | reject(err) 68 | } else { 69 | resolve(data) 70 | } 71 | }) 72 | }) 73 | } 74 | 75 | async function run_lambda(fxn_name, type, run_id, worker_id, launch_count) { 76 | const params = { 77 | FunctionName: fxn_name, 78 | Payload: JSON.stringify({ type, run_id, worker_id, launch_count }), 79 | InvocationType: 'Event' 80 | } 81 | 82 | return lambda.invoke(params).promise() 83 | .catch(err => fatal('Something went wrong invoking lambda ' + err)) 84 | } 85 | 86 | async function populate_queue(){ 87 | const max = process.env.MAX_CHUNKS ? parseInt(process.env.MAX_CHUNKS) : 2 88 | 89 | const content = await get_object(BUCKET, KEY) 90 | const manifest = await gunzipBuf(content.Body) 91 | const all_archives = manifest.toString().split("\n") 92 | 93 | // mix things up so we can test random archives other than the first couple 94 | const input = process.env.SHUFFLE ? shuffle(all_archives) : all_archives 95 | 96 | const lines = input.slice(0, max).filter(data => 0 !== data.length) // limit for now 97 | 98 | console.log(`Populating with ${lines.length} archive entries`) 99 | 100 | const enqueuers = [] 101 | let count = 0 102 | for (let index = 0; index < lines.length; index = index + 10) { 103 | let counter = 0 104 | const entries = lines.slice(index, index + 10).map(line => { 105 | return { 106 | MessageBody: line, 107 | Id: `${counter++}` // There has to be a better way to do this 108 | } 109 | }) 110 | 111 | const message = { Entries: entries, QueueUrl: INPUT_URL } 112 | count += entries.length 113 | enqueuers.push(sqs.sendMessageBatch(message).promise()) 114 | } 115 | 116 | await Promise.all(enqueuers) 117 | .catch(err => fatal("Something bad happened while enqueueing: " + err)) 118 | 119 | // return what we did so we can wait for it 120 | return lines 121 | } 122 | 123 | async function get_queue_size(){ 124 | const attr_params = { 125 | QueueUrl: INPUT_URL, 126 | AttributeNames: [ 'ApproximateNumberOfMessages', 'ApproximateNumberOfMessagesNotVisible' ] 127 | } 128 | 129 | const results = await sqs.getQueueAttributes(attr_params).promise() 130 | .catch(err => fatal("Unable to determine queue depth: " + err)) 131 | return parseInt(results.Attributes.ApproximateNumberOfMessages) + parseInt(results.Attributes.ApproximateNumberOfMessagesNotVisible) 132 | } 133 | 134 | async function await_queue(target){ 135 | // the promises have all run, now we need to make sure SQS shows all of them 136 | // or our lambdas may start and immediately be done 137 | 138 | while (true){ 139 | const ready = await get_queue_size() 140 | if (ready >= target){ // NB: type coercion here 141 | console.log("Queue reports " + ready + " messages available") 142 | break 143 | } 144 | console.log("Waiting for messages to show in SQS, see " + ready + ", want " + target) 145 | } 146 | 147 | return true 148 | } 149 | 150 | async function warm_target(launch_count){ 151 | const depth = await get_queue_size() 152 | const remaining = Math.max(0, MAX_WORKERS - launch_count) 153 | const initial = process.env.INITIAL_COUNT || 3000 154 | const step = process.env.INCREMENT_STEP || 500 155 | const limit = 0 === launch_count ? initial : step 156 | // if we don't have a full step to do, do what we need 157 | const full_limit = Math.min(limit, remaining) 158 | 159 | // if our queue is empty then we don't want spin any more up, 160 | // or if we have half the increment step in the queue, don't spin up the full step 161 | console.log(`Calculating warm target from ${limit}, ${remaining} and ${depth}`) 162 | return Math.min(full_limit, depth) 163 | } 164 | 165 | async function driver(fxn_name, memorySize, run_id, launch_count) { 166 | const full_start_time = new Date().getTime() 167 | const first_run = 0 === launch_count 168 | 169 | let lines = [] 170 | if (first_run) { 171 | console.log("Starting ", fxn_name, run_id) 172 | lines = await populate_queue() 173 | await await_queue(lines.length) 174 | } 175 | 176 | const target = await warm_target(launch_count) 177 | 178 | console.log(`Launching ${target} workers`) 179 | const workers = [] 180 | for (let worker_id = 0; worker_id !== target; worker_id++) { 181 | workers.push(run_lambda(fxn_name, WORK_REQUEST, run_id, launch_count + worker_id)) 182 | } 183 | 184 | if (first_run) { 185 | const launch_start_time = new Date().getTime() 186 | 187 | const metrics = [ 188 | create_metric('memory_size', memorySize, 'Megabytes'), 189 | create_metric('full_start_time', full_start_time, 'Milliseconds'), 190 | create_metric('launch_start_time', launch_start_time, 'Milliseconds'), 191 | create_metric('total_workers', MAX_WORKERS), 192 | create_metric('total_chunks', lines.length) 193 | ] 194 | 195 | await on_metrics(metrics, fxn_name, run_id) 196 | } 197 | 198 | console.log("Waiting for " + workers.length + " workers to spin up...") 199 | await Promise.all(workers) 200 | .catch(err => fatal("Something went wrong starting workers: " + err)) 201 | .then(() => "All launched for " + run_id) 202 | 203 | const current_count = target + launch_count 204 | if (0 === target){ 205 | console.log(`All workers launched`) 206 | return 207 | } 208 | 209 | // we want to launch every minute as close as possible 210 | const next_run_time = full_start_time + ONE_MINUTE_MILLIS 211 | const sleep_time = Math.max(0, next_run_time - new Date().getTime() ) 212 | 213 | await sandbag(sleep_time) 214 | console.log(`Recursing with launch count of ${current_count}`) 215 | return run_lambda(fxn_name, START_REQUEST, run_id, 0, current_count) 216 | } 217 | 218 | function on_regex(data, run_id){ 219 | // if you want to do something on getting a hit, put it here 220 | } 221 | 222 | async function handle_stream(stream, run_id){ 223 | let uncompressed_bytes = 0 224 | let compressed_bytes = 0 225 | let total_requests = 0 226 | let count = 0 227 | 228 | const start_time = new Date().getTime() 229 | const extractor = new Transform({ 230 | transform(chunk, encoding, callback) { 231 | try { 232 | const matches = chunk.toString().match(REGEX) || [] 233 | matches.forEach(data => this.push(data)) 234 | } catch (error){ 235 | annoying("Failed to extract: " + error) 236 | } finally { 237 | callback() 238 | } 239 | } 240 | }) 241 | 242 | const gunzipper = zlib.createGunzip() 243 | let data_ts = 0 244 | 245 | const log_traffic = () => { 246 | const ts = new Date().getTime() / 1000 247 | const elapsed = ts - data_ts 248 | if (10 <= elapsed){ 249 | console.log("Received data: " + compressed_bytes) 250 | data_ts = ts 251 | } 252 | } 253 | 254 | await new Promise((resolve, _reject) => { 255 | const reject = message => { 256 | console.log("Failed: " + message) 257 | _reject(message) 258 | } 259 | 260 | const gunzipStream = stream 261 | .on('error', err => fatal("GZip stream error " + err)) 262 | .on('data', log_traffic) 263 | .on('data', data => compressed_bytes += data.length) 264 | .on('end', () => console.log("End of base stream")) 265 | .pipe(gunzipper) 266 | 267 | const extractorStream = gunzipStream 268 | .on('error', err => fatal("Extract stream error " + err)) 269 | .on('data', data => { 270 | // technically our stream could split our request 271 | // marker, but that will be rare, and over the total 272 | // number of requests we have we should be ok 273 | try { 274 | const requests = data.toString().match(REQUEST_REGEX) || [] 275 | total_requests += requests.length 276 | } catch (error) { 277 | annoying("Failed to match batches: " + error) 278 | } 279 | }) 280 | .on('data', data => uncompressed_bytes += data.length) 281 | .on('end', () => console.log("End of gzip stream")) 282 | .pipe(extractor) 283 | 284 | const extractedStream = extractorStream 285 | .on('error', err => fatal("Extracted stream error " + err)) 286 | .on('data', () => count++) 287 | .on('data', data => on_regex(data, run_id)) 288 | .on('end', () => { 289 | console.log("Streaming complete") 290 | resolve("complete") 291 | }) 292 | }) 293 | 294 | const now = new Date().getTime() 295 | // we're measuring time internally, this isn't 100%, but go ahead 296 | // and round up to the nearest 100ms 297 | const elapsed = Math.ceil((now - start_time) / 100) * 100 298 | 299 | return [ 300 | create_metric('start_time', start_time, 'Milliseconds'), 301 | create_metric('regex_hits', count), 302 | create_metric('total_requests', total_requests), 303 | create_metric('compressed_bytes', compressed_bytes, 'Bytes'), 304 | create_metric('uncompressed_bytes', uncompressed_bytes, 'Bytes'), 305 | create_metric('elapsed_ms', elapsed, 'Milliseconds') 306 | ] 307 | } 308 | 309 | async function handle_path(path, run_id) { 310 | const params = { 311 | Bucket : BUCKET, 312 | Key : path 313 | } 314 | 315 | const stream = s3.getObject(params) 316 | .on('httpHeaders', (code, headers) => { 317 | const requestId = headers['x-amz-request-id'] 318 | const amzId = headers['x-amz-id-2'] 319 | console.log("Streaming as x-amz-id-2=" + amzId + ", x-amz-request-id=" + requestId + "/" + JSON.stringify(params)) 320 | }).createReadStream() 321 | 322 | return handle_stream(stream, run_id) 323 | } 324 | 325 | function create_metric(key, value, unit){ 326 | return { 327 | key, 328 | value, 329 | unit: unit || 'Count' 330 | } 331 | } 332 | 333 | async function on_metrics(metrics, fxn_name, run_id){ 334 | const date = new Date() 335 | const metricList = metrics.map(metric => { 336 | console.log(metric.key + "." + run_id, ' -> ', metric.value) 337 | return { 338 | MetricName: metric.key, 339 | Dimensions: [ 340 | { Name: 'run_id', Value: run_id } 341 | ], 342 | Timestamp: date, 343 | Unit: metric.unit, 344 | Value: metric.value 345 | } 346 | }) 347 | 348 | const update = { 349 | 'MetricData' : metricList, 350 | 'Namespace' : fxn_name 351 | } 352 | 353 | // ... so for now, shunt to this queue 354 | return sqs.sendMessage({ 355 | MessageBody: JSON.stringify(update), 356 | QueueUrl : METRIC_URL 357 | }).promise() 358 | .catch(err => fatal("Unable to send metric: " + err)) 359 | } 360 | 361 | async function setVisibilityTimeout(message, time){ 362 | return sqs.changeMessageVisibility({ QueueUrl : INPUT_URL, ReceiptHandle: message.ReceiptHandle, VisibilityTimeout: time }) 363 | .promise() 364 | .then(() => console.log(`Message ${message.ReceiptHandle} timeout set to ${time}`)) 365 | .catch(err => annoying("Failed to reset the visibility: " + err)) 366 | } 367 | 368 | async function handle_message(fxn_name, run_id, worker_id, end_time) { 369 | const response = await sqs.receiveMessage({ QueueUrl : INPUT_URL }).promise() 370 | .catch(err => fatal("Failed to receive message from queue: " + err)) 371 | const messages = response.Messages || [] 372 | 373 | // we're at the end of our queue, so send our done time 374 | // sometimes SQS gives us no work when we hammer it, so try a couple times before give up 375 | if (0 === messages.length && 0 !== invocations){ 376 | console.log(worker_id + ": No work to do") 377 | const metric = create_metric('end_time', new Date().getTime(), 'Milliseconds') 378 | await on_metrics([metric], fxn_name, run_id) 379 | return "All done" 380 | } 381 | 382 | invocations++ 383 | 384 | for(const message of messages){ 385 | let metrics = [create_metric('messages_attempted', 1)] 386 | 387 | // if we aren't done by panic_time then we need to push the message back 388 | const panic_time = end_time - new Date().getTime() 389 | 390 | // with high concurrency, S3 gets mad causing timeouts, this handles that for us by pushing the message back on the queue 391 | const timer = setTimeout(async () => await setVisibilityTimeout(message, 0), panic_time) 392 | try { 393 | const message_metrics = await handle_path(message.Body, run_id) 394 | metrics.push(create_metric("metrics_handled", 1)) 395 | metrics = metrics.concat(message_metrics) 396 | clearTimeout(timer) 397 | } catch (error) { 398 | metrics.push(create_metric("messages_error", 1)) 399 | annoying("Failed to handle path: " + error) 400 | await Promise.all([ 401 | setVisibilityTimeout(message, 0), 402 | on_metrics(metrics, fxn_name, run_id) 403 | ]) 404 | break // don't delete, punt immediately 405 | } 406 | await sqs.deleteMessage({ QueueUrl : INPUT_URL, ReceiptHandle: message.ReceiptHandle }).promise() 407 | .catch(err => fatal("Failed to delete message from queue: " + err)) 408 | await on_metrics(metrics, fxn_name, run_id) 409 | } 410 | 411 | const run = await run_lambda(fxn_name, WORK_REQUEST, run_id, worker_id) 412 | console.log(run) 413 | return "All done" 414 | } 415 | 416 | exports.metric_handler = async (event) => { 417 | const records = event.Records || [] 418 | 419 | const metrics = records.map(record => { 420 | const raw_payload = record.body 421 | return JSON.parse(raw_payload) 422 | }) 423 | 424 | await persist_metric_batch(metrics) 425 | console.log(`Inserted ${metrics.length} metrics`) 426 | } 427 | 428 | async function persist_metric_batch(messages){ 429 | for (const message of messages){ 430 | await cloudWatch.putMetricData(message).promise() 431 | .catch(err => fatal("Error putting metric data: " + err)) 432 | } 433 | 434 | return true 435 | } 436 | 437 | exports.handler = async (args, context) => { 438 | const { type, run_id, worker_id, launch_count } = args 439 | 440 | // peel off N seconds to give us a bit of a buffer for the event to clear and code to run 441 | const grace_period = process.env.PANIC_GRACE_PERIOD || 3000 442 | const end_time = new Date().getTime() + context.getRemainingTimeInMillis() - grace_period 443 | 444 | switch (type){ 445 | case WORK_REQUEST: 446 | return handle_message(context.functionName, run_id, worker_id, end_time) 447 | case START_REQUEST: 448 | const memorySize = parseInt(context.memoryLimitInMB) 449 | // if we have a run id, use it, else, make one 450 | const id = process.env.RUN_ID || run_id 451 | return driver(context.functionName, memorySize, id, launch_count || 0) 452 | } 453 | } 454 | 455 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | STACK=out/stack.json 6 | 7 | if [ -z "$1" ] 8 | then 9 | echo "Need a stack name" 10 | exit 1 11 | fi 12 | 13 | mkdir -p out 14 | aws cloudformation describe-stacks --stack-name $1 > $STACK 15 | 16 | FXN_NAME=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="Worker") | .OutputValue ' | tr -d '"'` 17 | CRAWL_QUEUE=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="CrawlQueue") | .OutputValue ' | tr -d '"'` 18 | METRIC_QUEUE=`cat $STACK | jq '.[]' | jq '.[0] | .Outputs[] | select(.OutputKey=="MetricQueue") | .OutputValue ' | tr -d '"'` 19 | 20 | export SIZE=1024 21 | export MAX_WORKERS=19900 22 | export INITIAL_COUNT=3000 23 | export MAX_CHUNKS=75000 24 | export INCREMENT_STEP=500 25 | export AWS_PROFILE=caws 26 | export FXN_NAME=$FXN_NAME 27 | export METRIC_URL=https://sqs.us-east-1.amazonaws.com/$AWS_ACN/$METRIC_QUEUE 28 | export INPUT_URL=https://sqs.us-east-1.amazonaws.com/$AWS_ACN/$CRAWL_QUEUE 29 | export SINGLE_BUNDLE=crawl-data/CC-MAIN-2018-17/segments/1524125944742.25/warc/CC-MAIN-20180420213743-20180420233743-00299.warc.gz 30 | export CRAWL_INDEX_BUCKET=commoncrawl 31 | export CRAWL_INDEX_KEY=crawl-data/CC-MAIN-2018-17/warc.paths.gz 32 | export RUN_ID=$(date +%s) 33 | 34 | aws --profile $AWS_PROFILE sqs purge-queue --queue-url $METRIC_URL || true 35 | aws --profile $AWS_PROFILE sqs purge-queue --queue-url $INPUT_URL || true 36 | aws --profile $AWS_PROFILE lambda update-function-configuration --function-name $FXN_NAME --memory-size $SIZE --environment "Variables={RUN_ID=$RUN_ID,METRIC_URL=$METRIC_URL,QUEUE_URL=$INPUT_URL,CRAWL_INDEX_BUCKET=$CRAWL_INDEX_BUCKET,CRAWL_INDEX_KEY=$CRAWL_INDEX_KEY,MAX_CHUNKS=$MAX_CHUNKS,MAX_WORKERS=$MAX_WORKERS,SHUFFLE=t,SINGLE_BUNDLE=$SINGLE_BUNDLE,INITIAL_COUNT=$INITIAL_COUNT,INCREMENT_STEP=$INCREMENT_STEP}" 37 | 38 | aws --profile $AWS_PROFILE lambda invoke --function-name $FXN_NAME --invocation-type Event out/test.`date +%s` --payload '{ "type" : "start" }' 39 | 40 | echo $RUN_ID | tee -a run.ids 41 | 42 | echo "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2:graph=~(view~'singleValue~stacked~false~metrics~(~(~'NAME~'memory_size~'run_id~'RUN_ID~(id~'memory~stat~'Average~period~2592000~label~'Lambda*20Size))~(~(expression~'0.00001667*20*2a*20*28memory*20*2f*201024*29*20*2a*20compute_sec~label~'Cost~id~'e6))~(~(expression~'*28end*20-*20start*29*20*2f*201000~label~'Wall*20Secs~id~'wall_sec~yAxis~'left))~(~(expression~'elapsed_sum*20*2f*201000~label~'Compute*20Secs~id~'compute_sec))~(~'NAME~'total_workers~'run_id~'RUN_ID~(id~'workers~label~'Workers~period~2592000))~(~'.~'start_time~'.~'.~(visible~false~id~'start~period~2592000~stat~'Minimum))~(~'.~'total_chunks~'.~'.~(id~'archives~label~'Archives~period~2592000))~(~'.~'regex_hits~'.~'.~(id~'regex_hits~period~2592000~stat~'Sum~label~'Numbers*20Found))~(~(expression~'regex_hits*20*2f*20wall_sec~label~'Numbers*20Found*20*2f*20Sec~id~'e5))~(~'NAME~'end_time~'run_id~'RUN_ID~(visible~false~id~'end~stat~'Maximum~period~2592000))~(~'.~'compressed_bytes~'.~'.~(id~'cbytes~stat~'Sum~period~2592000~label~'Compressed*20Bytes*20Scanned))~(~'.~'uncompressed_bytes~'.~'.~(id~'ucbytes~stat~'Sum~period~2592000~label~'Uncompressed*20Bytes*20Scanned))~(~(expression~'cbytes*20*2f*20wall_sec~label~'Compressed*20Bytes*20*2f*20Sec~id~'e3))~(~(expression~'ucbytes*20*2f*20wall_sec~label~'Uncompressed*20Bytes*20*2f*20Sec~id~'e4))~(~(expression~'cbytes*20*2f*20workers~label~'Compressed*20Bytes*20*2f*20Worker~id~'e1))~(~(expression~'ucbytes*20*2f*20workers~label~'Uncompressed*20Bytes*20*2f*20Worker~id~'e8))~(~'NAME~'elapsed_ms~'run_id~'RUN_ID~(id~'elapsed_sum~stat~'Sum~visible~false~period~2592000))~(~'.~'total_requests~'.~'.~(id~'requests~period~2592000~label~'HTTP*20Requests*20Scanned~stat~'Sum))~(~(expression~'requests*20*2f*20workers~label~'HTTP*20Requests*20*2f*20Worker~id~'e2))~(~(expression~'requests*20*2f*20wall_sec~label~'HTTP*20Requests*20*2f*20Sec~id~'e7)))~region~'us-east-1~start~'-P7D~end~'P0D~title~'SIZEmb*20Run);namespace=NAME;dimensions=run_id" | \ 43 | sed "s/NAME/${FXN_NAME}/g" | \ 44 | sed "s/RUN_ID/$RUN_ID/g" | \ 45 | sed "s/SIZE/$SIZE/g" 46 | 47 | echo 48 | 49 | echo "https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#metricsV2:graph=~(metrics~(~(~'NAME~'full_start_time~'run_id~'RUN_ID~(period~1~visible~false))~(~'.~'memory_size~'.~'.~(period~1~visible~false))~(~'.~'launch_start_time~'.~'.~(period~1~visible~false))~(~'.~'total_workers~'.~'.~(period~1~visible~false))~(~'.~'total_chunks~'.~'.~(period~1~visible~false))~(~'.~'regex_hits~'.~'.~(period~1~stat~'Sum))~(~'.~'elapsed_ms~'.~'.~(period~1~visible~false))~(~'.~'total_requests~'.~'.~(period~1~stat~'Sum))~(~'.~'uncompressed_bytes~'.~'.~(period~1~stat~'Sum))~(~'.~'start_time~'.~'.~(period~1~visible~false))~(~'.~'compressed_bytes~'.~'.~(period~1~stat~'Sum))~(~'.~'end_time~'.~'.~(period~1~visible~false)))~view~'timeSeries~stacked~false~region~'us-east-1);namespace=NAME;dimensions=run_id" | \ 50 | sed "s/NAME/${FXN_NAME}/g" | \ 51 | sed "s/RUN_ID/$RUN_ID/g" 52 | 53 | --------------------------------------------------------------------------------