├── .gitignore ├── LICENSE ├── Makefile ├── algolia └── dump_ddb.py ├── docs ├── architecture.png ├── dynamodb.png └── statemachine.png ├── graphql ├── QueryDdbByBlogsourceAndTimest-request.vtl ├── QueryDdbByBlogsourceAndTimest-response.vtl ├── QueryDdbByVisibleAndTimest-request.vtl ├── QueryDdbByVisibleAndTimest-response.vtl ├── QueryDdbGetDetailText-request.vtl ├── QueryDdbGetDetailText-response.vtl ├── QueryDdbItemCountAll-request.vtl ├── QueryDdbItemCountAll-response.vtl ├── QueryDdbItemCountPerBlog-request.vtl ├── QueryDdbItemCountPerBlog-response.vtl └── schema.graphql ├── lambda-crawl ├── crawl.py ├── feeds.txt └── requirements.txt ├── lambda-getfeed ├── getfeed.py └── requirements.txt ├── lambda-layer └── requirements.txt ├── lambda-pagecount ├── pagecount.py └── requirements.txt ├── readme.md ├── statemachine └── rssblog.asl.json └── template.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | packaged.yaml 2 | lambda/libs 3 | samconfig.toml 4 | .aws-sam/ 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Marek Kuczynski 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build 2 | build: 3 | sam build --parallel -u --cached 4 | 5 | .PHONY: init 6 | init: 7 | sam build --parallel -u --cached 8 | sam deploy -g 9 | 10 | .PHONY: deploy 11 | deploy: 12 | sam build --parallel -u --cached 13 | sam deploy 14 | -------------------------------------------------------------------------------- /algolia/dump_ddb.py: -------------------------------------------------------------------------------- 1 | # A simple script to dump the CSV results from the RSS DynamoDB table 2 | # You can use the CSV dump to bulk import blogs into Algolia 3 | 4 | import botocore, boto3 5 | 6 | region = 'eu-west-1' 7 | table = '' 8 | proj_expression = "guid, timest, datestr, blogsource, category, link, description, author, title" 9 | 10 | ddb = boto3.resource('dynamodb', region_name = region, config = botocore.client.Config(max_pool_connections = 50)).Table(table) 11 | 12 | def dump_records_to_csv(): 13 | 14 | res = [] 15 | queryres = ddb.scan(ProjectionExpression = proj_expression) 16 | 17 | for x in queryres['Items']: 18 | 19 | if x['timest'] != 0: 20 | y = [x['guid'], x['guid'], str(x['timest']), x['datestr'], x['blogsource'], x['category'], x['link'], x['description'], x['author'],x ['title']] 21 | res.append('"' + '","'.join(y) + '"') 22 | 23 | while 'LastEvaluatedKey' in queryres: 24 | 25 | queryres = ddb.scan(ExclusiveStartKey = queryres['LastEvaluatedKey'], ProjectionExpression = proj_expression) 26 | 27 | for x in queryres['Items']: 28 | 29 | if x['timest'] != 0: 30 | y = [x['guid'], x['guid'], str(x['timest']), x['datestr'], x['blogsource'], x['category'], x['link'], x['description'], x['author'],x ['title']] 31 | res.append('"' + '","'.join(y) + '"') 32 | 33 | z = open('out.csv', 'w') 34 | z.write('ObjectID,guid,timest,datestr,blogsource,category,link,description,author,title\n') 35 | for x in res: 36 | z.write(x+'\n') 37 | z.close() 38 | 39 | dump_records_to_csv() 40 | -------------------------------------------------------------------------------- /docs/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/docs/architecture.png -------------------------------------------------------------------------------- /docs/dynamodb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/docs/dynamodb.png -------------------------------------------------------------------------------- /docs/statemachine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/docs/statemachine.png -------------------------------------------------------------------------------- /graphql/QueryDdbByBlogsourceAndTimest-request.vtl: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "2017-02-28", 3 | "operation" : "Query", 4 | "index" : "timest", 5 | "limit" : 25, 6 | "scanIndexForward" : false, 7 | "select" : "ALL_PROJECTED_ATTRIBUTES", 8 | "query" : { 9 | "expression" : "blogsource = :bs and timest > :ts", 10 | "expressionValues" : { 11 | ":bs" : {"S": "$context.arguments.blogsource"}, 12 | ":ts" : {"N": $context.arguments.timest} 13 | } 14 | }, 15 | "nextToken": #if( $context.args.nextToken ) "$context.args.nextToken" #else null #end 16 | } -------------------------------------------------------------------------------- /graphql/QueryDdbByBlogsourceAndTimest-response.vtl: -------------------------------------------------------------------------------- 1 | $util.toJson($context.result) 2 | -------------------------------------------------------------------------------- /graphql/QueryDdbByVisibleAndTimest-request.vtl: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "2017-02-28", 3 | "operation" : "Query", 4 | "index" : "visible", 5 | "limit" : 25, 6 | "scanIndexForward" : false, 7 | "select" : "ALL_PROJECTED_ATTRIBUTES", 8 | "query" : { 9 | "expression" : "visible = :visible and timest > :timest", 10 | "expressionValues" : { 11 | ":visible" : { "S" : "$context.arguments.visible" }, 12 | ":timest" : { "N": $context.arguments.timest } 13 | } 14 | }, 15 | "nextToken": #if( $context.args.nextToken ) "$context.args.nextToken" #else null #end 16 | } 17 | -------------------------------------------------------------------------------- /graphql/QueryDdbByVisibleAndTimest-response.vtl: -------------------------------------------------------------------------------- 1 | $util.toJson($context.result) 2 | -------------------------------------------------------------------------------- /graphql/QueryDdbGetDetailText-request.vtl: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "2017-02-28", 3 | "operation" : "Query", 4 | "query" : { 5 | "expression" : "guid = :guid", 6 | "expressionValues" : { 7 | ":guid" : {"S": "$context.arguments.guid"} 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /graphql/QueryDdbGetDetailText-response.vtl: -------------------------------------------------------------------------------- 1 | $util.toJson($context.result) 2 | -------------------------------------------------------------------------------- /graphql/QueryDdbItemCountAll-request.vtl: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "2017-02-28", 3 | "operation" : "Query", 4 | "index" : "visible", 5 | "select" : "ALL_PROJECTED_ATTRIBUTES", 6 | "query" : { 7 | "expression" : "visible = :visible and timest = :ts", 8 | "expressionValues" : { 9 | ":visible" : { "S" : "y" }, 10 | ":ts" : { "N" : 0 } 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /graphql/QueryDdbItemCountAll-response.vtl: -------------------------------------------------------------------------------- 1 | $util.toJson($context.result) 2 | -------------------------------------------------------------------------------- /graphql/QueryDdbItemCountPerBlog-request.vtl: -------------------------------------------------------------------------------- 1 | { 2 | "version" : "2017-02-28", 3 | "operation" : "Query", 4 | "index" : "timest", 5 | "select" : "ALL_PROJECTED_ATTRIBUTES", 6 | "query" : { 7 | "expression" : "blogsource = :bs and timest = :ts", 8 | "expressionValues" : { 9 | ":bs" : { "S" : "$context.arguments.blogsource" }, 10 | ":ts" : { "N" : 0 } 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /graphql/QueryDdbItemCountPerBlog-response.vtl: -------------------------------------------------------------------------------- 1 | $util.toJson($context.result) 2 | -------------------------------------------------------------------------------- /graphql/schema.graphql: -------------------------------------------------------------------------------- 1 | schema { 2 | query: Query 3 | } 4 | 5 | type Query { 6 | QueryDdbByBlogsourceAndTimest(filter: TableddbsourceFilterInput, blogsource: String!, timest: Int!, nextToken: String): listourceConnection 7 | QueryDdbByVisibleAndTimest(filter: TableddbsourceFilterInput, visible: String!, timest: Int!, nextToken: String): listourceConnection 8 | QueryDdbGetDetailText(guid: String!): itemsourceConnection 9 | QueryDdbItemCountPerBlog(blogsource: String!): pagecountConnection 10 | QueryDdbItemCountAll(timest: Int!): pagecountConnection 11 | } 12 | 13 | type listsource { 14 | blogsource: String 15 | guid: String 16 | timest: Int 17 | link: String 18 | author: String 19 | title: String 20 | } 21 | 22 | type listourceConnection { 23 | items: [listsource] 24 | nextToken: String 25 | } 26 | 27 | type itemsource { 28 | blogsource: String 29 | guid: String 30 | timest: Int 31 | link: String 32 | author: String 33 | description: String 34 | rawhtml: String 35 | } 36 | 37 | type itemsourceConnection { 38 | items: [itemsource] 39 | } 40 | 41 | type pagecount { 42 | blogsource: String 43 | timest: Int 44 | visible: String 45 | articlecount: Int 46 | } 47 | 48 | type pagecountConnection { 49 | items: [pagecount] 50 | } 51 | 52 | input TableddbsourceFilterInput { 53 | blogsource: TableStringFilterInput 54 | guid: TableStringFilterInput 55 | timest: TableIntFilterInput 56 | visible: TableStringFilterInput 57 | } 58 | 59 | input TableStringFilterInput { 60 | beginsWith: String 61 | between: [String] 62 | contains: String 63 | eq: String 64 | ge: String 65 | gt: String 66 | le: String 67 | lt: String 68 | ne: String 69 | notContains: String 70 | } 71 | 72 | input TableIntFilterInput { 73 | between: [Int] 74 | contains: Int 75 | eq: Int 76 | ge: Int 77 | gt: Int 78 | le: Int 79 | lt: Int 80 | ne: Int 81 | notContains: Int 82 | } -------------------------------------------------------------------------------- /lambda-crawl/crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # @marekq 3 | # www.marek.rocks 4 | 5 | import base64, botocore, boto3, csv, feedparser 6 | import gzip, json, os, re, readability, requests 7 | import queue, sys, threading, time 8 | 9 | from aws_lambda_powertools import Logger, Tracer 10 | from boto3.dynamodb.conditions import Key, Attr 11 | from datetime import date, datetime, timedelta 12 | from bs4 import BeautifulSoup 13 | 14 | logger = Logger() 15 | modules_to_be_patched = ["botocore", "boto3", "requests"] 16 | tracer = Tracer(patch_modules = modules_to_be_patched) 17 | 18 | # establish a session with SES, DynamoDB and Comprehend 19 | ddb = boto3.resource('dynamodb', region_name = os.environ['dynamo_region'], config = botocore.client.Config(max_pool_connections = 50)).Table(os.environ['dynamo_table']) 20 | s3 = boto3.client('s3') 21 | 22 | 23 | # create a queue for multiprocessing 24 | q1 = queue.Queue() 25 | 26 | 27 | # get the blogpost guids that are already stored in DynamoDB table 28 | @tracer.capture_method(capture_response = False) 29 | def get_guids(ts): 30 | guids = [] 31 | 32 | # get the guid values up to x days ago 33 | queryres = ddb.query(ScanIndexForward = True, IndexName = 'visible', ProjectionExpression = 'guid', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(ts)) 34 | 35 | for x in queryres['Items']: 36 | if 'guid' in x: 37 | if x['guid'] not in guids: 38 | guids.append(x['guid']) 39 | 40 | # paginate the query in case more than 100 results are returned 41 | while 'LastEvaluatedKey' in queryres: 42 | queryres = ddb.query(ExclusiveStartKey = queryres['LastEvaluatedKey'], ScanIndexForward = True, IndexName = 'visible', ProjectionExpression = 'guid', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(ts)) 43 | 44 | for x in queryres['Items']: 45 | if 'guid' in x: 46 | if x['guid'] not in guids: 47 | guids.append(x['guid']) 48 | 49 | print('guids found in last day : '+str(len(guids))) 50 | return guids 51 | 52 | 53 | # read the url's from 'feeds.txt' stored in the lambda function 54 | @tracer.capture_method(capture_response = False) 55 | def read_feed(): 56 | result = {} 57 | filen = 'feeds.txt' 58 | count = 0 59 | 60 | # open the feeds.txt file and read line by line 61 | with open(filen) as fp: 62 | line = fp.readline() 63 | while line: 64 | 65 | # get the src and url value delimited by a ',' 66 | src, url = line.split(',') 67 | 68 | # add src and url to dict 69 | result[src.strip()] = url.strip() 70 | line = fp.readline() 71 | 72 | # add one to the count if less than 50, else we will spawn too many threads 73 | if count < 50: 74 | count += 1 75 | 76 | # return the result and count value 77 | return result, count 78 | 79 | # check if the s3 object exists by listing current s3 objects 80 | def get_s3_files(): 81 | s3list = s3.list_objects(Bucket = os.environ['s3_bucket']) 82 | 83 | return s3list 84 | 85 | 86 | # get the contents of the dynamodb table for json object on S3 87 | @tracer.capture_method(capture_response = False) 88 | def get_feed(x): 89 | url = x[0] 90 | blogsource = x[1] 91 | 92 | # if the blog json is available on s3 93 | if str(blogsource + '.json') in s3files: 94 | 95 | ts_old = int(time.time()) - (days_to_retrieve * 1) 96 | 97 | # if the blog json does not exist on s3 98 | else: 99 | 100 | # set the days to retrieve value based on the given setting 101 | ts_old = int(time.time()) - (days_to_retrieve * 86400) 102 | 103 | print(ts_old, url, blogsource) 104 | res.append({'ts': ts_old, 'url': url, 'blogsource': blogsource, 'daystoretrieve': days_to_retrieve}) 105 | 106 | # worker for queue jobs 107 | @tracer.capture_method(capture_response = False) 108 | def worker(): 109 | while not q1.empty(): 110 | get_feed(q1.get()) 111 | q1.task_done() 112 | 113 | # lambda handler 114 | @logger.inject_lambda_context(log_event = True) 115 | @tracer.capture_lambda_handler 116 | def handler(event, context): 117 | 118 | # set a default value of 1 for 'days_to_retrieve' 119 | global days_to_retrieve 120 | days_to_retrieve = int(1) 121 | 122 | send_mail = os.environ['send_mail'] 123 | 124 | # check if days input value was given in step function 125 | try: 126 | if int(event['msg']['days']) < 90: 127 | days_to_retrieve = int(event['msg']['days']) 128 | print('setting days_to_retrieve value to ' + str(days_to_retrieve) + ' based on state machine input') 129 | 130 | else: 131 | print('failed to get valid days input value from step function, proceeding with default value of 1') 132 | 133 | except Exception as e: 134 | print('failed to get valid days input value from step function, proceeding with default value of 1') 135 | print(e) 136 | 137 | # check if send email input value was given in step function 138 | try: 139 | if event['send_mail'] == 'y' or event['send_mail'] == 'yes': 140 | print('sending emails based on state machine input') 141 | send_mail = 'y' 142 | 143 | except Exception as e: 144 | print('failed to get valid send email input value from step function, proceeding with default value of n') 145 | print(e) 146 | 147 | print('sending emails: ' + str(send_mail)) 148 | 149 | # create global list for results 150 | global res 151 | res = [] 152 | 153 | # get s3 result files 154 | global s3files 155 | s3files = get_s3_files() 156 | 157 | # get the unix timestamp from days_to_retrieve days ago 158 | ts_old = int(time.time()) - (86400 * days_to_retrieve) 159 | 160 | # get post guids stored in dynamodb for days_to_retrieve 161 | guids = get_guids(ts_old) 162 | 163 | # get feed url's from local feeds.txt file 164 | feeds, thr = read_feed() 165 | 166 | # submit a thread per url feed to queue 167 | for blogsource, url in feeds.items(): 168 | q1.put([url, blogsource]) 169 | 170 | # start thread per feed 171 | for x in range(thr): 172 | t = threading.Thread(target = worker) 173 | t.daemon = True 174 | t.start() 175 | q1.join() 176 | 177 | # return results, guid and days to retrieve 178 | return { 179 | 'results': res, 180 | 'guids': guids, 181 | 'daystoretrieve': str(days_to_retrieve), 182 | 'send_mail': send_mail, 183 | 'algolia_app': os.environ['algolia_app'], 184 | 'algolia_apikey': os.environ['algolia_apikey'], 185 | 'algolia_index': os.environ['algolia_index'], 186 | 'dynamo_region': os.environ['dynamo_region'], 187 | 'dynamo_table': os.environ['dynamo_table'], 188 | 'from_email': os.environ['from_email'], 189 | 'to_email': os.environ['to_email'], 190 | 's3_bucket': os.environ['s3_bucket'], 191 | 'storepublics3': os.environ['storepublics3'], 192 | 'enable_algolia': os.environ['enable_algolia'], 193 | 'send_mail': os.environ['send_mail'] 194 | } 195 | -------------------------------------------------------------------------------- /lambda-crawl/feeds.txt: -------------------------------------------------------------------------------- 1 | apn, https://aws.amazon.com/blogs/apn/feed/ 2 | architecture, https://aws.amazon.com/blogs/architecture/feed/ 3 | big-data, https://aws.amazon.com/blogs/big-data/feed/ 4 | biz-prod, https://aws.amazon.com/blogs/business-productivity/feed/ 5 | cli, https://aws.amazon.com/blogs/developer/category/programing-language/aws-cli/feed/ 6 | compute, https://aws.amazon.com/blogs/compute/feed/ 7 | contact-center, https://aws.amazon.com/blogs/contact-center/feed/ 8 | containers, https://aws.amazon.com/blogs/containers/feed/ 9 | cost-mgmt, https://aws.amazon.com/blogs/aws-cost-management/feed/ 10 | database, https://aws.amazon.com/blogs/database/feed/ 11 | desktop, https://aws.amazon.com/blogs/desktop-and-application-streaming/feed/ 12 | developer, https://aws.amazon.com/blogs/developer/feed/ 13 | devops, https://aws.amazon.com/blogs/devops/feed/ 14 | enterprise-strat, https://aws.amazon.com/blogs/enterprise-strategy/feed/ 15 | gamedev, https://aws.amazon.com/blogs/gamedev/feed/ 16 | gametech, https://aws.amazon.com/blogs/gametech/feed/ 17 | governance, https://aws.amazon.com/blogs/mt/feed/ 18 | industries, https://aws.amazon.com/blogs/industries/feed/ 19 | infrastructure, https://aws.amazon.com/blogs/infrastructure-and-automation/feed/ 20 | iot, https://aws.amazon.com/blogs/iot/feed/ 21 | java, https://aws.amazon.com/blogs/developer/category/programing-language/java/feed/ 22 | management-tools, https://aws.amazon.com/blogs/mt/feed/ 23 | marketplace, https://aws.amazon.com/blogs/awsmarketplace/feed/ 24 | media, https://aws.amazon.com/blogs/media/feed/ 25 | messaging, https://aws.amazon.com/blogs/messaging-and-targeting/feed/ 26 | ml, https://aws.amazon.com/blogs/machine-learning/feed/ 27 | mobile, https://aws.amazon.com/blogs/mobile/feed/ 28 | modernizing, https://aws.amazon.com/blogs/modernizing-with-aws/feed/ 29 | networking, https://aws.amazon.com/blogs/networking-and-content-delivery/feed/ 30 | newsblog, https://aws.amazon.com/blogs/aws/feed/ 31 | open-source, https://aws.amazon.com/blogs/opensource/feed/ 32 | public-sector, https://aws.amazon.com/blogs/publicsector/feed/ 33 | quantum, https://aws.amazon.com/blogs/quantum-computing/feed/ 34 | robotics, https://aws.amazon.com/blogs/robotics/feed/ 35 | sap, https://aws.amazon.com/blogs/awsforsap/feed/ 36 | security, https://aws.amazon.com/blogs/security/feed/ 37 | security-bulletins, https://aws.amazon.com/security/security-bulletins/feed/ 38 | serverless, https://aws.amazon.com/blogs/aws/category/serverless/feed/ 39 | startups, https://aws.amazon.com/blogs/startups/feed/ 40 | storage, https://aws.amazon.com/blogs/storage/feed/ 41 | training, https://aws.amazon.com/blogs/training-and-certification/feed/ 42 | whats-new, https://aws.amazon.com/new/feed/ 43 | yan, https://feeds.feedburner.com/theburningmonk 44 | corey, https://www.lastweekinaws.com/blog/feed 45 | cloudguru, https://acloudguru.com/blog/feed 46 | werner, https://www.allthingsdistributed.com/index.xml 47 | jeremy, https://offbynone.io/feed -------------------------------------------------------------------------------- /lambda-crawl/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/lambda-crawl/requirements.txt -------------------------------------------------------------------------------- /lambda-getfeed/getfeed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # @marekq 3 | # www.marek.rocks 4 | 5 | import base64, botocore, boto3, csv, feedparser 6 | import gzip, json, os, re, readability, requests 7 | import queue, sys, threading, time 8 | from algoliasearch.search_client import SearchClient 9 | 10 | from aws_lambda_powertools import Logger, Tracer 11 | from boto3.dynamodb.conditions import Key, Attr 12 | from datetime import date, datetime, timedelta 13 | from bs4 import BeautifulSoup 14 | 15 | modules_to_be_patched = [ "boto3", "requests" ] 16 | tracer = Tracer(patch_modules = modules_to_be_patched) 17 | 18 | logger = Logger() 19 | tracer = Tracer() 20 | 21 | 22 | # establish a session with SES, DynamoDB and Comprehend 23 | ddb = boto3.resource('dynamodb', region_name = os.environ['AWS_REGION'], config = botocore.client.Config(max_pool_connections = 50)).Table(os.environ['dynamo_table']) 24 | com = boto3.client(service_name = 'comprehend', region_name = os.environ['AWS_REGION']) 25 | ses = boto3.client('ses') 26 | s3 = boto3.client('s3') 27 | 28 | 29 | # get the RSS feed through feedparser 30 | @tracer.capture_method(capture_response = False) 31 | def get_rss(url): 32 | return feedparser.parse(url) 33 | 34 | 35 | # update the item count in dynamodb by 1 36 | @tracer.capture_method(capture_response = False) 37 | def update_itemcount(blogsource): 38 | 39 | # update guid: , timest: 0 40 | ddb.update_item( 41 | Key = { "guid" : blogsource, "timest" : 0 }, 42 | ExpressionAttributeValues = { ":inc" : 1 }, 43 | UpdateExpression = "ADD articlecount :inc" 44 | ) 45 | 46 | print('incremented ' + blogsource + ' count by 1') 47 | 48 | 49 | # write the blogpost record into DynamoDB 50 | @tracer.capture_method(capture_response = False) 51 | def put_dynamo(timest_post, title, cleantxt, rawhtml, description, link, blogsource, author, guid, tags, category, datestr_post, table, event): 52 | 53 | # if no description was submitted, put a dummy value to prevent issues parsing the output 54 | if len(description) == 0: 55 | description = '...' 56 | 57 | # create item payload for algolia 58 | smallitem = { 59 | 'objectID' : guid, # add unique object id for Algolia search 60 | 'timest' : timest_post, # store the unix timestamp of the post as an int 61 | 'title' : title, 62 | 'description' : description, # store the short rss feed description of the content 63 | 'link' : link, 64 | 'blogsource' : blogsource, 65 | 'author' : author, 66 | 'guid' : guid # store the blogpost guid as a unique key 67 | } 68 | 69 | # add additional attributes for dynamodb record 70 | extraitem = { 71 | 'category' : category, 72 | 'datestr' : datestr_post, # store the human friendly timestamp of the post 73 | 'fulltxt': cleantxt, # store the "clean" text of the blogpost, using \n as a line delimiter 74 | 'lower-tag' : tags.lower(), # convert the tags to lowercase, which makes it easier to search or match these 75 | 'rawhtml': rawhtml, # store the raw html output of the readability plugin, in order to include the blog content with text markup 76 | 'tag' : tags, # set the comprehend tags 77 | 'visible' : 'y' # set the blogpost to visible by default - this "hack" allows for a simple query on a static primary key 78 | } 79 | 80 | # optionally, put the small record in your Algolia search DB if the API key is set 81 | if event['enable_algolia'] == 'y': 82 | 83 | client = SearchClient.create(event['algolia_app'], event['algolia_apikey']) 84 | index = client.init_index(event['algolia_index']) 85 | index.save_objects([smallitem]) 86 | 87 | # merge small and extra item for dynamodb 88 | def Merge(dict1, dict2): 89 | res = {**dict1, **dict2} 90 | return res 91 | 92 | # create fullitem for dynamodb 93 | fullitem = Merge(smallitem, extraitem) 94 | 95 | # put the full record into dynamodb 96 | ddb.put_item( 97 | TableName = table, 98 | Item = fullitem 99 | ) 100 | 101 | # increment dynamodb counter for blog category by 1 102 | update_itemcount(blogsource) 103 | 104 | # increment dynamodb counter for all blogs by 1 105 | update_itemcount('all') 106 | 107 | 108 | # retrieve the url of a blogpost 109 | @tracer.capture_method(capture_response = False) 110 | def retrieve_url(url): 111 | 112 | # set a "real" user agent 113 | firefox = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0" 114 | 115 | # retrieve the main text section from the url using the readability module and using the Chrome user agent 116 | req = requests.get(url, headers = {'User-Agent' : firefox}) 117 | doc = readability.Document(req.text) 118 | rawhtml = doc.summary(html_partial = True) 119 | 120 | # remove any html tags from output 121 | soup = BeautifulSoup(rawhtml, 'html.parser') 122 | cleantext = soup.get_text().strip('\n').encode('utf-8') 123 | 124 | return str(rawhtml), str(cleantext) 125 | 126 | 127 | # analyze the text of a blogpost using the AWS Comprehend service 128 | @tracer.capture_method(capture_response = False) 129 | def comprehend(cleantxt, title): 130 | detections = [] 131 | found = False 132 | 133 | fulltext = title + " " + cleantxt 134 | 135 | # cut down the text to less than 5000 bytes as this is the file limit for Comprehend 136 | strlen = sys.getsizeof(fulltext) 137 | 138 | while strlen > 5000: 139 | fulltext = fulltext[:-1] 140 | strlen = sys.getsizeof(fulltext) 141 | 142 | # check whether organization or title labels were found by Comprehend 143 | for x in com.detect_entities(Text = fulltext, LanguageCode = 'en')['Entities']: 144 | if x['Type'] == 'ORGANIZATION' or x['Type'] == 'TITLE' or x['Type'] == 'COMMERCIAL_ITEM' or x['Type'] == 'PERSON': 145 | if x['Text'] not in detections: 146 | detections.append(x['Text']) 147 | found = True 148 | 149 | # if no tags were retrieved, add a default tag 150 | if found: 151 | tags = ', '.join(detections) 152 | 153 | else: 154 | tags = 'none' 155 | 156 | # return tag values 157 | return(tags) 158 | 159 | 160 | # send an email out whenever a new blogpost was found - this feature is optional 161 | @tracer.capture_method(capture_response = False) 162 | def send_email(recpt, title, blogsource, author, rawhtml, link, datestr_post): 163 | 164 | # create a simple html body for the email 165 | mailmsg = '
Posted by '+str(author)+' in ' +str(blogsource) + ' blog on ' + str(datestr_post) + '

' 166 | mailmsg += 'view post here

' + str(rawhtml) + '
' 167 | 168 | # send the email using SES 169 | ses.send_email( 170 | Source = event['from_email'], 171 | Destination = {'ToAddresses': [recpt]}, 172 | Message = { 173 | 'Subject': { 174 | 'Data': blogsource.upper() + ' - ' + title 175 | }, 176 | 'Body': { 177 | 'Html': { 178 | 'Data': mailmsg 179 | } 180 | } 181 | } 182 | ) 183 | 184 | print('sent email with subject ' + blogsource.upper() + ' - ' + title + ' to ' + recpt) 185 | 186 | 187 | # main function to kick off collection of an rss feed 188 | @tracer.capture_method(capture_response = False) 189 | def get_feed(url, blogsource, guids, table, event): 190 | 191 | # create a variable about blog update and list to store new blogs 192 | blogupdate = False 193 | newblogs = [] 194 | 195 | # get the rss feed 196 | rssfeed = get_rss(url) 197 | 198 | print('found ' + str(len(rssfeed['entries'])) + ' blog entries') 199 | 200 | # check all the retrieved articles for published dates 201 | for x in rssfeed['entries']: 202 | 203 | # retrieve post guid 204 | guid = str(x['guid']) 205 | timest_post = int(time.mktime(x['updated_parsed'])) 206 | timest_now = int(time.time()) 207 | 208 | # retrieve blog date and description text 209 | datestr_post = time.strftime('%d-%m-%Y %H:%M', x['updated_parsed']) 210 | 211 | # if the post guid is not found in dynamodb and newer than the specified amount of days, retrieve the record 212 | if guid not in guids and (timest_now < (timest_post + (86400 * days_to_retrieve))): 213 | 214 | # retrieve other blog post values, remove double quotes from title 215 | link = str(x['link']) 216 | title = str(x['title']).replace('"', "'") 217 | 218 | # retrieve the blogpost author if available 219 | author = 'blank' 220 | 221 | if x.has_key('author'): 222 | author = str(x['author']) 223 | 224 | # retrieve blogpost link 225 | print('retrieving '+str(title)+' in '+str(blogsource)+' using url '+str(link)+'\n') 226 | rawhtml, cleantxt = retrieve_url(link) 227 | 228 | # discover tags with comprehend on html output 229 | tags = comprehend(cleantxt, title) 230 | 231 | # clean up blog post description text and remove unwanted characters such as double quotes and spaces (this can be improved further) 232 | des = str(x['description']) 233 | r = re.compile(r'<[^>]+>') 234 | description = r.sub('', str(des)).strip(' ').replace('"', "'").strip('\n') 235 | 236 | # submit the retrieved tag values discovered by comprehend to the list 237 | category_tmp = [] 238 | category = 'none' 239 | 240 | # join category fields in one string 241 | if x.has_key('tags'): 242 | for tag in x['tags']: 243 | category_tmp.append(str(tag['term'])) 244 | 245 | category = str(', '.join(category_tmp)) 246 | 247 | # update the blogpost 248 | blogupdate = True 249 | 250 | # put record to dynamodb 251 | put_dynamo(timest_post, title, cleantxt, rawhtml, description, link, blogsource, author, guid, tags, category, datestr_post, table, event) 252 | 253 | # add blog to newblogs list 254 | newblogs.append(str(blogsource) + ' ' + str(title) + ' ' + str(guid)) 255 | 256 | # if sendemails enabled, generate the email message body for ses and send email 257 | if send_mail == 'y': 258 | 259 | # get mail title and email recepient 260 | title = blogsource.upper()+' - '+title 261 | recpt = event['to_email'] 262 | 263 | # send the email 264 | send_email(recpt, title, blogsource, author, rawhtml, link, datestr_post) 265 | 266 | return blogupdate, newblogs 267 | 268 | 269 | # check if new items were uploaded to s3 270 | @tracer.capture_method(capture_response = False) 271 | def get_s3_json_age(bucket): 272 | 273 | # set variable for s3 update operation 274 | updateblog = False 275 | 276 | # list objects in s3 277 | s3list = s3.list_objects_v2(Bucket = bucket) 278 | 279 | print('get s3 list ' + str(s3list)) 280 | 281 | # iterate over present files in s3 282 | if 'Contents' in s3list: 283 | for s3file in s3list['Contents']: 284 | 285 | # get last modified time of item 286 | s3time = s3file['LastModified'] 287 | 288 | objtime = int(time.mktime(s3time.timetuple())) 289 | nowtime = int(time.time()) 290 | difftime = nowtime - objtime 291 | 292 | # if an s3 file was created in the last 300 seconds, update the blog feed 293 | if difftime < 300: 294 | updateblog = True 295 | 296 | print(str(difftime) + " " + str(s3file['Key'])) 297 | 298 | # return true/false about blog update status 299 | return updateblog 300 | 301 | 302 | # get the contents of the dynamodb table for json object on S3 303 | @tracer.capture_method(capture_response = False) 304 | def get_table_json(blogsource, bucket): 305 | 306 | # create a list for found guids from json stored on s3 307 | s3guids = [] 308 | 309 | # create a list for s3 objects that were found 310 | s3files = [] 311 | 312 | # check if the s3 object exists by listing current s3 objects 313 | s3list = s3.list_objects_v2(Bucket = bucket) 314 | 315 | # set days_to_get value 316 | days_to_get = int(days_to_retrieve) 317 | 318 | 319 | # iterate over present files in s3 320 | if 'Contents' in s3list: 321 | 322 | for x in s3list['Contents']: 323 | s3files.append(x['Key']) 324 | 325 | # if the blog json is available on s3 326 | if str(blogsource + '.json') in s3files: 327 | 328 | # retrieve the object from s3 329 | s3obj = s3.get_object(Bucket = bucket, Key = blogsource + '.json') 330 | 331 | # create list for results from json 332 | res = json.loads(s3obj['Body'].read()) 333 | 334 | # add guids from json file to s3guids list 335 | for s3file in res: 336 | s3guids.append(s3file['guid']) 337 | 338 | # if the blog json does not exist on s3 339 | else: 340 | 341 | # since the previous results can not be found, create an emptylist for results and get current time 342 | res = [] 343 | 344 | print('could not find ' + blogsource + '.json file on s3') 345 | 346 | # get the current timestamp 347 | now_ts = datetime.now() 348 | 349 | # get timestamp based on days_to_retrieve 350 | old_ts = now_ts - timedelta(days = days_to_get) 351 | diff_ts = int(time.mktime(old_ts.timetuple())) 352 | 353 | if blogsource != 'all': 354 | 355 | # query the dynamodb table for blogposts of a specific category from up to 1 day ago 356 | blogs = ddb.query(IndexName = "timest", ScanIndexForward = True, ProjectionExpression = 'blogsource, datestr, timest, title, author, description, link, guid', KeyConditionExpression = Key('blogsource').eq(blogsource) & Key('timest').gt(diff_ts)) 357 | 358 | else: 359 | 360 | # query the dynamodb table for all category blogposts from up to 1 day ago 361 | blogs = ddb.query(IndexName = "visible", ScanIndexForward = True, ProjectionExpression = 'blogsource, datestr, timest, title, author, description, link, guid', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(diff_ts)) 362 | 363 | # iterate over the returned items 364 | for a in blogs['Items']: 365 | 366 | # if guid not present in s3 json file 367 | if a['guid'] not in s3guids: 368 | 369 | b = {'timest': str(a['timest']), 'blogsource': a['blogsource'], 'title': a['title'], 'datestr': a['datestr'], 'guid': a['guid'], 'link': a['link'], 'description': a['description'].strip(), 'author': a['author']} 370 | 371 | # add the json object to the result list 372 | res.append(b) 373 | 374 | # retrieve additional items if lastevaluatedkey was found 375 | while 'LastEvaluatedKey' in blogs: 376 | lastkey = blogs['LastEvaluatedKey'] 377 | 378 | if blogsource != 'all': 379 | 380 | # query the dynamodb table for blogposts of a specific category 381 | blogs = ddb.query(IndexName = "timest", ScanIndexForward = True, ExclusiveStartKey = lastkey, ProjectionExpression = 'blogsource, datestr, timest, title, author, description, link, guid', KeyConditionExpression = Key('blogsource').eq(blogsource) & Key('timest').gt(diff_ts)) 382 | 383 | else: 384 | 385 | # query the dynamodb table for all category blogposts from up to 30 days old 386 | blogs = ddb.query(IndexName = "visible", ScanIndexForward = True, ExclusiveStartKey = lastkey, ProjectionExpression = 'blogsource, datestr, timest, title, author, description, link, guid', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(diff_ts)) 387 | 388 | # add an entry per blog to the output list 389 | for a in blogs['Items']: 390 | 391 | # if guid not present in s3 json file 392 | if a['guid'] not in s3guids: 393 | 394 | b = {'timest': str(a['timest']), 'blogsource': a['blogsource'], 'title': a['title'], 'datestr': a['datestr'], 'guid': a['guid'], 'author': a['author'], 'link': a['link'], 'description': a['description'].strip()} 395 | 396 | # add the json object to the result list 397 | res.append(b) 398 | 399 | return res 400 | 401 | 402 | # copy the file to s3 with a public acl 403 | @tracer.capture_method(capture_response = False) 404 | def cp_s3(blogsource, bucket): 405 | 406 | # put object to s3 407 | s3.put_object( 408 | Bucket = bucket, 409 | Body = open('/tmp/' + blogsource + '.json', 'rb'), 410 | Key = blogsource + '.json', 411 | ACL = 'public-read', 412 | CacheControl = 'public', 413 | ContentType = 'application/json' 414 | ) 415 | 416 | 417 | # update json objects on S3 for single page web apps 418 | @tracer.capture_method(capture_response = False) 419 | def update_json_s3(blog, bucket): 420 | 421 | print('updating json for ' + blog) 422 | 423 | # get the json content from DynamoDB 424 | out = get_table_json(blog, bucket) 425 | 426 | # create the json and return path 427 | make_json(out, blog) 428 | 429 | # upload the json to s3 430 | cp_s3(blog, bucket) 431 | 432 | 433 | # create a json file from blog content 434 | def make_json(content, blogsource): 435 | 436 | # write the json file to /tmp/ 437 | fpath = '/tmp/' + blogsource + '.json' 438 | 439 | # create empty list for filteredcontent 440 | filteredcontent = [] 441 | 442 | # filter blog posts for category 443 | for blog in content: 444 | if blog['blogsource'] == blogsource or blogsource == 'all': 445 | filteredcontent.append(blog) 446 | 447 | # sort the keys by timestamp 448 | dumpfile = sorted(filteredcontent, key = lambda k: k['timest'], reverse = True) 449 | 450 | with open(fpath, "w") as outfile: 451 | json.dump(dumpfile, outfile) 452 | 453 | print('wrote to ' + fpath) 454 | 455 | 456 | # lambda handler 457 | @logger.inject_lambda_context(log_event = True) 458 | @tracer.capture_lambda_handler 459 | def handler(event, context): 460 | 461 | print('event ' + str(event)) 462 | 463 | # set default value for 'days_to_retrieve' 464 | global days_to_retrieve 465 | days_to_retrieve = int(1) 466 | 467 | # set send email boolean, newblog and blogupdate default values 468 | global send_mail 469 | send_mail = event['send_mail'] 470 | blogupdate = False 471 | newblogs = '' 472 | 473 | bucket = event['s3_bucket'] 474 | table = os.environ['dynamo_table'] 475 | 476 | # if updating all blogposts, set source to 'all' and skip blogpost retrieval 477 | if event['msg'] == 'all': 478 | blogsource = 'all' 479 | 480 | # check if there are files on s3 less than 60 seconds old 481 | blogupdate = get_s3_json_age(bucket) 482 | 483 | else: 484 | 485 | # get submitted values from blog to retrieve 486 | url = event['msg']['url'] 487 | blogsource = event['msg']['blogsource'] 488 | guids = event['guids'] 489 | days_to_retrieve = int(event['msg']['daystoretrieve']) 490 | send_mail = event['send_mail'] 491 | 492 | # get feed and boolean indicating if an update to s3 is required 493 | blogupdate, newblogs = get_feed(url, blogsource, guids, table, event) 494 | 495 | # if new blogposts found, create new json output on s3 496 | if blogupdate == True and event['storepublics3'] == 'y': 497 | print('updating json output on s3 for ' + blogsource) 498 | update_json_s3(blogsource, bucket) 499 | 500 | return newblogs 501 | -------------------------------------------------------------------------------- /lambda-getfeed/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/lambda-getfeed/requirements.txt -------------------------------------------------------------------------------- /lambda-layer/requirements.txt: -------------------------------------------------------------------------------- 1 | algoliasearch 2 | aws-lambda-powertools 3 | BeautifulSoup4 4 | readability-lxml 5 | feedparser 6 | requests 7 | -------------------------------------------------------------------------------- /lambda-pagecount/pagecount.py: -------------------------------------------------------------------------------- 1 | import boto3, os 2 | from boto3.dynamodb.conditions import Key, Attr 3 | 4 | ddb = boto3.resource('dynamodb', region_name = os.environ['dynamo_region']).Table(os.environ['dynamo_table']) 5 | 6 | blogs = ["apn", "architecture", "big-data", "biz-prod", "cli", "cloudguru", "compute", "contact-center", "containers", "corey", "cost-mgmt", "database", "desktop", "developer", "devops", "enterprise-strat", "gamedev", "gametech", "governance", "industries", "infrastructure", "iot", "java", "jeremy", "management-tools", "marketplace", "media", "messaging", "ml", "mobile", "modernizing", "networking", "newsblog", "open-source", "public-sector", "quantum", "robotics", "sap", "security", "security-bulletins", "serverless", "startups", "storage", "training", "werner", "whats-new", "yan", "all"] 7 | 8 | # get blogsource item count per category 9 | def getblog_count(blogsource): 10 | 11 | count = 0 12 | 13 | if blogsource == 'all': 14 | 15 | # get all blogs with timestamp greater than 1 16 | blogs = ddb.query(IndexName = "visible", Select = 'COUNT', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(1)) 17 | 18 | count += int(blogs['Count']) 19 | 20 | while 'LastEvaluatedKey' in blogs: 21 | 22 | blogs = ddb.query(ExclusiveStartKey = blogs['LastEvaluatedKey'], IndexName = "visible", Select = 'COUNT', KeyConditionExpression = Key('visible').eq('y') & Key('timest').gt(1)) 23 | 24 | count += int(blogs['Count']) 25 | 26 | else: 27 | 28 | # get a count of blogpost per category 29 | blogs = ddb.query(IndexName = "timest", Select = 'COUNT', KeyConditionExpression = Key('blogsource').eq(blogsource) & Key('timest').gt(1)) 30 | 31 | count += int(blogs['Count']) 32 | 33 | while 'LastEvaluatedKey' in blogs: 34 | blogs = ddb.query(ExclusiveStartKey = blogs['LastEvaluatedKey'], IndexName = "timest", Select = 'COUNT', KeyConditionExpression = Key('blogsource').eq(blogsource) & Key('timest').gt(1)) 35 | 36 | count += int(blogs['Count']) 37 | 38 | # write the page count record to dynamodb 39 | ddb.put_item( 40 | TableName = os.environ['dynamo_table'], 41 | Item = { 42 | 'timest' : 0, 43 | 'guid': blogsource, 44 | 'blogsource' : blogsource, 45 | 'articlecount' : int(count), 46 | 'visible': 'y' 47 | } 48 | ) 49 | 50 | # print status 51 | print('updated ' + str(count) + ' page count for ' + blogsource) 52 | 53 | def handler(event, context): 54 | for blog in blogs: 55 | getblog_count(blog) 56 | -------------------------------------------------------------------------------- /lambda-pagecount/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/marekq/rss-lambda/d276d9374223f217e3403a33d2a072c8f957d395/lambda-pagecount/requirements.txt -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | rss-lambda 2 | ========== 3 | 4 | Monitor your favourite blogs through RSS and get a notification whenever a new blog is posted. New blogposts are stored in DynamoDB and (optionally) sent out to your e-mail address using SES. The Step Function function to retrieve the blogs runs every 15 minutes by default. The cost for running the solution should be less than $3 per month, which is mostly influenced by the polling frequency of the function. 5 | 6 | You can extend the blog scraper by adding your own RSS feeds to monitor. By default various AWS related feeds are included, but you can add any of your own feeds in the *lambda-dynamo/feeds.txt* file. Within the DynamoDB table that is deployed, you can find various details about the blogposts and also the text or html versions of the content. This can be helpful in case you are building your own feed scraper or notification service. You can also use the included AppSync endpoint to read data from the table using GraphQL. 7 | 8 | Optionally, a JSON output for every blog category can be uploaded as a public S3 object. These files can be included in a single page app, such as the one at https://marek.rocks . The output will be compressed using 'brotli' or something similar later in the future to save on S3 storage and bandwidth costs. 9 | 10 | The feed retrieval feature uses a "readability" library which works similarly to the "Reader View" function of the Apple Safari browser. This makes it convenient to read the full text of a blogpost in your email client or on mobile. All of the links, images and text markup is preserved. 11 | 12 | Finally, an AppSync public endpoint can be deployed which retrieves the blogposts from DynamoDB. You can include the endpoint in a single page app to query blogpost context real time in a (web) application. 13 | 14 | 15 | ![alt text](./docs/architecture.png) 16 | 17 | 18 | The following fields are stored in DynamoDB per blog article. In the screenshot, the large HTML and text outputs were omitted; 19 | 20 | 21 | ![alt text](./docs/dynamodb.png) 22 | 23 | 24 | Finally, the following State Machine is created to retrieve blog posts; 25 | 26 | 27 | ![alt text](./docs/statemachine.png) 28 | 29 | 30 | Installation 31 | ------------ 32 | 33 | - Make sure the AWS SAM CLI and Docker are installed and configured on your local machine. 34 | - If you want, you can edit the RSS feeds in 'lambda/feeds.txt'. These contain various AWS blogs I read by default. 35 | - Run 'make init' to deploy the stack for the first time. Once the 'samconfig.toml' file is present, you can use 'make deploy'. 36 | - If you optionally select to use email notifications using SES, you will need to ensure that you have the SES sender and email address preconfigured in your account. There is unfortunately no simple way to provision this using SAM. 37 | 38 | You can now run the Step Function to trigger the blog refresh. The URL to find the Step Function is given as an output value of the CloudFormation stack. 39 | 40 | 41 | Roadmap 42 | ------- 43 | 44 | - [ ] Switch to Step Functions Express to save on costs. The Express option can be used today, but is more difficult to debug in case of Lambda failures. 45 | - [X] Add AppSync endpoint for retrieval of blog posts through Amplify. 46 | - [X] Decompose the "monolith" Lambda function into smaller functions. This will allow for easier retries and debugging of blogpost retrieval. 47 | - [X] Implement Step Function for better coordination of individual functionality. 48 | - [X] Add Lambda Extension to monitor network and CPU usage of the RSS function. 49 | - [X] Optimize Lambda memory and timeout settings to lower cost. 50 | - [X] Add "smart" text extraction of the full blogpost, so that the full content of a post can be stored in DynamoDB or sent through e-mail. 51 | - [X] Add generation of JSON files with blogposts to S3 for easier inclusion in a single page app (as seen on https://marek.rocks ). 52 | - [X] Add support for retrieval of non AWS blogposts using RSS. 53 | - [X] Add DynamoDB Global Secondary Indexes for (partial) data retrieval based on GUID, timestamp and blog categories. 54 | 55 | 56 | About the repo contents 57 | ----------------------- 58 | 59 | The following description describes briefly what the files and folder contains; 60 | 61 | - Run *make init* to deploy the stack to AWS. It will download all of the Lambda dependancies, pack them and upload them to S3 and deploy a CloudFormation stack using SAM. After the initial run, you can use *make deploy* for incremental changes to your SAM stack. 62 | - The *template.yaml* file is the SAM CloudFormation stack for the deployment. You do not need to edit this file directly. 63 | - The *lambda-crawl* folder has the Lambda function to discover the RSS feeds, if files are present on S3 and see how much days of data need to be retrieved. It is triggered at the start of the Step Function. 64 | - The *lambda-getfeed* folder contains the source code the function that checks every feed individually. It is triggered in the map state of the Step Function. 65 | - The *statemachine* folder contains the source code for Step Function in JSON. 66 | - The *lambda-layer* folder contains the *requirements.txt* file for the Lambda layer of the blog retrieval function. 67 | - The *graphql* folder contains the GraphQL schema and VTL resolvers for AppSync. 68 | 69 | 70 | License 71 | ------- 72 | 73 | MIT-0, please see the 'LICENSE' file for more info. 74 | 75 | 76 | Contact 77 | ------- 78 | 79 | In case of questions or bugs, please raise an issue or reach out to @marekq! 80 | -------------------------------------------------------------------------------- /statemachine/rssblog.asl.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Check AWS Blog feeds, get last timestamp stored on S3", 3 | "StartAt": "Discover Blog Feeds", 4 | "States": { 5 | "Discover Blog Feeds": { 6 | "Type": "Task", 7 | "Resource": "arn:aws:states:::lambda:invoke", 8 | "OutputPath": "$.Payload", 9 | "Parameters": { 10 | "FunctionName": "${rsscrawl}", 11 | "Payload": { 12 | "msg.$": "$" 13 | } 14 | }, 15 | "Next": "Get RSS Blog Feed" 16 | }, 17 | "Get RSS Blog Feed": { 18 | "Type": "Map", 19 | "InputPath": "$", 20 | "ItemsPath": "$.results", 21 | "OutputPath": "$", 22 | "ResultPath": "$.map", 23 | "Next": "Finish", 24 | "Parameters": { 25 | "msg.$": "$$.Map.Item.Value", 26 | "guids.$": "$.guids", 27 | "s3_bucket.$": "$.s3_bucket", 28 | "algolia_app.$": "$.algolia_app", 29 | "algolia_apikey.$": "$.algolia_apikey", 30 | "algolia_index.$": "$.algolia_index", 31 | "dynamo_region.$": "$.dynamo_region", 32 | "dynamo_table.$": "$.dynamo_table", 33 | "from_email.$": "$.from_email", 34 | "to_email.$": "$.to_email", 35 | "storepublics3.$": "$.storepublics3", 36 | "enable_algolia.$": "$.enable_algolia", 37 | "send_mail.$": "$.send_mail" 38 | }, 39 | "Iterator": { 40 | "StartAt": "Get RSS Blogs", 41 | "States": { 42 | "Get RSS Blogs": { 43 | "Type": "Task", 44 | "Resource": "arn:aws:states:::lambda:invoke", 45 | "InputPath": "$", 46 | "OutputPath": "$.Payload", 47 | "Retry": [ 48 | { 49 | "ErrorEquals": [ "States.ALL" ], 50 | "IntervalSeconds": 1, 51 | "BackoffRate": 2, 52 | "MaxAttempts": 3 53 | } 54 | ], 55 | "Parameters": { 56 | "FunctionName": "${rssgetfeed}", 57 | "Payload": { 58 | "msg.$": "$.msg", 59 | "guids.$": "$.guids", 60 | "send_mail.$": "$.send_mail", 61 | "s3_bucket.$": "$.s3_bucket", 62 | "dynamo_table.$": "$.dynamo_table", 63 | "enable_algolia.$": "$.enable_algolia", 64 | "algolia_app.$": "$.algolia_app", 65 | "algolia_apikey.$": "$.algolia_apikey", 66 | "algolia_index.$": "$.algolia_index", 67 | "storepublics3.$": "$.storepublics3" 68 | } 69 | }, 70 | "End": true 71 | } 72 | } 73 | } 74 | }, 75 | "Finish": { 76 | "Type": "Succeed", 77 | "InputPath": "$", 78 | "OutputPath": "$" 79 | } 80 | } 81 | } -------------------------------------------------------------------------------- /template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: 'AWS::Serverless-2016-10-31' 3 | Description: An AWS Serverless Specification template describing your function. 4 | 5 | Parameters: 6 | SourceEmail: 7 | Type: String 8 | Description: The sending email address for notification emails. 9 | Default: aws@marek.rocks 10 | 11 | DestEmail: 12 | Type: String 13 | Description: The receiving email address for notification emails. 14 | Default: marek.kuczynski@gmail.com 15 | 16 | SendEmails: 17 | Description: Set whether to send SES emails or not (default 'n'). 18 | Default: 'n' 19 | Type: String 20 | AllowedValues: 21 | - 'y' 22 | - 'n' 23 | 24 | StorePublicS3: 25 | Description: Store a JSON object of blogposts as a public S3 file (default 'n'). 26 | Default: 'n' 27 | Type: String 28 | AllowedValues: 29 | - 'y' 30 | - 'n' 31 | 32 | CreateAppSync: 33 | Description: Create a read only AppSync endpoint for the blogs stored in DynamoDB 34 | Default: 'n' 35 | Type: String 36 | AllowedValues: 37 | - 'y' 38 | - 'n' 39 | 40 | EnableAlgolia: 41 | Description: Optional - enable Algolia search index support 42 | Default: 'n' 43 | Type: 'String' 44 | AllowedValues: 45 | - 'y' 46 | - 'n' 47 | 48 | AlgoliaApp: 49 | Description: Optional - add the Algolia App ID 50 | Default: '' 51 | Type: String 52 | 53 | AlgoliaApikey: 54 | Description: Optional - add the Algolia API key 55 | Default: '' 56 | Type: String 57 | 58 | AlgoliaIndex: 59 | Description: Optional - add the Algolia Index name 60 | Default: '' 61 | Type: String 62 | 63 | # appsync create condition 64 | Conditions: 65 | EnableAppSync: !Equals [ !Ref CreateAppSync, y ] 66 | 67 | Resources: 68 | 69 | # create per rss feed retrieval function 70 | rssgetfeed: 71 | Type: 'AWS::Serverless::Function' 72 | Properties: 73 | Handler: getfeed.handler 74 | Runtime: python3.8 75 | CodeUri: lambda-getfeed/ 76 | Description: 'Retrieve RSS feeds and store them in DynamoDB' 77 | Policies: 78 | - Version: '2012-10-17' 79 | Statement: 80 | - Effect: Allow 81 | Action: 82 | - 'ses:SendEmail' 83 | Resource: '*' 84 | - ComprehendFullAccess 85 | - arn:aws:iam::aws:policy/CloudWatchLambdaInsightsExecutionRolePolicy 86 | - DynamoDBCrudPolicy: 87 | TableName: !Ref rssfeed 88 | - S3CrudPolicy: 89 | BucketName: !Ref PublicJsonBucket 90 | MemorySize: 512 91 | Timeout: 30 92 | Environment: 93 | Variables: 94 | dynamo_table: !Ref rssfeed 95 | 96 | Tracing: Active 97 | ReservedConcurrentExecutions: 50 98 | Layers: 99 | - !Ref lambdalayer 100 | - !Sub "arn:aws:lambda:${AWS::Region}:580247275435:layer:LambdaInsightsExtension:14" 101 | 102 | # create rss feed crawl function 103 | rsscrawl: 104 | Type: 'AWS::Serverless::Function' 105 | Properties: 106 | Handler: crawl.handler 107 | Runtime: python3.8 108 | CodeUri: lambda-crawl/ 109 | Description: 'Retrieve RSS feeds and check files stored on S3' 110 | Policies: 111 | - arn:aws:iam::aws:policy/CloudWatchLambdaInsightsExecutionRolePolicy 112 | - DynamoDBReadPolicy: 113 | TableName: !Ref rssfeed 114 | - S3ReadPolicy: 115 | BucketName: !Ref PublicJsonBucket 116 | MemorySize: 256 117 | Timeout: 30 118 | Environment: 119 | Variables: 120 | algolia_app: !Ref AlgoliaApp 121 | algolia_apikey: !Ref AlgoliaApikey 122 | algolia_index: !Ref AlgoliaIndex 123 | dynamo_region: !Ref 'AWS::Region' 124 | dynamo_table: !Ref rssfeed 125 | from_email: !Ref SourceEmail 126 | to_email: !Ref DestEmail 127 | s3_bucket: !Ref PublicJsonBucket 128 | storepublics3: !Ref StorePublicS3 129 | send_mail: !Ref SendEmails 130 | enable_algolia: !Ref EnableAlgolia 131 | Tracing: Active 132 | ReservedConcurrentExecutions: 1 133 | Layers: 134 | - !Ref lambdalayer 135 | - !Sub "arn:aws:lambda:${AWS::Region}:580247275435:layer:LambdaInsightsExtension:14" 136 | 137 | # refresh pagecount stored in dynamodb using a manual lambda invoke 138 | pagecount: 139 | Type: 'AWS::Serverless::Function' 140 | Properties: 141 | Handler: pagecount.handler 142 | Runtime: python3.8 143 | CodeUri: lambda-pagecount/ 144 | Description: 'Retrieve the total article count for blogs stored in DynamoDB' 145 | Policies: 146 | - arn:aws:iam::aws:policy/CloudWatchLambdaInsightsExecutionRolePolicy 147 | - DynamoDBCrudPolicy: 148 | TableName: !Ref rssfeed 149 | MemorySize: 256 150 | Timeout: 30 151 | Environment: 152 | Variables: 153 | dynamo_region: !Ref 'AWS::Region' 154 | dynamo_table: !Ref rssfeed 155 | POWERTOOLS_SERVICE_NAME: rssgetpagecount 156 | POWERTOOLS_TRACE_DISABLED: "false" 157 | Tracing: Active 158 | ReservedConcurrentExecutions: 1 159 | Layers: 160 | - !Ref lambdalayer 161 | - !Sub "arn:aws:lambda:${AWS::Region}:580247275435:layer:LambdaInsightsExtension:14" 162 | 163 | # create lambda layer with dependencies 164 | lambdalayer: 165 | Type: AWS::Serverless::LayerVersion 166 | Properties: 167 | LayerName: rsslayer 168 | Description: python3 dependencies for XRay, BeautifulSoup4, feedparser and requests 169 | ContentUri: lambda-layer/ 170 | CompatibleRuntimes: 171 | - python3.8 172 | LicenseInfo: 'MIT-0' 173 | RetentionPolicy: Delete 174 | Metadata: 175 | BuildMethod: python3.8 176 | 177 | # dynamodb table for blog articles 178 | rssfeed: 179 | Type: 'AWS::DynamoDB::Table' 180 | Properties: 181 | BillingMode: PAY_PER_REQUEST 182 | AttributeDefinitions: 183 | - AttributeName: guid 184 | AttributeType: S 185 | - AttributeName: timest 186 | AttributeType: N 187 | - AttributeName: visible 188 | AttributeType: S 189 | - AttributeName: blogsource 190 | AttributeType: S 191 | KeySchema: 192 | - AttributeName: guid 193 | KeyType: HASH 194 | - AttributeName: timest 195 | KeyType: RANGE 196 | GlobalSecondaryIndexes: 197 | - IndexName: visible 198 | KeySchema: 199 | - AttributeName: visible 200 | KeyType: HASH 201 | - AttributeName: timest 202 | KeyType: RANGE 203 | Projection: 204 | ProjectionType: ALL 205 | - IndexName: timest 206 | KeySchema: 207 | - AttributeName: blogsource 208 | KeyType: HASH 209 | - AttributeName: timest 210 | KeyType: RANGE 211 | Projection: 212 | ProjectionType: ALL 213 | 214 | # log group 215 | rssblog: 216 | Type: AWS::Logs::LogGroup 217 | 218 | # public s3 bucket 219 | PublicJsonBucket: 220 | Type: AWS::S3::Bucket 221 | 222 | # state machine to coordinate the workflow 223 | blogstatemachine: 224 | Type: AWS::Serverless::StateMachine 225 | Properties: 226 | Type: STANDARD 227 | Tracing: 228 | Enabled: true 229 | DefinitionUri: statemachine/rssblog.asl.json 230 | DefinitionSubstitutions: 231 | rsscrawl: !GetAtt rsscrawl.Arn 232 | rssgetfeed: !GetAtt rssgetfeed.Arn 233 | Policies: 234 | - LambdaInvokePolicy: 235 | FunctionName: !Ref rsscrawl 236 | - LambdaInvokePolicy: 237 | FunctionName: !Ref rssgetfeed 238 | - CloudWatchFullAccess 239 | Logging: 240 | IncludeExecutionData: true 241 | Destinations: 242 | - CloudWatchLogsLogGroup: 243 | LogGroupArn: !GetAtt rssblog.Arn 244 | Events: 245 | ScheduledEventEvery15Min: 246 | Type: Schedule 247 | Properties: 248 | Schedule: rate(15 minutes) 249 | 250 | # graphql api role 251 | GraphQLApiRole: 252 | Condition: EnableAppSync 253 | Type: 'AWS::IAM::Role' 254 | Properties: 255 | AssumeRolePolicyDocument: 256 | Version: '2012-10-17' 257 | Statement: 258 | - Effect: Allow 259 | Principal: 260 | Service: 'appsync.amazonaws.com' 261 | Action: 'sts:AssumeRole' 262 | Policies: 263 | - PolicyName: CWLogs 264 | PolicyDocument: 265 | Version: '2012-10-17' 266 | Statement: 267 | - Effect: Allow 268 | Action: 269 | - 'logs:CreateLogGroup' 270 | - 'logs:CreateLogStream' 271 | - 'logs:PutLogEvents' 272 | Resource: '*' 273 | - PolicyName: DDBRead 274 | PolicyDocument: 275 | Version: '2012-10-17' 276 | Statement: 277 | - Effect: Allow 278 | Action: 279 | - 'dynamodb:GetItem' 280 | - 'dynamodb:Query' 281 | - 'dynamodb:Scan' 282 | Resource: 283 | - !GetAtt rssfeed.Arn 284 | - !Sub '${rssfeed.Arn}/*' 285 | 286 | # create graphql api 287 | GraphQLApi: 288 | Condition: EnableAppSync 289 | Type: 'AWS::AppSync::GraphQLApi' 290 | Properties: 291 | XrayEnabled: true 292 | Name: !Ref 'AWS::StackName' 293 | AuthenticationType: API_KEY 294 | LogConfig: 295 | CloudWatchLogsRoleArn: !GetAtt 'GraphQLApiRole.Arn' 296 | FieldLogLevel: ALL 297 | 298 | # define graphql schema 299 | GraphQLSchema: 300 | Condition: EnableAppSync 301 | Type: 'AWS::AppSync::GraphQLSchema' 302 | Properties: 303 | DefinitionS3Location: './graphql/schema.graphql' 304 | ApiId: !GetAtt 'GraphQLApi.ApiId' 305 | 306 | # define dynamodb source 307 | DDBDataSource: 308 | Condition: EnableAppSync 309 | Type: 'AWS::AppSync::DataSource' 310 | Properties: 311 | Type: AMAZON_DYNAMODB 312 | ServiceRoleArn: !GetAtt 'GraphQLApiRole.Arn' 313 | ApiId: !GetAtt 'GraphQLApi.ApiId' 314 | Name: ddbsourcce 315 | DynamoDBConfig: 316 | TableName: !Ref rssfeed 317 | AwsRegion: !Ref 'AWS::Region' 318 | 319 | # create appsync api key 320 | ApiKey: 321 | Condition: EnableAppSync 322 | Type: 'AWS::AppSync::ApiKey' 323 | Properties: 324 | ApiId: !GetAtt 'GraphQLApi.ApiId' 325 | 326 | # create per blogsource resolver for appsync 327 | BlogSourceResolver: 328 | Condition: EnableAppSync 329 | Type: 'AWS::AppSync::Resolver' 330 | Properties: 331 | TypeName: Query 332 | DataSourceName: !GetAtt 'DDBDataSource.Name' 333 | RequestMappingTemplateS3Location: './graphql/QueryDdbByBlogsourceAndTimest-request.vtl' 334 | ResponseMappingTemplateS3Location: './graphql/QueryDdbByBlogsourceAndTimest-response.vtl' 335 | ApiId: !GetAtt 'GraphQLApi.ApiId' 336 | FieldName: QueryDdbByBlogsourceAndTimest 337 | 338 | # create visible blogs resolver for appsync 339 | VisibleResolver: 340 | Condition: EnableAppSync 341 | Type: 'AWS::AppSync::Resolver' 342 | Properties: 343 | TypeName: Query 344 | DataSourceName: !GetAtt 'DDBDataSource.Name' 345 | RequestMappingTemplateS3Location: './graphql/QueryDdbByVisibleAndTimest-request.vtl' 346 | ResponseMappingTemplateS3Location: './graphql/QueryDdbByVisibleAndTimest-response.vtl' 347 | ApiId: !GetAtt 'GraphQLApi.ApiId' 348 | FieldName: QueryDdbByVisibleAndTimest 349 | 350 | # create single article resolver for appsync 351 | SingleBlogResolver: 352 | Condition: EnableAppSync 353 | Type: 'AWS::AppSync::Resolver' 354 | Properties: 355 | TypeName: Query 356 | DataSourceName: !GetAtt 'DDBDataSource.Name' 357 | RequestMappingTemplateS3Location: './graphql/QueryDdbGetDetailText-request.vtl' 358 | ResponseMappingTemplateS3Location: './graphql/QueryDdbGetDetailText-response.vtl' 359 | ApiId: !GetAtt 'GraphQLApi.ApiId' 360 | FieldName: QueryDdbGetDetailText 361 | 362 | # create page count resolver per blog for appsync 363 | PerBlogPageCountResolver: 364 | Condition: EnableAppSync 365 | Type: 'AWS::AppSync::Resolver' 366 | Properties: 367 | TypeName: Query 368 | DataSourceName: !GetAtt 'DDBDataSource.Name' 369 | RequestMappingTemplateS3Location: './graphql/QueryDdbItemCountPerBlog-request.vtl' 370 | ResponseMappingTemplateS3Location: './graphql/QueryDdbItemCountPerBlog-response.vtl' 371 | ApiId: !GetAtt 'GraphQLApi.ApiId' 372 | FieldName: QueryDdbItemCountPerBlog 373 | 374 | # create page count resolver for all blogs appsync 375 | AllBlogsPageCountResolver: 376 | Condition: EnableAppSync 377 | Type: 'AWS::AppSync::Resolver' 378 | Properties: 379 | TypeName: Query 380 | DataSourceName: !GetAtt 'DDBDataSource.Name' 381 | RequestMappingTemplateS3Location: './graphql/QueryDdbItemCountAll-request.vtl' 382 | ResponseMappingTemplateS3Location: './graphql/QueryDdbItemCountAll-response.vtl' 383 | ApiId: !GetAtt 'GraphQLApi.ApiId' 384 | FieldName: QueryDdbItemCountAll 385 | 386 | # print the url of the state machine and graphql details 387 | Outputs: 388 | StateMachineURL: 389 | Value: !Sub 'https://${AWS::Region}.console.aws.amazon.com/states/home?region=${AWS::Region}#/statemachines/view/${blogstatemachine}' 390 | --------------------------------------------------------------------------------