├── README.md ├── medium_scrapper_tag_archive.py ├── medium_scrapper_post.py ├── DataExtraction.py └── Data_Extraction_Archive_Tags.py /README.md: -------------------------------------------------------------------------------- 1 | # Medium Web Scrapper 2 | 3 | To run this, user has to install scrapy library using 4 | pip install scrapy 5 | 6 | There are two scrappers 7 | 1. medium_scrapper_post.py 8 | This scrapper searches Medium for articles based on a user inputted search string. 9 | 10 | To run the scrapper, use 11 | 12 | scrapy runspider -a searchString=searchTerm medium_scrapper_post.py 13 | 14 | 2. medium_scrapper_tag_archive.py 15 | This scraper get all Articles for a particular tag slug in a given date range 16 | 17 | Note : If tag is Data Science, then pass tag as 'data-science' in tagSlug Parameter 18 | To run the scrapper, use 19 | 20 | 21 | scrapy runspider -a tagSlug='tagSlug' -a start_date=YYYYmmdd -a end_date=YYYYmmdd medium_scrapper_tag_archive.py 22 | 23 | 24 | # Medium Posts Data Extraction 25 | 26 | The file DataExtraction.py extracts information from the json files scrapped by the scrapper medium_scrapper_post.py. 27 | To extract information from json files scrapped by medium_scrapper_tag_archive.py (scrapping from tags archive) then use Data_Extraction_Archive_Tags.py 28 | -------------------------------------------------------------------------------- /medium_scrapper_tag_archive.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 1 23:30:54 2018 4 | 5 | @author: Aiswarya 6 | 7 | This scrapper extracts data for a given date range for a particular medium Tag 8 | """ 9 | 10 | import scrapy 11 | import codecs 12 | import json 13 | from datetime import datetime 14 | from datetime import timedelta 15 | import os 16 | 17 | def writeTofile(fileName,text): 18 | with codecs.open(fileName,'w','utf-8') as outfile: 19 | outfile.write(text) 20 | 21 | class MediumPost(scrapy.Spider): 22 | name='medium_scrapper' 23 | handle_httpstatus_list = [401,400] 24 | 25 | autothrottle_enabled=True 26 | def start_requests(self): 27 | 28 | start_urls = ['https://medium.com/tag/'+self.tagSlug.strip("'")+'/archive/'] 29 | print(start_urls) 30 | 31 | #Header and cookie information can be got from the Network Tab in Developer Tools 32 | cookie=cookie 33 | header =header 34 | 35 | startDate=datetime.strptime(self.start_date,"%Y%m%d") 36 | endDate=datetime.strptime(self.end_date,"%Y%m%d") 37 | delta=endDate-startDate 38 | print(delta) 39 | for i in range(delta.days + 1): 40 | d=datetime.strftime(startDate+timedelta(days=i),'%Y/%m/%d') 41 | for url in start_urls: 42 | print(url+d) 43 | yield scrapy.Request(url+d,method="GET",headers=header,cookies=cookie,callback=self.parse,meta={'reqDate':d}) 44 | 45 | #for url in start_urls: 46 | #yield scrapy.Request(url,method='GET',headers=header,cookies=cookie,callback=self.parse) 47 | #yield scrapy.Request(url,method='GET',body=json.dumps(formdata),headers=header,cookies=cookie,callback=self.parse) 48 | 49 | def parse(self,response): 50 | response_data=response.text 51 | response_split=response_data.split("while(1);") 52 | response_data=response_split[1] 53 | date_post=response.meta['reqDate'] 54 | date_post=date_post.replace("/","") 55 | directory=datetime.now().strftime("%Y%m%d") 56 | if not os.path.exists(directory): 57 | os.makedirs(directory) 58 | writeTofile(directory+"//"+self.tagSlug.replace("-","").strip("'")+"Tag"+date_post+".json",response_data) 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /medium_scrapper_post.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 07 20:00:44 2018 4 | 5 | @author: Aiswarya 6 | 7 | #https://medium.com/tag/data-science -to search my tag 8 | """ 9 | 10 | import scrapy 11 | import json 12 | import codecs 13 | import datetime 14 | 15 | 16 | def writeTofile(fileName,text): 17 | with codecs.open(fileName,'w','utf-8') as outfile: 18 | outfile.write(text) 19 | class MediumPost(scrapy.Spider): 20 | name='medium_scrapper' 21 | handle_httpstatus_list = [401,400] 22 | 23 | autothrottle_enabled=True 24 | def start_requests(self): 25 | 26 | start_urls = ['https://www.medium.com/search/posts?q='+self.searchString] 27 | 28 | #set cookie and header info by looking to Network Tab in Developer tools 29 | cookie=cookie 30 | header = header 31 | for url in start_urls: 32 | 33 | yield scrapy.Request(url,method='GET',headers=header,cookies=cookie,callback=self.parse) 34 | 35 | 36 | 37 | def parse(self,response): 38 | #writeTofile("Log"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".txt",response.text) 39 | response_data=response.text 40 | response_split=response_data.split("while(1);") 41 | #num_split= len(response_split) 42 | response_data=response_split[1] 43 | filename="medium_"+self.searchString+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".json" 44 | writeTofile(filename,response_data) 45 | 46 | with codecs.open(filename,'r','utf-8') as infile: 47 | data=json.load(infile) 48 | #Check if there is a next tag in json data 49 | if 'paging' in data['payload']: 50 | data=data['payload']['paging'] 51 | if 'next' in data: 52 | #Make a post request 53 | print "In Paging, Next Loop" 54 | data=data['next'] 55 | formdata={ 56 | 'ignoredIds':data['ignoredIds'], 57 | 'page':data['page'], 58 | 'pageSize':data['pageSize'] 59 | } 60 | cookie=cookie 61 | 62 | header = header 63 | yield scrapy.Request('https://www.medium.com/search/posts?q='+self.searchString,method='POST',body=json.dumps(formdata),headers=header,cookies=cookie,callback=self.parse) 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /DataExtraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 07 23:42:49 2018 4 | 5 | @author: Aiswarya 6 | 7 | This has to read all the json files scrapped and extract the information about each post, author and publisher (if any) 8 | 9 | """ 10 | import json 11 | import os 12 | import codecs 13 | import pandas as pd 14 | import datetime 15 | 16 | ''' 17 | Read all json files in the given path 18 | ''' 19 | def ReadData(path): 20 | posts=pd.DataFrame() 21 | tags=pd.DataFrame() 22 | users=pd.DataFrame() 23 | collections=pd.DataFrame() 24 | files=os.listdir(path) 25 | 26 | #files=["medium_datascience20180408_002322.json"] 27 | processedFiles=[] 28 | 29 | #total_records_processed=0 30 | 31 | for fileName in files : 32 | if '.json' in fileName: 33 | fileNamespilt=fileName.split("_") 34 | searchString=fileNamespilt[1] 35 | searchString=searchString.replace("%20"," ") 36 | dateScrapped=fileNamespilt[2] 37 | print "Processing "+fileName 38 | with codecs.open(path+fileName,'r','utf-8') as infile: 39 | data=json.load(infile) 40 | 41 | dfs=processPosts(data,searchString,dateScrapped) 42 | posts=posts.append(dfs[0],ignore_index=True) 43 | tags=tags.append(dfs[1],ignore_index=True) 44 | references=processReferences(data,searchString,dateScrapped) 45 | users=users.append(references[0],ignore_index=True) 46 | collections=collections.append(references[1],ignore_index=True) 47 | processedFiles.append(fileName) 48 | #remove duplicates in dataframe 49 | #posts=posts.drop_duplicates(keep='last') 50 | #users=users.drop_duplicates(keep='last') 51 | #collections=collections.drop_duplicates(keep='last') 52 | #tags=tags.drop_duplicates(keep='last') 53 | 54 | posts.to_csv(path+"Posts_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 55 | tags.to_csv(path+"Tags_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 56 | users.to_csv(path+"Users_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 57 | collections.to_csv(path+"Collections_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 58 | 59 | 60 | ''' 61 | This function, reads the posts data from the json 62 | ''' 63 | 64 | def processPosts(data_json,searchString,dateScrapped): 65 | #Check if there is a posts tag 66 | flag=False 67 | posts=pd.DataFrame() 68 | tags=pd.DataFrame() 69 | if "success" in data_json: 70 | if "payload" in data_json: 71 | if "value" in data_json["payload"]: 72 | print "value present" 73 | #print data_json["payload"]["value"] 74 | flag=True 75 | if(flag==True): 76 | print "Number of posts to process"+str(len(data_json["payload"]["value"])) 77 | data=data_json["payload"]["value"] 78 | for dat in data: 79 | posts_dict={} 80 | posts_dict["searchQuery"]=[searchString] 81 | posts_dict["ScrappingDate"]=[ datetime.datetime.strptime(dateScrapped,'%Y%m%d').strftime("%Y-%m-%d")] 82 | posts_dict["id"]=[dat["id"]] 83 | posts_dict["versionId"]=[dat["versionId"]] 84 | #print dat["creatorId"] 85 | posts_dict["creatorId"]=[dat["creatorId"]] 86 | posts_dict["collectionId"]=[dat["homeCollectionId"]] 87 | posts_dict["title"]=[dat["title"]] 88 | posts_dict["language"]=[dat["detectedLanguage"]] 89 | posts_dict["createdAt"]=[dat["createdAt"]] 90 | posts_dict["updatedAt"]=[dat["updatedAt"]] 91 | posts_dict["firstPublishedAt"]=[dat["firstPublishedAt"]] 92 | posts_dict["latestPublishedAt"]=[dat["latestPublishedAt"]] 93 | posts_dict["story_slug"]=[dat["slug"]] 94 | posts_dict["uniqueSlug"]=[dat['uniqueSlug']] 95 | posts_dict["vote"]=[dat['vote']] 96 | posts_dict["hasUnpublishedEdits"]=[dat["hasUnpublishedEdits"]] 97 | posts_dict["allowResponses"]=[dat["allowResponses"]] 98 | posts_dict["importedUrl"]=[dat["importedUrl"]] 99 | posts_dict["webCanonicalUrl"]=[dat["webCanonicalUrl"]] 100 | posts_dict["mediumUrl"]=[dat["mediumUrl"]] 101 | posts_dict["importedPublishedAt"]=[dat["importedPublishedAt"]] 102 | posts_dict["vote"]=[dat["vote"]] 103 | posts_dict["isApprovedTranslation"]=[dat["isApprovedTranslation"]] 104 | posts_dict["translationSourcePostId"]=[dat["translationSourcePostId"]] 105 | posts_dict["translationSourceCreatorId"]=[dat["translationSourceCreatorId"]] 106 | posts_dict["displayAuthor"]=[dat["displayAuthor"]] 107 | posts_dict["coverless"]=[dat["coverless"]] 108 | #What type of preview content are available 109 | prev_name=[] 110 | prev_type=[] 111 | prev_text=[] 112 | 113 | for prev_content in dat["previewContent"]["bodyModel"]["paragraphs"]: 114 | #print prev_content['name'] 115 | if "name" in prev_content: 116 | prev_name.append(prev_content["name"]) 117 | else: 118 | prev_name.append("") 119 | if "type" in prev_content: 120 | prev_type.append(prev_content["type"]) 121 | else: 122 | prev_type.append("") 123 | 124 | if("text" in prev_content): 125 | #print "Text tag present" 126 | text=prev_content["text"] 127 | 128 | prev_text.append(text) 129 | #print "Priting prev Text list" 130 | #print prev_text 131 | else: 132 | prev_text.append("") 133 | posts_dict["PreviewContent_Name"]=[prev_name] 134 | posts_dict["PreviewContent_Type"]=[prev_type] 135 | posts_dict["PreviewContent_Text"]=[prev_text] 136 | posts_dict["PreviewContent_isFullContent"]=[dat["previewContent"]["isFullContent"]] 137 | posts_dict["notifyFollowers"]=[dat["notifyFollowers"]] 138 | posts_dict["notifyTwitter"]=[dat["notifyTwitter"]] 139 | posts_dict["notifyFacebook"]=[dat["notifyFacebook"]] 140 | posts_dict["isSeries"]=[dat["isSeries"]] 141 | posts_dict["isSponsored"]=[dat["isSponsored"]] 142 | posts_dict["isSubscriptionLocked"]=[dat["isSubscriptionLocked"]] 143 | posts_dict["seriesLastAppendedAt"]=[dat["seriesLastAppendedAt"]] 144 | posts_dict["audioVersionDurationSec"]=[dat["audioVersionDurationSec"]] 145 | posts_dict["isNsfw"]=[dat["isNsfw"]] 146 | posts_dict["isEligibleForRevenue"]=[dat["isEligibleForRevenue"]] 147 | posts_dict["isBlockedFromHightower"]=[dat["isBlockedFromHightower"]] 148 | posts_dict["featureLockRequestAcceptedAt"]=[dat["featureLockRequestAcceptedAt"]] 149 | posts_dict["featureLockRequestMinimumGuaranteeAmount"]=[dat["featureLockRequestMinimumGuaranteeAmount"]] 150 | posts_dict["isElevate"]=[dat['isElevate']] 151 | posts_dict["isTitleSynthesized"]=[dat["isTitleSynthesized"]] 152 | posts_dict["inResponseToPostId"]=[dat["inResponseToPostId"]] 153 | 154 | 155 | #Get metadata from virtuals 156 | if "virtuals" in dat: 157 | virtuals=dat["virtuals"] 158 | #print virtuals 159 | if "statusForCollection" in virtuals: 160 | posts_dict["statusForCollection"]=[virtuals["statusForCollection"]] 161 | else: 162 | posts_dict["statusForCollection"]=[""] 163 | 164 | posts_dict["allowNotes"]=[virtuals["allowNotes"]] 165 | posts_dict["wordCount"]=[virtuals["wordCount"]] 166 | posts_dict["imageCount"]=[virtuals["imageCount"]] 167 | posts_dict["readingTime"]=[virtuals["readingTime"]] 168 | posts_dict["subTitle"]=[virtuals["subtitle"]] 169 | if "publishedInCount" in virtuals: 170 | posts_dict["publishedInCount"]=[virtuals["publishedInCount"]] 171 | else: 172 | posts_dict["publishedInCount"]=[""] 173 | posts_dict["recommends"]=[virtuals["recommends"]] 174 | posts_dict["isBookmarked"]=[virtuals["isBookmarked"]] 175 | posts_dict["socialRecommendsCount"]=[virtuals["socialRecommendsCount"]] 176 | posts_dict["responsesCreatedCount"]=[virtuals["responsesCreatedCount"]] 177 | posts_dict["isLockedPreviewOnly"]=[virtuals["isLockedPreviewOnly"]] 178 | posts_dict["sectionCount"]=[virtuals["sectionCount"]] 179 | posts_dict["metaDescription"]=[virtuals["metaDescription"]] 180 | posts_dict["totalClapCount"]=[virtuals["totalClapCount"]] 181 | posts_dict["readingList"]=[virtuals["readingList"]] 182 | 183 | #Get number of links in the story 184 | posts_dict["linksCount"]=len(virtuals["links"]["entries"]) 185 | #Get number of tags in the story 186 | posts_dict["tagsCount"]=len(virtuals["tags"]) 187 | #What are the tags associated with the story 188 | tag_name=[] 189 | tag_slug=[] 190 | tags_dict={} 191 | for tag in virtuals["tags"]: 192 | tag_slug.append(tag["slug"]) 193 | tag_name.append(tag["name"]) 194 | tags_dict["slug"]=[tag["slug"]] 195 | tags_dict["name"]=[tag["name"]] 196 | tags_dict["followerCount"]=[tag['metadata']["followerCount"]] 197 | tags_dict["postCount"]=[tag['metadata']["postCount"]] 198 | tags_dict["isFollowing"]=[tag['virtuals']['isFollowing']] 199 | tags_dict["tagDateScrapped"]=[ datetime.datetime.strptime(dateScrapped, '%Y%m%d').strftime("%Y-%m-%d")] 200 | tags_dict["Search Query"]=[searchString] 201 | tags=tags.append(pd.DataFrame(tags_dict),ignore_index=False) 202 | posts_dict["tags_slug"]=[tag_slug] 203 | posts_dict["tags_name"]=[tag_name] 204 | 205 | 206 | 207 | 208 | 209 | 210 | print "Complted creating dic" 211 | posts=posts.append(pd.DataFrame(posts_dict),ignore_index=True) 212 | 213 | return [posts,tags] 214 | 215 | 216 | 217 | ''' 218 | References tag contains information about the user and the Collections 219 | ''' 220 | 221 | def processReferences(data_json,searchString,dateScrapped): 222 | user_df=pd.DataFrame() 223 | collection_df=pd.DataFrame() 224 | flag=False 225 | 226 | if "success" in data_json: 227 | if "payload" in data_json: 228 | if "references" in data_json["payload"]: 229 | print "references present" 230 | #print data_json["payload"]["value"] 231 | flag=True 232 | if flag==True: 233 | data=data_json["payload"]["references"] 234 | if "User" in data: 235 | user_dict={} 236 | users=data["User"] 237 | print len(users) 238 | print type(users) 239 | user_key=[] 240 | for user in users: 241 | user_key.append(user) 242 | for key in user_key: 243 | user=users[key] 244 | user_dict["Search Query"]=[searchString] 245 | user_dict["DateScrapped"]=[ datetime.datetime.strptime(dateScrapped, '%Y%m%d').strftime("%Y-%m-%d")] 246 | user_dict["userId"]=[user["userId"]] 247 | user_dict["Name"]=[user["name"]] 248 | user_dict["userName"]=[user["username"]] 249 | user_dict["createdAt"]=[user["createdAt"]] 250 | user_dict["lastPostCreatedAt"]=[user["lastPostCreatedAt"]] 251 | user_dict["bio"]=[user["bio"]] 252 | if "twitterScreenName" in user: 253 | user_dict["twitterScreenName"]=[user["twitterScreenName"]] 254 | else: 255 | user_dict["twitterScreenName"]=[""] 256 | if "facebookAccountId" in user: 257 | user_dict["facebookAccountId"]=[user["facebookAccountId"]] 258 | else: 259 | user_dict["facebookAccountId"]=[""] 260 | user_dict["allowNotes"]=[user["allowNotes"]] 261 | user_dict["mediumMemberAt"]=[user["mediumMemberAt"]] 262 | user_dict["isNsfw"]=[user["isNsfw"]] 263 | 264 | user_df=user_df.append(pd.DataFrame(user_dict),ignore_index=True) 265 | if "Collection" in data: 266 | collection_dict={} 267 | collections=data["Collection"] 268 | collection_key=[] 269 | for collection in collections: 270 | collection_key.append(collection) 271 | for key in collection_key: 272 | coll=collections[key] 273 | collection_dict["Search Query"]=[searchString] 274 | collection_dict["DateScrapped"]=[ datetime.datetime.strptime(dateScrapped, '%Y%m%d').strftime("%Y-%m-%d")] 275 | collection_dict['id']=[coll["id"]] 276 | collection_dict["name"]=[coll["name"]] 277 | collection_dict["slug"]=[coll["slug"]] 278 | if "tags" in coll: 279 | collection_dict["tags"]=[coll["tags"]] 280 | else: 281 | collection_dict["tags"]="" 282 | collection_dict["creatorId"]=[coll["creatorId"]] 283 | collection_dict["description"]=[coll["description"]] 284 | collection_dict["shortDescription"]=[coll["shortDescription"]] 285 | collection_dict["followerCount"]=[coll["metadata"]["followerCount"]] 286 | if "twitterUsername" in coll: 287 | collection_dict["twitterUsername"]=[coll["twitterUsername"]] 288 | else: 289 | collection_dict["twitterUsername"]=[""] 290 | if "facebookPageName" in coll: 291 | collection_dict["facebookPageName"]=[coll["facebookPageName"]] 292 | else: 293 | collection_dict["facebookPageName"]=[""] 294 | if "publicEmail" in coll: 295 | collection_dict["publicEmail"]=[coll["publicEmail"]] 296 | else: 297 | collection_dict["publicEmail"]=[""] 298 | if "domain" in coll: 299 | 300 | collection_dict["domain"]=[coll["domain"]] 301 | else: 302 | collection_dict["domain"]=[""] 303 | if "lightText" in coll: 304 | collection_dict["lightText"]=[coll["lightText"]] 305 | else: 306 | collection_dict["lightText"]=[""] 307 | if "instantArticlesState" in coll: 308 | 309 | collection_dict["instantArticlesState"]=[coll["instantArticlesState"]] 310 | else: 311 | collection_dict["instantArticlesState"]=[""] 312 | if "acceleratedMobilePagesState" in coll: 313 | collection_dict["acceleratedMobilePagesState"]=[coll["acceleratedMobilePagesState"]] 314 | else: 315 | collection_dict["acceleratedMobilePagesState"]=[""] 316 | collection_df=collection_df.append(pd.DataFrame(collection_dict),ignore_index=True) 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | return [user_df,collection_df] 325 | 326 | 327 | 328 | ''' 329 | change path to folder where the scrapped json files are stored 330 | ''' 331 | path='C:\Users\Aiswarya\DataScienceArena\Web Scraping\medium_scrapper\medium_search_dataScience\scrapped\\' 332 | 333 | ReadData(path) 334 | 335 | 336 | 337 | 338 | -------------------------------------------------------------------------------- /Data_Extraction_Archive_Tags.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 16 22:51:19 2018 4 | 5 | @author: Aiswarya 6 | 7 | Extract Data from json files created using medium_scrapper_tag_archive.py 8 | 9 | """ 10 | 11 | #from scrapperFunctions import * 12 | import pandas as pd 13 | import datetime 14 | import codecs 15 | import os 16 | import json 17 | #import sqlalchemy 18 | #import urllib 19 | os.chdir("D:\Aiswarya\Web Scraping\medium_scrapper\\") 20 | 21 | def writeTofile(fileName,text): 22 | with codecs.open(fileName,'w','utf-8') as outfile: 23 | outfile.write(text) 24 | 25 | 26 | def getJsonResponse(fileName): 27 | with codecs.open(fileName,'r','utf-8') as infile: 28 | data=json.load(infile) 29 | return data 30 | 31 | #Check if success True in json 32 | def isRequestSuccess(data): 33 | if "success" in data: 34 | #print "success in data" 35 | #print data["success"] 36 | if data["success"]==True: 37 | return True 38 | return False 39 | 40 | def hasPayload(data): 41 | if "payload" in data: 42 | return True 43 | else: 44 | return False 45 | 46 | def hasTopicTag(data): 47 | if hasPayload(data)==True: 48 | data=data["payload"] 49 | if "topic" in data: 50 | return True 51 | return False 52 | 53 | def hasReferenceTag(data): 54 | if hasPayload(data)==True: 55 | data=data["payload"] 56 | if "references" in data: 57 | return True 58 | return False 59 | 60 | def processPost(dat): 61 | print("In Process Post") 62 | posts_dict={} 63 | tags=pd.DataFrame() 64 | posts=pd.DataFrame() 65 | posts_dict["id"]=[dat["id"]] 66 | print(dat["id"]) 67 | posts_dict["versionId"]=[dat["versionId"]] 68 | print(dat["creatorId"]) 69 | posts_dict["creatorId"]=[dat["creatorId"]] 70 | posts_dict["collectionId"]=[dat["homeCollectionId"]] 71 | 72 | posts_dict["title"]=[dat["title"]] 73 | posts_dict["language"]=[dat["detectedLanguage"]] 74 | posts_dict["createdAt"]=[dat["createdAt"]] 75 | posts_dict["updatedAt"]=[dat["updatedAt"]] 76 | posts_dict["firstPublishedAt"]=[dat["firstPublishedAt"]] 77 | posts_dict["latestPublishedAt"]=[dat["latestPublishedAt"]] 78 | posts_dict["story_slug"]=[dat["slug"]] 79 | posts_dict["uniqueSlug"]=[dat['uniqueSlug']] 80 | posts_dict["vote"]=[dat['vote']] 81 | posts_dict["hasUnpublishedEdits"]=[dat["hasUnpublishedEdits"]] 82 | posts_dict["allowResponses"]=[dat["allowResponses"]] 83 | posts_dict["importedUrl"]=[dat["importedUrl"]] 84 | posts_dict["webCanonicalUrl"]=[dat["webCanonicalUrl"]] 85 | posts_dict["mediumUrl"]=[dat["mediumUrl"]] 86 | posts_dict["importedPublishedAt"]=[dat["importedPublishedAt"]] 87 | posts_dict["vote"]=[dat["vote"]] 88 | posts_dict["isApprovedTranslation"]=[dat["isApprovedTranslation"]] 89 | posts_dict["translationSourcePostId"]=[dat["translationSourcePostId"]] 90 | posts_dict["translationSourceCreatorId"]=[dat["translationSourceCreatorId"]] 91 | posts_dict["displayAuthor"]=[dat["displayAuthor"]] 92 | posts_dict["coverless"]=[dat["coverless"]] 93 | #What type of preview content are available 94 | prev_name=[] 95 | prev_type=[] 96 | prev_text=[] 97 | 98 | for prev_content in dat["previewContent"]["bodyModel"]["paragraphs"]: 99 | #print prev_content['name'] 100 | if "name" in prev_content: 101 | prev_name.append(prev_content["name"]) 102 | else: 103 | prev_name.append("") 104 | if "type" in prev_content: 105 | prev_type.append(prev_content["type"]) 106 | else: 107 | prev_type.append("") 108 | 109 | if("text" in prev_content): 110 | #print "Text tag present" 111 | text=prev_content["text"] 112 | 113 | prev_text.append(text) 114 | #print "Priting prev Text list" 115 | #print prev_text 116 | else: 117 | prev_text.append("") 118 | posts_dict["PreviewContent_Name"]=[prev_name] 119 | posts_dict["PreviewContent_Type"]=[prev_type] 120 | posts_dict["PreviewContent_Text"]=[prev_text] 121 | posts_dict["PreviewContent_isFullContent"]=[dat["previewContent"]["isFullContent"]] 122 | posts_dict["notifyFollowers"]=[dat["notifyFollowers"]] 123 | posts_dict["notifyTwitter"]=[dat["notifyTwitter"]] 124 | posts_dict["notifyFacebook"]=[dat["notifyFacebook"]] 125 | posts_dict["isSeries"]=[dat["isSeries"]] 126 | posts_dict["isSponsored"]=[dat["isSponsored"]] 127 | posts_dict["isSubscriptionLocked"]=[dat["isSubscriptionLocked"]] 128 | posts_dict["seriesLastAppendedAt"]=[dat["seriesLastAppendedAt"]] 129 | posts_dict["audioVersionDurationSec"]=[dat["audioVersionDurationSec"]] 130 | posts_dict["isNsfw"]=[dat["isNsfw"]] 131 | posts_dict["isEligibleForRevenue"]=[dat["isEligibleForRevenue"]] 132 | posts_dict["isBlockedFromHightower"]=[dat["isBlockedFromHightower"]] 133 | posts_dict["featureLockRequestAcceptedAt"]=[dat["featureLockRequestAcceptedAt"]] 134 | posts_dict["featureLockRequestMinimumGuaranteeAmount"]=[dat["featureLockRequestMinimumGuaranteeAmount"]] 135 | posts_dict["isElevate"]=[dat['isElevate']] 136 | posts_dict["isTitleSynthesized"]=[dat["isTitleSynthesized"]] 137 | posts_dict["inResponseToPostId"]=[dat["inResponseToPostId"]] 138 | 139 | 140 | #Get metadata from virtuals 141 | if "virtuals" in dat: 142 | virtuals=dat["virtuals"] 143 | #print virtuals 144 | if "statusForCollection" in virtuals: 145 | posts_dict["statusForCollection"]=[virtuals["statusForCollection"]] 146 | else: 147 | posts_dict["statusForCollection"]=[""] 148 | 149 | posts_dict["allowNotes"]=[virtuals["allowNotes"]] 150 | posts_dict["wordCount"]=[virtuals["wordCount"]] 151 | posts_dict["imageCount"]=[virtuals["imageCount"]] 152 | posts_dict["readingTime"]=[virtuals["readingTime"]] 153 | posts_dict["subTitle"]=[virtuals["subtitle"]] 154 | if "publishedInCount" in virtuals: 155 | posts_dict["publishedInCount"]=[virtuals["publishedInCount"]] 156 | else: 157 | posts_dict["publishedInCount"]=[""] 158 | posts_dict["recommends"]=[virtuals["recommends"]] 159 | posts_dict["isBookmarked"]=[virtuals["isBookmarked"]] 160 | posts_dict["socialRecommendsCount"]=[virtuals["socialRecommendsCount"]] 161 | posts_dict["responsesCreatedCount"]=[virtuals["responsesCreatedCount"]] 162 | posts_dict["isLockedPreviewOnly"]=[virtuals["isLockedPreviewOnly"]] 163 | posts_dict["sectionCount"]=[virtuals["sectionCount"]] 164 | posts_dict["metaDescription"]=[virtuals["metaDescription"]] 165 | posts_dict["totalClapCount"]=[virtuals["totalClapCount"]] 166 | posts_dict["readingList"]=[virtuals["readingList"]] 167 | 168 | #Get number of links in the story 169 | try: 170 | posts_dict["linksCount"]=len(virtuals["links"]["entries"]) 171 | except: 172 | posts_dict["linksCount"]=0 173 | #Get number of tags in the story 174 | posts_dict["tagsCount"]=len(virtuals["tags"]) 175 | #What are the tags associated with the story 176 | tag_name=[] 177 | tag_slug=[] 178 | tags_dict={} 179 | for tag in virtuals["tags"]: 180 | tag_slug.append(tag["slug"]) 181 | tag_name.append(tag["name"]) 182 | tags_dict["slug"]=[tag["slug"]] 183 | tags_dict["name"]=[tag["name"]] 184 | tags_dict["followerCount"]=[tag['metadata']["followerCount"]] 185 | tags_dict["postCount"]=[tag['metadata']["postCount"]] 186 | tags_dict["isFollowing"]=[tag['virtuals']['isFollowing']] 187 | 188 | tags=tags.append(pd.DataFrame(tags_dict),ignore_index=False) 189 | posts_dict["tags_slug"]=[tag_slug] 190 | posts_dict["tags_name"]=[tag_name] 191 | posts=posts.append(pd.DataFrame(posts_dict),ignore_index=True) 192 | return [posts,tags] 193 | 194 | 195 | def ReadData(path): 196 | posts=pd.DataFrame() 197 | tags=pd.DataFrame() 198 | users=pd.DataFrame() 199 | collections=pd.DataFrame() 200 | files=os.listdir(path) 201 | 202 | #files=["dataScienceTag20180430.json"] 203 | processedFiles=[] 204 | 205 | #total_records_processed=0 206 | 207 | for fileName in files : 208 | if '.json' in fileName: 209 | with codecs.open(path+fileName,'r','utf-8') as infile: 210 | data=json.load(infile) 211 | 212 | dfs=processTopicPost(data) 213 | posts=posts.append(dfs[0],ignore_index=True) 214 | tags=tags.append(dfs[1],ignore_index=True) 215 | posts['searchTag']=fileName.split("Tag")[0] 216 | print(fileName.split("Tag")[0]) 217 | archiveDate=fileName.split("Tag")[1] 218 | archiveDate=archiveDate.strip(".json") 219 | archiveDate=datetime.datetime.strptime(archiveDate, '%Y%m%d').strftime("%Y-%m-%d") 220 | posts['archiveDate']=archiveDate 221 | scrappedDate=path.strip("\\") 222 | posts['scrappedDate']=datetime.datetime.strptime(scrappedDate, '%Y%m%d').strftime("%Y-%m-%d") 223 | tags['scrappedDate']=datetime.datetime.strptime(scrappedDate, '%Y%m%d').strftime("%Y-%m-%d") 224 | references=processReferences(data,fileName.split("Tag")[1],scrappedDate) 225 | users=users.append(references[0],ignore_index=True) 226 | collections=collections.append(references[1],ignore_index=True) 227 | processedFiles.append(fileName) 228 | 229 | posts.to_csv(path+"Posts_ByTag_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 230 | tags.to_csv(path+"Tags_ByTag_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 231 | users.to_csv(path+"Users_ByTag_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 232 | collections.to_csv(path+"Collections_ByTag_"+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".csv",index=False,encoding='utf-8') 233 | 234 | def processTopicPost(data): 235 | 236 | #Check if request successful 237 | successStatus=isRequestSuccess(data) 238 | print(successStatus) 239 | postdata=pd.DataFrame() 240 | tags=pd.DataFrame() 241 | #check if references in payload 242 | if successStatus==True and hasReferenceTag(data)==True: 243 | data=data["payload"]["references"] 244 | if "Post" in data: 245 | posts=data["Post"] 246 | #Get all the post id 247 | post_id=[] 248 | for post in posts: 249 | post_id.append(post) 250 | #For each post id extract post information 251 | for pid in post_id: 252 | #print(pid) 253 | post=posts[pid] 254 | dfs=processPost(post) 255 | postdata=postdata.append(dfs[0],ignore_index=True) 256 | tags=tags.append(dfs[1],ignore_index=True) 257 | #postdata.to_csv("Posts.csv",encoding='utf-8') 258 | #tags.to_csv("Tags.csv",encoding='utf-8') 259 | return [postdata,tags] 260 | 261 | 262 | 263 | 264 | #If success, check if payload exists 265 | def processReferences(data_json,searchTag,dateScrapped): 266 | user_df=pd.DataFrame() 267 | collection_df=pd.DataFrame() 268 | flag=False 269 | 270 | if "success" in data_json: 271 | if "payload" in data_json: 272 | if "references" in data_json["payload"]: 273 | print("references present") 274 | #print data_json["payload"]["value"] 275 | flag=True 276 | if flag==True: 277 | data=data_json["payload"]["references"] 278 | if "User" in data: 279 | user_dict={} 280 | users=data["User"] 281 | #print len(users) 282 | #print type(users) 283 | user_key=[] 284 | for user in users: 285 | user_key.append(user) 286 | for key in user_key: 287 | user=users[key] 288 | user_dict["Search Tag"]=[searchTag] 289 | user_dict["ScrappedDate"]=[ datetime.datetime.strptime(dateScrapped, '%Y%m%d').strftime("%Y-%m-%d")] 290 | user_dict["userId"]=[user["userId"]] 291 | user_dict["Name"]=[user["name"]] 292 | user_dict["userName"]=[user["username"]] 293 | user_dict["createdAt"]=[user["createdAt"]] 294 | user_dict["lastPostCreatedAt"]=[user["lastPostCreatedAt"]] 295 | user_dict["bio"]=[user["bio"]] 296 | if "twitterScreenName" in user: 297 | user_dict["twitterScreenName"]=[user["twitterScreenName"]] 298 | else: 299 | user_dict["twitterScreenName"]=[""] 300 | if "facebookAccountId" in user: 301 | user_dict["facebookAccountId"]=[user["facebookAccountId"]] 302 | else: 303 | user_dict["facebookAccountId"]=[""] 304 | user_dict["allowNotes"]=[user["allowNotes"]] 305 | user_dict["mediumMemberAt"]=[user["mediumMemberAt"]] 306 | if "isNsfw" in user: 307 | user_dict["isNsfw"]=[user["isNsfw"]] 308 | else: 309 | user_dict["IsNsfw"]=[""] 310 | user_df=user_df.append(pd.DataFrame(user_dict),ignore_index=True) 311 | if "Collection" in data: 312 | collection_dict={} 313 | collections=data["Collection"] 314 | collection_key=[] 315 | for collection in collections: 316 | collection_key.append(collection) 317 | for key in collection_key: 318 | coll=collections[key] 319 | collection_dict["Search Tag"]=[searchTag] 320 | collection_dict["ScrappedDate"]=[ datetime.datetime.strptime(dateScrapped, '%Y%m%d').strftime("%Y-%m-%d")] 321 | collection_dict['id']=[coll["id"]] 322 | collection_dict["name"]=[coll["name"]] 323 | collection_dict["slug"]=[coll["slug"]] 324 | if "tags" in coll: 325 | collection_dict["tags"]=[coll["tags"]] 326 | else: 327 | collection_dict["tags"]="" 328 | collection_dict["creatorId"]=[coll["creatorId"]] 329 | collection_dict["description"]=[coll["description"]] 330 | collection_dict["shortDescription"]=[coll["shortDescription"]] 331 | collection_dict["followerCount"]=[coll["metadata"]["followerCount"]] 332 | if "twitterUsername" in coll: 333 | collection_dict["twitterUsername"]=[coll["twitterUsername"]] 334 | else: 335 | collection_dict["twitterUsername"]=[""] 336 | if "facebookPageName" in coll: 337 | collection_dict["facebookPageName"]=[coll["facebookPageName"]] 338 | else: 339 | collection_dict["facebookPageName"]=[""] 340 | if "publicEmail" in coll: 341 | collection_dict["publicEmail"]=[coll["publicEmail"]] 342 | else: 343 | collection_dict["publicEmail"]=[""] 344 | if "domain" in coll: 345 | 346 | collection_dict["domain"]=[coll["domain"]] 347 | else: 348 | collection_dict["domain"]=[""] 349 | if "lightText" in coll: 350 | collection_dict["lightText"]=[coll["lightText"]] 351 | else: 352 | collection_dict["lightText"]=[""] 353 | if "instantArticlesState" in coll: 354 | 355 | collection_dict["instantArticlesState"]=[coll["instantArticlesState"]] 356 | else: 357 | collection_dict["instantArticlesState"]=[""] 358 | if "acceleratedMobilePagesState" in coll: 359 | collection_dict["acceleratedMobilePagesState"]=[coll["acceleratedMobilePagesState"]] 360 | else: 361 | collection_dict["acceleratedMobilePagesState"]=[""] 362 | collection_df=collection_df.append(pd.DataFrame(collection_dict),ignore_index=True) 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | return [user_df,collection_df] 371 | 372 | 373 | 374 | 375 | 376 | 377 | # Read Data by giving folder path 378 | 379 | ReadData("20180504\\") 380 | --------------------------------------------------------------------------------