├── .gitignore ├── README.md ├── scraper.py └── scrapers ├── CSL.py ├── MAS.py ├── PepScraper.py ├── StateCourt.py ├── UN_Sanctions.py ├── __pycache__ ├── CSL.cpython-36.pyc ├── MAS.cpython-36.pyc ├── PepScraper.cpython-36.pyc ├── UN_Sanctions.cpython-36.pyc ├── cia_gov.cpython-36.pyc ├── google.cpython-36.pyc ├── panama.cpython-36.pyc ├── reddit.cpython-36.pyc └── twitter_scraper.cpython-36.pyc ├── cia_gov.py ├── fb_public_search.py ├── google.py ├── panama.py ├── reddit.py └── twitter_scraper.py /.gitignore: -------------------------------------------------------------------------------- 1 | results/ 2 | Scrap web.ipynb 3 | .ipynb_checkpoints/ 4 | .idea/ 5 | config_keys.py 6 | *.ipynb 7 | __pycache__/config_keys.cpython-36.pyc 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Person Scraper 2 | 3 | Given a name of a person, the scraper will cross check against the following databases: 4 | 5 | - [X] OFAC SDN 6 | - [X] The Panama Papers 7 | - [X] UN Sanctions 8 | - [X] US Sanctions 9 | - [X] MAS Sanctions and Freezing of assets 10 | - [X] PEP databases 11 | - [X] CIA Database 12 | - [ ] Credit bureaus 13 | - [X] Facebook 14 | - [ ] LinkedIn 15 | - [X] Twitter 16 | - [ ] Criminal Records 17 | - [X] Court Records 18 | - [X] Google 19 | - [ ] FATF 20 | - [X] Reddit 21 | 22 | 23 | 24 | ### Prerequisites 25 | 26 | The code runs on Python 3.X and these are the packages you need: 27 | 28 | ``` 29 | pandas 30 | beautifulsoup4 31 | urllib 32 | python-linkedin 33 | ``` 34 | 35 | ### Installing 36 | 37 | Install the following packages if you have not: 38 | 39 | ``` 40 | pip install pandas 41 | pip install urllib 42 | pip install beautifulsoup4 43 | pip install python-linkedin 44 | ``` 45 | 46 | 47 | ### Running the scrapper 48 | 49 | Here's an example if you want to scrape on "Osama Bin Laden" 50 | 51 | ``` 52 | python scraper.py -n "Osama Bin Laden" 53 | ``` 54 | 55 | And the output will be something like this: 56 | ``` 57 | Checking for Osama Bin Laden... 58 | CSL check: 59 | Found 1 matches in CSL 60 | Panama Papers check: 61 | Found 0 matches in Panama Papers 62 | ``` 63 | 64 | The search results can be found in the results/name folder 65 | ``` 66 | person-scraper/ 67 | |-- scrapers/ 68 | | |-- CSL.py 69 | | |-- panama.py 70 | | 71 | |-- results/ 72 | | |-- Osama_Bin_Laden/ 73 | | | |-- Osama_Bin_Laden_CSL.json 74 | | | |-- Osama_Bin_Laden_PanamaPapers.csv 75 | | 76 | |-- scraper.py 77 | |-- README 78 | |-- .gitignore 79 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from scrapers import CSL, panama, twitter_scraper, google, cia_gov, reddit, MAS, UN_Sanctions, PepScraper 4 | 5 | parser = argparse.ArgumentParser(description='Person Scrapper') 6 | requiredNamed = parser.add_argument_group('required named arguments') 7 | requiredNamed.add_argument('--name','-n', required=True, type=str, help='Example -n Osama') 8 | 9 | args = parser.parse_args() 10 | 11 | def scrape(name): 12 | folderName = name.replace(' ', '_') 13 | cwd = os.getcwd() 14 | newFolder = os.path.join(os.path.join(cwd,'results'),folderName) 15 | if not os.path.exists(newFolder): 16 | os.makedirs(newFolder) 17 | print("Checking for " + name + "...") 18 | print("CSL check:") 19 | CSL.checkCSL(name, newFolder) 20 | print("Panama Papers check:") 21 | panama.checkPanama(name, newFolder) 22 | print("CIA Gov check:") 23 | cia_gov.checkCIA(name, newFolder) 24 | print("Twitter check:") 25 | twitter_scraper.checkTwitter(name, newFolder) 26 | print("Google check:") 27 | google.checkGoogle(name, newFolder) 28 | print("Reddit check:") 29 | reddit.checkReddit(name, newFolder) 30 | print("MAS check:") 31 | MAS.getMASResults(name, newFolder) 32 | print("UN Sanctions check:") 33 | UN_Sanctions.getUnResults(name, newFolder) 34 | print("PEP Database check:") 35 | PepScraper.getPepResults(name, newFolder) 36 | scrape(args.name) -------------------------------------------------------------------------------- /scrapers/CSL.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import json 3 | 4 | def checkCSL(name, folder): 5 | urlname = name.replace(' ', '%20') 6 | url = 'https://api.trade.gov/consolidated_screening_list/search?api_key=AWsbLvhl2q_-vqIz1V00QX-o&name=%s&type=Individual' %(urlname) 7 | contents = urllib.request.urlopen(url).read() 8 | results = json.loads(contents)["results"] 9 | fileName = folder + '/' + name.replace(' ', '_') + '_CSL.json' 10 | #TO-DO process the data dump 11 | with open(fileName, 'w') as outfile: 12 | json.dump(results, outfile) 13 | print("Found " + str(len(results)) + " matches in CSL") -------------------------------------------------------------------------------- /scrapers/MAS.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import urllib 7 | from bs4 import BeautifulSoup 8 | import pandas as pd 9 | import re 10 | 11 | 12 | # In[16]: 13 | 14 | def getMASResults(name, folder): 15 | df = pd.DataFrame(columns=['Names','Data']) 16 | 17 | name_lower = name.lower() 18 | dprk = 'https://scsanctions.un.org/en/?keywords=dprk' 19 | drc = 'https://scsanctions.un.org/en/?keywords=drc' 20 | southSudan ='https://scsanctions.un.org/en/?keywords=southsudan' 21 | somalia = 'https://scsanctions.un.org/en/?keywords=somalia' 22 | iran = 'https://scsanctions.un.org/en/?keywords=iran' 23 | libya = 'https://scsanctions.un.org/en/?keywords=libya' 24 | sudan ='https://scsanctions.un.org/en/?keywords=sudan' 25 | yemen = 'https://scsanctions.un.org/en/?keywords=yemen' 26 | al_qaida = 'https://scsanctions.un.org/en/?keywords=al-qaida' 27 | taliban = 'https://scsanctions.un.org/en/?keywords=taliban' 28 | list_of_urls = [] 29 | list_of_urls.append(dprk) 30 | list_of_urls.append(drc) 31 | list_of_urls.append(southSudan) 32 | list_of_urls.append(somalia) 33 | list_of_urls.append(iran) 34 | list_of_urls.append(libya) 35 | list_of_urls.append(sudan) 36 | list_of_urls.append(yemen) 37 | list_of_urls.append(al_qaida) 38 | list_of_urls.append(taliban) 39 | fileName = folder + '/' + name.replace(' ', '_') + '_MASResults.csv' 40 | 41 | for sites in list_of_urls: 42 | page = urllib.request.urlopen(sites) 43 | soup = BeautifulSoup(page, 'html.parser') 44 | 45 | item_list = soup.findAll('tr', attrs={'class':'rowtext'}) 46 | 47 | for item in item_list: 48 | text = item.text 49 | text = text.replace('\n', '') 50 | text = text.replace('\t', '') 51 | text = text.replace('\xa0',' ') 52 | text = text.replace('click here', '') 53 | name_index = text.find('Name: 1:') 54 | if name_index != -1: 55 | first_name = text[name_index + len('Name: 1:'): text.find('2:')] 56 | second_name = text[text.find('2:')+ 1: text.find('3:')] 57 | third_name = text[text.find('3:')+1: text.find('4:')] 58 | full_name = first_name + second_name + third_name 59 | full_name = full_name.replace(' na ','') 60 | full_name= full_name.replace(':', '') 61 | full_name = full_name.strip() 62 | full_name = ' '.join(full_name.split()) 63 | if full_name.lower() ==name_lower: 64 | df.loc[0] = [name_lower, text] 65 | df.to_csv(fileName,encoding='utf-8',header=True, index=False) 66 | print(name + ' FOUND in MAS Sanctions List' + " " + sites) 67 | else: 68 | full_name = text[text.find('Name:')+ len('Name:') : text.find('A.k.a.')] 69 | full_name = full_name.replace(' na ','') 70 | full_name= full_name.replace(':', '') 71 | full_name = full_name.strip() 72 | full_name = ' '.join(full_name.split()) 73 | if full_name.lower() ==name_lower: 74 | df.loc[0] = [name_lower, text] 75 | df.to_csv(fileName,encoding='utf-8',header=True, index=False) 76 | print(name + ' FOUND in MAS Sanctions List' + " " + sites) 77 | print(name + ' NOT FOUND in MAS Sanctions List') 78 | 79 | 80 | 81 | # In[17]: 82 | 83 | 84 | # In[ ]: 85 | 86 | 87 | 88 | 89 | # In[ ]: 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /scrapers/PepScraper.py: -------------------------------------------------------------------------------- 1 | 2 | import urllib 3 | from bs4 import BeautifulSoup 4 | import pandas as pd 5 | 6 | def nameMatching(testName, dbName, threshold=0.7): 7 | testName = testName.lower() 8 | dbName = dbName.strip().lower() 9 | testList = testName.split(' ') 10 | dbNameList = dbName.split(' ') 11 | count = 0 12 | for name in testList: 13 | if name in dbNameList: 14 | count+= 1 15 | count = count/len(testList) 16 | if count > threshold: 17 | return True 18 | return False 19 | 20 | def getPepResults(name, folder): 21 | country_list = ["AF.html","AL.html","AG.html","AN.html","AO.html","AC.html","AR.html","AM.html","AA.html","AS.html","AU.html","AJ.html","BF.html","BA.html","BG.html","BB.html","BO.html","BE.html","BH.html","BN.html","BD.html","BT.html","BL.html","BK.html","BC.html","BR.html","BX.html","BU.html","UV.html","BM.html","BY.html","CV.html","CB.html","CM.html","CA.html","CT.html","CD.html","CI.html","CH.html","CO.html","CN.html","CG.html","CF.html","CW.html","CS.html","IV.html","HR.html","CU.html","CY.html","EZ.html","DA.html","DJ.html","DO.html","DR.html","EC.html","EG.html","ES.html","EK.html","ER.html","EN.html","WZ.html","ET.html","FJ.html","FI.html","FR.html","GB.html","GA.html","GG.html","GM.html","GH.html","GR.html","GJ.html","GT.html","GV.html","PU.html","GY.html","HA.html","VT.html","HO.html","HU.html","IC.html","IN.html","ID.html","IR.html","IZ.html","EI.html","IS.html","IT.html","JM.html","JA.html","JO.html","KZ.html","KE.html","KR.html","KN.html","KS.html","KV.html","KU.html","KG.html","LA.html","LG.html","LE.html","LT.html","LI.html","LY.html","LS.html","LH.html","LU.html","MK.html","MA.html","MI.html","MY.html","MV.html","ML.html","MT.html","RM.html","MR.html","MP.html","MX.html","FM.html","MD.html","MN.html","MG.html","MJ.html","MO.html","MZ.html","WA.html","NR.html","NP.html","NL.html","NZ.html","NU.html","NG.html","NI.html","NO.html","MU.html","PK.html","PS.html","PM.html","PP.html","PA.html","PE.html","RP.html","PL.html","PO.html","QA.html","RO.html","RS.html","RW.html","SC.html","ST.html","VC.html","WS.html","SM.html","TP.html","SA.html","SG.html","RI.html","SE.html","SL.html","SN.html","LO.html","SI.html","BP.html","SO.html","SF.html","OD.html","SP.html","CE.html","SU.html","NS.html","SW.html","SZ.html","SY.html","TW.html","TI.html","TZ.html","TH.html","TT.html","TO.html","TN.html","TD.html","TS.html","TU.html","TX.html","TV.html","UG.html","UP.html","AE.html","UK.html","UY.html","UZ.html","NH.html","VE.html","VM.html","YM.html","ZA.html","ZI.html"] 22 | names = [] 23 | websites = [] 24 | fileName = folder + '/' + name.replace(' ', '_') + '_PEPResults.csv' 25 | for country in country_list: 26 | 27 | sites= "https://www.cia.gov/library/publications/world-leaders-1/"+country 28 | page = urllib.request.urlopen(sites) 29 | soup = BeautifulSoup(page, 'html.parser') 30 | 31 | item_list = soup.findAll('span', attrs={'class':'cos_name'}) 32 | 33 | for item in item_list: 34 | text = item.text 35 | if nameMatching(name, text): 36 | print(name + ' is a PEP' + " " + sites) 37 | websites.append(sites) 38 | names.append(text.strip()) 39 | if len(names) == 0: 40 | print(name + ' NOT FOUND as a PEP') 41 | else: 42 | df = pd.DataFrame({'Names':names, 'Websites':websites}) 43 | df.to_csv(fileName, header=True, index=False) 44 | 45 | 46 | -------------------------------------------------------------------------------- /scrapers/StateCourt.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import re 5 | 6 | def getStateCourtResults(name): 7 | 8 | name_lower = name.lower() 9 | search_name = name_lower.replace(' ','%20') 10 | stateCourt = 'https://www.statecourts.gov.sg/Pages/BasicSearchResult.aspx?k='+ search_name 11 | page = urllib.request.urlopen(stateCourt) 12 | soup = BeautifulSoup(page, 'html.parser') 13 | 14 | table = soup.find('table') 15 | contents = table.text.lower() 16 | 17 | if contents.find(name_lower)!= -1: 18 | return name + ' FOUND in State Court Records' 19 | 20 | 21 | return name + ' NOT FOUND in State Court Records' 22 | 23 | 24 | 25 | #print(getStateCourtResults('kang CHI LOONG')) 26 | #print(getStateCourtResults('PANG CHI KANG')) 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /scrapers/UN_Sanctions.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | import urllib 7 | from bs4 import BeautifulSoup 8 | import pandas as pd 9 | import re 10 | 11 | 12 | # In[8]: 13 | 14 | def getUnResults(name, folder): 15 | df = pd.DataFrame(columns=['Names','Data']) 16 | 17 | search_name1 = name.lower().replace(' ','%7C') 18 | search_name2= name.lower().replace(' ','+') 19 | name_lower = name.lower() 20 | sites= "https://scsanctions.un.org/en/?keywords=+%22" + search_name1+ "%22&per-page=2500§ions=s§ions=s&sort=id&includes=%22"+search_name2 + "%22&excludes=&committee=&nationality=&reference_from=&reference_to=" 21 | 22 | page = urllib.request.urlopen(sites) 23 | soup = BeautifulSoup(page, 'html.parser') 24 | 25 | item_list = soup.findAll('tr', attrs={'class':'rowtext'}) 26 | fileName = folder + '/' + name.replace(' ', '_') + '_UNResults.csv' 27 | 28 | for item in item_list: 29 | text = item.text 30 | text = text.replace('\n', '') 31 | text = text.replace('\t', '') 32 | text = text.replace('\xa0',' ') 33 | text = text.replace('click here', '') 34 | name_index = text.find('Name: 1:') 35 | if name_index != -1: 36 | first_name = text[name_index + len('Name: 1:'): text.find('2:')] 37 | second_name = text[text.find('2:')+ 1: text.find('3:')] 38 | third_name = text[text.find('3:')+1: text.find('4:')] 39 | full_name = first_name + second_name + third_name 40 | full_name = full_name.replace(' na ','') 41 | full_name= full_name.replace(':', '') 42 | full_name = full_name.strip() 43 | full_name = ' '.join(full_name.split()) 44 | if full_name.lower() == name_lower: 45 | 46 | df.loc[0] = [name_lower, text] 47 | df.to_csv(fileName,encoding='utf-8',header=True, index=False) 48 | print(name + ' FOUND in MAS Sanctions List' + " " + sites) 49 | else: 50 | full_name = text[text.find('Name:')+ len('Name:') : text.find('A.k.a.')] 51 | full_name = full_name.replace(' na ','') 52 | full_name= full_name.replace(':', '') 53 | full_name = full_name.strip() 54 | full_name = ' '.join(full_name.split()) 55 | if full_name.lower() == name_lower: 56 | 57 | df.loc[0] = [name_lower, text] 58 | df.to_csv(fileName,encoding='utf-8',header=True, index=False) 59 | print(name + ' FOUND in MAS Sanctions List' + " " + sites) 60 | print(name + ' NOT FOUND in MAS Sanctions List') 61 | 62 | 63 | # In[ ]: 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /scrapers/__pycache__/CSL.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/CSL.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/MAS.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/MAS.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/PepScraper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/PepScraper.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/UN_Sanctions.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/UN_Sanctions.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/cia_gov.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/cia_gov.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/google.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/google.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/panama.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/panama.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/reddit.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/reddit.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/__pycache__/twitter_scraper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/twitter_scraper.cpython-36.pyc -------------------------------------------------------------------------------- /scrapers/cia_gov.py: -------------------------------------------------------------------------------- 1 | # from selenium import webdriver 2 | # from selenium.webdriver.common.keys import Keys 3 | import urllib 4 | from bs4 import BeautifulSoup 5 | import pandas as pd 6 | import re 7 | 8 | def getCIAResults(name): 9 | subject = name.lower().replace(' ','+') 10 | url = 'https://www.cia.gov/search?q=' + subject + '&site=CIA&output=xml_no_dtd&client=CIA&myAction=/search&proxystylesheet=CIA&submitMethod=get&ie=UTF-8&ulang=en&ip=137.132.84.43&access=p&sort=date:D:L:d1&entqr=3&entqrm=0&wc=200&wc_mc=1&oe=UTF-8&ud=1&filter=0' 11 | page = urllib.request.urlopen(url) 12 | soup = BeautifulSoup(page, 'html.parser') 13 | result_info = soup.find(id="content-core") 14 | result_info_items = result_info.find_all('b', limit = 4) 15 | 16 | info = [] 17 | for ri in result_info_items: 18 | info.append(ri.text.strip()) 19 | 20 | test = info.pop(1) 21 | if test == name.lower(): 22 | return pd.DataFrame(columns={'title','date','description','link'}) 23 | 24 | num_results= int(info.pop(1)) 25 | num_pages = int(num_results/10 + (num_results % 10 > 0)) 26 | 27 | 28 | counter = 0 29 | pages = [] 30 | for i in range(0,num_pages): 31 | url2 = 'https://www.cia.gov/search?q=' + subject + '&site=CIA&output=xml_no_dtd&client=CIA&myAction=/search&proxystylesheet=CIA&submitMethod=get&ie=UTF-8&ulang=en&ip=137.132.84.43&access=p&sort=date:D:L:d1&entqr=3&entqrm=0&wc=200&wc_mc=1&oe=UTF-8&ud=1&filter=0&start=' + str(counter) 32 | pages.append(url2) 33 | counter += 10 34 | 35 | link = [] 36 | title = [] 37 | desc = [] 38 | date = [] 39 | 40 | for item in pages: 41 | pg = urllib.request.urlopen(item) 42 | soup2 = BeautifulSoup(pg, 'html.parser') 43 | result_list = soup2.find(id="content-core") 44 | 45 | #get link and title 46 | 47 | result_list_items = result_list.find_all('a', ctype = "c") 48 | for result in result_list_items: 49 | link.append(result.get('href')) #get links 50 | title.append(result.text) #get title 51 | 52 | result_list_desc = result_list.find_all('td', class_="s") 53 | for result2 in result_list_desc: 54 | temp = result2.text.strip() 55 | desc.append(temp[:-19]) 56 | date.append(temp[-10:]) 57 | 58 | data = pd.DataFrame({'title':title, 'date': date, 'description': desc, 'link':link}) 59 | return data 60 | 61 | 62 | def checkCIA(name, folder): 63 | results_table = getCIAResults(name) 64 | if len(results_table) == 0: 65 | print('No results found in www.cia.gov') 66 | else: 67 | print('Found ' + str(len(results_table)) + ' matches in www.cia.gov') 68 | fileName = folder + '/' + name.replace(' ', '_') + '_CIAResults.csv' 69 | results_table.to_csv(fileName, encoding='utf-8',header=True, index=False) 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /scrapers/fb_public_search.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import re 5 | 6 | def getFBProfiles(name): 7 | 8 | subject = name.lower().replace(' ','-') 9 | links= [] 10 | names= [] 11 | 12 | url = 'https://www.facebook.com/public/' + subject 13 | page = urllib.request.urlopen(url) 14 | soup = BeautifulSoup(page, 'html.parser') 15 | text = str(soup) 16 | result = re.findall('',text) 17 | 18 | for r in result: 19 | link = re.search('href="(.*)">',r) 20 | name = re.search('>(.*)<', r) 21 | links.append(link.group(1)) 22 | names.append(name.group(1)) 23 | data = pd.DataFrame({'Name':names, 'Link': links}) 24 | return data 25 | 26 | 27 | 28 | def checkFB(name,folder): 29 | results_table = getFBProfiles(name) 30 | if len(results_table) == 0: 31 | print('No results found in www.facebook.com') 32 | else: 33 | print('Found ' + str(len(results_table)) + ' matches in www.facebook.com') 34 | results_table.to_csv("FB.csv" , header = True, index = False ) 35 | fileName = folder + '/' + name.replace(' ', '_') + '_FBResults.csv' 36 | results_table.to_csv(fileName, encoding='utf-8',header=True, index=False) 37 | 38 | #test: 39 | #checkFB('Daniel Tan') 40 | 41 | -------------------------------------------------------------------------------- /scrapers/google.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import json 3 | import config_keys 4 | 5 | def checkGoogle(name, folder): 6 | urlname = name.replace(' ', '%20') 7 | url = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=008469397288160540229:emgz62cres4&q=%s' % (config_keys.googleAPI, urlname) 8 | contents = urllib.request.urlopen(url).read() 9 | results = json.loads(contents)["items"] 10 | matchCount = json.loads(contents)['searchInformation']['totalResults'] 11 | fileName = folder + '/' + name.replace(' ', '_') + '_GoogleSearch.json' 12 | # TO-DO process the data dump 13 | with open(fileName, 'w') as outfile: 14 | json.dump(results, outfile) 15 | print("Found " + matchCount + " matches in Google Search") -------------------------------------------------------------------------------- /scrapers/panama.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import re 5 | 6 | 7 | 8 | def getPanamaResults(name): 9 | 10 | search_name = name.lower().replace(' ','+') 11 | initial_page = 'https://offshoreleaks.icij.org/search?cat=1&e=&q='+search_name+'&utf8=%E2%9C%93' 12 | 13 | page = urllib.request.urlopen(initial_page) 14 | 15 | soup = BeautifulSoup(page, 'html.parser') 16 | 17 | results_count = soup.find('div', attrs={'id':'results_wrapper'}) 18 | 19 | #Get number of officers/individual count 20 | officerCategory = str(results_count.find_all('li')[1]).replace('\n', '') 21 | countpat = r'(?<=\().+?(?=\))' 22 | resultsCount = int(re.findall(countpat,officerCategory)[0]) 23 | results_table = pd.DataFrame(columns=['Names', 'Node', 'Source'],index=range(resultsCount)) # I know the size 24 | 25 | #Iterate through the pages and search results 26 | pageCount = 0 27 | namepat = r'(?<=\>).+?(?=\<)' 28 | nodepat = r'(?<=\/).+?(?=\")' 29 | row_marker = 0 30 | for i in range((resultsCount//100)+1): 31 | newpage = 'https://offshoreleaks.icij.org/search?cat=1&e=&from=' + str(pageCount) + '&q='+search_name+'&utf8=%E2%9C%93' 32 | newpage = urllib.request.urlopen(newpage) 33 | newsoup = BeautifulSoup(newpage, 'html.parser') 34 | table_box = newsoup.find('table', attrs={'class':'search_results'}) 35 | names = table_box.find_all('a') 36 | for row in names: 37 | str_row = str(row).replace('\n','') 38 | if 'nodes' in str_row: 39 | person_name = re.findall(namepat, str_row)[0].strip() 40 | results_table.iat[row_marker, 0] = person_name 41 | node = re.findall(nodepat, str_row)[0].split('/')[1] 42 | results_table.iat[row_marker, 1] = node 43 | else: 44 | source_name = re.findall(namepat, str_row)[0] 45 | results_table.iat[row_marker, 2] = source_name 46 | row_marker += 1 47 | pageCount += 100 48 | return results_table 49 | 50 | 51 | 52 | #To Compare Names 53 | def nameMatching(testName, dbName, threshold=0.7): 54 | testName = testName.lower() 55 | dbName = dbName.lower() 56 | testList = testName.split(' ') 57 | dbNameList = dbName.split(' ') 58 | count = 0 59 | for name in testList: 60 | if name in dbNameList: 61 | count+= 1 62 | count = count/len(testList) 63 | if count > threshold: 64 | return True 65 | return False 66 | 67 | 68 | def checkPanama(testName, folder): 69 | results_table = getPanamaResults(testName) 70 | filteredNameDb = pd.DataFrame([results_table.iloc[row] for row in results_table.index if nameMatching(testName, results_table.iloc[row]['Names'])]) 71 | filteredNameDb.reset_index(drop=True, inplace=True) 72 | if len(filteredNameDb) == 0: 73 | print('No results found in Panama Papers') 74 | else: 75 | print('Found ' + str(len(filteredNameDb)) + ' matches in Panama Papers') 76 | filteredNameDb[['URL']] = 'https://offshoreleaks.icij.org/nodes/' + filteredNameDb[['Node']] 77 | filteredNameDb.drop(columns=['Node'], inplace=True) 78 | fileName = folder + '/' + testName.replace(' ', '_') + '_PanamaPapers.csv' 79 | filteredNameDb.to_csv(fileName,encoding='utf-8',header=True, index=False) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /scrapers/reddit.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Manager 2 | from datetime import datetime 3 | from bs4 import BeautifulSoup 4 | import argparse 5 | import requests 6 | import json 7 | import re 8 | 9 | 10 | def createSoup(url): 11 | REQUEST_AGENT = 'Mozilla/5.0 Chrome/47.0.2526.106 Safari/537.36' 12 | return BeautifulSoup(requests.get(url, headers={'User-Agent':REQUEST_AGENT}).text, 'lxml') 13 | 14 | def getSearchResults(searchUrl): 15 | posts = [] 16 | while True: 17 | resultPage = createSoup(searchUrl) 18 | posts += resultPage.findAll('div', {'class':'search-result-link'}) 19 | footer = resultPage.findAll('a', {'rel':'nofollow next'}) 20 | if footer: 21 | searchUrl = footer[-1]['href'] 22 | else: 23 | return posts 24 | 25 | def parsePost(post, results): 26 | time = post.find('time')['datetime'] 27 | date = datetime.strptime(time[:19], '%Y-%m-%dT%H:%M:%S') 28 | title = post.find('a', {'class':'search-title'}).text 29 | score = post.find('span', {'class':'search-score'}).text 30 | score = int(re.match(r'[+-]?\d+', score).group(0)) 31 | author = post.find('a', {'class':'author'}).text 32 | subreddit = post.find('a', {'class':'search-subreddit-link'}).text 33 | commentsTag = post.find('a', {'class':'search-comments'}) 34 | url = commentsTag['href'] 35 | numComments = int(re.match(r'\d+', commentsTag.text).group(0)) 36 | #print("\n" + str(date)[:19] + ":", numComments, score, author, subreddit, title) 37 | results.append({'title': title, 'url': url, 'date': str(date), 'score': score, 38 | 'author': author, 'subreddit': subreddit}) 39 | 40 | def checkReddit(name, folder): 41 | SITE_URL = 'https://old.reddit.com/' 42 | searchUrl = SITE_URL + 'search?q="' + name + '"' 43 | fileName = folder + '/' + name.replace(' ', '_') + '_redditResults.json' 44 | try: 45 | product = json.load(open(fileName)) 46 | except FileNotFoundError: 47 | print('Creating json file') 48 | product = {} 49 | print('Search URL:', searchUrl) 50 | posts = getSearchResults(searchUrl) 51 | if (len(posts) == 0): 52 | return "Found 0 matches in Reddit" 53 | print('Started scraping', len(posts), 'posts.') 54 | keyword = name.replace(' ', '-') 55 | product[keyword] = {} 56 | product[keyword]['subreddit'] = 'all' 57 | results = [] 58 | i = 0 59 | for post in posts: 60 | parsePost(post, results) 61 | if i == len(posts): 62 | break 63 | i += 1 64 | product[keyword]['posts'] = list(results) 65 | print('Found', len(product[keyword]['posts']), 'results') 66 | with open(fileName, 'w', encoding='utf-8') as f: 67 | json.dump(product, f, indent=4, ensure_ascii=False) -------------------------------------------------------------------------------- /scrapers/twitter_scraper.py: -------------------------------------------------------------------------------- 1 | from twitter import * 2 | 3 | import sys 4 | sys.path.append(".") 5 | import config_keys 6 | import pandas as pd 7 | 8 | #----------------------------------------------------------------------- 9 | # create twitter API object 10 | #----------------------------------------------------------------------- 11 | twitter = Twitter(auth = OAuth(config_keys.access_key, 12 | config_keys.access_secret, 13 | config_keys.consumer_key, 14 | config_keys.consumer_secret)) 15 | 16 | def checkTwitter(name, folder): 17 | query = twitter.search.tweets(q=name) 18 | created = [] 19 | screen_name = [] 20 | tweet = [] 21 | for result in query["statuses"]: 22 | created.append(result["created_at"]) 23 | screen_name.append(result["user"]["screen_name"]) 24 | tweet.append(result["text"]) 25 | result_df = pd.DataFrame({'Created At':created, 'User':screen_name,'Message':tweet}) 26 | print("Found " + str(len(result_df)) + " matches in Twitter") 27 | fileName = folder + '/' + name.replace(' ', '_') + '_tweets.csv' 28 | result_df.to_csv(fileName, encoding='utf-8', header=True, index=False) --------------------------------------------------------------------------------