├── .gitignore
├── README.md
├── scraper.py
└── scrapers
    ├── CSL.py
    ├── MAS.py
    ├── PepScraper.py
    ├── StateCourt.py
    ├── UN_Sanctions.py
    ├── __pycache__
        ├── CSL.cpython-36.pyc
        ├── MAS.cpython-36.pyc
        ├── PepScraper.cpython-36.pyc
        ├── UN_Sanctions.cpython-36.pyc
        ├── cia_gov.cpython-36.pyc
        ├── google.cpython-36.pyc
        ├── panama.cpython-36.pyc
        ├── reddit.cpython-36.pyc
        └── twitter_scraper.cpython-36.pyc
    ├── cia_gov.py
    ├── fb_public_search.py
    ├── google.py
    ├── panama.py
    ├── reddit.py
    └── twitter_scraper.py


/.gitignore:
--------------------------------------------------------------------------------
1 | results/
2 | Scrap web.ipynb
3 | .ipynb_checkpoints/
4 | .idea/
5 | config_keys.py
6 | *.ipynb
7 | __pycache__/config_keys.cpython-36.pyc
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Person Scraper
 2 | 
 3 | Given a name of a person, the scraper will cross check against the following databases:
 4 | 
 5 | - [X] OFAC SDN
 6 | - [X] The Panama Papers
 7 | - [X] UN Sanctions
 8 | - [X] US Sanctions
 9 | - [X] MAS Sanctions and Freezing of assets
10 | - [X] PEP databases
11 | - [X] CIA Database
12 | - [ ] Credit bureaus
13 | - [X] Facebook
14 | - [ ] LinkedIn
15 | - [X] Twitter
16 | - [ ] Criminal Records
17 | - [X] Court Records
18 | - [X] Google
19 | - [ ] FATF
20 | - [X] Reddit
21 | 
22 | 
23 | 
24 | ### Prerequisites
25 | 
26 | The code runs on Python 3.X and these are the packages you need:
27 | 
28 | ```
29 | pandas
30 | beautifulsoup4
31 | urllib
32 | python-linkedin
33 | ```
34 | 
35 | ### Installing
36 | 
37 | Install the following packages if you have not:
38 | 
39 | ```
40 | pip install pandas
41 | pip install urllib
42 | pip install beautifulsoup4
43 | pip install python-linkedin
44 | ```
45 | 
46 | 
47 | ### Running the scrapper
48 | 
49 | Here's an example if you want to scrape on "Osama Bin Laden"
50 | 
51 | ```
52 | python scraper.py -n "Osama Bin Laden"
53 | ```
54 | 
55 | And the output will be something like this:
56 | ```
57 | Checking for Osama Bin Laden...
58 | CSL check:
59 | Found 1 matches in CSL
60 | Panama Papers check:
61 | Found 0 matches in Panama Papers
62 | ```
63 | 
64 | The search results can be found in the results/name folder
65 | ```
66 | person-scraper/
67 | |-- scrapers/
68 | |   |-- CSL.py
69 | |   |-- panama.py
70 | |
71 | |-- results/
72 | |   |-- Osama_Bin_Laden/
73 | |   |   |-- Osama_Bin_Laden_CSL.json
74 | |   |   |-- Osama_Bin_Laden_PanamaPapers.csv
75 | |
76 | |-- scraper.py
77 | |-- README
78 | |-- .gitignore
79 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from scrapers import CSL, panama, twitter_scraper, google, cia_gov, reddit, MAS, UN_Sanctions, PepScraper
 4 | 
 5 | parser = argparse.ArgumentParser(description='Person Scrapper')
 6 | requiredNamed = parser.add_argument_group('required named arguments')
 7 | requiredNamed.add_argument('--name','-n', required=True, type=str, help='Example -n Osama')
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | def scrape(name):
12 |     folderName = name.replace(' ', '_')
13 |     cwd = os.getcwd()
14 |     newFolder = os.path.join(os.path.join(cwd,'results'),folderName)
15 |     if not os.path.exists(newFolder):
16 |         os.makedirs(newFolder)
17 |     print("Checking for " + name + "...")
18 |     print("CSL check:")
19 |     CSL.checkCSL(name, newFolder)
20 |     print("Panama Papers check:")
21 |     panama.checkPanama(name, newFolder)
22 |     print("CIA Gov check:")
23 |     cia_gov.checkCIA(name, newFolder)
24 |     print("Twitter check:")
25 |     twitter_scraper.checkTwitter(name, newFolder)
26 |     print("Google check:")
27 |     google.checkGoogle(name, newFolder)
28 |     print("Reddit check:")
29 |     reddit.checkReddit(name, newFolder)
30 |     print("MAS check:")
31 |     MAS.getMASResults(name, newFolder)
32 |     print("UN Sanctions check:")
33 |     UN_Sanctions.getUnResults(name, newFolder)
34 |     print("PEP Database check:")
35 |     PepScraper.getPepResults(name, newFolder)
36 | scrape(args.name)


--------------------------------------------------------------------------------
/scrapers/CSL.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import json
 3 | 
 4 | def checkCSL(name, folder):
 5 |     urlname = name.replace(' ', '%20')
 6 |     url = 'https://api.trade.gov/consolidated_screening_list/search?api_key=AWsbLvhl2q_-vqIz1V00QX-o&name=%s&type=Individual' %(urlname)
 7 |     contents = urllib.request.urlopen(url).read()
 8 |     results = json.loads(contents)["results"]
 9 |     fileName = folder + '/' + name.replace(' ', '_') + '_CSL.json'
10 |     #TO-DO process the data dump
11 |     with open(fileName, 'w') as outfile:
12 |         json.dump(results, outfile)
13 |     print("Found " + str(len(results)) + " matches in CSL")


--------------------------------------------------------------------------------
/scrapers/MAS.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import urllib
 7 | from bs4 import BeautifulSoup
 8 | import pandas as pd
 9 | import re
10 | 
11 | 
12 | # In[16]:
13 | 
14 | def getMASResults(name, folder):
15 |     df = pd.DataFrame(columns=['Names','Data'])
16 |     
17 |     name_lower = name.lower()
18 |     dprk = 'https://scsanctions.un.org/en/?keywords=dprk'
19 |     drc = 'https://scsanctions.un.org/en/?keywords=drc'
20 |     southSudan ='https://scsanctions.un.org/en/?keywords=southsudan'
21 |     somalia = 'https://scsanctions.un.org/en/?keywords=somalia'
22 |     iran = 'https://scsanctions.un.org/en/?keywords=iran'
23 |     libya = 'https://scsanctions.un.org/en/?keywords=libya'
24 |     sudan ='https://scsanctions.un.org/en/?keywords=sudan'
25 |     yemen = 'https://scsanctions.un.org/en/?keywords=yemen'
26 |     al_qaida = 'https://scsanctions.un.org/en/?keywords=al-qaida'
27 |     taliban = 'https://scsanctions.un.org/en/?keywords=taliban'
28 |     list_of_urls = []
29 |     list_of_urls.append(dprk)
30 |     list_of_urls.append(drc)
31 |     list_of_urls.append(southSudan)
32 |     list_of_urls.append(somalia)
33 |     list_of_urls.append(iran)
34 |     list_of_urls.append(libya)
35 |     list_of_urls.append(sudan)
36 |     list_of_urls.append(yemen)
37 |     list_of_urls.append(al_qaida)
38 |     list_of_urls.append(taliban)
39 |     fileName = folder + '/' + name.replace(' ', '_') + '_MASResults.csv'
40 | 
41 |     for sites in list_of_urls:
42 |         page = urllib.request.urlopen(sites)
43 |         soup = BeautifulSoup(page, 'html.parser')
44 | 
45 |         item_list = soup.findAll('tr', attrs={'class':'rowtext'})
46 | 
47 |         for item in item_list:
48 |             text = item.text
49 |             text = text.replace('\n', '')
50 |             text = text.replace('\t', '')
51 |             text = text.replace('\xa0',' ')
52 |             text = text.replace('click here', '')
53 |             name_index = text.find('Name: 1:')
54 |             if name_index != -1:
55 |                 first_name = text[name_index + len('Name: 1:'): text.find('2:')]
56 |                 second_name = text[text.find('2:')+ 1: text.find('3:')]
57 |                 third_name = text[text.find('3:')+1: text.find('4:')]
58 |                 full_name = first_name + second_name + third_name
59 |                 full_name = full_name.replace(' na ','')
60 |                 full_name= full_name.replace(':', '')
61 |                 full_name = full_name.strip()
62 |                 full_name = ' '.join(full_name.split())
63 |                 if full_name.lower() ==name_lower:
64 |                     df.loc[0] = [name_lower, text]
65 |                     df.to_csv(fileName,encoding='utf-8',header=True, index=False)
66 |                     print(name + ' FOUND in MAS Sanctions List' + " " + sites)
67 |             else: 
68 |                 full_name = text[text.find('Name:')+ len('Name:') : text.find('A.k.a.')]
69 |                 full_name = full_name.replace(' na ','')
70 |                 full_name= full_name.replace(':', '')
71 |                 full_name = full_name.strip()
72 |                 full_name = ' '.join(full_name.split())
73 |                 if full_name.lower() ==name_lower:
74 |                     df.loc[0] = [name_lower, text]
75 |                     df.to_csv(fileName,encoding='utf-8',header=True, index=False)
76 |                     print(name + ' FOUND in MAS Sanctions List' + " " + sites)
77 |     print(name + ' NOT FOUND in MAS Sanctions List')
78 |     
79 | 
80 | 
81 | # In[17]:
82 | 
83 | 
84 | # In[ ]:
85 | 
86 | 
87 | 
88 | 
89 | # In[ ]:
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/scrapers/PepScraper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import urllib
 3 | from bs4 import BeautifulSoup
 4 | import pandas as pd
 5 | 
 6 | def nameMatching(testName, dbName, threshold=0.7):
 7 |     testName = testName.lower()
 8 |     dbName = dbName.strip().lower()
 9 |     testList = testName.split(' ')
10 |     dbNameList = dbName.split(' ')
11 |     count = 0
12 |     for name in testList:
13 |         if name in dbNameList:
14 |             count+= 1
15 |     count = count/len(testList)
16 |     if count > threshold:
17 |         return True
18 |     return False
19 | 
20 | def getPepResults(name, folder):
21 |     country_list = ["AF.html","AL.html","AG.html","AN.html","AO.html","AC.html","AR.html","AM.html","AA.html","AS.html","AU.html","AJ.html","BF.html","BA.html","BG.html","BB.html","BO.html","BE.html","BH.html","BN.html","BD.html","BT.html","BL.html","BK.html","BC.html","BR.html","BX.html","BU.html","UV.html","BM.html","BY.html","CV.html","CB.html","CM.html","CA.html","CT.html","CD.html","CI.html","CH.html","CO.html","CN.html","CG.html","CF.html","CW.html","CS.html","IV.html","HR.html","CU.html","CY.html","EZ.html","DA.html","DJ.html","DO.html","DR.html","EC.html","EG.html","ES.html","EK.html","ER.html","EN.html","WZ.html","ET.html","FJ.html","FI.html","FR.html","GB.html","GA.html","GG.html","GM.html","GH.html","GR.html","GJ.html","GT.html","GV.html","PU.html","GY.html","HA.html","VT.html","HO.html","HU.html","IC.html","IN.html","ID.html","IR.html","IZ.html","EI.html","IS.html","IT.html","JM.html","JA.html","JO.html","KZ.html","KE.html","KR.html","KN.html","KS.html","KV.html","KU.html","KG.html","LA.html","LG.html","LE.html","LT.html","LI.html","LY.html","LS.html","LH.html","LU.html","MK.html","MA.html","MI.html","MY.html","MV.html","ML.html","MT.html","RM.html","MR.html","MP.html","MX.html","FM.html","MD.html","MN.html","MG.html","MJ.html","MO.html","MZ.html","WA.html","NR.html","NP.html","NL.html","NZ.html","NU.html","NG.html","NI.html","NO.html","MU.html","PK.html","PS.html","PM.html","PP.html","PA.html","PE.html","RP.html","PL.html","PO.html","QA.html","RO.html","RS.html","RW.html","SC.html","ST.html","VC.html","WS.html","SM.html","TP.html","SA.html","SG.html","RI.html","SE.html","SL.html","SN.html","LO.html","SI.html","BP.html","SO.html","SF.html","OD.html","SP.html","CE.html","SU.html","NS.html","SW.html","SZ.html","SY.html","TW.html","TI.html","TZ.html","TH.html","TT.html","TO.html","TN.html","TD.html","TS.html","TU.html","TX.html","TV.html","UG.html","UP.html","AE.html","UK.html","UY.html","UZ.html","NH.html","VE.html","VM.html","YM.html","ZA.html","ZI.html"]
22 |     names = []
23 |     websites = []
24 |     fileName = folder + '/' + name.replace(' ', '_') + '_PEPResults.csv'
25 |     for country in country_list:
26 | 
27 |         sites= "https://www.cia.gov/library/publications/world-leaders-1/"+country
28 |         page = urllib.request.urlopen(sites)
29 |         soup = BeautifulSoup(page, 'html.parser')
30 | 
31 |         item_list = soup.findAll('span', attrs={'class':'cos_name'})
32 | 
33 |         for item in item_list:
34 |             text = item.text
35 |             if nameMatching(name, text):
36 |                 print(name + ' is a PEP' + " " + sites)
37 |                 websites.append(sites)
38 |                 names.append(text.strip())
39 |     if len(names) == 0:
40 |         print(name + ' NOT FOUND as a PEP')
41 |     else:
42 |         df = pd.DataFrame({'Names':names, 'Websites':websites})
43 |         df.to_csv(fileName, header=True, index=False)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/scrapers/StateCourt.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | import re
 5 | 
 6 | def getStateCourtResults(name):
 7 |     
 8 |     name_lower = name.lower()
 9 |     search_name = name_lower.replace(' ','%20')
10 |     stateCourt = 'https://www.statecourts.gov.sg/Pages/BasicSearchResult.aspx?k='+ search_name
11 |     page = urllib.request.urlopen(stateCourt)
12 |     soup = BeautifulSoup(page, 'html.parser')
13 |     
14 |     table = soup.find('table')
15 |     contents = table.text.lower()
16 |     
17 |     if contents.find(name_lower)!= -1:
18 |         return name + ' FOUND in State Court Records'
19 |     
20 |     
21 |     return name + ' NOT FOUND in State Court Records'
22 | 
23 | 
24 | 
25 | #print(getStateCourtResults('kang CHI LOONG'))
26 | #print(getStateCourtResults('PANG CHI KANG'))
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/scrapers/UN_Sanctions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[2]:
 5 | 
 6 | import urllib
 7 | from bs4 import BeautifulSoup
 8 | import pandas as pd
 9 | import re
10 | 
11 | 
12 | # In[8]:
13 | 
14 | def getUnResults(name, folder):
15 |     df = pd.DataFrame(columns=['Names','Data'])
16 |     
17 |     search_name1 = name.lower().replace(' ','%7C')
18 |     search_name2= name.lower().replace(' ','+')
19 |     name_lower = name.lower()
20 |     sites= "https://scsanctions.un.org/en/?keywords=+%22" + search_name1+ "%22&per-page=2500&sections=s&sections=s&sort=id&includes=%22"+search_name2 + "%22&excludes=&committee=&nationality=&reference_from=&reference_to="
21 | 
22 |     page = urllib.request.urlopen(sites)
23 |     soup = BeautifulSoup(page, 'html.parser')
24 | 
25 |     item_list = soup.findAll('tr', attrs={'class':'rowtext'})
26 |     fileName = folder + '/' + name.replace(' ', '_') + '_UNResults.csv'
27 | 
28 |     for item in item_list:
29 |         text = item.text
30 |         text = text.replace('\n', '')
31 |         text = text.replace('\t', '')
32 |         text = text.replace('\xa0',' ')
33 |         text = text.replace('click here', '')
34 |         name_index = text.find('Name: 1:')
35 |         if name_index != -1:
36 |             first_name = text[name_index + len('Name: 1:'): text.find('2:')]
37 |             second_name = text[text.find('2:')+ 1: text.find('3:')]
38 |             third_name = text[text.find('3:')+1: text.find('4:')]
39 |             full_name = first_name + second_name + third_name
40 |             full_name = full_name.replace(' na ','')
41 |             full_name= full_name.replace(':', '')
42 |             full_name = full_name.strip()
43 |             full_name = ' '.join(full_name.split())
44 |             if full_name.lower() == name_lower:
45 |                 
46 |                 df.loc[0] = [name_lower, text]
47 |                 df.to_csv(fileName,encoding='utf-8',header=True, index=False)
48 |                 print(name + ' FOUND in MAS Sanctions List' + " " + sites)
49 |         else: 
50 |             full_name = text[text.find('Name:')+ len('Name:') : text.find('A.k.a.')]
51 |             full_name = full_name.replace(' na ','')
52 |             full_name= full_name.replace(':', '')
53 |             full_name = full_name.strip()
54 |             full_name = ' '.join(full_name.split())
55 |             if full_name.lower() == name_lower:
56 |                 
57 |                 df.loc[0] = [name_lower, text]
58 |                 df.to_csv(fileName,encoding='utf-8',header=True, index=False)
59 |                 print(name + ' FOUND in MAS Sanctions List' + " " + sites)
60 |     print(name + ' NOT FOUND in MAS Sanctions List')
61 | 
62 | 
63 | # In[ ]:
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/scrapers/__pycache__/CSL.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/CSL.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/MAS.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/MAS.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/PepScraper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/PepScraper.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/UN_Sanctions.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/UN_Sanctions.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/cia_gov.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/cia_gov.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/google.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/google.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/panama.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/panama.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/reddit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/reddit.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/__pycache__/twitter_scraper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhunhung/person_scraper/1efbe373c15c52dd5121416e984d916804067806/scrapers/__pycache__/twitter_scraper.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapers/cia_gov.py:
--------------------------------------------------------------------------------
 1 | # from selenium import webdriver
 2 | # from selenium.webdriver.common.keys import Keys
 3 | import urllib
 4 | from bs4 import BeautifulSoup
 5 | import pandas as pd
 6 | import re
 7 | 
 8 | def getCIAResults(name):    
 9 |     subject = name.lower().replace(' ','+')
10 |     url = 'https://www.cia.gov/search?q=' + subject + '&site=CIA&output=xml_no_dtd&client=CIA&myAction=/search&proxystylesheet=CIA&submitMethod=get&ie=UTF-8&ulang=en&ip=137.132.84.43&access=p&sort=date:D:L:d1&entqr=3&entqrm=0&wc=200&wc_mc=1&oe=UTF-8&ud=1&filter=0'
11 |     page = urllib.request.urlopen(url)
12 |     soup = BeautifulSoup(page, 'html.parser')    
13 |     result_info = soup.find(id="content-core")
14 |     result_info_items = result_info.find_all('b', limit = 4)
15 |     
16 |     info = []    
17 |     for ri in result_info_items:
18 |         info.append(ri.text.strip())
19 |     
20 |     test = info.pop(1)
21 |     if test == name.lower():
22 |         return pd.DataFrame(columns={'title','date','description','link'})
23 |     
24 |     num_results= int(info.pop(1))
25 |     num_pages = int(num_results/10 + (num_results % 10 > 0))
26 | 
27 | 
28 |     counter = 0    
29 |     pages = []
30 |     for i in range(0,num_pages):
31 |         url2 = 'https://www.cia.gov/search?q=' + subject + '&site=CIA&output=xml_no_dtd&client=CIA&myAction=/search&proxystylesheet=CIA&submitMethod=get&ie=UTF-8&ulang=en&ip=137.132.84.43&access=p&sort=date:D:L:d1&entqr=3&entqrm=0&wc=200&wc_mc=1&oe=UTF-8&ud=1&filter=0&start=' + str(counter)
32 |         pages.append(url2)
33 |         counter += 10
34 |     
35 |     link = []
36 |     title = []    
37 |     desc = []
38 |     date = []        
39 |         
40 |     for item in pages:
41 |         pg = urllib.request.urlopen(item)
42 |         soup2 = BeautifulSoup(pg, 'html.parser')
43 |         result_list = soup2.find(id="content-core")
44 |         
45 |         #get link and title
46 | 
47 |         result_list_items = result_list.find_all('a', ctype = "c")
48 |         for result in result_list_items:
49 |             link.append(result.get('href')) #get links
50 |             title.append(result.text) #get title
51 |             
52 |         result_list_desc = result_list.find_all('td', class_="s")
53 |         for result2 in result_list_desc:
54 |             temp = result2.text.strip() 
55 |             desc.append(temp[:-19])
56 |             date.append(temp[-10:])
57 |     
58 |     data = pd.DataFrame({'title':title, 'date': date, 'description': desc, 'link':link})
59 |     return data
60 | 
61 | 
62 | def checkCIA(name, folder):   
63 |     results_table = getCIAResults(name)
64 |     if len(results_table) == 0:
65 |         print('No results found in www.cia.gov')
66 |     else:
67 |         print('Found ' + str(len(results_table)) + ' matches in www.cia.gov')
68 |         fileName = folder + '/' + name.replace(' ', '_') + '_CIAResults.csv'
69 |         results_table.to_csv(fileName, encoding='utf-8',header=True, index=False)
70 | 
71 | 
72 | 
73 | 
74 | 
75 |     
76 |     
77 |     
78 | 
79 |     
80 |     
81 | 


--------------------------------------------------------------------------------
/scrapers/fb_public_search.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | import re
 5 | 
 6 | def getFBProfiles(name):
 7 |     
 8 |     subject = name.lower().replace(' ','-')
 9 |     links= []
10 |     names= []
11 |   
12 |     url = 'https://www.facebook.com/public/' + subject 
13 |     page = urllib.request.urlopen(url)
14 |     soup = BeautifulSoup(page, 'html.parser')    
15 |     text = str(soup)
16 |     result = re.findall('<a class="_32mo"(.*?)/span>',text)    
17 |     
18 |     for r in result:
19 |         link = re.search('href="(.*)">',r)
20 |         name = re.search('><span>(.*)<', r)
21 |         links.append(link.group(1))
22 |         names.append(name.group(1))            
23 |     data = pd.DataFrame({'Name':names, 'Link': links})
24 |     return data
25 | 
26 | 
27 |         
28 | def checkFB(name,folder):   
29 |     results_table = getFBProfiles(name)
30 |     if len(results_table) == 0:
31 |         print('No results found in www.facebook.com')
32 |     else:
33 |         print('Found ' + str(len(results_table)) + ' matches in www.facebook.com')
34 |         results_table.to_csv("FB.csv" , header = True, index = False )
35 |         fileName = folder + '/' + name.replace(' ', '_') + '_FBResults.csv'
36 |         results_table.to_csv(fileName, encoding='utf-8',header=True, index=False)
37 | 
38 | #test: 
39 | #checkFB('Daniel Tan')
40 |       
41 | 


--------------------------------------------------------------------------------
/scrapers/google.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import json
 3 | import config_keys
 4 | 
 5 | def checkGoogle(name, folder):
 6 |     urlname = name.replace(' ', '%20')
 7 |     url = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=008469397288160540229:emgz62cres4&q=%s' % (config_keys.googleAPI, urlname)
 8 |     contents = urllib.request.urlopen(url).read()
 9 |     results = json.loads(contents)["items"]
10 |     matchCount = json.loads(contents)['searchInformation']['totalResults']
11 |     fileName = folder + '/' + name.replace(' ', '_') + '_GoogleSearch.json'
12 |     # TO-DO process the data dump
13 |     with open(fileName, 'w') as outfile:
14 |         json.dump(results, outfile)
15 |     print("Found " + matchCount + " matches in Google Search")


--------------------------------------------------------------------------------
/scrapers/panama.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | import re
 5 | 
 6 | 
 7 | 
 8 | def getPanamaResults(name):
 9 |     
10 |     search_name = name.lower().replace(' ','+')
11 |     initial_page = 'https://offshoreleaks.icij.org/search?cat=1&e=&q='+search_name+'&utf8=%E2%9C%93'
12 | 
13 |     page = urllib.request.urlopen(initial_page)
14 | 
15 |     soup = BeautifulSoup(page, 'html.parser')
16 | 
17 |     results_count = soup.find('div', attrs={'id':'results_wrapper'})
18 | 
19 |     #Get number of officers/individual count
20 |     officerCategory = str(results_count.find_all('li')[1]).replace('\n', '')
21 |     countpat = r'(?<=\().+?(?=\))'
22 |     resultsCount = int(re.findall(countpat,officerCategory)[0])
23 |     results_table = pd.DataFrame(columns=['Names', 'Node', 'Source'],index=range(resultsCount)) # I know the size 
24 | 
25 |     #Iterate through the pages and search results
26 |     pageCount = 0
27 |     namepat = r'(?<=\>).+?(?=\<)'
28 |     nodepat = r'(?<=\/).+?(?=\")'
29 |     row_marker = 0
30 |     for i in range((resultsCount//100)+1):
31 |         newpage = 'https://offshoreleaks.icij.org/search?cat=1&e=&from=' + str(pageCount) + '&q='+search_name+'&utf8=%E2%9C%93'
32 |         newpage = urllib.request.urlopen(newpage)
33 |         newsoup = BeautifulSoup(newpage, 'html.parser')
34 |         table_box = newsoup.find('table', attrs={'class':'search_results'})
35 |         names = table_box.find_all('a')
36 |         for row in names:
37 |             str_row = str(row).replace('\n','')
38 |             if 'nodes' in str_row:
39 |                 person_name = re.findall(namepat, str_row)[0].strip()
40 |                 results_table.iat[row_marker, 0] = person_name
41 |                 node = re.findall(nodepat, str_row)[0].split('/')[1]
42 |                 results_table.iat[row_marker, 1] = node  
43 |             else:
44 |                 source_name = re.findall(namepat, str_row)[0]
45 |                 results_table.iat[row_marker, 2] = source_name
46 |                 row_marker += 1
47 |         pageCount += 100
48 |     return results_table
49 | 
50 | 
51 | 
52 | #To Compare Names
53 | def nameMatching(testName, dbName, threshold=0.7):
54 |     testName = testName.lower()
55 |     dbName = dbName.lower()
56 |     testList = testName.split(' ')
57 |     dbNameList = dbName.split(' ')
58 |     count = 0
59 |     for name in testList:
60 |         if name in dbNameList:
61 |             count+= 1
62 |     count = count/len(testList)
63 |     if count > threshold:
64 |         return True
65 |     return False
66 | 
67 | 
68 | def checkPanama(testName, folder):
69 |     results_table = getPanamaResults(testName)
70 |     filteredNameDb = pd.DataFrame([results_table.iloc[row] for row in results_table.index if nameMatching(testName, results_table.iloc[row]['Names'])])
71 |     filteredNameDb.reset_index(drop=True, inplace=True)
72 |     if len(filteredNameDb) == 0:
73 |         print('No results found in Panama Papers')
74 |     else:
75 |         print('Found ' + str(len(filteredNameDb)) + ' matches in Panama Papers')
76 |         filteredNameDb[['URL']] = 'https://offshoreleaks.icij.org/nodes/' + filteredNameDb[['Node']]
77 |         filteredNameDb.drop(columns=['Node'], inplace=True)
78 |         fileName = folder + '/' + testName.replace(' ', '_') + '_PanamaPapers.csv'
79 |         filteredNameDb.to_csv(fileName,encoding='utf-8',header=True, index=False)
80 |     
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/scrapers/reddit.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process, Manager
 2 | from datetime import datetime
 3 | from bs4 import BeautifulSoup
 4 | import argparse
 5 | import requests
 6 | import json
 7 | import re
 8 | 
 9 | 
10 | def createSoup(url):
11 |     REQUEST_AGENT = 'Mozilla/5.0 Chrome/47.0.2526.106 Safari/537.36'
12 |     return BeautifulSoup(requests.get(url, headers={'User-Agent':REQUEST_AGENT}).text, 'lxml')
13 | 
14 | def getSearchResults(searchUrl):
15 |     posts = []
16 |     while True:
17 |         resultPage = createSoup(searchUrl)
18 |         posts += resultPage.findAll('div', {'class':'search-result-link'})
19 |         footer = resultPage.findAll('a', {'rel':'nofollow next'})
20 |         if footer:
21 |             searchUrl = footer[-1]['href']
22 |         else:
23 |             return posts
24 | 
25 | def parsePost(post, results):
26 |     time = post.find('time')['datetime']
27 |     date = datetime.strptime(time[:19], '%Y-%m-%dT%H:%M:%S')
28 |     title = post.find('a', {'class':'search-title'}).text
29 |     score = post.find('span', {'class':'search-score'}).text
30 |     score = int(re.match(r'[+-]?\d+', score).group(0))
31 |     author = post.find('a', {'class':'author'}).text
32 |     subreddit = post.find('a', {'class':'search-subreddit-link'}).text
33 |     commentsTag = post.find('a', {'class':'search-comments'})
34 |     url = commentsTag['href']
35 |     numComments = int(re.match(r'\d+', commentsTag.text).group(0))
36 |     #print("\n" + str(date)[:19] + ":", numComments, score, author, subreddit, title)
37 |     results.append({'title': title, 'url': url, 'date': str(date), 'score': score,
38 |                     'author': author, 'subreddit': subreddit})
39 | 
40 | def checkReddit(name, folder):
41 |     SITE_URL = 'https://old.reddit.com/'
42 |     searchUrl = SITE_URL + 'search?q="' + name + '"'
43 |     fileName = folder + '/' + name.replace(' ', '_') + '_redditResults.json'
44 |     try:
45 |         product = json.load(open(fileName))
46 |     except FileNotFoundError:
47 |         print('Creating json file')
48 |         product = {}
49 |     print('Search URL:', searchUrl)
50 |     posts = getSearchResults(searchUrl)
51 |     if (len(posts) == 0):
52 |         return "Found 0 matches in Reddit"
53 |     print('Started scraping', len(posts), 'posts.')
54 |     keyword = name.replace(' ', '-')
55 |     product[keyword] = {}
56 |     product[keyword]['subreddit'] = 'all'
57 |     results = []
58 |     i = 0
59 |     for post in posts:
60 |         parsePost(post, results)
61 |         if i == len(posts):
62 |             break
63 |         i += 1
64 |     product[keyword]['posts'] = list(results)
65 |     print('Found', len(product[keyword]['posts']), 'results')
66 |     with open(fileName, 'w', encoding='utf-8') as f:
67 |         json.dump(product, f, indent=4, ensure_ascii=False)


--------------------------------------------------------------------------------
/scrapers/twitter_scraper.py:
--------------------------------------------------------------------------------
 1 | from twitter import *
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | import config_keys
 6 | import pandas as pd
 7 | 
 8 | #-----------------------------------------------------------------------
 9 | # create twitter API object
10 | #-----------------------------------------------------------------------
11 | twitter = Twitter(auth = OAuth(config_keys.access_key,
12 |                   config_keys.access_secret,
13 |                   config_keys.consumer_key,
14 |                   config_keys.consumer_secret))
15 | 
16 | def checkTwitter(name, folder):
17 |     query = twitter.search.tweets(q=name)
18 |     created = []
19 |     screen_name = []
20 |     tweet = []
21 |     for result in query["statuses"]:
22 |         created.append(result["created_at"])
23 |         screen_name.append(result["user"]["screen_name"])
24 |         tweet.append(result["text"])
25 |     result_df = pd.DataFrame({'Created At':created, 'User':screen_name,'Message':tweet})
26 |     print("Found " + str(len(result_df)) + " matches in Twitter")
27 |     fileName = folder + '/' + name.replace(' ', '_') + '_tweets.csv'
28 |     result_df.to_csv(fileName, encoding='utf-8', header=True, index=False)


--------------------------------------------------------------------------------