├── CFTC.py
├── CME1.py
├── CME2.py
├── CME3.py
├── CQF.py
├── LICENSE
├── LME.py
├── MENA Newsletter.py
├── Macrotrends.py
├── README.md
├── SHFE.py
├── Springer.py
├── Tomtom.py
├── US Federal Holidays.py
├── US Treasury.py
├── WallStreetBets.py
└── preview
    ├── cme1 html.PNG
    ├── cme1 tree.png
    ├── cme2 euronext.PNG
    ├── cme2 inspect element.png
    ├── cme2 json.PNG
    ├── cme2 link address.png
    ├── cme2 network.PNG
    ├── cme2 request url.PNG
    ├── cme2 url.PNG
    ├── cqf login link.PNG
    ├── cqf post form.PNG
    ├── cqf query.PNG
    ├── cqf request header.PNG
    ├── legality.PNG
    ├── mena bat file.PNG
    ├── mena bat format.PNG
    ├── mena check.PNG
    ├── mena create task.PNG
    ├── mena finito.PNG
    ├── mena freq.PNG
    ├── mena python path.PNG
    ├── mena script name.PNG
    ├── mena set time.PNG
    ├── mena start program.PNG
    ├── mena task name.PNG
    ├── mena task scheduler.PNG
    ├── proxy domain.PNG
    ├── proxy ie.png
    ├── proxy lan.PNG
    ├── shfe javascript.png
    ├── shfe regex.png
    └── web-scraping-profile.png


/CFTC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | 
  6 | #scrape cftc trader commitment report
  7 | 
  8 | 
  9 | # In[1]:
 10 | 
 11 | 
 12 | import requests
 13 | import pandas as pd
 14 | import re
 15 | import os
 16 | os.chdir('H:/')
 17 | 
 18 | 
 19 | # In[2]:
 20 | 
 21 | 
 22 | #scraping function
 23 | def scrape(url):
 24 |     
 25 |     session=requests.Session()
 26 |     
 27 |     session.headers.update(
 28 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
 29 |     
 30 |     response=session.get(url)    
 31 | 
 32 |     return response
 33 | 
 34 | 
 35 | # In[3]:
 36 | 
 37 | 
 38 | #get data
 39 | def etl(response):
 40 |     
 41 |     #create a list
 42 |     text=response.content.decode('utf-8').split('\r')    
 43 |     
 44 |     
 45 |     #create index for each block
 46 |     assets=[i for i in text if 'CHICAGO MERCANTILE EXCHANGE' in i]
 47 |     ind=[text.index(i) for i in assets]
 48 |     
 49 |             
 50 |     overall=[]
 51 |     
 52 |     #etl
 53 |     for i in ind:
 54 |         
 55 |         commodity=text[i].split(' - CHICAGO MERCANTILE EXCHANGE')[0].replace('\n','')
 56 |         commodity_code=text[i].split('Code-')[-1].replace('\n','')
 57 |         date=re.search('\d{2}\/\d{2}\/\d{2}',text[i+1]).group()
 58 |         contractunit=re.search('(?<=\().*(?=OPEN INTEREST)',text[i+7]).group().replace(')','')
 59 |         open_interest=re.search('(?<=OPEN INTEREST\:).*',text[i+7]).group()
 60 |         non_commercial_long_commitment,non_commercial_short_commitment, \
 61 |         non_commercial_spread_commitment,commercial_long_commitment, \
 62 |         commercial_short_commitment,total_long_commitment,total_short_commitment, \
 63 |         non_reportable_long_commitment,non_reportable_short_commitment=re.findall('\S+',text[i+9])
 64 |         changedate=re.search('\d{2}\/\d{2}\/\d{2}',text[i+11]).group()
 65 |         change_open_interest=text[i+11].split(' ')[-1].replace(')','')
 66 |         non_commercial_long_change,non_commercial_short_change, \
 67 |         non_commercial_spread_change,commercial_long_change, \
 68 |         commercial_short_change,total_long_change,total_short_change, \
 69 |         non_reportable_long_change,non_reportable_short_change=re.findall('\S+',text[i+12])
 70 |         non_commercial_long_percent,non_commercial_short_percent, \
 71 |         non_commercial_spread_percent,commercial_long_percent, \
 72 |         commercial_short_percent,total_long_percent,total_short_percent, \
 73 |         non_reportable_long_percent,non_reportable_short_percent=re.findall('\S+',text[i+15])
 74 |         totaltraders=text[i+17].split(' ')[-1].replace(')','')
 75 |         non_commercial_long_traders,non_commercial_short_traders, \
 76 |         non_commercial_spread_traders,commercial_long_traders, \
 77 |         commercial_short_traders,total_long_traders,total_short_traders=re.findall('\S+',text[i+18])
 78 |         
 79 |         temp=[commodity,commodity_code,date,contractunit,open_interest,
 80 |               non_commercial_long_commitment,non_commercial_short_commitment,
 81 |               non_commercial_spread_commitment,commercial_long_commitment,
 82 |               commercial_short_commitment,total_long_commitment,
 83 |               total_short_commitment,non_reportable_long_commitment,
 84 |               non_reportable_short_commitment,changedate,change_open_interest,
 85 |               non_commercial_long_change,non_commercial_short_change,
 86 |               non_commercial_spread_change,commercial_long_change,
 87 |               commercial_short_change,total_long_change,total_short_change,
 88 |               non_reportable_long_change,non_reportable_short_change,
 89 |               non_commercial_long_percent,non_commercial_short_percent,
 90 |               non_commercial_spread_percent,commercial_long_percent,
 91 |               commercial_short_percent,total_long_percent,
 92 |               total_short_percent,non_reportable_long_percent,
 93 |               non_reportable_short_percent,totaltraders,
 94 |               non_commercial_long_traders,non_commercial_short_traders,
 95 |               non_commercial_spread_traders,commercial_long_traders,
 96 |               commercial_short_traders,total_long_traders,total_short_traders]
 97 |         
 98 |         overall+=temp
 99 |     
100 |     
101 |     colnames=['commodity',
102 |      'commodity_code',
103 |      'date',
104 |      'contract_unit',
105 |      'open_interest',
106 |      'non_commercial_long_commitment',
107 |      'non_commercial_short_commitment',
108 |      'non_commercial_spread_commitment',
109 |      'commercial_long_commitment',
110 |      'commercial_short_commitment',
111 |      'total_long_commitment',
112 |      'total_short_commitment',
113 |      'non_reportable_long_commitment',
114 |      'non_reportable_short_commitment',
115 |      'change_date',
116 |      'change_open_interest',
117 |      'non_commercial_long_change',
118 |      'non_commercial_short_change',
119 |      'non_commercial_spread_change',
120 |      'commercial_long_change',
121 |      'commercial_short_change',
122 |      'total_long_change',
123 |      'total_short_change',
124 |      'non_reportable_long_change',
125 |      'non_reportable_short_change',
126 |      'non_commercial_long_percent',
127 |      'non_commercial_short_percent',
128 |      'non_commercial_spread_percent',
129 |      'commercial_long_percent',
130 |      'commercial_short_percent',
131 |      'total_long_percent',
132 |      'total_short_percent',
133 |      'non_reportable_long_percent',
134 |      'non_reportable_short_percent',
135 |      'total_traders',
136 |      'non_commercial_long_traders',
137 |      'non_commercial_short_traders',
138 |      'non_commercial_spread_traders',
139 |      'commercial_long_traders',
140 |      'commercial_short_traders',
141 |      'total_long_traders',
142 |      'total_short_traders']
143 |     
144 |     
145 |     #create dataframe
146 |     df=pd.DataFrame(columns=colnames)
147 |     
148 |     
149 |     for i in range(len(colnames)):
150 |         df[colnames[i]]=overall[i::len(colnames)]
151 |         
152 |     
153 |     #transform
154 |     ind=['commodity', 'commodity_code','change_date',
155 |          'date', 'contract_unit', 'open_interest',
156 |          'change_open_interest','total_traders']
157 | 
158 |     df=df.melt(id_vars=ind,value_vars=[i for i in df.columns if i not in ind])
159 | 
160 |     #isolate position
161 |     df['position']=''
162 | 
163 |     ind_long=df.loc[df['variable'].apply(lambda x: 'long' in x )].index
164 |     ind_short=df.loc[df['variable'].apply(lambda x: 'short' in x )].index
165 |     ind_spread=df.loc[df['variable'].apply(lambda x: 'spread' in x )].index
166 | 
167 |     for i in ind_spread:
168 |         df.at[i,'position']='spread'
169 |     for i in ind_short:
170 |         df.at[i,'position']='short'
171 |     for i in ind_long:
172 |         df.at[i,'position']='long'
173 | 
174 |     df['variable']=df['variable'].str.replace('long_','').str.replace('short_','').str.replace('spread_','')
175 | 
176 |     #isolate type
177 |     df['type']=df['variable'].apply(lambda x:'_'.join(x.split('_')[:-1]))
178 | 
179 |     #clean variable name
180 |     df['variable']=df['variable'].apply(lambda x:x.split('_')[-1])
181 | 
182 |     df['variable']=df['variable'].str.replace('percent',
183 |                                'percent_of_open_interest_for_each_type_of_traders')
184 | 
185 |     df['variable']=df['variable'].str.replace('traders',
186 |                                'number_of_traders_in_each_type')
187 | 
188 |     #change col order
189 |     df=df[['commodity', 'commodity_code', 'change_date',
190 |         'date', 'contract_unit','open_interest', 
191 |            'change_open_interest', 'total_traders', 
192 |            'type','position','variable','value', ]]
193 |     
194 |     return df
195 |     
196 |     
197 | # In[4]:
198 | 
199 | def main():
200 | 
201 |     url='https://www.cftc.gov/dea/futures/deacmesf.htm'
202 |     
203 |     #scrape
204 |     response=scrape(url)
205 | 
206 |     #get data
207 |     df=etl(option_url)
208 | 
209 |     df.to_csv('trader commitment report.csv',index=False)
210 |     
211 | 
212 | if __name__ == "__main__":
213 |     main()
214 | 
215 | 


--------------------------------------------------------------------------------
/CME1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar 20 16:23:53 2018
  4 | 
  5 | """
  6 | 
  7 | #scraping CME is soooo effortless
  8 | #just simple html parse tree
  9 | #how i love Chicago
 10 | import urllib.request as u
 11 | import pandas as pd
 12 | from bs4 import BeautifulSoup as bs
 13 | import os
 14 | os.chdir('H:/')
 15 | 
 16 | 
 17 | #
 18 | def scrape(category_name,commodity_name):
 19 |     
 20 |     #i use proxy handler cuz my uni network runs on its proxy
 21 |     #and i cannot authenticate python through the proxy
 22 |     #so i use empty proxy to bypass the authentication
 23 |     proxy_handler = u.ProxyHandler({})
 24 |     opener = u.build_opener(proxy_handler)
 25 |     
 26 |     #cme officially forbids scraping
 27 |     #so a header must be used for disguise as an internet browser
 28 |     #the developers say no to scraping, it appears to be so
 29 |     #but actually they turn a blind eye to us, thx
 30 |     #i need different types of commodity
 31 |     #so i need to format the website for each commodity
 32 |     req=u.Request('http://www.cmegroup.com/trading/metals/%s/%s.html'%(
 33 |             category_name,commodity_name),headers={'User-Agent': 'Mozilla/5.0'})
 34 |     response=opener.open(req)
 35 |     result=response.read()
 36 |     soup=bs(result,'html.parser')
 37 |     
 38 |     return soup
 39 | 
 40 | 
 41 | #
 42 | def etl(category_name,commodity_name):
 43 |     
 44 |     try:
 45 |         page=scrape(category_name,commodity_name)
 46 |         print(commodity_name)
 47 |  
 48 |     except Exception as e:
 49 |         print(e)
 50 |         
 51 |         
 52 |     #i need date, prior settle price and volume
 53 |     #it is essential to view source of the website first
 54 |     #then use beautiful soup to search specific class
 55 |     p1=page.find_all('span',class_='cmeNoWrap')
 56 |     p2=page.find_all('td',class_=['statusOK','statusNull','statusAlert'])
 57 |     p3=page.find_all('td',class_="cmeTableRight")
 58 | 
 59 |     a=[]
 60 |     b=[]
 61 |     c=[]
 62 |     
 63 |     for i in p1:
 64 |         a.append(i.text)
 65 | 
 66 |     #somehow prior settle is hard to get
 67 |     #we cannot find that specific tag
 68 |     #we can search for the previous tag instead
 69 |     #the find_next function of beautifulsoup allows us to get the next tag
 70 |     #the previous tag of prior settle is change
 71 |     for j in p2:
 72 |         temp=j.find_next()
 73 |         b.append(temp.text)
 74 | 
 75 |     #the volume contains comma
 76 |     for k in p3:
 77 |         c.append(float(str(k).replace(',','')))
 78 |         
 79 |         
 80 |     df=pd.DataFrame()    
 81 |     df['expiration date']=a
 82 |     df['prior settle']=b
 83 |     df['volume']=c
 84 |     df['name']=commodity_name
 85 |     
 86 |     #for me, i wanna highlight the front month
 87 |     #The front month is the month where the majority of volume and liquidity occurs
 88 |     df['front month']=df['volume']==max(df['volume'])
 89 | 
 90 | 
 91 | #
 92 | def main():
 93 |     
 94 |     #scraping and etl
 95 |     df1=etl('precious','silver')
 96 |     df2=etl('precious','gold')
 97 |     df3=etl('precious','palladium')
 98 |     df4=etl('base','copper')
 99 | 
100 |     #concatenate then export
101 |     dd=pd.concat([df1,df2,df3,df4])
102 |     dd.to_csv('cme.csv',encoding='utf_8_sig')
103 |     
104 |     
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/CME2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr  9 11:33:03 2018
  4 | 
  5 | """
  6 | #previously in CME1
  7 | #i said scraping CME is soooo effortless
  8 | #CME technical guys must have heard my voice
  9 | #they changed the website from xml structure to json query
 10 | #holy crap!! well, it would not scare off people like us!!
 11 | 
 12 | #here is the trick
 13 | #before we actually go to the website of CME quotes
 14 | #we press ctrl+shift+i in chrome or f12 in ie
 15 | #we can inspect element of the website
 16 | #we just go to the network monitor
 17 | #we will be able to see all the network activity
 18 | #including where the data of CME is coming from
 19 | #this is how we gon do it baby
 20 | import pandas as pd
 21 | import requests
 22 | import os
 23 | os.chdir('H:/')
 24 | 
 25 | 
 26 | #
 27 | def scrape(commodity_code):    
 28 |     
 29 |     session=requests.Session()
 30 |     
 31 |     
 32 |     #cme officially forbids scraping
 33 |     #so a header must be used to disguise as a browser
 34 |     #technically speaking, the website should be able to detect that too
 35 |     #those tech guys just turn a blind eye, thx fellas
 36 |     session.headers.update(
 37 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
 38 |     
 39 |     
 40 |     #now that we have found out where the data is coming from
 41 |     #we need to do a lil analysis on the url
 42 |     #e.g. http://www.cmegroup.com/CmeWS/mvc/Quotes/Future/437/G
 43 |     #it is quite obvious that 437 is a code name for commodity gold
 44 |     #but how do we know the code for each commodity
 45 |     #this is an issue raised by maysam19
 46 |     # https://github.com/je-suis-tm/web-scraping/issues/1
 47 |     #might as well as mention the solution here
 48 |     #there are two ways to solve it
 49 |     
 50 |     #if you only need very few types of commodity
 51 |     #you can go to websites one by one
 52 |     #e.g. https://www.cmegroup.com/trading/metals/precious/gold.html
 53 |     #you can right click and select view page source
 54 |     #search for /CmeWS/mvc/Quotes/Future/
 55 |     #you should find the commodity code easily
 56 |     
 57 |     #if you got so many types of commodity to scrape
 58 |     #you should seek for the link that contains such information from inspect element
 59 |     #here is the hack that i have done for you, voila
 60 |     # https://www.cmegroup.com/CmeWS/mvc/ProductSlate/V2/List
 61 |     #it is a json file that contains codes of each commodity in cme
 62 |     #if you are visiting this script to understand json file
 63 |     #dont worry, we will talk about how to read it very soon
 64 |     response=session.get(
 65 |             'http://www.cmegroup.com/CmeWS/mvc/Quotes/Future/%s/G'%(commodity_code))
 66 | 
 67 |     return response
 68 | 
 69 | 
 70 | #
 71 | def etl(commodity_code,commodity_name):
 72 |     
 73 |     try:
 74 |         response=scrape(commodity_code)
 75 |         print(response)
 76 |  
 77 |     except Exception as e:
 78 |         print(e)
 79 |         
 80 |         
 81 |     #think of json file as dictionaries inside dictionaries
 82 |     #the simplest way to handle json files is pandas
 83 |     #remember, the solution is pandas package, not json package!
 84 |     #dataframe is a default way of reading json
 85 |     #if you dont like the structure
 86 |     #you can use pd.read_json with orient as a key argument
 87 |     #you can choose from index, columns, values, split, records
 88 |     df=pd.DataFrame(response.json())
 89 |     
 90 |     #pandas turns json into a dataframe
 91 |     #still, for df['quotes']
 92 |     #we end up with a bunch of dictionaries
 93 |     #we just treat things as normal dictionaries
 94 |     #we use the key to get value for each dictionary
 95 |     #and we form a new dataframe as output
 96 |     #for me, i only need prior settle price and expiration date
 97 |     #volume is used to detect the front month contract
 98 |     output=pd.DataFrame()
 99 |     output['prior settle']=[i['priorSettle'] for i in df['quotes']]
100 |     output['expiration date']=[i['expirationDate'] for i in df['quotes']]
101 |     output['volume']=[i['volume'] for i in df['quotes']]
102 |     output['volume']=output['volume'].replace(',','').astype(float)
103 |     output['name']=commodity_name
104 |     output['front month']=output['volume']==max(output['volume'])
105 |     
106 |     return output
107 | 
108 | 
109 | #
110 | def main():
111 | 
112 |     df1=etl('458','silver')
113 |     df2=etl('437','gold')
114 |     df3=etl('445','palladium')
115 |     df4=etl('438','copper')
116 | 
117 | 
118 |     #concatenate then export
119 |     output=pd.concat([df1,df2,df3,df4])
120 |     output.to_csv('cme.csv',encoding='utf_8_sig')
121 |     
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/CME3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | 
  5 | 
  6 | #without the help of my intern, this option data scraper would never exist
  7 | #thank you, Olivia, much appreciated for the data etl
  8 | 
  9 | # In[1]:
 10 | 
 11 | 
 12 | import requests
 13 | import pandas as pd
 14 | import time
 15 | import random as rd
 16 | import os
 17 | os.chdir('H:/')
 18 | 
 19 | 
 20 | # In[2]:
 21 | 
 22 | 
 23 | #scraping function
 24 | def scrape(url):
 25 |     
 26 |     session=requests.Session()
 27 |     
 28 |     session.headers.update(
 29 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
 30 |     
 31 |     time.sleep(rd.randint(0,10))
 32 |     
 33 |     response=session.get(url,params={"_": int(time.time()*1000)})    
 34 | 
 35 |     return response
 36 | 
 37 | 
 38 | # In[3]:
 39 | 
 40 | 
 41 | #get options expiration id
 42 | def get_expiration_data(expiration_json,options_id):
 43 | 
 44 |     expiration_dict=expiration_json[str(options_id)]['expirations']
 45 | 
 46 |     return [(expiration_dict[i]['expiration'],expiration_dict[i]['label']) for i in expiration_dict]
 47 | 
 48 | 
 49 | # In[4]:
 50 | 
 51 | 
 52 | #get group id
 53 | def get_groupid(jsondata):
 54 |     
 55 |     commoditygroup=pd.DataFrame.from_dict(jsondata['filters']['group'])
 56 | 
 57 |     var=locals()
 58 |     for i in range(len(commoditygroup)):
 59 |         var['a'+str(i)]=pd.DataFrame.from_dict(commoditygroup['children'].iloc[i])
 60 |         var['a'+str(i)]['group']=commoditygroup['name'].iloc[i]
 61 | 
 62 |     groupid=pd.concat([var['a'+str(i)] for i in range(len(commoditygroup))])
 63 |     groupid.reset_index(inplace=True,drop=True)
 64 |     
 65 |     return groupid
 66 | 
 67 | #get product id
 68 | def get_productid(jsondata):
 69 |     
 70 |     return pd.DataFrame.from_dict(jsondata['products'])
 71 | 
 72 | 
 73 | # In[5]:
 74 | 
 75 | 
 76 | #get option quote
 77 | def get_data(jsondata):
 78 |     
 79 |     table=pd.DataFrame.from_dict(jsondata,orient='index').T
 80 |     
 81 |     #unpack option related data    
 82 |     optionContractQuotes=table['optionContractQuotes'].iloc[0]
 83 |         
 84 |     var=locals()
 85 |     for i in range(len(optionContractQuotes)):        
 86 |         var['a'+str(i)]=pd.DataFrame.from_dict(optionContractQuotes[i]).T
 87 |         
 88 |         var['a'+str(i)]['strikePrice']=var['a'+str(i)]['change'].loc['strikePrice']
 89 |         var['a'+str(i)]['strikeRank']=var['a'+str(i)]['change'].loc['strikePrice']
 90 |         var['a'+str(i)]['underlyingFutureContract']=var['a'+str(i)]['change'].loc['underlyingFutureContract']
 91 |         var['a'+str(i)].drop(['strikePrice','strikeRank','underlyingFutureContract'],
 92 |                              inplace=True)
 93 |         var['a'+str(i)].reset_index(inplace=True)
 94 |         var['a'+str(i)].columns=var['a'+str(i)].columns.str.replace('index','optiontype')    
 95 |     
 96 |     options=pd.concat([var['a'+str(i)] for i in range(len(optionContractQuotes))])
 97 |     options.columns=['options-'+i for i in options.columns]
 98 | 
 99 |     #unpack underlying future contract
100 |     assert len(table)==1,"table length mismatch"
101 |     underlyingFutureContractQuotes=pd.DataFrame.from_dict(table['underlyingFutureContractQuotes'].iloc[0])
102 |     
103 |     assert len(underlyingFutureContractQuotes)==1,"underlyingFutureContractQuotes length mismatch"
104 |     lastTradeDate_dict=underlyingFutureContractQuotes['lastTradeDate'].iloc[0]
105 |     lastTradeDate=pd.DataFrame()
106 |     for i in lastTradeDate_dict:
107 |         lastTradeDate[i]=[lastTradeDate_dict[i]]
108 |     
109 |     priceChart_dict=underlyingFutureContractQuotes['priceChart'].iloc[0]
110 |     priceChart=pd.DataFrame()
111 |     for i in priceChart_dict:
112 |         priceChart[i]=[priceChart_dict[i]]
113 |     del underlyingFutureContractQuotes['lastTradeDate']
114 |     del underlyingFutureContractQuotes['priceChart']
115 |     priceChart.columns=priceChart.columns.str.replace('code','pricechartcode')
116 |     
117 |     futures=pd.concat([underlyingFutureContractQuotes,lastTradeDate,priceChart],axis=1)
118 |     futures.columns=['futures-'+i for i in futures.columns]
119 |     
120 |     #concatenate options and futures
121 |     output=options.copy(deep=True)
122 |     
123 |     assert len(futures)==1,"futures length mismatch"
124 |     for i in futures:
125 |         output[i]=futures[i].iloc[0]
126 |         
127 |     del table['optionContractQuotes']
128 |     del table['underlyingFutureContractQuotes']
129 |     for i in table:
130 |         output[i]=table[i].iloc[0]
131 |         
132 |     return output
133 | 
134 | 
135 | 
136 | # In[6]:
137 | 
138 | def main():
139 | 
140 |     id_url='https://www.cmegroup.com/CmeWS/mvc/ProductSlate/V2/List'
141 |     
142 |     #get group and product id to find the future contract
143 |     response_id=scrape(id_url)
144 |     groupid=get_groupid(response_id.json())
145 |     productid=get_productid(response_id.json())
146 |         
147 |     #301 denotes corn option
148 |     option_id=301
149 | 
150 |     #get expiration code from futures
151 |     expiration_url=f'https://www.cmegroup.com/CmeWS/mvc/Options/Categories/List/{option_id}/G?optionTypeFilter='
152 |     response_expiration=scrape(expiration_url)
153 |     target_exp_id=get_expiration_data(response_expiration.json())
154 |     
155 |     #get option data
156 |     for expiration_id,expiration_date in target_exp_id:
157 |         
158 |         option_url=f'https://www.cmegroup.com/CmeWS/mvc/Quotes/Option/{option_id}/G/{expiration_id}/ALL?optionProductId={option_id}&strikeRange=ALL'
159 |         response_option=scrape(option_url)
160 |         
161 |         #not every expiration_id leads to concrete data
162 |         try:
163 |             df=get_data(response_option.json())
164 | 
165 |             target=['options-optiontype',
166 |              'options-change',
167 |              'options-close',
168 |              'options-high',
169 |              'options-highLimit',
170 |              'options-last',
171 |              'options-low',
172 |              'options-lowLimit',
173 |              'options-mdKey',
174 |              'options-open',
175 |              'options-percentageChange',
176 |              'options-priorSettle',
177 |              'options-updated',
178 |              'options-volume',
179 |              'options-strikePrice',
180 |              'options-strikeRank',
181 |              'futures-change',
182 |              'futures-close',
183 |              'futures-expirationDate',
184 |              'futures-high',
185 |              'futures-highLimit',
186 |              'futures-last',
187 |              'futures-low',
188 |              'futures-lowLimit',
189 |              'futures-mdKey',
190 |              'futures-open',
191 |              'futures-optionUri',
192 |              'futures-percentageChange',
193 |              'futures-priorSettle',
194 |              'futures-productId',
195 |              'futures-productName',
196 |              'futures-updated',
197 |              'futures-volume',
198 |              'futures-default24',
199 |              'tradeDate']
200 |         
201 |             df=df[target]
202 |             
203 |             #fix the expiration mismatch between futures and options
204 |             #or you can use cme rule based month coding system
205 |             # https://www.cmegroup.com/month-codes.html
206 |             df['futures-expirationDate']=pd.to_datetime(expiration_date)
207 |             
208 |             df.to_csv(f'corn option {expiration_id}.csv',index=False)
209 |         
210 |         except ValueError:
211 |             pass
212 | 
213 | if __name__ == "__main__":
214 |     main()
215 | 
216 | 
217 | 


--------------------------------------------------------------------------------
/CQF.py:
--------------------------------------------------------------------------------
  1 | #this is a script to scrape website that requires login
  2 | #make sure you understand the basics of a webpage
  3 | #u should go through other simple scrapers in this repo before moving to this one
  4 | # https://github.com/je-suis-tm/web-scraping
  5 | 
  6 | #in the following context
  7 | #the script is trying to get some articles from a website
  8 | #this website called cqf only allows pdf download for registered users
  9 | 
 10 | import requests
 11 | from bs4 import BeautifulSoup as bs
 12 | import re
 13 | import os
 14 | os.chdir('d:/')
 15 | 
 16 | def main():
 17 |           
 18 |     #input your username and password
 19 |     #ideally we should not store password
 20 |     #we should use getpass as followed
 21 |     """
 22 |     import getpass
 23 |     getpass.getpass('input password:')
 24 |     """
 25 |     session=requests.Session()
 26 |     username=''
 27 |     password=''
 28 |     prefix='https://www.cqfinstitute.org/cqf-access/nojs/'
 29 |     login_url='https://www.cqfinstitute.org/user/login?destination=cqf-access/nojs/'
 30 | 
 31 |     
 32 |     #the first stage is to get a list of what you want
 33 |     response=session.get('https://www.cqfinstitute.org/articles')
 34 |     page=bs(response.content,'html.parser')
 35 |     
 36 |     #in this case, we just need to find a list of all the articles
 37 |     #each article is assigned with a code
 38 |     #we only need (prefix+code) to visit the article download website
 39 |     articlelist=page.find_all('a',class_='use-ajax ctools-modal-cqf-popup')
 40 |     
 41 |     d={}
 42 |     for i in articlelist:
 43 |         if i.text:
 44 |             d[i.text]=re.search('(?<=nojs\/)\d*',
 45 |                                 i.get('href')).group()
 46 |     
 47 |     #d is a dictionary that contains all the articles and codes
 48 |     #for simplicity, we only wanna get the first article
 49 |     target=d[list(d.keys())[0]]
 50 |           
 51 |           
 52 |     #the second stage is authentication
 53 |     #for websites without captcha or other methods to detect bots
 54 |     #it will be as simple as followed
 55 |     #if we need to go through captcha or other human verification
 56 |     #we can use neural network to recognize stuff
 57 |     #or download the image and let human identify it
 58 |     #this script will not cover that part (cuz i am lazy)
 59 |     
 60 |     #u may wonder where i get the headers and data from
 61 |     #before writing any script at all
 62 |     #we should use browser to login and go through the process
 63 |     #while typing username and password in browser
 64 |     #we can right click and inspect element
 65 |     #in chrome, simply ctrl+shift+i
 66 |     #the top columns in a popup window are elements, console, sources, network...
 67 |     #we select network monitor before we login
 68 |     #next, we click sign in button
 69 |     #and we should see a lot of traffic in network monitor
 70 |     #usually there is something called login or sign-in or auth
 71 |     #when we click it, we can see our username and password in form data
 72 |     #voila, that is everything we need to post
 73 |     #an easy way is to copy as powershell and paste it in our ide
 74 |     #we just need to restructure headers and form data in a pythonic way
 75 |     #normally we dont include cookies as they may expire after a few weeks
 76 |     #and we can find login url in request url section 
 77 |     auth=session.post(login_url+target,
 78 |                  headers={"Cache-Control":"max-age=0",
 79 |                           "Origin":"https://www.cqfinstitute.org",
 80 |                           "Upgrade-Insecure-Requests":"1",
 81 |                           "DNT":"1",
 82 |                           "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
 83 |                           "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
 84 |                           "Referer":"https://www.cqfinstitute.org/user/login?destination=cqf-access/nojs/"+target,
 85 |                           "Accept-Encoding":"gzip, deflate, br",
 86 |                           "Accept-Language":"en-US,en;q=0.9"},
 87 |                  data={'name': username,
 88 |                        'pass': password,
 89 |                        'form_id': 'user_login',
 90 |                        'device_authentication_operating_system': 'Windows 10 64-bit',
 91 |                        'device_authentication_browser': 'Chrome',
 92 |                        'op': 'Log in'})
 93 |     
 94 |           
 95 |     #normally when we finish login
 96 |     #we should take a look at the response
 97 |     #in most cases, login response is a json
 98 |     #we need to find something like token or auth
 99 |     #and update the session header as followed
100 |     """
101 |     token=auth.json()["token"]
102 |     session.headers.update({"Authorization": 'Token %s'%token})
103 |     """
104 |     
105 | 
106 |     #once we officially sign in as a user
107 |     #the third stage is to download the pdf
108 |     response=session.get(prefix+target)
109 |     page=bs(response.content,'html.parser')
110 |     
111 |     pdf_link=(page.find('div',class_='file file-ext').find('a').get('href'))
112 |     
113 |     pdf=session.get(pdf_link)
114 |     
115 |     f=open('a.pdf','wb')
116 |     f.write(pdf.content)
117 |     f.close()
118 |         
119 |     
120 |     return
121 | 
122 |     
123 |     
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LME.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | #this is a script to store scraped content into database
  5 | #if we scrape a lot of websites or simply scrape a website everyday
  6 | #we will end up with a huge amount of data
  7 | #it is essential to create a data warehouse to keep everything organized
  8 | import sqlite3
  9 | import requests
 10 | import pandas as pd
 11 | from io import BytesIO
 12 | import re
 13 | import pyodbc
 14 | 
 15 | 
 16 | #say if we wanna get the trader commitment report of lme from the link below
 17 | # https://www.lme.com/en-GB/Market-Data/Reports-and-data/Commitments-of-traders#tabIndex=1
 18 | #when we select aluminum and we will be redirected to a new link
 19 | # https://www.lme.com/en-GB/Market-Data/Reports-and-data/Commitments-of-traders/Aluminium
 20 | #if we try to view page source, we will find nothing in html parse tree
 21 | #what do we do?
 22 | #here is a very common scenario in web scraping
 23 | #we simply right click and select inspect element
 24 | #we will have to monitor the traffic one by one to identify where the report comes from
 25 | #as usual, i have done it for you
 26 | def get_download_link():
 27 |     
 28 |     download_link='https://www.lme.com/api/Lists/DownloadLinks/%7B02E29CA4-5597-42E7-9A22-59BB73AE8F6B%7D'
 29 |         
 30 |     
 31 |     #there are quite a few pages of reports
 32 |     #for simplicity, we only care about the latest report
 33 |     #note that the page counting starts from 0
 34 |     session=requests.Session()
 35 |     response = session.get(download_link, 
 36 |                            params={"currentPage": 0})
 37 |     
 38 |     
 39 |     #the response is a json file
 40 |     #i assume you should be familiar with json now
 41 |     #if not, plz check the link below
 42 |     # https://github.com/je-suis-tm/web-scraping/blob/master/CME2.py
 43 |     url_list=response.json()['content_items']
 44 |     
 45 |     
 46 |     return url_list
 47 | 
 48 | 
 49 | 
 50 | #once we find out where the download link is
 51 | #we can get the actual report
 52 | def get_report(url_list):
 53 |     
 54 |     prefix='https://www.lme.com'    
 55 |     url=url_list[0]['Url']
 56 |     
 57 |     
 58 |     session=requests.Session()
 59 |     response = session.get(prefix+url)
 60 |     
 61 |     
 62 |     #we also get the date of the data from url
 63 |     date=pd.to_datetime(re.search(r"\d{4}/\d{2}/\d{2}",url).group())
 64 |     
 65 |     return response.content,date
 66 | 
 67 | 
 68 | #
 69 | def etl(content,date):
 70 |     
 71 |     #the first seven rows are annoying headers
 72 |     #we simply skip them
 73 |     df = pd.ExcelFile(BytesIO(content)).parse('AH', skiprows=7)
 74 |     
 75 |     #assume we only want positions of investment funds 
 76 |     #lets do some etl
 77 |     df['Unnamed: 0'].fillna(method='ffill',
 78 |                             inplace=True)
 79 |     
 80 |     col=list(df.columns)
 81 |     for i in range(1,len(col)):
 82 |         if 'Unnamed' in col[i]:
 83 |             col[i]=col[i-1]
 84 |     
 85 |     df.columns=col
 86 |     del df['Notation of the position quantity']
 87 |     df.dropna(inplace=True)
 88 |     
 89 |     output=df['Investment Funds'][df['Unnamed: 0']=='Number of Positions']    
 90 |     output.columns=['long','short']
 91 |     
 92 |     output=output.melt(value_vars=['long','short'], 
 93 |                        var_name='position', 
 94 |                        value_name='value')
 95 |     
 96 |     output['type']=df['LOTS'].drop_duplicates().tolist()*2
 97 |     output['date']=date
 98 |     
 99 |     return output
100 | 
101 | 
102 | #for sql server
103 | #we have to use pyodbc driver
104 | def connect(
105 |         server=None, database=None, driver=None,
106 |         username=None, password=None,
107 |         autocommit=False
108 |     ):
109 |         """ get the db connection """
110 |         connection_string = "Driver={driver}; Server={server}; Database={database}"
111 |         if username:
112 |             connection_string += "; UID={username}"
113 |         if password:
114 |             connection_string += "; PWD={password}"
115 |         if not driver:
116 |             driver = [
117 |                 d for d in sorted(pyodbc.drivers())
118 |                 if re.match(r"(ODBC Driver \d+ for )?SQL Server", d)
119 |             ][0]
120 |     
121 |         return pyodbc.connect(
122 |             connection_string.format(
123 |                 server=server,
124 |                 database=database,
125 |                 driver=driver,
126 |                 username=username,
127 |                 password=password,
128 |             ),
129 |             autocommit=autocommit,
130 |         )
131 |             
132 |             
133 | #this function is to insert data into sqlite3 database
134 | #i will not go into details for sql grammar
135 | #for pythoners, sql is a piece of cake
136 | #go check out the following link for sql
137 | # https://www.w3schools.com/sql/
138 | def database(df,SQL=False):
139 |     
140 |     #plz make sure u have created the database and the table to proceed
141 |     #to create a table in database, first two lines are the same as below
142 |     #just add a few more lines
143 |     
144 |     #c.execute("""CREATE TABLE lme (position TEXT, value FLOAT, type TEXT, date DATE);""")
145 |     #conn.commit()
146 |     #conn.close()
147 |     
148 |     #connect to sqlite3
149 |     if not SQL:
150 |         
151 |         #to see what it looks like in the database
152 |         #use microsoft access or toad or just pandas
153 |         #db=pd.read_sql("""SELECT * FROM lme""",conn)
154 |         conn = sqlite3.connect('database.db')
155 |     else:
156 |         SERVER='10.10.10.10'
157 |         DATABASE='meme_stock'
158 |         conn=connect(SERVER,DATABASE,'SQL Server')
159 |     c = conn.cursor()      
160 |         
161 |     #insert data
162 |     for i in range(len(df)):
163 |         try:
164 |             c.execute("""INSERT INTO lme VALUES (?,?,?,?)""",df.iloc[i,:])
165 |             conn.commit()
166 |             print('Updating...')
167 |         except Exception as e:
168 |             print(e)
169 |     
170 |     #always need to close it
171 |     conn.close()
172 | 
173 |     print('Done.')
174 | 
175 |     return 
176 | 
177 | 
178 | #
179 | def main():
180 |     
181 |     url_list=get_download_link()
182 |     
183 |     content,date=get_report(url_list)
184 |     
185 |     output=etl(content,date)
186 |     
187 |     database(output)
188 |     
189 | 
190 | if __name__ == "__main__":
191 |     main()
192 | 


--------------------------------------------------------------------------------
/MENA Newsletter.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | #this script is about the latest news of MENA region
  4 | #we scrape different influential media websites, or so-called fake news, lol
  5 | #and send only updates to the mailbox for daily newsletter
  6 | #in order to do that, we need a db to store all the historical content of websites
  7 | #and all the scraping techniques from html parse tree to regular expression
  8 | #over time, i also discovered the issue of information overload in daily newsletter
  9 | #hence, i invented a graph theory based algorithm to extract key information
 10 | #a part of this algo will also be featured in this script to solve info redundancy
 11 | #as u can see, this is the most advanced script in web scraping repository
 12 | #it contains almost every technique we have introduced so far
 13 | #make sure you have gone through all the other scripts before moving onto this one
 14 | 
 15 | import pandas as pd
 16 | from bs4 import BeautifulSoup as bs
 17 | import requests
 18 | import datetime as dt
 19 | import win32com.client as win32 
 20 | import sqlite3
 21 | import os
 22 | import re
 23 | import copy
 24 | import time
 25 | os.chdir('d:/')
 26 | 
 27 | #this is a home made special package for text mining
 28 | #it is designed to extract key information and remove similar contents
 29 | #for details of this graph traversal algorithm plz refer to the following link
 30 | # https://github.com/je-suis-tm/graph-theory/blob/master/Text%20Mining%20project/text_mining.py
 31 | import text_mining
 32 | 
 33 | 
 34 | #main stuff
 35 | def main():
 36 |     
 37 |     ec=scrape('https://www.economist.com/middle-east-and-africa/',economist)
 38 |     aj=scrape('https://www.aljazeera.com/topics/regions/middleeast.html',aljazeera)
 39 |     tr=scrape('https://www.reuters.com/news/archive/middle-east',reuters)    
 40 |     bc=scrape('https://www.bbc.co.uk/news/world/middle_east',bbc)
 41 |     ws=scrape('https://www.wsj.com/news/types/middle-east-news',wsj)
 42 |     ft=scrape('https://www.ft.com/world/mideast',financialtimes)
 43 |     bb=scrape('https://www.bloomberg.com/view/topics/middle-east',bloomberg)
 44 |     cn=scrape('https://edition.cnn.com/middle-east',cnn)
 45 |     fo=scrape('https://fortune.com/tag/middle-east/',fortune)
 46 |     
 47 |     #concat scraped data via append, can use pd.concat as an alternative
 48 |     #unlike the previous version, current version does not sort information by source
 49 |     #the purpose of blending data together is to go through text mining pipeline
 50 |     df=ft
 51 |     for i in [aj,tr,bc,ws,cn,fo,ec,bb]:
 52 |         df=df.append(i)
 53 |     
 54 |     #CRUCIAL!!!
 55 |     #as we append dataframe together, we need to reset the index
 56 |     #otherwise, we would not be able to use reindex in database function call
 57 |     df.reset_index(inplace=True,drop=True)
 58 |     
 59 |     #first round, insert into database and remove outdated information
 60 |     df=database(df)
 61 |     
 62 |     #second round, use home made package to remove similar contents
 63 |     output=text_mining.remove_similar(df,text_mining.stopword)
 64 |     
 65 |     #if the link is not correctly captured
 66 |     #remove anything before www and add https://
 67 |     for i in range(len(output)):
 68 |         if 'https://' not in output['link'][i]:
 69 |             temp=re.search('www',output['link'][i]).start()
 70 |             output.at[i,'link']='http://'+output['link'][i][temp:]
 71 |     
 72 |     print(output)
 73 |     
 74 |     
 75 |     #using html email template
 76 |     #check stripo for different templates
 77 |     # https://stripo.email/templates/
 78 |     html="""
 79 |     <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
 80 |     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 81 |     <html>
 82 | 
 83 |     <head>
 84 |     <meta charset="UTF-8">
 85 |     <meta content="width=device-width, initial-scale=1" name="viewport">
 86 |     <meta name="x-apple-disable-message-reformatting">
 87 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 88 |     <meta content="telephone=no" name="format-detection">
 89 |     <title></title>
 90 |     <!--[if (mso 16)]>
 91 |     <style type="text/css">
 92 |     a {text-decoration: none;}
 93 |     </style>
 94 |     <![endif]-->
 95 |     <!--[if gte mso 9]><style>sup 
 96 |     { font-size: 100% !important; }</style><![endif]-->
 97 |     </head>
 98 | 
 99 |     <body>
100 |     <div class="es-wrapper-color">
101 |         <!--[if gte mso 9]>
102 |                 <v:background xmlns:v="urn:schemas-microsoft-com:vml" 
103 |                 fill="t">
104 |                     <v:fill type="tile" color="#333333"></v:fill>
105 |                 </v:background>
106 |             <![endif]-->
107 |         <table class="es-content-body" width="600" 
108 |         cellspacing="15" cellpadding="15" bgcolor="#ffffff" 
109 |         align="center">
110 |          <tr>
111 |             <td class="esd-block-text" align="center">
112 |             <h2>Middle East</h2></td>
113 |          </tr></table>
114 |          <div><br></div>
115 |         
116 |     """
117 |     
118 |     
119 |     #there are a few ways for embed image in html email
120 |     #here, we use the link of the image
121 |     #it may be a lil bit slow to load the image, its the most efficient way
122 |     #alternatively, we can use mail.Attachments.add()
123 |     #we attach all images, and set <img src='cid: imagename.jpg'>
124 |     #the downside is that we have to scrape the website repeatedly to get images
125 |     #or we can use < img src='data:image/jpg; base64, [remove the brackets and paste base64]'/>
126 |     #base64 can be generated via the following code
127 |     # from io import BytesIO
128 |     # import base64
129 |     # def create_image_in_html(fig):
130 |     #     tmpfile = BytesIO()
131 |     #     fig.savefig(tmpfile, format='png')
132 |     #     encoded = base64.b64encode(
133 |     #         tmpfile.getvalue()).decode('utf-8')
134 |     #     return encoded
135 |     #but this approach is blocked by most email clients including outlook 2016
136 |     for i in range(len(output)):
137 |         html+="""<table class="es-content-body" width="600" 
138 |         cellspacing="10" cellpadding="5" bgcolor="#ffffff"
139 |         align="center">"""
140 |         html+="""<tr><td class="esd-block-text es-p10t es-p10b"
141 |         align="center"><p><a href="%s">
142 |         <font color="#6F6F6F">%s<font><a></p></td></tr>
143 |         <tr><td align="center">
144 |         <img src="%s" width="200" height="150"/></td></tr>
145 |         <tr>"""%(output['link'][i],output['title'][i],output['image'][i])
146 |         html+="""</tr></table><div><br></div>"""
147 |         
148 |     html+="""
149 |     </div>
150 |     </body>
151 |     </html>
152 |     """
153 |     
154 |     
155 |     send(html)
156 | 
157 |    
158 | #i use win32 to control outlook and send emails
159 | #when you have a win 10 pro, it is the easiest way to do it
160 | #cuz windows pro automatically launches outlook at startup
161 | #otherwise, there is a library called smtp for pop3/imap server
162 | #supposedly authentication of corporate email would kill u
163 | #i definitely recommend folks to use win32 library
164 | #note that using win32.email requires outlook to stay active
165 | #do not close the app until u actually send out the email
166 | 
167 | #win32 library uses COM api to control windows
168 | #go to microsoft developer network 
169 | #check mailitem object model to learn how to manipulate outlook emails
170 | #the website below is the home page of outlook vba reference
171 | # https://msdn.microsoft.com/en-us/vba/vba-outlook
172 | def send(html):
173 |         
174 |     #create an email with recipient, subject, context and attachment
175 |     outlook = win32.Dispatch('outlook.application')  
176 |     mail = outlook.CreateItem(0)  
177 |     
178 |     #these email addresses are fabricated, PLZ DO NOT HARASS OUR GODDESS
179 |     #just some random pornstar i love
180 |     receivers = ['lana.rhodes@brazzers.com',
181 |                  'tori.black@brazzers.com',
182 |                  'naomi.woods@brazzers.com']  
183 | 
184 |     #use ';' to separate receipients
185 |     #this is a requirement of outlook
186 |     mail.To = ';'.join(receivers) 
187 |     
188 |     mail.Subject ='Mid East Newsletter %s'%(dt.datetime.now())
189 |     mail.BodyFormat=2
190 |     
191 |     #use html to make email looks more elegant
192 |     #html is very simple
193 |     #use br for line break, b for bold fonts
194 |     #font for color and size, a href for hyperlink
195 |     #check the website below to see more html tutorials
196 |     # https://www.w3schools.com/html/
197 |     
198 |     #Alternatively, we can use plain text email
199 |     #remember to use '\r\n' to jump line
200 |     #assuming html is a list of str
201 |     #the code should be mail.Body = '\r\n'.join(html)
202 |     mail.HTMLBody=html
203 |     
204 |     #i usually print out everything
205 |     #need to check carefully before sending to stakeholders
206 |     #we can use mail.Display() to see the draft instead
207 |     condition=str(input('0/1 for no/yes:'))
208 |     if condition=='1':
209 |         mail.Send()
210 |         print('\nSENT')
211 |     else:
212 |         print('\nABORT')
213 |     
214 |     return
215 | 
216 | 
217 | #database insertion and output the latest feeds
218 | #i assume you are familiar with sqlite3
219 | #if not, plz check the following link
220 | # https://github.com/je-suis-tm/web-scraping/blob/master/LME.py
221 | def database(df):
222 |     
223 |     temp=[]
224 |     conn = sqlite3.connect('mideast_news.db')
225 |     c = conn.cursor()
226 |     
227 |     #the table structure is simple
228 |     #the table name is new
229 |     #there are three columns, title, link and image
230 |     #the data types of all of them are TEXT
231 |     #title is the primary key which forbids duplicates
232 |     for i in range(len(df)):
233 |         try:
234 |             c.execute("""INSERT INTO news VALUES (?,?,?)""",df.iloc[i,:])
235 |             conn.commit()
236 |             
237 |             print('Updating...')
238 |             
239 |             #the idea is very simple
240 |             #insert each line from our scraped result into database
241 |             #as the primary key has been set up
242 |             #we have non-duplicate title constraint
243 |             #insert what has already been in database would raise an error
244 |             #if so, just ignore the error and pass to the next iteration
245 |             #we can utilize the nature of database to pick out the latest information
246 |             #every successful insertion into the database also goes to the output
247 |             #at the end, output contains nothing but latest updates of websites
248 |             #that is what we call newsletter
249 |             temp.append(i)
250 |             
251 |         except Exception as e:
252 |             print(e)
253 |     
254 |     conn.close()
255 |     
256 |     #check if the output contains no updates
257 |     if temp:
258 |         output=df.loc[[i for i in temp]]
259 |         output.reset_index(inplace=True,drop=True)
260 |     else:
261 |         output=pd.DataFrame()
262 |         output['title']=['No updates yet.']
263 |         output['link']=output['image']=['']
264 |     
265 |     return output
266 | 
267 | 
268 | #scraping webpages and do some etl
269 | def scrape(url,method):
270 |     
271 |     print('scraping webpage effortlessly')
272 |     time.sleep(5)
273 |     
274 |     session=requests.Session()
275 |     response = session.get(url,headers={'User-Agent': 'Mozilla/5.0'})      
276 |     page=bs(response.content,'html.parser',from_encoding='utf_8_sig')
277 |     
278 |     df=method(page) 
279 |     out=database(df)
280 |     
281 |     return out        
282 | 
283 | 
284 | """
285 | the functions below are data etl of different media sources
286 | """
287 | #the economist etl
288 | def economist(page):
289 |     
290 |     title,link,image=[],[],[]
291 |     df=pd.DataFrame()
292 |     prefix='https://www.economist.com'
293 |     
294 |     a=page.find_all('div',class_="topic-item-container")
295 |     
296 |     for i in a:
297 |     
298 |         link.append(prefix+i.find('a').get('href'))
299 |         title.append(i.find('a').text)
300 |         image.append(i.parent.find('img').get('src'))
301 | 
302 |     df['title']=title
303 |     df['link']=link
304 |     df['image']=image
305 |     
306 |     return df
307 | 
308 | 
309 | #fortune etl
310 | def fortune(page):
311 |     
312 |     title,link,image=[],[],[]
313 |     df=pd.DataFrame()
314 |     prefix='https://fortune.com'
315 |     
316 |     a=page.find_all('article')
317 |     
318 |     for i in a:
319 |     
320 |         link.append(prefix+i.find('a').get('href'))
321 |     
322 |         if 'http' in i.find('img').get('src'):
323 |             image.append(i.find('img').get('src'))
324 |         else:
325 |             image.append('')
326 |     
327 |         temp=re.split('\s*',i.find_all('a')[1].text)
328 |         temp.pop()
329 |         temp.pop(0)
330 |         title.append(' '.join(temp))
331 | 
332 |     df['title']=title
333 |     df['link']=link
334 |     df['image']=image
335 |     
336 |     return df
337 | 
338 | 
339 | #cnn etl
340 | def cnn(page):
341 |     
342 |     title,link,image=[],[],[]
343 |     df=pd.DataFrame()
344 |     
345 |     prefix='https://edition.cnn.com'
346 |     
347 |     a=page.find_all('div', class_='cd__wrapper')
348 |     
349 |     for i in a:
350 |         title.append(i.find('span').text)
351 |         link.append(prefix+i.find('a').get('href'))
352 |         try:
353 |             image.append('https:'+i.find('img').get('data-src-medium'))
354 |         except:
355 |             image.append('')
356 |         
357 |     df['title']=title
358 |     df['link']=link
359 |     df['image']=image
360 |     
361 |     return df
362 | 
363 | 
364 | #bloomberg etl
365 | def bloomberg(page):
366 | 
367 |     title,link,image=[],[],[]
368 |     df=pd.DataFrame()
369 |     prefix='https://www.bloomberg.com'
370 |     
371 |     a=page.find_all('h1')
372 |     for i in a:
373 |         try:
374 |             link.append(prefix+i.find('a').get('href'))
375 |             title.append(i.find('a').text.replace('â€™','\''))
376 |         except:
377 |             pass
378 |     
379 | 
380 |     b=page.find_all('li')
381 |     for j in b:
382 |         try:
383 |             temp=j.find('article').get('style')
384 |             
385 |             image.append( \
386 |                          re.search('(?<=url\()\S*(?=\))', \
387 |                                    temp).group() \
388 |                         )
389 |         except:
390 |             temp=j.find('article')
391 |             
392 |             try:
393 |                 temp2=temp.get('id')
394 |                 if not temp2:
395 |                     image.append('')
396 |             except:
397 |                 pass
398 | 
399 | 
400 |     df['title']=title
401 |     df['link']=link
402 |     df['image']=image
403 |     
404 |     return df
405 | 
406 | 
407 | #financial times etl
408 | def financialtimes(page):
409 |     
410 |     title,link,image=[],[],[]
411 |     df=pd.DataFrame()
412 |     prefix='https://www.ft.com'
413 | 
414 |     a=page.find_all('a',class_='js-teaser-heading-link')
415 |     for i in a:
416 |         link.append(prefix+i.get('href'))
417 |         temp=i.text.replace('â€™','\'').replace('â€˜','\'')
418 |         title.append(temp.replace('\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t',''))
419 | 
420 |     for j in a:
421 |         temp=j.parent.parent.parent
422 |         try:
423 |             text=re.search('(?<=")\S*(?=next)',str(temp)).group()
424 |             image.append(text+'next&fit=scale-down&compression=best&width=210')
425 |         except:
426 |             image.append('')
427 |     
428 |     df['title']=title
429 |     df['link']=link
430 |     df['image']=image
431 |     
432 |     return df
433 | 
434 | 
435 | #wall street journal etl
436 | def wsj(page):
437 |     
438 |     df=pd.DataFrame()
439 |     
440 |     text=str(page)
441 | 
442 |     link=re.findall('(?<=headline"> <a href=")\S*(?=">)',text)
443 | 
444 |     image=re.findall('(?<=img data-src=")\S*(?=")',text)
445 | 
446 |     title=[]
447 |     for i in link:
448 |         try:
449 |             temp=re.search('(?<={}")>(.*?)<'.format(i),text).group()
450 |             title.append(temp)
451 |         except:
452 |             pass
453 | 
454 |     for i in range(len(title)):
455 |         title[i]=title[i].replace('â€™',"'").replace('<','').replace('>','')
456 |         
457 |     df['title']=title
458 |     df['link']=link[:len(title)]
459 |     df['image']=image+[''] if (len(image)!=len(title)) else image
460 |         
461 |     return df
462 | 
463 | 
464 | #bbc etl
465 | def bbc(page):
466 |     
467 |     title,link,image=[],[],[]
468 |     df=pd.DataFrame()
469 |     
470 |     prefix='https://www.bbc.co.uk'
471 |     
472 |     a=page.find_all('span',class_='title-link__title-text')
473 |     
474 |     for i in a:
475 |         temp=i.parent.parent.parent.parent
476 |         b=(re.findall('(?<=src=")\S*(?=jpg)',str(temp)))
477 |         
478 |         if len(b)>0:
479 |             b=copy.deepcopy(b[0])+'jpg'
480 |         else:
481 |             b=''
482 |             
483 |         image.append(b)
484 |     
485 |     for j in a:
486 |         title.append(j.text)
487 |     
488 |     for k in a:
489 |         temp=k.parent.parent
490 |         c=re.findall('(?<=href=")\S*(?=">)',str(temp))
491 |         link.append(prefix+c[0])
492 |         
493 |     df['title']=title
494 |     df['link']=link
495 |     df['image']=image
496 |     
497 |     return df
498 | 
499 | 
500 | #thompson reuters etl
501 | def reuters(page):
502 |     title,link,image=[],[],[]
503 |     df=pd.DataFrame()
504 |     
505 |     prefix='https://www.reuters.com'
506 |         
507 |     for i in page.find('div', class_='news-headline-list').find_all('h3'):
508 |         temp=i.text.replace('								','')
509 |         title.append(temp.replace('\n',''))
510 |     
511 |     for j in page.find('div', class_='news-headline-list').find_all('a'):
512 |         link.append(prefix+j.get('href'))
513 |     link=link[0::2]
514 |         
515 |     for k in page.find('div', class_='news-headline-list').find_all('img'):
516 |         if k.get('org-src'):
517 |             image.append(k.get('org-src'))
518 |         else:
519 |             image.append('')
520 | 
521 |     
522 |     df['title']=title
523 |     df['link']=link
524 |     df['image']=image
525 |     
526 |     return df
527 | 
528 | 
529 | #al jazeera etl
530 | def aljazeera(page):
531 |     title,link,image=[],[],[]
532 |     df=pd.DataFrame()
533 |     
534 |     prefix='https://www.aljazeera.com'
535 |     
536 |     a=page.find_all('div',class_='frame-container')
537 |     for i in a:
538 |         title.append(i.find('img').get('title'))
539 |         image.append(prefix+i.find('img').get('src'))
540 |         temp=i.find('a').get('href')
541 |         link.append(temp if 'www' in temp else (prefix+temp))
542 |     
543 |     b=page.find_all('div',class_='col-sm-7 topics-sec-item-cont')
544 |     c=page.find_all('div',class_='col-sm-5 topics-sec-item-img')
545 |     
546 |     limit=max(len(b),len(c))
547 |     j,k=0,0
548 |     while j<limit:
549 |         
550 |         title.append(b[j].find('h2').text)
551 |         temp=b[j].find_all('a')[1].get('href')
552 |         link.append(temp if 'www' in temp else (prefix+temp))
553 |         
554 |         #when there is an opinion article
555 |         #the image tag would change
556 |         #terrible website
557 |         if 'opinion' in b[j].find('a').get('href'):
558 |             image.append(' ')
559 | 
560 |         else:
561 |             image.append(prefix+c[k].find_all('img')[1].get('data-src'))
562 |             k+=1
563 |             
564 |         j+=1
565 |     
566 |     df['title']=title
567 |     df['link']=link
568 |     df['image']=image
569 |         
570 |     return df
571 | 
572 | 
573 | if __name__ == "__main__":
574 |     main()
575 | 


--------------------------------------------------------------------------------
/Macrotrends.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Apr 12 16:49:09 2021
 4 | 
 5 | @author: Administrator
 6 | """
 7 | 
 8 | 
 9 | #this website is called macrotrends
10 | #this script is designed to scrape its financial statements
11 | #yahoo finance only contains the recent 5 year
12 | #macrotrends can trace back to 2005 if applicable
13 | import re
14 | import json
15 | import pandas as pd
16 | import requests
17 | import os
18 | os.chdir('k:/')
19 | 
20 | 
21 | #simply scrape
22 | def scrape(url,**kwargs):
23 |     
24 |     session=requests.Session()
25 |     session.headers.update(
26 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
27 |     
28 |     response=session.get(url,**kwargs)
29 | 
30 |     return response
31 | 
32 | 
33 | #create dataframe
34 | def etl(response):
35 | 
36 |     #regex to find the data
37 |     num=re.findall('(?<=div\>\"\,)[0-9\.\"\:\-\, ]*',response.text)
38 |     text=re.findall('(?<=s\: \')\S+(?=\'\, freq)',response.text)
39 | 
40 |     #convert text to dict via json
41 |     dicts=[json.loads('{'+i+'}') for i in num]
42 | 
43 |     #create dataframe
44 |     df=pd.DataFrame()
45 |     for ind,val in enumerate(text):
46 |         df[val]=dicts[ind].values()
47 |     df.index=dicts[ind].keys()
48 |     
49 |     return df
50 | 
51 | 
52 | def main():
53 |     
54 |     url='https://www.macrotrends.net/stocks/charts/AAPL/apple/financial-statements'
55 |     response=scrape(url)
56 |     df=etl(response)
57 |     df.to_csv('aapl financial statements.csv')
58 |     
59 |     return
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Web Scraping
  2 | 
  3 | <br>
  4 | 
  5 | ## Intro
  6 | 
  7 | My understanding of web scraping is patience and attention to details. Scraping is not rocket science (deep learning is). When I do scraping, I typically spend 50% of my time in analyzing the source (navigate through HTML parse tree or inspect element to find the post form) and the rest 50% in ETL. The most useful tools for me are `requests`, `bs4` and `re`. Some people may recommend `selenium` for non-static website. To be honest, I have never used `selenium` throughout my career, but dynamic websites like Facebook and Twitter are still within my grasp. You see? patience and attention to details matter. 
  8 | 
  9 | This repository contains a couple of python web scrapers. These scrapers mainly target at different commodity future exchanges and influential media websites (or so-called fake news, lol). Most scripts were written during my early days of Python learning. Since this repository gained unexpected popularity, I have restructured everything to make it more user-friendly. All the scripts featured in this repository are ready for use. Each script is designed to feature a unique technique that I found useful throughout my experience of data engineering. 
 10 | 
 11 | Scripts inside this repository are classified into two groups, beginner and advanced. At the beginning, the script is merely about some technique to extract the data. As you progress, the script leans more towards data architect and other functions to improve the end product. If you are experienced or simply come to get scrapers for free, you may want to skip the content and just look at <a href= https://github.com/je-suis-tm/web-scraping#available-scrapers>available scrapers</a>. If you are here to learn, you may look at <a href= https://github.com/je-suis-tm/web-scraping#table-of-contents>table of contents</a> to determine which suits you best. In addition, there are some <a href= https://github.com/je-suis-tm/web-scraping#notes>notes</a> on the annoying issues such as proxy authentication (usually corporate or university network) and legality (hopefully you won't come to that).
 12 | 
 13 | <br>
 14 | 
 15 | ## Table of Contents
 16 | 
 17 | #### Beginner
 18 | 
 19 | <a href=https://github.com/je-suis-tm/web-scraping#1-html-parse-tree-search-cme1>1. HTML Parse Tree Search (CME1)</a>
 20 | 
 21 | <a href=https://github.com/je-suis-tm/web-scraping#2-json-cme2>2. JSON (CME2)</a>
 22 | 
 23 | <a href=https://github.com/je-suis-tm/web-scraping#3-regular-expression-shfe>3. Regular Expression (SHFE)</a>
 24 | 
 25 | #### Advanced
 26 | 
 27 | <a href=https://github.com/je-suis-tm/web-scraping#1-sign-in-cqf>1. Sign-in (CQF)</a>
 28 | 
 29 | <a href=https://github.com/je-suis-tm/web-scraping#2-database-lme>2. Database (LME)</a>
 30 | 
 31 | <a href=https://github.com/je-suis-tm/web-scraping#3-newsletter-mena>3. Newsletter (MENA)</a>
 32 | 
 33 | #### Notes
 34 | 
 35 | <a href=https://github.com/je-suis-tm/web-scraping#1-proxy-authentication>1. Proxy Authentication</a>
 36 | 
 37 | <a href=https://github.com/je-suis-tm/web-scraping#2-legality>2. Legality</a>
 38 | 
 39 | <br>
 40 | 
 41 | ## Available Scrapers
 42 | 
 43 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Al Jazeera AJ</a> 
 44 | 
 45 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Bloomberg</a> 
 46 | 
 47 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>British Broadcasting Corporation BBC</a> 
 48 | 
 49 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/CFTC.py>CFTC</a> 
 50 | 
 51 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Cable News Network CNN</a> 
 52 | 
 53 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/US%20Federal%20Holidays.py>Calendar Labs</a> 
 54 | 
 55 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/CQF.py>Certificate in Quantitative Finance CQF</a> 
 56 | 
 57 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/CME2.py>Chicago Mercantile Exchange CME Futures</a> 
 58 | 
 59 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/CME3.py>Chicago Mercantile Exchange CME Options</a> 
 60 | 
 61 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Financial Times FT</a> 
 62 | 
 63 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Fortune</a> 
 64 | 
 65 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/LME.py>London Metal Exchange LME</a> 
 66 | 
 67 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/Macrotrends.py>Macrotrends</a>
 68 | 
 69 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/WallStreetBets.py>Reddit WallStreetBets</a>
 70 | 
 71 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/SHFE.py>Shanghai Future Exchange SHFE</a> 
 72 | 
 73 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/Springer.py>Springer Nature</a> 
 74 | 
 75 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>The Economist</a> 
 76 | 
 77 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Thompson Reuters</a> 
 78 | 
 79 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/Tomtom.py>Tomtom</a>
 80 | 
 81 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/US%20Treasury.py>US Treasury</a>
 82 | 
 83 | * <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>Wall Street Journal WSJ</a> 
 84 | 
 85 | <br>
 86 | 
 87 | ## Beginner
 88 | 
 89 | #### 1. HTML Parse Tree Search (CME1)
 90 | 
 91 | Tree is an abstract data type in computer science. Now that you are a programmer, Binary Tree and AVL Tree must feel like primary school math (haha, I am joking, tree is my worst nightmare when it comes to interview). For a webpage, if you right click and select view source (CTRL+U in both IE & Chrome), you will end up with a bunch of codes like this.
 92 | 
 93 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme1%20html.PNG)
 94 | 
 95 | The codes are written in HTML. The whole HTML script is a tree structure as well. The HTML parse tree looks like this. 
 96 | 
 97 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme1%20tree.png)
 98 | 
 99 | There is something interesting about HTML parse tree. The first word after the left bracket is HTML tag (in tree structure we call it node). In most cases, tags come in pairs. Of course, there are some exceptions such as line break tag `<br>` or doc type tag ` <!DOCTYPE>`. Usually the opening tag is just tag name but the closing tag has a slash before the name. Different tag names represent different functionalities. In most cases, there are only a few tags that contain information we need, e.g., tag `<div>` usually defines a table, tag `<a>` creates a hyperlink (the link is at attribute `href` and it may skip prefix if the prefix is the same as current URL), tag `<img>` comes up with a pic (the link is hidden in attribute `src`), tag `<p>` or `<h1>`-`<h6>` normally contains text. For more details of tagging, please refer to <a href= https://www.w3schools.com/tags/default.asp>w3schools</a>.
100 | 
101 | It is vital to understand the basics of HTML parse tree because most websites with simple layout can easily be traversed via a library called BeautifulSoup. When we use urllib or other packages to request a specific website via python, we end up with HTML parse tree in bytes. When the bytes are parsed to BeautifulSoup, it makes life easier. It allows us to search the tag name and other attributes to get the content we need. The link to the documentation of BeautifulSoup is <a href= https://www.crummy.com/software/BeautifulSoup/bs4/doc>here</a>.
102 | 
103 | For instance, we would love to get the link to the quiz on Dragon Ball, we can do
104 | 
105 | ```python
106 | result.find('div',class_='article article__list old__article-square').find('a').get('href')
107 | ```
108 | or
109 | 
110 | ```python
111 | result.find('div',attrs={'class':'article article__list old__article-square'}).find('a').get('href')
112 | ```
113 | 
114 | Here, result is a BeautifulSoup object. The attribute `find` returns the first matched tag. The attribute `get` enables us to seek for attributes inside a tag.
115 | 
116 | If we are interested in all the titles of the articles, we do
117 | 
118 | ```python
119 | temp=result.find('div',class_='article article__list old__article-square').find_all('a')
120 | output=[i.text for i in temp]
121 | ```
122 | 
123 | or
124 | 
125 | ```python
126 | temp=result.find('div',attrs={'class':'article article__list old__article-square'}).find_all('a')
127 | output=[i.text for i in temp]
128 | ```
129 | 
130 | The attribute `find_all` returns all the matched results. `.text` attribute automatically gets all `str` values inside the current tag. The second article has a subtitle 'subscriber only'. So we will have a rather longer title for the second article compared to the rest. 
131 | 
132 | You can refer to <a href= https://github.com/je-suis-tm/web-scraping/blob/master/CME1.py>CME1</a> for more details. Please note that CME1 is an outdated script for Chicago Mercantile Exchange. Due to the change of the website, you cannot go through HTML parse tree to extract data any more. Yet, the concept of HTML parse tree is still applicable to other cases.
133 | 
134 | #### 2. JSON (CME2)
135 | 
136 | JSON, is the initial for JavaScript Object Notation. Like csv, it is another format to store data. According to the <a href=https://www.json.org>official website</a> of JSON, it is easy for humans to read and write. Pfff, are you fxxking with me? Anyway, an SVG image generated by D3.js is normally associated with JSON data. Finding JSON makes it possible to extract data of an interactive chart. If you open JSON with notepad, you will see something like this.
137 | 
138 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20json.PNG)
139 | 
140 | Gosh, the structure is messy and I will have a panic attack very soon. Duh! Just kidding. If you are familiar with adjacency list in graph theory, you will find it very easy to understand JSON. If not, do not worry, JSON is merely dictionaries inside dictionaries (with some lists as well). To navigate through the data structure, all you need to know is the key of the value.
141 | 
142 | Reading a JSON file in Python is straight forward. There are two ways.
143 | 
144 | There is a default package just called json, you can do
145 | 
146 | ```python
147 | import json
148 | with open('data.json') as f:
149 |   data = json.load(f)
150 | print(data)
151 | ```
152 | 
153 | Nevertheless, I propose a much easier way. We can parse the content to pandas and treat it like a dataframe. You can do
154 | 
155 | ```python
156 | import pandas as pd
157 | df=pd.read_json('data.json')
158 | print(df)
159 | ```
160 | 
161 | Reading JSON is not really the main purpose of this chapter. What really made me rewrite the scraper for CME is the change of website structure. In April 2018, I could not extract data from searching for HTML tags any more. I came to realize that CME created a dynamic website by JavaScript. The great era of BeautifulSoup was water under the bridge. At this critical point of either adapt or die, I had to find out where the data came from and develop a new script. Guess where?
162 | 
163 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20url.PNG)
164 | 
165 | The URL is still in page source! The HTML tag for the hidden link is `<script>`. As I have mentioned at the beginning of this README file, scraping is about patience and attention to details. If you try to search all `<script>` tags, you will end up with more than 100 results. It took me a while for me to sniff the data source. My friends, patience is a virtue. 
166 | 
167 | As for other websites, we may not be that lucky. Take <a href= https://www.euronext.com/en/products/indices/FR0003502079-XPAR>Euronext</a> for example, you won't find any data in page source. We have to right click and select inspect element (CTRL+SHIFT+I in Chrome, F12 in IE).
168 | 
169 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20inspect%20element.png)
170 | 
171 | The next step is to select Network Monitor in a pop-up window. Now let's view data.
172 | 
173 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20network.PNG)
174 | 
175 | There is a lot of traffic. Each one contains some information. Currently what truly matters to us is the request URL. Other information such as header or post form data will be featured in a later chapter. We must go through all the traffic to find out which URL leads to a JSON file. Once we hit the jackpot, we right click the request and copy link address.
176 | 
177 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20request%20url.PNG)
178 | 
179 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20link%20address.png)
180 | 
181 | Voila!
182 | 
183 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cme2%20euronext.PNG)
184 | 
185 | Euronext is still considered an easy one. Sometimes you have to post a form with valid header to get the JSON file. You will see that in the first chapter of advanced level. For more details of JSON, feel free to take a look at <a href= https://github.com/je-suis-tm/web-scraping/blob/master/CME2.py>CME2</a>. Please note that CME2 has replaced CME1 to be the available scraper for Chicago Mercantile Exchange. There is also <a href= https://github.com/je-suis-tm/web-scraping/blob/master/CME3.py>CME3</a> which specializes in option data.
186 | 
187 | #### 3. Regular Expression (SHFE)
188 | 
189 | Sometimes, navigate through HTML parse tree may not get you what you need. For instance, you have some information inside a javascript function. You don't really want the whole bloc. All you care about is API key. 
190 | 
191 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/shfe%20javascript.png)
192 | 
193 | Or you got multiple titles. You only care about the numbers inside these titles. You are unable to use array slicing with indices because numbers don't appear in fixed positions. Those numbers can even be negative.
194 | 
195 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/shfe%20regex.png)
196 | 
197 | Helpless, right? Not if you know regular expression! We will call it regex in the following context. Think of regex as another simple language as HTML. In Python, there is a built-in library called `re`. There are a couple of functions inside this module. But for web scraping, `re.findall` and `re.search` are commonly used. `re.findall` returns a list of all the matched words and `re.search` returns a regex object. We simply apply attribute `re.search('','').group()` to concatenate the text together.
198 | 
199 | As for the regex itself, there are a few useful tips. `(?<=)` and `(?=)` are my favorite pair. They are called look-ahead and look-behind. If the content you are looking for is always behind a comma and before a question mark. You can simply do
200 | 
201 | ```regex
202 | (?<=\,)\S*(?=\?)
203 | ```
204 | 
205 | Punctuation marks have special meanings in regex. If you need to specify comma instead of special meanings, you always remember to put a slash before it. `\S*` refer to all the non-whitespace characters. Characters have no special meanings in regex. But when you put a slash before characters, all of sudden they have special meanings, quite the opposite to punctuation marks. 
206 | 
207 | The full table of my useful tips is here.
208 | 
209 | Syntax | Meaning
210 | ------------ | -------
211 | `\d*` | All the numbers. If we remove asterisk mark, we will only match one digit. Asterisk mark refers to zero or multiple occurrence.
212 | `\w*` | All the characters, numbers and underscore marks
213 | `\S*` | All the non-whitespace characters
214 | `-?\d*\,?\d*` | All the numbers, potential negative signs and potential commas. Question mark means the character can be zero or one occurrence.
215 | `\d{4}` | 4 digits
216 | `^Tori` | Anything starts with Tori
217 | `Black$` | Anything ends with Black
218 | `[a-z0-9]` | Anything involves lower case characters or digits
219 | `[^A-Z]` | Anything except upper case characters
220 | `(?<=\,)\S*(?=\?)` | Anything behind a comma and before a question mark
221 | 
222 | You can check <a href=https://www.w3schools.com/python/python_regex.asp>w3schools</a> for more details on regex syntax. 
223 | 
224 | In this chapter, the example is to navigate through a JSON file by regex (way faster than parsed as a pandas dataframe). Recalled from the previous chapter, JSON file is sort of dictionaries inside dictionaries. Normally we access the value by multiple keys. If you think of JSON file as a tree ADT, we need to know every node (key) from root to parent to go to the child node (value). Now we convert the whole structure to string and search for certain patterns via regex. With look-ahead and look-behind pair, knowing a parent node is fairly sufficient to get the value. Don't believe me? Feel free to take a look at <a href=https://github.com/je-suis-tm/web-scraping/blob/master/SHFE.py>SHFE</a> for coding details.
225 | 
226 | <br>
227 | 
228 | ## Advanced
229 | 
230 | #### 1. Sign-in (CQF)
231 | 
232 | Congrats! I assume you have mastered entry-level web scraping. Since we come to more advanced level, we will have to deal with more complex issues. In this chapter, we will talk about how to sign into a website in Python. Please bear in mind that we will only discuss login without any captcha or other Turing Test (google's reCAPTCHA is one of the worst). It doesn't mean captcha is the dead end for scraping. There are two ways to bypass captcha, manually downloading image for human recognition or using external package to do image recognition. You will find something <a href=https://www.tutorialspoint.com/python_web_scraping/python_web_scraping_processing_captcha.htm>here</a>.
233 | 
234 | Well, login is no magic. Traditionally it is posting a form consists of critical information to a certain address. When each piece of information matches the record in website backend database, the website will assign a token to you. Token is like security clearance. It enables you to visit the content that requires login. Always remember to insert a token into the header when you got one after authentication.
235 | 
236 | Big companies like Facebook or Twitter use a slightly different approach called CSRF token. The website sends a token to you before sign-in. It goes without saying that CSRF token must be presented at login. There will be no more token assigned to you after authentication because the cookies will take care of everything. Think of CSRF as buying TTP in Madrid, once you tap it on the card reader to pass the gate, you do not need it to visit any station or exit the metro system. 
237 | 
238 | Let's look at a simple case, a website called CQF. This great website features many free reports and videos on quantitative finance. But, there is always a but, the annoying part is resources are exclusive to registered users. Thus, we will be forced to include the login part in our python scraper. As usual, we always take a quick look at the website before coding. When we log in, we need to inspect element to seek for the login activity (if you forget how to do this, please refer to chapter 2 in the beginner level). There are quite a few activities when we log in, right? The quickest way to distinguish the login from the rest is to search your username and password. Because username and password are normally unhashed. 
239 | 
240 | Now that we have located the login activity, there are three key things we need to keep an eye on. The first one is Request URL. It will be the URL we post our form to. Pay attention to Request Method. The login is often POST method, rather than GET method.
241 | 
242 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cqf%20login%20link.PNG)
243 | 
244 | The second one will be Request Headers. Headers are great tools to disguise your scraping as an internet browser. They are called headers because you would spend most of your time scratching your head to get them right. We can observe tons of information in the headers. Only a small bit of them are genuinely useful to the login. An effective way is to exclude cookies and anything contains hashed information. Nonetheless, this is not always the case. Some websites filter out machines by valid cookies with hashed information for login. If you accidentally exclude those headers, you may trigger the alarm of the website and end up with some form of captcha.
245 | 
246 | *My apologies for the redaction in these headers. The redaction plays a vital role here to protect my privacy. It turns my headers into some confidential documents from MI6.*
247 | 
248 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cqf%20request%20header.PNG)
249 | 
250 | The last but not least one will be Form Data. It contains the critical information for authentication, such as username and password. 
251 | 
252 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cqf%20post%20form.PNG)
253 | 
254 | There is another part called Query String Parameters. We do not encounter it very often at login. It is more frequently seen in data query though.
255 | 
256 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/cqf%20query.PNG)
257 | 
258 | Once we have gathered everything we need, we can simply do
259 | 
260 | ```python
261 | session.post(url,headers={'iamnotarobot':True},
262 |              data={'username':'lanarhodes4avn',
263 |                    'password':'i<3ellahughes'},
264 |              params={'id':'jia.lissa'})
265 | ```
266 | 
267 | The session will automatically update its cookie after posting a form. Generally speaking, the website gives a token in return (CQF does not). And the response is likely to be in JSON format, then we do 
268 | 
269 | ```python
270 | session.headers.update({'token':response.json()['token']})
271 | ```
272 | 
273 | We have obtained the security clearance now. We can snoop around every corner as we please. Quite simple, isn't it? For more details, feel free to click <a href=https://github.com/je-suis-tm/web-scraping/blob/master/CQF.py>CQF</a>. If you crave for a bigger challenge, why don't you start with scraping a private instagram account?
274 | 
275 | 
276 | #### 2. Database (LME)
277 | 
278 | Why do we need database?
279 | 
280 | There are many reasons for a big organization. For an individual user like us, the biggest advantage is, EFFICIENCY. Assume a website publishes data every day and you need consistent time series to run some models. For economic reasons, we only need to scrape the latest report each day in theory. What if one report gets delayed by some 'technical issues' (usually this is a lame excuse)? We have to scrape two reports next day. But we cannot keep track of everything every day and machines are supposed to do the grunt work for us. If we scrape the entire historical dataset as an alternative, it will take too much time and computing capacity. Additionally, we are running a risk of being blacklisted by the website. Some of those APIs even implement daily limit for each user. This is when database kicks in and keep tracks of everything. With records in the database, we can always start from where we left off last time. Of course, there are many other benefits of database, e.g. Data Integrity, Data Management.
281 | Enough of sales pitch, let's get into the technical details of database. The package we installed is called sqlite3, referring to SQLite database. The setup of SQLite database is hassle-free, in contrast to other relational database such as MySQL or PostgreSQL. Other benefits of SQLite include rapide execution, petit size. Since we are not running a big organization, we shouldn't be bothered with things like Azure, SQL server or Mongo DB.
282 | 
283 | To create a database, we simply do
284 | 
285 | ```python
286 | conn = sqlite3.connect('database.db')
287 | c = conn.cursor()
288 | ```
289 | 
290 | The above command would create a database if it does not exist in a given directory. If it exists, it will automatically connect to the database instead. 
291 | 
292 | If you are connecting to corporate server, you might consider the following <a href=https://docs.microsoft.com/en-us/sql/connect/python/pyodbc/step-3-proof-of-concept-connecting-to-sql-using-pyodbc>tutorial</a> on `pyodbc`. To connect to SQL server, simply do
293 | 
294 | ```python
295 | import pyodbc
296 | conn = pyodbc.connect('DRIVER={SQL Server};SERVER=some_ip_address;DATABASE=some_database_name;UID=some_username;PWD=some_password')
297 | c = conn.cursor()
298 | ```
299 | 
300 | 
301 | Next step is to create a table in the database, we can do
302 | 
303 | ```python
304 | c.execute("""
305 |           CREATE TABLE table_name 
306 |           ([column1] DATATYPE, [column2] DATATYPE, [column3] DATATYPE,
307 |           PRIMARY KEY ([column1], [column2], [column3]));
308 |           """)
309 | conn.commit()
310 | ```
311 | 
312 | Some key notes
313 | * An interesting feature of SQL is its case insensitivity. Still, the upper case letters make things more distinguishable. The brackets `[]` serve the same purpose.
314 | * For some reason, sqlite3 requests `;` at the end of each SQL command.
315 | * Always remember to commit changes to database. Think of it as a pop up window to ask you if you want to save all the changes and you click HELL YEAH.
316 | * The data types in SQL are very sophisticated and precise. Unless your duty is to maintain the efficiency of the database, I'd suggest you go with python data types such as 'FLOAT','DATE','TEXT'.
317 | * Primary key is crucial to your data integrity. Primary key guarantees the uniqueness of the datasets. For instance, if the website modifies one historical data point, without the primary key constraint, we could end up with two values for the same period. With the primary key constraint, we can update the old value to skip any data corruption. 
318 | * Only one primary key is allowed in each table (or no primary key at all). Even though, primary key can involve multiple columns, like the above statement.
319 | 
320 | Now that tables are set up, let's insert some scraped data into the database, we can do
321 | 
322 | ```python
323 | c.execute("""INSERT INTO table_name VALUES (?,?,?,?)""",[data1,data2,data3,data4])
324 | conn.commit()
325 | conn.close()
326 | ```
327 | 
328 | We should not forget the last statement. SQLite3 database does not allow multiple modification at the same time. Other users cannot make changes inside the table if we don't close the database, similar to Excel in a way.
329 | 
330 | To make query directly from database, we do
331 | 
332 | ```python
333 | c=conn.cursor()
334 | c.execute("""SELECT * FROM table_name WHERE [column1]=value1;""")
335 | rows=c.fetchall()
336 | conn.commit()
337 | ```
338 | 
339 | The above is a conventional query method in sqlite3. However, pandas provide a much more convenient way. The output goes straight into dataframe instead of tuples within a list. Easy peasy lemon squeezy! 
340 | 
341 | ```python
342 | df=pd.read_sql("""SELECT * FROM table_name WHERE [column1]=value1""",conn)
343 | ```
344 | 
345 | One of the very common issues from query is encoding. Unfortunately, I haven't managed to solve it so far. Though there is a way to get around like this
346 | 
347 | ```python
348 | 'C\'était des loques qui se traînaient'.encode('latin-1').decode('ISO-8859-1')
349 | ```
350 | 
351 | There are other useful SQL sentences as well. For more details, you can check <a href=https://www.w3schools.com/sql>w3schools</a>. I personally believe fluency in `SELECT`, `DELETE`, `UPDATE` and `INSERT` is enough to cover most of your daily tasks, unless you aim to be a data architect dealing with numerous schemas.
352 | 
353 | Some other useful statements including
354 | 
355 | ```sql
356 | UPDATE table_name SET [column1]=value1
357 | DELETE FROM table_name WHERE [column1]=value1
358 | ```
359 | 
360 | Feel free to take a look at <a href= https://github.com/je-suis-tm/web-scraping/blob/master/LME.py>LME</a> for more coding details.
361 | 
362 | 
363 | #### 3. Newsletter (MENA)
364 | 
365 | These days we receive endless bombing of newsletter both in work email and private account. Ideally, a newsletter provides you with vital and concise information to make important decisions. It’s one of the greatest end products of web scraping. Unfortunately, most of the newsletters these days merely have one objective, tapping into your inner fear of missing out. You know what? It’s time for us to show those scumbags how a great newsletter is properly done!
366 | 
367 | Everyday shit happens. For someone who tracks the petroleum industry in Middle East and North Africa region, what’s the best way to keep up with the latest information? Here, we create a newsletter for the analysts who crave for market intelligence. The newsletter consists of scraping, updating, filtering and presenting. The initial stage is to scrape the news from influential sources that covers MENA region including BBC, CNN, Wall Street Journal, Financial Times, Reuters, Bloomberg and Al Jazeera. As we have gone through HTML tree, JSON searching and regular expression in the earlier chapters, this should be a walk in the park for you now. The second stage is to create a database. Since we want the latest update, not some information we have read yesterday, we need a database to keep track of the outdated information. We can simply use `insert` command to filter all the outdated information which is the topic we have covered in the previous chapter. All the outdated information would fail the inserting process due to the no-duplicate constraint. Everything being inserted will also go into the newsletter. The third stage, filtering the useful information, is beyond the scope of web scraping. How do I apply the degree centrality to prevent information overload? You can read it from the repository of <a href=https://github.com/je-suis-tm/graph-theory/blob/master/Text%20Mining%20project/README.md>graph theory</a>. The fourth stage, presenting your newsletter, is what this chapter is centered around.
368 | 
369 | There are two parts of the presenting, backend and frontend. Frontend is a lot simpler in terms of codes. A nice and comprehensive frontend is all about design. There are plenty of HTML email templates you can borrow ideas from, e.g. <a href=https://stripo.email/templates>Stripo</a>. After you obtain the information from database inserting and algorithm filtering, you can embed the information into HTML. The only tricky part is to include the image. There are a few ways to do that.
370 | 
371 | * Use the link of the image. This is highly recommended. Although it may be a little bit slow to load the image from the internet, it’s the most efficient way. The downside is that images are not loaded by default. The recipients need to click download image button.
372 | 
373 | * Attach the image. Once all images are attached, we can add `<img src='cid: imagename.jpg'>` into HTML email. The image will be faster to load but the email definitely takes a longer while to send and receive. Another faux pas is that the script is likely to suffer from a performance issue. We have to scrape the website repeatedly to download images.
374 | 
375 | * Paste the raw content in the format of base64. Add `< img src='data:image/jpg; base64, [remove the brackets and paste base64]'/>` into HTML email. Blocked by most email clients including Outlook is its big malaise.
376 | 
377 | Compared to frontend, the backend is usually a pain in the ass. Backend developers don’t take credits for flawless operation, but when things go south, they are always the cannon fodder :worried: After you put everything clean and tidy into HTML email, you don’t want to launch your email client and click the send button. C’mmon, we are programmers. We can do better! There are a few ways to achieve the goal as well. My preferable choice is `win32`. This package allows you to control Microsoft applications through COM API. You can control outlook via Python as long as the email terminal is active in the background. The detailed commands can be referenced from `mailitem` object model in <a href=https://msdn.microsoft.com/en-us/vba/vba-outlook>Microsoft Developer Network</a>. If you are a user of Windows 10 Pro, your outlook autoruns at startup. This seems to be the most straight forward choice. Alternatively, there is a library called `smtp` for configuring POP3/IMAP server. I suppose the authentication of your email account would be a big hassle.
378 | 
379 | With the blessing of web scraping, we eventually reduce click click click to pressing F5 in a Python IDE. Here is one final piece of the puzzle to turn your workstation into a server. The magic is called task scheduler (this is a showcase for Windows not sure Mac OS has the same thing). Task scheduler can automatically perform routine tasks at whatever criteria you set and then execute the tasks when triggered. You can run python directly in task scheduler with the script path as your argument. Here I prefer a BAT execution file which is more acceptable in a corporate network environment.
380 | 
381 | Regardless of which way you pick, you need to find out where your python is installed. This can be easily done via `where python` in the command line.
382 | 
383 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20python%20path.PNG)
384 | 
385 | If you would love to directly call python.exe, you can skip the following procedures straight to the final step. Otherwise, you need to create a BAT execution file in notepad. Just specify the path to python.exe and the path to the script you want to automate.
386 | 
387 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20bat%20file.PNG)
388 | 
389 | When you save the file, remember to use the extension of `bat` instead of the default `txt` and change the file type to `All Files`. 
390 | 
391 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20bat%20format.PNG)
392 | 
393 | To launch task scheduler, you only need to search it in the start menu.
394 | 
395 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20task%20scheduler.PNG)
396 | 
397 | Let’s start a basic task from the right-hand tab.
398 | 
399 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20create%20task.PNG)
400 | 
401 | Just type in whatever name and description you want.
402 | 
403 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20task%20name.PNG)
404 | 
405 | Then select the frequency, in our case, daily.
406 | 
407 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20freq.PNG)
408 | 
409 | We can set the beginning of the execution, the time of the execution and how often the action recurs. It even allows you to synchronize the operation across different time zones. Use it especially if you are a technology consultant who frequently travels.
410 | 
411 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20set%20time.PNG)
412 | 
413 | For some unknown reasons, task scheduler has deprecated the choice of automatically sending an email. We can still achieve the same result via `win32` in Python, just to create a fabricated persona of a workaholic who sends midnight email to boss :unamused:
414 | 
415 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20start%20program.PNG)
416 | 
417 | Finally, we should input the path to BAT execution file. You can use the path to python as well. Then the path to the script is required as the argument.
418 | 
419 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20script%20name.PNG)
420 | 
421 | Felicitations, c’est fini :no_mouth:
422 | 
423 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20finito.PNG)
424 | 
425 | You can edit or delete your task from clicking the library tab.
426 | 
427 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/mena%20check.PNG)
428 | 
429 | As you can see, newsletter is not just some gibberish promotion email from Ocado. It can be used to optimize your information process as well. What kind of power a newsletter can harness is entirely up to you! 
430 | 
431 | ![Alt Text](https://je-suis-tm.github.io/img/newsletter.png)
432 | 
433 | For more technical details, feel free to check out <a href=https://github.com/je-suis-tm/web-scraping/blob/master/MENA%20Newsletter.py>MENA Newsletter</a>.
434 | 
435 | ## Notes
436 | 
437 | #### 1. Proxy Authentication
438 | 
439 | Proxy, it has always been associated with censorship. It is commonly used to access the websites blocked by some parties (e.g. governments) or it is enforced by some parties (e.g. corporates) for the purpose of surveillance. Either way, it is against liberté. Yet, it's not my place to make a moral judgement. For whatever reason to use proxy, the authentication can become a huge pain in the ass. I have literally spent hours scratching my head to figure out why my code doesn't work. Here are a few key points to make our life easier.
440 | 
441 | ###### URL Protocol, Domain and Port
442 | 
443 | It goes without saying how important proxy URL is. Protocol is usually `http` but occasionally `https`. Domain could be a bunch of number (e.g. 192.168.1.1) or a normal one (e.g. myproxy.com). As for the port, it can only be numbers (e.g. 8080).
444 | If you don't have URL information, you can always try to check Local Area Network Settings. Assuming you are using Windows, you can go to internet options in IE (in windows 10, you can find it in Windows Settings - Network & Internet - Proxy)
445 | 
446 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/proxy%20ie.png)
447 | 
448 | Click connections on the top panel then click LAN settings
449 | 
450 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/proxy%20lan.PNG)
451 | 
452 | You could find the proxy setup here
453 | 
454 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/proxy%20domain.PNG)
455 | 
456 | If not, you have to contact your local IT administrator. Sometimes python may be going through proxy by default. We could pass empty dictionary to `session.proxies` to get things working. 
457 | 
458 | Once we obtain the proxy URL, we can do
459 | ```Python
460 | session.proxies={'http':'http://domain:port'}
461 | ```
462 | ###### Username and Password
463 | 
464 | Proxy commonly requires authentication. The package we are using offers three types of authentication, HTTPBasicAuth, HTTPDigestAuth and HTTPProxyAuth. For simplicity, we just do
465 | ```Python
466 | session.auth=requests.auth.HTTPProxyAuth('username',
467 |                                          'password')
468 | ```
469 | 
470 | ###### Certificates
471 | 
472 | Given correct username and password, you still get `TimeoutError` thrown to your face? Or what exactly is this `SSLError`? You are not alone! I have been hung out to dry for days simply because I neglect the role of certificates. The library `requests` provide two types of certificates, SSL and client side. They are both handy to use. For client side certificates, we can do
473 | ```Python
474 | session.cert='path/proxy.cer'
475 | ```
476 | 
477 | For SSL certificates, we can do
478 | ```Python
479 | session.verify='path/proxy.cer'
480 | ```
481 | 
482 | If you don't know where to find certificates or your IT administrator does not cooperate, additionally, there is a dangerous shortcut. We simply disable SSL verification at the risk of man in the middle attack (by raising this issue, your IT admin shall comply, it works every time :smirk: ). There is also a tool called <a href=http://cntlm.sourceforge.net>CNTLM</a> that helps.
483 | ```Python
484 | session.verify=False
485 | ```
486 | 
487 | Now we can harness the power of web-scraping!
488 | ```Python
489 | session.get('https://www.lepoint.fr/gastronomie/')
490 | ```
491 | 
492 | #### 2. Legality
493 | 
494 | The purpose of web scraping is to extract online data in a large scale by automation. Nevertheless, some of the website content may be under copyright protection called <a href=https://en.wikipedia.org/wiki/Digital_Millennium_Copyright_Act>Digital Millennium Copyright Act</a> (even pornography is protected by DMCA). This adds a little complication to our projects. Even open source data is subjected to fair use, yet the interpretation of fair use is always trivial. There have been many lawsuits around web scraping. Sometimes the scrapers lose sometimes the scrapers win. The actual verdict varies case by case. Law is equal to anyone. Except those with deep pockets are more equal than the others. Nobody really wants to be that sucker who pays the fine. I am not licensed to offer any practical legal counselling but I do have a couple of useful hints for you.
495 | 
496 | ###### Disclaimer
497 | 
498 | In most cases, you can find terms of web scraping in the declaration of the content rights or other similar legal sections. These terms are long, sophisticated and incomprehensible (insert the joke here, can you speak English :grin: ). The most straightforward way is to add `/robots.txt` behind any website domain. You will find out the policy set by the website administrator. Usually it tells you what the restrictions are.
499 | 
500 | For instance
501 | 
502 | https://www.lemonde.fr/robots.txt
503 | 
504 | ![Alt Text](https://github.com/je-suis-tm/web-scraping/blob/master/preview/legality.PNG)
505 | 
506 | 
507 | In English, all user agents (*  refers to all) are allowed for the following 2 sub directories.
508 | 
509 | ```HTML
510 | /ws/1/live/*
511 | /ws/1/related_content/*
512 | ```
513 | 
514 | But forbidden for the following directories
515 | 
516 | ```HTML
517 | /ajah/
518 | /api/
519 | /beta
520 | /cgi-bin/ACHATS/
521 | /element/commun/afficher/
522 | /petites-annonces/
523 | /qui-sommes-nous/
524 | /txt/
525 | /verification/source/*
526 | /noscript/
527 | /ws/*
528 | /recherche/*
529 | ```
530 | 
531 | ###### Traffic
532 | 
533 | An ethical way to do web scraping is to pause your request from time to time. Using distributed web scrapers, the algorithms send requests much faster than clicking website in a web browser. Some of the small websites could encounter server overload and service shutdown. If you are being too aggressive, you might trigger an anti-abuse system. Usually it results in IP address blocking, perhaps temporary for a few hours or days, unlikely to be permanent. In worst cases, your scraping could be identified as <a href= https://www.cloudflare.com/learning/ddos/what-is-a-ddos-attack/>DDoS attack</a>. State sponsored hacking groups normally launch these sorts of attacks. These attacks are serious offense under <a href=https://en.wikipedia.org/wiki/Computer_Fraud_and_Abuse_Act>Computer Fraud and Abuse Act</a>. You could be convicted in criminal cases and end up in jail. Hence, there is no harm in waiting for a few extra seconds to disguise as a human viewer.
534 | 
535 | In Python, we just do
536 | 
537 | ```python
538 | import time
539 | import random as rd
540 | 
541 | time.sleep(rd.randint(1,5))
542 | ```
543 | 
544 | ###### Privacy
545 | 
546 | Thanks to European Union, now we have <a href=https://ec.europa.eu/info/law/law-topic/data-protection_en>General Data Protection Regulation</a>. Apart from GDPR, California is about to roll out <a href=https://oag.ca.gov/privacy/ccpa>California Consumer Privacy Act</a>. This brings challenges to the thriving business of data broker. Ideally most of us build web scrapers to scrape open data source. If you are scraping résumé from LinkedIn or stalking someone on social media, and the person is fallen under the jurisdiction of European Court of Justice, then it will be tricky. Is it considered obtaining and processing personal data without someone's consent? I can't tell. As a consumer, I fully support the regulation safeguarding my personal data. As a coder, I would've recused myself from building a web scrapper with potential violations.
547 | 
548 | ###### API
549 | 
550 | Last but not least, if there is API which implies it is legitimate, always use API instead of building your own toolbox. Why reinvent the wheels and bear the risks?
551 | 


--------------------------------------------------------------------------------
/SHFE.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 29 10:48:35 2018
  4 | 
  5 | """
  6 | 
  7 | #Shanghai Future Exchange's daily price is stored in a dat file
  8 | #when you make a query on the website
  9 | #the page runs jquery to get the json file
 10 | #then convert json to dat file and put it on the website table
 11 | #the process can be tracked by inspect element
 12 | #the logic of getting dat file is pretty much the same as cme2
 13 | # https://github.com/je-suis-tm/web-scraping/blob/master/CME2.py
 14 | #theoretically speaking, we can use the same trick as json
 15 | #here, we apply a more general way to process it
 16 | #regular expression a.k.a. regex
 17 | 
 18 | #regex can work on any sort of text extraction
 19 | #when we cannot extract text from html parse tree 
 20 | #or maybe we just need a part of the text
 21 | #regex is the most efficient way
 22 | #even for a simple html parse tree
 23 | #we can still convert response.content to string first
 24 | #and apply regex to extract what we need later
 25 | #regex in python is the same as regex in any other languages
 26 | #the rules of regex syntax are basically universal
 27 | #check the link below to see more details of regex
 28 | # https://www.w3schools.com/python/python_regex.asp
 29 | import requests
 30 | import pandas as pd
 31 | import re
 32 | import datetime as dt
 33 | import os
 34 | os.chdir('H:/')
 35 | 
 36 | 
 37 | #this function is to format the date
 38 | #the date format of SHFE is yyyymmdd
 39 | #alternatively, u can use strftime
 40 | def format_date():
 41 |     
 42 |     year=str(dt.datetime.now().year)
 43 |     month=(dt.datetime.now().month)
 44 |     
 45 |     #i normally get t-1 prices
 46 |     day=(dt.datetime.now().day)-1
 47 |     
 48 |     datetime=str(pd.to_datetime(f'{year}-{month}-{day}'))
 49 |     date=datetime[:10].replace('-','')
 50 |     
 51 |     return date
 52 | 
 53 | 
 54 | #
 55 | def scrape(date):
 56 |     
 57 |     session=requests.Session()
 58 |     response = session.get('http://www.shfe.com.cn/data/dailydata/kx/kx%s.dat'%(date))
 59 |         
 60 |     return response.content
 61 | 
 62 | 
 63 | #
 64 | def etl(content):
 65 |     
 66 |     #if we look closely at dat file, it is just json in another format
 67 |     #all we need to do is to discover the pattern of where the data is stored
 68 |     #all the price data i care about are behind colon :
 69 |     #regex lookahead will do the trick
 70 |     numbers=re.findall('(?<=:)-?\d*\.?\d*',content.decode('utf_8-sig'))
 71 | 
 72 |     #i only need the close price, which is the expression of slicing 9::16
 73 |     #and i only need certain types of commodity
 74 |     temp=numbers[9::16]
 75 |     cu=temp[0:12]
 76 |     al=temp[13:25]
 77 |     zn=temp[26:38]
 78 |     pb=temp[39:51]
 79 |     ni=temp[52:64]
 80 |     au=temp[78:86]
 81 |     ag=temp[87:99]
 82 |     frb=temp[100:112]
 83 | 
 84 |     #customize the format based on my requirement
 85 |     group=al+['','']+cu+['','']+zn+['','']+pb+['','']+ni
 86 |     upload=[al[0]]+cu[0:3]+zn[0:3]+pb[0:3]+frb[0:2]+[ag[2]]+['']+[au[2]]+ni[0:2]+[ni[3]]+[0]*50
 87 |     df=pd.DataFrame(upload)
 88 |     df['upload']=group
 89 |     df['al extra']=al[1]
 90 | 
 91 |     return df
 92 |     
 93 |     #this is the regex to get date of each contract
 94 |     #even though price and date are both stored in the same file
 95 |     #date has quotation marks, price doesnt
 96 |     #i dont need date, if u need it, just use the regex below
 97 |     
 98 |     """
 99 |     temp=re.findall('(?<=")\d*(?=")',content.decode('utf_8-sig'))
100 |     date=temp[0:12]
101 |     """
102 | 
103 | #
104 | def main():
105 |     
106 |     date=format_date()
107 |     
108 |     content=scrape(date)
109 |     
110 |     df=etl(content)
111 |     
112 |     df.to_csv('murex update.csv')
113 |     
114 |     
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/Springer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | 
 7 | import requests
 8 | import os
 9 | import pandas as pd
10 | import time
11 | os.chdir('d:/python')
12 | 
13 | 
14 | # In[4]:
15 | 
16 | 
17 | def scrape(url):
18 |     
19 |     session=requests.Session()
20 | 
21 |     page=session.get(url,verify=False)
22 | 
23 |     return page.content
24 | 
25 | 
26 | # In[5]:
27 | def main():
28 |     
29 |     #get textbook list
30 |     content=scrape('https://resource-cms.springernature.com/springer-cms/rest/v1/content/17858272/data/v4')
31 |         
32 |     f=open('textbook.xlsx','wb')
33 |     f.write(content)
34 |     f.close
35 |         
36 |     df=pd.ExcelFile('textbook.xlsx').parse('eBook list')
37 |     
38 |     
39 |     #iterate through all books but it will take a long ass time
40 |     for i in range(len(df)):
41 |         
42 |         
43 |         name=df['Book Title'][i]
44 |         url=df['OpenURL'][i]
45 |         print(name)
46 |         
47 |         prefix='https://rd.springer.com/content/pdf/'
48 |         postfix=df['DOI URL'][i].split('http://doi.org/')[-1].replace('/','%2F')        
49 |         url=prefix+postfix+'.pdf'
50 |         
51 |         time.sleep(5)
52 |         content=scrape(url)
53 |         f=open(f'{name}.pdf','wb')
54 |         f.write(content)
55 |         f.close
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/Tomtom.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim: filetype=python
  4 | 
  5 | 
  6 | import pandas as pd
  7 | import datetime as dt
  8 | import time
  9 | import numpy as np
 10 | import requests
 11 | import os
 12 | os.chdir('k:/')
 13 | 
 14 | 
 15 | #
 16 | def scrape(city):
 17 |     
 18 |     prefix = 'https://api.midway.tomtom.com/ranking/liveHourly/'
 19 |     
 20 |     session = requests.Session()
 21 |     response = session.get(prefix+city)
 22 |         
 23 |     return response
 24 | 
 25 | 
 26 | #
 27 | def etl(rawdata,target,city,historic_avg):
 28 |     
 29 |     # json keys
 30 |     cols = rawdata['data'][0].keys()
 31 | 
 32 |     # add some missing column
 33 |     for i in range(len(rawdata['data'])):
 34 |         for j in cols:
 35 |             if j not in rawdata['data'][i].keys():
 36 |                 rawdata['data'][i][j] = np.nan
 37 | 
 38 |     df = pd.DataFrame()
 39 | 
 40 |     # fill in data
 41 |     for col in cols:
 42 |         df[col] = [i[col] for i in rawdata['data']]
 43 | 
 44 |     # there is only system time
 45 |     t0 = dt.datetime(1970, 1, 1, 1, 44)
 46 | 
 47 |     # convert system time to real time
 48 |     df['datetime'] = [
 49 |             t0 + dt.timedelta(minutes=i / 60000) for i in df['UpdateTime'].tolist()
 50 |         ]
 51 | 
 52 |     # change column name
 53 |     df.columns = df.columns.str.replace('TrafficIndexLive', 'LiveCongestion')
 54 |     df.columns = df.columns.str.replace(
 55 |             'TrafficIndexHistoric', 'LastYearAverageCongestion'
 56 |         )
 57 |     
 58 |     # get daily average
 59 |     df['datetime'] = pd.to_datetime(df['datetime'])
 60 |     datelist = set(df['datetime'].dt.date)
 61 |     df.set_index('datetime', inplace=True)
 62 | 
 63 |     # create cols
 64 |     df['LiveCongestionDaily'] = np.nan
 65 |     df['LastYearAverageCongestionDaily'] = np.nan
 66 |     df['location'] = target[city]['location']
 67 |     df['country'] = target[city]['country']
 68 | 
 69 |     # create daily average
 70 |     for i in datelist:
 71 |         df['LiveCongestionDaily'][
 72 |                 i.strftime('%Y-%m-%d') : i.strftime('%Y-%m-%d')
 73 |             ] = df['LiveCongestion'][
 74 |                 i.strftime('%Y-%m-%d') : i.strftime('%Y-%m-%d')
 75 |             ].mean()
 76 |         
 77 |         # there used to be last year avg
 78 |         # if it reappears, take daily average instead of 15 min interval by default
 79 |         if "LastYearAverageCongestion" in df.columns:
 80 |             df['LastYearAverageCongestionDaily'][
 81 |                     i.strftime('%Y-%m-%d') : i.strftime('%Y-%m-%d')
 82 |                 ] = df['LastYearAverageCongestion'][
 83 |                     i.strftime('%Y-%m-%d') : i.strftime('%Y-%m-%d')
 84 |                 ].mean()
 85 |             
 86 |         # if no historic, use historic avg
 87 |         else:
 88 |             df['LastYearAverageCongestionDaily'][
 89 |                     i.strftime('%Y-%m-%d') : i.strftime('%Y-%m-%d')
 90 |                 ] = historic_avg[target[city]['location']][dt.datetime.weekday(i)]
 91 | 
 92 |     # create output
 93 |     df.reset_index(inplace=True)
 94 |     df.to_csv(f'{target[city]["location"]}.csv')
 95 | 
 96 | #
 97 | def main():
 98 |    
 99 |     # target to be scraped
100 |     target = {
101 |         'FRA%2FCircle%2Fparis': {'country': 'France', 'location': 'Paris'},
102 |         'ITA%2FCircle%2Fmilan': {'country': 'Italy', 'location': 'Milan'},
103 |         'DEU%2FCircle%2Ffrankfurt-am-main': {
104 |             'country': 'Germany',
105 |             'location': 'Frankfurt',
106 |         },
107 |         'GBR%2FCircle%2Flondon': {'country': 'United Kingdom', 'location': 'London'},
108 |         'USA%2FCircle%2Fnew-york': {'country': 'United States', 'location': 'New York'},
109 |         'JPN%2FCircle%2Ftokyo': {'country': 'Japan', 'location': 'Tokyo'},
110 |         'AUS%2FCircle%2Fsydney': {'country': 'Australia', 'location': 'Sydney'},
111 |         'ESP%2FCircle%2Fmadrid': {'country': 'Spain', 'location': 'Madrid'},
112 |         'USA%2FCircle%2Flos-angeles': {
113 |             'country': 'United States',
114 |             'location': 'Los Angeles',
115 |         },
116 |         'USA%2FCircle%2Fseattle': {'country': 'United States', 'location': 'Seattle'},
117 |     }
118 | 
119 |     # tomtom used to offer historical data in api
120 |     # now we have to hardcode the number
121 |     historic_avg = {
122 |         'Frankfurt': {
123 |             0: 14.828168159761104,
124 |             1: 18.556550951847704,
125 |             2: 18.764821684086105,
126 |             3: 20.81831114679017,
127 |             4: 15.212893625192013,
128 |             5: 9.440824468085108,
129 |             6: 5.451007326007326,
130 |         },
131 |         'London': {
132 |             0: 21.20389254385965,
133 |             1: 25.025871360582304,
134 |             2: 25.890477245862883,
135 |             3: 27.638587079798576,
136 |             4: 24.563016917293233,
137 |             5: 18.371318922305765,
138 |             6: 14.034886809414841,
139 |         },
140 |         'Los Angeles': {
141 |             0: 18.081597222222225,
142 |             1: 24.602430555555554,
143 |             2: 27.961805555555557,
144 |             3: 29.181798245614033,
145 |             4: 27.713230861965037,
146 |             5: 23.43154761904762,
147 |             6: 13.758666928309788,
148 |         },
149 |         'Madrid': {
150 |             0: 12.490570175438597,
151 |             1: 13.618031189083823,
152 |             2: 14.09101382667662,
153 |             3: 14.179331140350877,
154 |             4: 12.251941150954309,
155 |             5: 4.372204447288434,
156 |             6: 2.938329142699487,
157 |         },
158 |         'Milan': {
159 |             0: 16.595997807017547,
160 |             1: 19.678281697150677,
161 |             2: 20.116642559412714,
162 |             3: 21.798127320117878,
163 |             4: 20.93199688049912,
164 |             5: 11.182520463392523,
165 |             6: 7.4401126039613885,
166 |         },
167 |         'New York': {
168 |             0: 16.63888888888889,
169 |             1: 20.151041666666668,
170 |             2: 20.938764732923374,
171 |             3: 22.204457295793247,
172 |             4: 21.864376130198917,
173 |             5: 14.28361528822055,
174 |             6: 10.565672422815279,
175 |         },
176 |         'Paris': {
177 |             0: 22.678165437974368,
178 |             1: 27.70737293144208,
179 |             2: 27.65354658845982,
180 |             3: 29.25440264472295,
181 |             4: 28.879417293233082,
182 |             5: 15.110823934837091,
183 |             6: 11.962517707311758,
184 |         },
185 |         'Seattle': {
186 |             0: 12.897569444444445,
187 |             1: 19.878472222222225,
188 |             2: 21.339887521222412,
189 |             3: 22.007419590643277,
190 |             4: 19.98502486437613,
191 |             5: 15.204010025062656,
192 |             6: 8.828100470957613,
193 |         },
194 |         'Sydney': {
195 |             0: 16.886235062293416,
196 |             1: 19.2371895783413,
197 |             2: 20.033814183747694,
198 |             3: 20.24800293601769,
199 |             4: 17.27066753884507,
200 |             5: 13.693233082706767,
201 |             6: 12.606057987711214,
202 |         },
203 |         'Tokyo': {
204 |             0: 22.880642162471393,
205 |             1: 24.30436652357845,
206 |             2: 23.2880849082068,
207 |             3: 25.036028679855665,
208 |             4: 26.748143194524776,
209 |             5: 22.496804511278196,
210 |             6: 15.289682095309194,
211 |         }}
212 | 
213 |     for city in target:
214 | 
215 |         time.sleep(5)
216 |         print(city)
217 |         response = scrape(city)
218 |         rawdata = response.json()
219 |         etl(rawdata,target,city,historic_avg)        
220 |         
221 |     return
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     main()
226 | 


--------------------------------------------------------------------------------
/US Federal Holidays.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim: filetype=python
  4 | 
  5 | 
  6 | # In[1]:
  7 | 
  8 | 
  9 | #this script computes cme holidays
 10 | #based upon federal holidays for the next two years
 11 | #the trading calendar is crucial to the vix calculator
 12 | # https://github.com/je-suis-tm/quant-trading/blob/master/VIX%20Calculator.py
 13 | #if you just want the current year holiday calendar
 14 | # https://www.cmegroup.com/tools-information/holiday-calendar.html
 15 | 
 16 | import datetime as dt
 17 | import pandas as pd
 18 | import random as rd
 19 | import time
 20 | import requests
 21 | import os
 22 | os.chdir('d:/')
 23 | 
 24 | 
 25 | # In[2]:
 26 | 
 27 | 
 28 | #scraping function
 29 | def scrape(url):
 30 |     
 31 |     session=requests.Session()
 32 |     
 33 |     session.headers.update(
 34 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
 35 |     
 36 |     time.sleep(rd.randint(0,10))
 37 |    
 38 |     response=session.get(url,verify=False)
 39 |     
 40 |     return response
 41 |     
 42 | 
 43 | # In[3]:
 44 | 
 45 | 
 46 | #get exchange holidays
 47 | def get_cme_holidays():
 48 |     
 49 |     weekdays=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
 50 | 
 51 |     federal_holidays=["New Year's Day",
 52 |                      'M L King Day',
 53 |                      "Presidents' Day",
 54 |                      'Good Friday',
 55 |                      'Memorial Day',
 56 |                      'Thanksgiving Day',
 57 |                      'Christmas']
 58 |     
 59 |     currentyear=dt.datetime.now().year
 60 |     allholidays=pd.DataFrame(columns=['DAY', 'DATE', 'HOLIDAY'])
 61 |     
 62 |     #get this year plus the next two years
 63 |     for year in range(currentyear,currentyear+3):
 64 |         
 65 |         url=f'https://www.calendarlabs.com/holidays/us/{year}'
 66 |         response=scrape(url)
 67 |         response.raise_for_status()
 68 |         
 69 |         #get tables from html
 70 |         dataframes=pd.read_html(response.text)
 71 |         
 72 |         #cleansing
 73 |         holidays=dataframes[1]        
 74 |         holidays['DATE']=holidays['DATE'].apply(lambda x:x[:-6])        
 75 |         holidays['DAY']=holidays['DAY'].apply(lambda x:x[-3:])
 76 |         
 77 |         #datetime conversion
 78 |         holidays['DATE']=pd.to_datetime(holidays['DATE'])
 79 |         
 80 |         #only select federal holiday + good friday
 81 |         #cuz july the 4th and labor day is at the beginning of the month
 82 |         #all monthly options expire at the end of the month
 83 |         cme_holidays=holidays[holidays['HOLIDAY'].isin(federal_holidays)]
 84 |         cme_holidays.reset_index(inplace=True,drop=True)
 85 |         
 86 |         #create cme holidays based upon +-1 day on the official holiday
 87 |         for i in cme_holidays.index:
 88 |             
 89 |             temp=pd.DataFrame(columns=cme_holidays.columns)
 90 |             
 91 |             if cme_holidays.at[i,'DAY']=='Mon':
 92 |                 temp['DAY']=['Fri','Tue']
 93 |                 temp['DATE']=[cme_holidays.at[i,'DATE']-dt.timedelta(days=3),
 94 |                              cme_holidays.at[i,'DATE']+dt.timedelta(days=1)]
 95 |                 temp['HOLIDAY']=[cme_holidays.at[i,'HOLIDAY']]*2
 96 |                 
 97 |             elif cme_holidays.at[i,'DAY']=='Fri':
 98 |                 temp['DAY']=['Thu','Mon']
 99 |                 temp['DATE']=[cme_holidays.at[i,'DATE']-dt.timedelta(days=1),
100 |                              cme_holidays.at[i,'DATE']+dt.timedelta(days=3)]
101 |                 temp['HOLIDAY']=[cme_holidays.at[i,'HOLIDAY']]*2
102 |                 
103 |             elif cme_holidays.at[i,'DAY']=='Sat':
104 |                 temp['DAY']=['Fri','Mon']
105 |                 temp['DATE']=[cme_holidays.at[i,'DATE']-dt.timedelta(days=1),
106 |                              cme_holidays.at[i,'DATE']+dt.timedelta(days=2)]
107 |                 temp['HOLIDAY']=[cme_holidays.at[i,'HOLIDAY']]*2
108 |                 
109 |             elif cme_holidays.at[i,'DAY']=='Sun':
110 |                 temp['DAY']=['Fri','Mon']
111 |                 temp['DATE']=[cme_holidays.at[i,'DATE']-dt.timedelta(days=2),
112 |                              cme_holidays.at[i,'DATE']+dt.timedelta(days=1)]
113 |                 temp['HOLIDAY']=[cme_holidays.at[i,'HOLIDAY']]*2
114 |                 
115 |             else:
116 |                 temp['DAY']=[weekdays[dt.datetime.weekday(cme_holidays.at[i,'DATE'])-1],
117 |                             weekdays[dt.datetime.weekday(cme_holidays.at[i,'DATE'])+1]]
118 |                 temp['DATE']=[cme_holidays.at[i,'DATE']-dt.timedelta(days=1),
119 |                              cme_holidays.at[i,'DATE']+dt.timedelta(days=1)]
120 |                 temp['HOLIDAY']=[cme_holidays.at[i,'HOLIDAY']]*2 
121 |                 
122 |             cme_holidays=cme_holidays.append(temp)
123 |             cme_holidays.reset_index(inplace=True,drop=True)
124 |             
125 |         allholidays=allholidays.append(cme_holidays)
126 |         allholidays.reset_index(inplace=True,drop=True)
127 |     
128 |     return allholidays
129 | 
130 | 
131 | # In[4]:
132 | 
133 | 
134 | def main():    
135 |     
136 |     allholidays=get_cme_holidays()
137 |     allholidays.to_csv('cme holidays.csv',index=False)
138 |     
139 | if __name__ == "__main__":
140 |     main()
141 | 


--------------------------------------------------------------------------------
/US Treasury.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim: filetype=python
 4 | 
 5 | 
 6 | # In[1]:
 7 | 
 8 | 
 9 | #this script scrapes us treasury website
10 | #for constant maturity treasury rate
11 | #the treasury rate will be used as risk free interest rate
12 | #which is crucial to the vix calculator
13 | # https://github.com/je-suis-tm/quant-trading/blob/master/VIX%20Calculator.py
14 | 
15 | 
16 | import pandas as pd
17 | import requests
18 | import os
19 | os.chdir('d:/')
20 | 
21 | 
22 | # In[2]:
23 | 
24 | 
25 | #scraping function
26 | def scrape(url):
27 |     
28 |     session=requests.Session()
29 |     
30 |     session.headers.update(
31 |             {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'})
32 |        
33 |     response=session.get(url,verify=False)
34 |     
35 |     return response
36 |     
37 | 
38 | # In[3]:
39 | 
40 | 
41 | def main():    
42 |     
43 |     url='https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield'
44 |     response=scrape(url)
45 | 
46 |     #the second table is yield curve rate
47 |     data=pd.read_html(response.text)[1]
48 |     
49 |     #cleanse
50 |     data=data.melt(id_vars='Date',var_name='maturity')
51 |     
52 |     #convert to datetime
53 |     data['Date']=pd.to_datetime(data['Date'])
54 |     
55 |     data.to_csv('treasury yield curve rates.csv',index=False)
56 |     
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/WallStreetBets.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Feb  3 16:19:57 2021
  4 | 
  5 | @author: Administrator
  6 | """
  7 | 
  8 | #!/usr/bin/env python
  9 | # -*- coding: utf-8 -*-
 10 | # vim: filetype=python
 11 | 
 12 | #if wallstreetbets can move the market
 13 | #why not join em? 
 14 | #fuck investment banks and hedge funds
 15 | #they only make the rich richer
 16 | #this script scrapes the topics under different flairs
 17 | #we only care about the hottest within the past 24 hours
 18 | #im not willing to use any stemmer or lemmatizer here
 19 | #simply becuz i dont want any ticker code gets fucked over
 20 | #if u want any nlp cleansing, check the link below
 21 | # https://github.com/je-suis-tm/machine-learning/blob/master/naive%20bayes.ipynb
 22 | import logging
 23 | import time
 24 | import datetime as dt
 25 | import pandas as pd
 26 | from bs4 import BeautifulSoup as bs
 27 | import matplotlib.pyplot as plt
 28 | from PIL import Image
 29 | import numpy as np
 30 | from wordcloud import WordCloud
 31 | import win32com.client as win32 
 32 | import requests
 33 | import traceback
 34 | 
 35 | global stopword_dict
 36 | global punctuations
 37 | global commodities_of_interests
 38 | 
 39 | punctuations= ['!', '(', ')', '[', ']', '{', '}', ';', ':', "'", '"',
 40 |                '\\', ',', '<', '>', '.', '/', '?', '@', '#', '%', '^',
 41 |                '&', '*', '_', '~',]
 42 | 
 43 | #remove swearing words
 44 | stopword_dict=punctuations+['i','yolo', 'fuck', 'fucking', 'shit',
 45 |                             'take', 'still', 'new', 'say', 'get',
 46 |                             'add', 'update', 'me', 'my', 'myself',
 47 |                             'we', 'our', 'ours', 'ourselves', 'be',
 48 |                             'you', "you're", "you've", "you'll",
 49 |                             "you'd", 'your', 'yours', 'yourself',
 50 |                             'yourselves', 'he', 'him', 'his', 'were',
 51 |                             'himself', 'she', "she's", 'her', 'been',
 52 |                             'hers', 'herself', 'it', "it's", 'being',
 53 |                             'its', 'itself', 'they', 'them', 'their',
 54 |                             'theirs', 'themselves', 'what', 'which',
 55 |                             'who', 'whom', 'this', 'that', "that'll",
 56 |                             'these', 'those', 'am', 'is', 'are', 'was',
 57 |                             'have', 'has', 'had', 'having', 'do',
 58 |                             'does', 'did', 'doing', 'a', 'an', 'the',
 59 |                             'and', 'but', 'if', 'or', 'because', 'as',
 60 |                             'until', 'while', 'of', 'at', 'by', 'for',
 61 |                             'with', 'about', 'against', 'between',
 62 |                             'into', 'through', 'during', 'before',
 63 |                             'after', 'above', 'below', 'to', 'from',
 64 |                             'up', 'down', 'in', 'out', 'on', 'off',
 65 |                             'over', 'under', 'again', 'further',
 66 |                             'then', 'once', 'here', 'there', 'when',
 67 |                             'where', 'why', 'how', 'all', 'any',
 68 |                             'both', 'each', 'few', 'more', 'most',
 69 |                             'other', 'some', 'such', 'no', 'nor',
 70 |                             'not', 'only', 'own', 'same', 'so', 'than',
 71 |                             'too', 'very', 's', 't', 'can', 'will',
 72 |                             'just', 'don', "don't", 'should', 
 73 |                             "should've", 'now', 'd', 'll', 'm', 'o',
 74 |                             're', 've', 'y', 'ain', 'aren', "aren't",
 75 |                             'couldn', "couldn't", 'didn', "didn't",
 76 |                             'doesn', "doesn't", 'hadn', "hadn't",
 77 |                             'hasn', "hasn't", 'haven', "haven't",
 78 |                             'isn', "isn't", 'ma', 'mightn', "mightn't",
 79 |                             'mustn', "mustn't", 'needn', "needn't",
 80 |                             'shan', "shan't", 'shouldn', "shouldn't",
 81 |                              'wasn', "wasn't", 'weren', "weren't",
 82 |                              'won','other','others', "won't",'guys',
 83 |                              'another','many','much', 'wouldn','guy',
 84 |                              'go', "wouldn't",'retard','retards',
 85 |                              'dick','dickhead','bitch','porn',
 86 |                              'asshole','pussy','cock']
 87 | 
 88 | commodities_of_interests=['wheat','soybean','corn','milk','cheese',
 89 |                           'butter','whey','lean hog','live cattle',
 90 |                           'lumber','pork','cocoa','sugar','coffee',
 91 |                           'cotton','diesel','ethanol','natural gas',
 92 |                           'coal','gasoline','gasoil','methanol',
 93 |                           'urea','rough rice','oats','palm oil',
 94 |                           'lead','carbon','greenhouse gas','freight',
 95 |                           'baltic','iron ore','steel','aluminium',
 96 |                           'aluminum','copper','gold','silver','brent',
 97 |                           'wti','henry hub','uranium','cobalt',
 98 |                           'nickel','zinc','palladium','platinum',
 99 |                           'propane','naphtha','fuel oil']
100 | 
101 | 
102 | 
103 | def scraping_data(session):
104 |     """scraping"""
105 |     
106 |     logger = logging.getLogger('scraping starts')
107 |     
108 |     flairs=['DD','Discussion',
109 |             'Chart','YOLO',
110 |             '"Earnings%20Thread"',
111 |            'Gain','Loss','News']
112 | 
113 |     threads=[]
114 |     pages={}
115 | 
116 |     for flair in flairs:
117 |         url=f'https://new.reddit.com/r/wallstreetbets/search?sort=hot&restrict_sr=on&q=flair%3A{flair}&t=day'
118 |         
119 |         logger.debug(f'scraping {flair}')
120 |         time.sleep(5)
121 |         response=session.get(url,verify=False)
122 | 
123 |         page=bs(response.content,'html.parser')
124 | 
125 |         pages[url]=page
126 |         threads+=[i.text for i in page.find_all('span', attrs={'style':"font-weight:normal"})]
127 | 
128 |     return threads
129 | 
130 | 
131 | def create_wordcloud(text):
132 |     """draw wordcloud"""
133 |     
134 |     #use shape    
135 |     mask=np.array(Image.open('silhouette.jpg'))
136 | 
137 |     wordcloud=WordCloud(mask=mask,
138 |                         #to draw the boundary
139 |                         #contour_width=3,contour_color='grey',
140 |                         background_color='white',
141 |                         #color_func=image_colors,
142 |                         colormap='gist_heat',
143 |                         stopwords=stopword_dict,
144 |                         height=900,
145 |                         width=1200,
146 |                          ).generate(text)
147 | 
148 |     ax=plt.figure(figsize=(12,9)).add_subplot(111)
149 | 
150 |     #display the image of word cloud
151 |     plt.imshow(wordcloud)
152 | 
153 |     #remove axis
154 |     plt.axis("off")
155 |     plt.savefig('output.png')
156 |     
157 |     
158 | def create_df_from_dict(potential):
159 |     """create df from dict"""
160 |     
161 |     if len(potential)==0:
162 |         return pd.DataFrame()
163 |     
164 |     #make sure each value has the same length
165 |     maxlen=max([len(potential[i]) for i in potential])
166 | 
167 |     for i in potential:
168 |         if len(potential[i])!=maxlen:
169 |             potential[i]+=['']*(maxlen-len(potential[i]))
170 | 
171 |     return pd.DataFrame().from_dict(potential)
172 | 
173 | 
174 | 
175 | def main():
176 |     """ main  """
177 |     
178 |     logger = logging.getLogger()
179 |     
180 |     session=requests.Session()
181 |     threads=scraping_data(session)
182 |     
183 |     logger.debug("prepare for wordcloud")
184 |     
185 |     #etl
186 |     rawtext=''.join(threads)
187 |     cleantext=[i for i in rawtext.split(' ') if i.lower() not in stopword_dict]
188 |     
189 |     #cleanse
190 |     potential_tickers={}
191 |     potential_commodities={}
192 |     
193 |     for ind,val in enumerate(cleantext):
194 |         
195 |         #remove punctuations
196 |         for j in punctuations:
197 |             if j in val:
198 |                 cleantext[ind]=val.replace(j,'')
199 |                 
200 |         #remove stopword
201 |         if cleantext[ind].lower() in stopword_dict:
202 |             cleantext[ind]=''
203 |         
204 |         #ticker starts with $
205 |         if val[0]=='$' and not val[1].isdigit():
206 |             potential_tickers[val]=[]
207 |         
208 |         #find commodities of interests
209 |         for ii in commodities_of_interests:
210 |             if ii in val.lower():
211 |                 potential_commodities[ii]=[]
212 |     
213 |     #find the context
214 |     for ind,val in enumerate(threads):
215 |         
216 |         for j in potential_commodities:
217 |             if j in val.lower():
218 |                 potential_commodities[j].append(val)
219 |             
220 |         for j in potential_tickers:
221 |             if j in val:
222 |                 potential_tickers[j].append(val)
223 |     
224 |     
225 |     logger.debug("create output")
226 |     
227 |     #count freq
228 |     lexicons=set([i for i in cleantext])
229 |     D={}
230 |     for word in lexicons:
231 |         D[word]=cleantext.count(word)
232 |     
233 |     #create wordcount
234 |     df=pd.DataFrame()
235 |     df['word']=D.keys()
236 |     df['count']=D.values()
237 |     df.sort_values('count',inplace=True,ascending=False)
238 |     
239 |     #create context finder
240 |     df_commodities=create_df_from_dict(potential_commodities)
241 |     df_tickers=create_df_from_dict(potential_tickers)
242 |     
243 |     #concatenate
244 |     writer=pd.ExcelWriter('output.xlsx')    
245 |     df_commodities.to_excel(writer,
246 |                             sheet_name='potential commodities',
247 |                             index=False)    
248 |     df_tickers.to_excel(writer,sheet_name='potential tickers',
249 |                         index=False)   
250 |     df.to_excel(writer,sheet_name='word count',
251 |                             index=False)    
252 |     writer.save()
253 |     
254 |     
255 |     logger.debug("wordcloud")    
256 |     processed=' '.join(cleantext)
257 |     create_wordcloud(processed)
258 |     
259 |     #cleanse text
260 |     text_commodities=', '.join([i.title() for i in potential_commodities])
261 |     text_tickers=', '.join([i.upper() for i in potential_tickers])
262 |     
263 |     #create html
264 |     row1=f"""*Commodities Mentioned: <font color="red">{text_commodities}</font>"""
265 |     row2=f"""*Tickers Mentioned: <font color="red">{text_tickers}</font>"""
266 |     disclaimer='*Please check the spreadsheet attached for the exact context of the mentioning.'
267 |     image="""<img width=800 height=600 id="1" src="cid:output.png">"""
268 |     html=f"""<p>{row1}</p><p>{row2}</p><br>{image}<br><p>{disclaimer}</p>"""
269 |     
270 |     
271 |     files=['output.png','output.xlsx']
272 |     
273 |     #send email
274 |     try:
275 |         title = dt.datetime.today()
276 | 
277 |         outlook = win32.Dispatch('outlook.application')  
278 |         mail = outlook.CreateItem(0)      
279 |         receivers = ['lana.rhodes@brazzers.com',
280 |                      'tori.black@brazzers.com',
281 |                      'naomi.woods@brazzers.com']  
282 |         mail.To = ';'.join(receivers)   
283 |         mail.Attachments.Add(Source=files)
284 |         mail.Subject ='What was Reddit talking about %s'%(title)
285 |         mail.BodyFormat=2    
286 |         mail.HTMLBody=html
287 |         mail.Send()
288 |         
289 |     except Exception:
290 |         print(traceback.format_exc())
291 |             
292 | 
293 | if __name__ == "__main__":
294 |     main()
295 | 


--------------------------------------------------------------------------------
/preview/cme1 html.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme1 html.PNG


--------------------------------------------------------------------------------
/preview/cme1 tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme1 tree.png


--------------------------------------------------------------------------------
/preview/cme2 euronext.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 euronext.PNG


--------------------------------------------------------------------------------
/preview/cme2 inspect element.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 inspect element.png


--------------------------------------------------------------------------------
/preview/cme2 json.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 json.PNG


--------------------------------------------------------------------------------
/preview/cme2 link address.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 link address.png


--------------------------------------------------------------------------------
/preview/cme2 network.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 network.PNG


--------------------------------------------------------------------------------
/preview/cme2 request url.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 request url.PNG


--------------------------------------------------------------------------------
/preview/cme2 url.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cme2 url.PNG


--------------------------------------------------------------------------------
/preview/cqf login link.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cqf login link.PNG


--------------------------------------------------------------------------------
/preview/cqf post form.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cqf post form.PNG


--------------------------------------------------------------------------------
/preview/cqf query.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cqf query.PNG


--------------------------------------------------------------------------------
/preview/cqf request header.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/cqf request header.PNG


--------------------------------------------------------------------------------
/preview/legality.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/legality.PNG


--------------------------------------------------------------------------------
/preview/mena bat file.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena bat file.PNG


--------------------------------------------------------------------------------
/preview/mena bat format.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena bat format.PNG


--------------------------------------------------------------------------------
/preview/mena check.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena check.PNG


--------------------------------------------------------------------------------
/preview/mena create task.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena create task.PNG


--------------------------------------------------------------------------------
/preview/mena finito.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena finito.PNG


--------------------------------------------------------------------------------
/preview/mena freq.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena freq.PNG


--------------------------------------------------------------------------------
/preview/mena python path.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena python path.PNG


--------------------------------------------------------------------------------
/preview/mena script name.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena script name.PNG


--------------------------------------------------------------------------------
/preview/mena set time.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena set time.PNG


--------------------------------------------------------------------------------
/preview/mena start program.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena start program.PNG


--------------------------------------------------------------------------------
/preview/mena task name.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena task name.PNG


--------------------------------------------------------------------------------
/preview/mena task scheduler.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/mena task scheduler.PNG


--------------------------------------------------------------------------------
/preview/proxy domain.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/proxy domain.PNG


--------------------------------------------------------------------------------
/preview/proxy ie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/proxy ie.png


--------------------------------------------------------------------------------
/preview/proxy lan.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/proxy lan.PNG


--------------------------------------------------------------------------------
/preview/shfe javascript.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/shfe javascript.png


--------------------------------------------------------------------------------
/preview/shfe regex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/shfe regex.png


--------------------------------------------------------------------------------
/preview/web-scraping-profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sallamy2580/python-web-scrapping/3d66c27b783fb42be4937ec04644faea4539be04/preview/web-scraping-profile.png


--------------------------------------------------------------------------------