├── requirements.txt
├── README.md
└── pleco.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.12.0
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### TSX Company Directory
 2 | 
 3 | A tool that scrapes the Toronto Stock Exchange (TSX) to create a directory of:
 4 | 
 5 | 1. Company ticker symbols
 6 | 2. Company names
 7 | 3. Industry classifications
 8 | 
 9 | Note: This tool only lists companies. ETFs (Exchange Traded Funds) are not included in the directory.
10 | 
11 | The information is stored in an SQLite database and can be dumped to JSON with the --dump option.
12 | 
13 | ### Installation
14 | 
15 | 1. Clone this repository
16 | 2. Install Python dependencies:
17 | 
18 | ```bash
19 | pip install -r requirements.txt
20 | ```
21 | 
22 | ### Usage
23 | 
24 | Scrape company data:
25 | 
26 | ```bash
27 | ./pleco.py --all
28 | ```
29 | 
30 | Output JSON list of all companies:
31 | 
32 | ```bash
33 | ./pleco.py --dump
34 | ```
35 | 
36 | Example output:
37 | 
38 | ```json
39 | [
40 |   {
41 |     "symbol": "TSE:AW",
42 |     "name": "A & W Food Services of Canada Inc.",
43 |     "industry": "N/A"
44 |   },
45 |   {
46 |     "symbol": "TSE:AAB",
47 |     "name": "Aberdeen International Inc.",
48 |     "industry": "Finance"
49 |   }
50 | ]
51 | ```
52 | 


--------------------------------------------------------------------------------
/pleco.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import re
  5 | import sqlite3
  6 | from bs4 import BeautifulSoup
  7 | import urllib.request
  8 | import hashlib
  9 | import time
 10 | import json
 11 | 
 12 | # This program will first download a list of stocks from the TSX. Then, for
 13 | # each stock, it grabs company information, including company name and
 14 | # quarterly EPS and revenue data, and adds it to an SQLITE database.
 15 | 
 16 | # All web pages downloaded are stored in a folder called "cache", so they will
 17 | # never need to be downloaded again. To get fresh data from the web, you will
 18 | # have to delete the cache folder.
 19 | 
 20 | # Here is the schema for the SQL database. Note that dates are recorded as the
 21 | # number of seconds since January 1, 1970
 22 | SCHEMA = """
 23 | CREATE TABLE COMPANIES (
 24 |     symbol TEXT PRIMARY KEY,
 25 |     company TEXT,
 26 |     industry TEXT
 27 | );
 28 | 
 29 | CREATE TABLE PRICES (
 30 |     symbol TEXT,
 31 |     date INTEGER,
 32 |     price INTEGER
 33 | );
 34 | 
 35 | 
 36 | CREATE TABLE FINANCIALS (
 37 |     symbol TEXT,
 38 |     type TEXT,
 39 |     date TEXT,
 40 |     value INTEGER
 41 | );
 42 | 
 43 | """
 44 | 
 45 | DATABASE_NAME = "pleco.db"
 46 | CACHE_FOLDER = "cache"
 47 | 
 48 | class Database:
 49 |     def __init__(self):
 50 |         create = not os.path.exists( DATABASE_NAME )
 51 | 
 52 |         self.conn = sqlite3.connect( DATABASE_NAME, timeout=99.0 )
 53 |         if create:
 54 |             c = self.conn.cursor()
 55 |             c.executescript( SCHEMA )
 56 | 
 57 |     def addCompany( self, symbol, company, industry ):
 58 |         c = self.conn.cursor()
 59 |         c.execute( "DELETE FROM COMPANIES WHERE symbol=?", (symbol,));
 60 |         c.execute( "INSERT INTO COMPANIES values ( ?, ?, ? )", 
 61 |                 (symbol, company, industry ) )
 62 |         self.conn.commit()
 63 | 
 64 |     def getCompanies(self):
 65 |         c = self.conn.cursor()
 66 |         c.execute( "SELECT * FROM COMPANIES" )
 67 |         return c.fetchall();
 68 | 
 69 |     def setPrice(self, symbol, date, price):
 70 |         c = self.conn.cursor()
 71 |         c.execute( "INSERT INTO PRICES VALUES (?, ?, ?)",
 72 |                 ( symbol, date, price ) )
 73 |         self.conn.commit()
 74 | 
 75 |     def getPrice(self, symbol ):
 76 |         c = self.conn.cursor()
 77 |         c.execute( "SELECT price FROM PRICES WHERE symbol=? ORDER BY DATE DESC",
 78 |                 ( symbol, ) )
 79 |         return c.fetchone()[0]
 80 | 
 81 |     def getPrice(self, symbol):
 82 |         c = self.conn.cursor()
 83 |         c.execute( "SELECT price FROM PRICES ORDER BY DATE DESC" )
 84 |         return c.fetchone()[0]
 85 | 
 86 |     def setFinancials( self, symbol, type, date, value ):
 87 |         c = self.conn.cursor()
 88 |         c.execute("DELETE FROM FINANCIALS WHERE symbol=? AND type=? and date=?",
 89 |                 (symbol, type, date))
 90 |         c.execute("INSERT INTO FINANCIALS VALUES (?, ?, ?, ?)",
 91 |                 ( symbol, type, date, value ) )
 92 |         self.conn.commit()
 93 | 
 94 |     def getFinancials( self, symbol, type ):
 95 |         c = self.conn.cursor()
 96 |         c.execute( "SELECT * FROM FINANCIALS WHERE symbol=? AND type=? ORDER BY DATE DESC",
 97 |                 (symbol, type))
 98 |         return c.fetchall()
 99 | 
100 |     def getEverything( self ):
101 |         c = self.conn.cursor()
102 |         c.execute( """
103 |                 SELECT COMPANIES.symbol, company, industry, type, value, price from
104 |                 COMPANIES, PRICES, FINANCIALS WHERE
105 |                 COMPANIES.symbol = PRICES.symbol AND PRICES.symbol =
106 |                 FINANCIALS.symbol""")
107 | 
108 |         return c.fetchall()
109 | 
110 | # This class will fetch a web page from the WWW. However, if the web page
111 | # exists in the cache, it will instead use the cached version.
112 | class PageCache:
113 |     def __init__(self):
114 |         if not os.path.exists( CACHE_FOLDER ):
115 |             os.mkdir( CACHE_FOLDER )
116 | 
117 |     def get( self, url, fname = None ):
118 |         if fname == None:
119 |             fname = hashlib.sha1(url.encode('utf-8')).hexdigest()
120 |         fname = os.path.join( CACHE_FOLDER, fname )
121 | 
122 |         if os.path.exists( fname ):
123 |             return open( fname, "rt" ).read()
124 |         else:
125 |             print("Retrieve %s" % url)
126 |             f = urllib.request.urlopen(url)
127 |             content = f.read().decode('utf-8')
128 |             f.close()
129 | 
130 |             f = open( fname, "w" );
131 |             f.write( content );
132 |             f.close()
133 | 
134 |             return content
135 | 
136 | class EmptyClass: pass
137 | 
138 | # The Pleco class contains logic for scraping the stock information from the
139 | # internet.
140 | class Pleco:
141 |     def __init__(self):
142 |         self.db = Database()
143 |         self.webCache = PageCache()
144 | 
145 |     # This function will, given a stock symbol, scrape the industry from
146 |     # the global and mail. It returns it as a string.
147 |     def scrapeIndustryForSymbol( self, symbol ):
148 |         symbol = symbol.upper()
149 |         if symbol.startswith("TSE:"):
150 |             symbol = symbol[4:]
151 |         symbol = symbol.replace(".", "-")
152 | 
153 |         # lookup file, otherwise retrieve the url
154 |         url = f"https://www.theglobeandmail.com/investing/markets/stocks/{symbol}-T/profile/"
155 |         page = self.webCache.get( url )
156 |         
157 |         soup = BeautifulSoup(page, 'html.parser')
158 |         industry_element = soup.find('barchart-field', {"name": "industryGroup"})
159 | 
160 |         if industry_element is None:
161 |             print(f"Warning: Cannot find industry in {url}")
162 |             return "N/A"
163 | 
164 |         return industry_element.get('value')
165 | 
166 |     # This function will, given a stock symbol, scrape the company name from
167 |     # Google Finance. It returns it as a string.
168 |     def scrapeCompanyNameForSymbol( self, symbol ):
169 |         url = "http://www.google.com/finance?q=%s&fstype=ii" % symbol.upper()
170 |         page = self.webCache.get( url )
171 | 
172 |         expr = re.compile(r"""Financial Statements for (.*?) - Google Finance""")
173 |         m = expr.search(page)
174 |         if m:
175 |             return BeautifulSoup(m.group(1), 'html.parser').contents[0].string
176 |         else:
177 |             return None
178 | 
179 |     # This function will return a list of all of the stock symbols on the TSX,
180 |     # scraped from the TSX web page.
181 |     def scrapeCompanies( self ):
182 |         letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
183 |         found = {}
184 | 
185 |         for s in letters:
186 |             url = f"https://www.tsx.com/json/company-directory/search/tsx/{s}"
187 |             page = self.webCache.get(url)
188 |             
189 |             try:
190 |                 data = json.loads(page)
191 |                 for company in data.get('results', []):
192 |                     symbol = "TSE:" + company['symbol']
193 |                     if symbol in found: 
194 |                         continue
195 |                         
196 |                     found[symbol] = 1
197 |                     name = company['name']
198 |                     industry = self.scrapeIndustryForSymbol(symbol)
199 |                     
200 |                     if name and industry:
201 |                         print(f"Found {name} ({symbol}) - {industry}")
202 |                         self.db.addCompany(symbol, name, industry)
203 |                         
204 |             except json.JSONDecodeError as e:
205 |                 print(f"Error parsing JSON for letter {s}: {e}", file=sys.stderr)
206 | 
207 |     # Assume the database already has the companies table filled in. This
208 |     # function will get the current price of every company that we know about
209 |     # and store it in the prices table, along with the date. 
210 |     def scrapePrices( self ):
211 |         date = int(time.time())
212 |         
213 |         # Given a list of symbols, we get the prices from YAHOO finance and
214 |         # insert them into the PRICES table of the database.
215 |         def getPrices(stocks, list):
216 |             prices = requestYahooPrices( convertToYahooFormat( list ) )
217 | 
218 |             for i in range(len(prices)):
219 |                 self.db.setPrice( list[i], date, prices[i] )
220 |                 print("%s = $%.2f" % (list[i], float(prices[i]) / 1000))
221 | 
222 |         # Given a stock symbol which may be in google finance format, we
223 |         # convert them to yahoo format (eg, ending in .to)
224 |         def convertToYahooFormat( list ):
225 |             # convert from google to yahoo format.
226 |             ret = []
227 |             for symbol in list:
228 |                 symbol = symbol[4:] # remove tse:
229 |                 symbol = symbol.lower().replace('.', '-') + ".to"
230 |                 ret.append( symbol )
231 | 
232 |             return ret
233 | 
234 |         # Given a list of stock symbols, we request the current prices from
235 |         # Yahoo and return them as a list. The prices are returned in the same
236 |         # order as the requested symbols so you can match them up.
237 |         def requestYahooPrices(symbols):
238 |             # form HTTP request
239 |             url = "http://finance.yahoo.com/d/quotes.csv?s=%s&f=l1&e=.csv" % \
240 |                 ( ",".join(symbols) )
241 |             prices = []
242 | 
243 |             content = self.webCache.get(url)
244 | 
245 |             # for each line,
246 |             for line in content.split("\n"):
247 |                 line = line.strip()
248 |                 if line == "": continue
249 |                 prices.append(int(float(line) * 1000))
250 | 
251 |             # return the list.
252 |             return prices
253 | 
254 |         stocks = {}
255 |         date = int(time.time())
256 |         for company in self.db.getCompanies():
257 |             stocks[company[0]] = 0
258 | 
259 |         # for each chunk of 64 stocks,
260 |         array = []
261 |         for key in stocks.keys():
262 |             array.append( key )
263 |             if len( array ) == 64:
264 |                 getPrices( stocks, array )
265 |                 array = []
266 | 
267 |         if len( array ) > 0:
268 |             getPrices( stocks, array )
269 |             array = []
270 | 
271 |     # Scrape the financial information from the quarterly reports of all
272 |     # companies and store in the database.
273 |     def scrapeFinancials( self ):
274 |         for company in self.db.getCompanies():
275 |             self.scrapeFinancialsForSymbol( company[0] )
276 | 
277 |     # Scrape the financial information from the quarterly reports of a single
278 |     # company and store in the database.
279 |     def scrapeFinancialsForSymbol( self, symbol ):
280 |         date = int(time.time())
281 | 
282 |         def checkPresence( page, pattern ):
283 |             for line in page:
284 |                 if line.find(pattern) != -1:
285 |                     return True
286 | 
287 |             return False
288 | 
289 |         def extractRow( soup, text ):
290 |             def byname(tag):
291 |                 return str(tag.string).rstrip() == text and tag.name == 'td'
292 | 
293 |             tag = soup.find(byname)
294 |             contents = []
295 |             while tag:
296 |                 tag = tag.findNextSibling('td')
297 |                 if tag == None: break
298 |                 contents.append(str(tag.find(text=True)))
299 |             return moneyToNumber(contents)
300 | 
301 |         def moneyToNumber( arr ):
302 |             ret = []
303 |             for a in arr:
304 |                 if a == '-':
305 |                     ret.append(0)
306 |                 else:
307 |                     ret.append(int(float(a.replace(",", "")) * 1000 ))
308 | 
309 |             return ret
310 | 
311 |         def extractDates( lines ):
312 |             values = []
313 |             expr = re.compile(r"""(\d{4}-\d{2}-\d{2})""")
314 |             for line in lines:
315 |                 m = expr.search(line)
316 |                 if m:
317 |                     values.append( m.group(0) )
318 |                 else:
319 |                     values.append("")
320 | 
321 |             return values
322 | 
323 |         def findLinesLike( page, pattern ):
324 |             lines = []
325 |             skipped = -1
326 |             pattern = re.compile(pattern)
327 |             for line in page:
328 |                 if pattern.search(line):
329 |                     lines.append( line )
330 |                     skipped = 0
331 |                 elif skipped >= 0:
332 |                     skipped += 1
333 |                     if skipped >= 5:
334 |                         break
335 |             return lines
336 | 
337 |         print("Scraping financials for %s" % symbol)
338 | 
339 |         # retrieve the web page
340 |         url = "http://www.google.com/finance?q=%s&fstype=ii" % symbol
341 |         page = self.webCache.get( url )
342 |         soup = BeautifulSoup(page, 'html.parser')
343 |         page = page.split('\n')
344 |         quarterlyPage = soup.find( "div", { "id" : "incinterimdiv" } )
345 |         annualPage = soup.find( "div", { "id" : "incannualdiv" } )
346 | 
347 |         qstr = str(quarterlyPage).split('\n')
348 |         astr = str(annualPage).split('\n')
349 | 
350 |         # Look for "In Millions of". If not there, error!
351 |         if not checkPresence( page, "In Millions of" ):
352 |             print("While processing %s could not find 'In Millions of' at %s" % (symbol, url), file=sys.stderr)
353 |             return False
354 | 
355 |         # Set multiplier to 1000000
356 |         multiplier = 1000000
357 | 
358 |         # build array of all lines like "3 months Ending"
359 |         quarterlyDates = extractDates(findLinesLike( qstr, r"""\d+ (months|weeks) ending""" ))
360 | 
361 |         # Build array of all lines like "12 months Ending"
362 |         annualDates = extractDates(findLinesLike( astr, r"""\d+ (months|weeks) ending""" ))
363 | 
364 |         # Look for td containing "Total Revenue"
365 |         # Extract all td elements in siblings that contain only a number
366 | 
367 |         # Build table for revenue
368 |         quarterlyRevenue = extractRow( quarterlyPage, "Revenue" )
369 |         annualRevenue = extractRow( annualPage, "Revenue" )
370 | 
371 |         # Build table for ";Diluted EPS Normalized EPS&"
372 |         quarterlyEPS = extractRow( quarterlyPage, "Diluted Normalized EPS" )
373 |         annualEPS = extractRow( annualPage, "Diluted Normalized EPS" )
374 | 
375 |         for i in range( len(quarterlyRevenue) ):
376 |             self.db.setFinancials( symbol, "QuarterlyRevenue", quarterlyDates[i],
377 |                     quarterlyRevenue[i] * multiplier )
378 |             self.db.setFinancials( symbol, "QuarterlyEPS", quarterlyDates[i],
379 |                     quarterlyEPS[i] )
380 | 
381 |         for i in range( len(annualRevenue) ):
382 |             self.db.setFinancials( symbol, "AnnualRevenue", annualDates[i],
383 |                     annualRevenue[i] * multiplier )
384 |             self.db.setFinancials( symbol, "AnnualEPS", annualDates[i],
385 |                     annualEPS[i] )
386 | 
387 |     def addProjected( self, symbol, type ):
388 |         financials = self.db.getFinancials( symbol, "Quarterly%s" % type )
389 |         if len(financials) < 4:
390 |             return
391 | 
392 |         projected = financials[0][3] + financials[1][3] + financials[2][3] + \
393 |                     financials[3][3]
394 | 
395 |         self.db.setFinancials( symbol, "Projected%s" % type, 0, projected )
396 | 
397 |     def addAverageGrowth( self, symbol, type ):
398 |         financials = self.db.getFinancials( symbol, "Annual%s" % type )
399 |         avgGrowth = 0.0
400 |         if len(financials) > 1:
401 |             projected = self.db.getFinancials( symbol, "Projected%s" % type )
402 |             financials.extend( projected )
403 |             financials.reverse()
404 |             first = financials[0][3]
405 |             count = 0
406 |             for val in financials:
407 |                 if first > 0:
408 |                     growth = float((val[3] - first)) / first
409 |                     avgGrowth += growth
410 |                     count += 1
411 |                 else:
412 |                     avgGrowth = 0.0
413 |                     count = 0
414 |                 first = val[3]
415 | 
416 |             if count < 2:
417 |                 avgGrowth = 0.0
418 |             else:
419 |                 avgGrowth /= count
420 |         
421 |         self.db.setFinancials( symbol, "Average%sGrowth" % type, 0, 
422 |                 round( avgGrowth * 100 ) )
423 | 
424 |     def addYearsOfGrowth( self, symbol, type ):
425 |         financials = self.db.getFinancials( symbol, "Annual%s" % type )
426 |         count = 0
427 |         if len(financials) > 0:
428 |             last = financials[0]
429 |             for line in financials[1:]:
430 |                 if line[3] < last:
431 |                     count += 1
432 |                 else:
433 |                     break
434 | 
435 |         self.db.setFinancials( symbol, "YearsOf%sGrowth" % type, 0, count )
436 | 
437 |     def addPE( self, symbol ):
438 |         price = self.db.getPrice( symbol )
439 |         financials = self.db.getFinancials( symbol, "ProjectedEPS" )
440 |         if len(financials) == 0:
441 |             return
442 | 
443 |         earnings = financials[0][3]
444 |         if earnings > 0:
445 |             pe = round(float(price)/float(earnings) * 10)
446 |         else:
447 |             pe = 0
448 | 
449 |         self.db.setFinancials( symbol, "PE", 0, pe );
450 | 
451 |     def addExtraInfo( self ):
452 |         for company in self.db.getCompanies():
453 |             symbol = company[0]
454 |             print("Processing %s...    \r" % symbol, end='')
455 |             sys.stdout.flush()
456 |             self.addProjected(symbol, "EPS")
457 |             self.addProjected(symbol, "Revenue")
458 |             self.addAverageGrowth( symbol, "EPS" )
459 |             self.addAverageGrowth( symbol, "Revenue" )
460 |             self.addYearsOfGrowth( symbol, "EPS" )
461 |             self.addYearsOfGrowth( symbol, "Revenue" )
462 |             self.addPE( symbol )
463 | 
464 |         print()
465 | 
466 |     def dump(self):
467 |         companies = []
468 |         for company in self.db.getCompanies():
469 |             companies.append({
470 |                 "symbol": company[0],
471 |                 "name": company[1],
472 |                 "industry": company[2]
473 |             })
474 |         print(json.dumps(companies, indent=2))
475 | 
476 |     def process(self):
477 |         stocks = {}
478 |         for record in self.db.getEverything():
479 |             symbol = record[0]
480 |             company = record[1]
481 |             industry = record[2]
482 |             type = record[3]
483 |             value = record[4]
484 |             price = record[5]
485 |             if symbol not in stocks:
486 |                 stock = { "symbol": symbol, 
487 |                     "price": price, 
488 |                     "company": company,
489 |                     "industry": industry}
490 |                 stocks[symbol] = stock
491 |             else:
492 |                 stock = stocks[symbol]
493 | 
494 |             stock[type] = value
495 | 
496 |         stocks = filter( self.filt, stocks.values() )
497 | 
498 |         stocks.sort( key = lambda stock: stock["AverageRevenueGrowth"] )
499 |         self.printTable(stocks)
500 | 
501 |     def filt(self, stock):
502 |         return \
503 |             stock["YearsOfRevenueGrowth"] >= 1 and \
504 |             stock["YearsOfEPSGrowth"] >= 1 and \
505 |             stock["AverageRevenueGrowth"] >= 5 and \
506 |             stock["AverageEPSGrowth"] >= 5 and \
507 |             "PE" in stock and \
508 |             stock["PE"] >= 0 and \
509 |             stock["PE"] <= 50 \
510 |             and stock["ProjectedEPS"] > 0 \
511 |             and stock["industry"].find("Oil") == -1 \
512 |             and stock["industry"].find("Mining") == -1 \
513 |             and stock["industry"].find("Metals") == -1 \
514 |             and stock["industry"].find("Diversified") == -1 \
515 |             and stock["industry"].find("Forestry") == -1 
516 | 
517 |     def printTable(self, stocks):
518 |         print("symbol, AverageRevenueGrowth, YearsOfRevenueGrowth, AverageEPSGrowth, YearsOfEPSGrowth, PE, Company")
519 |         for stock in stocks:
520 |             print(stock["symbol"].ljust(13), end=' ')
521 |             print(str(stock["AverageRevenueGrowth"]).ljust(5), end=' ')
522 |             print(str(stock["YearsOfRevenueGrowth"]).ljust(3), end=' ')
523 |             print(str(stock["AverageEPSGrowth"]).ljust(5), end=' ')
524 |             print(str(stock["YearsOfEPSGrowth"]).ljust(3), end=' ')
525 |             print(str(stock["PE"]).ljust(5), end=' ')
526 |             print(stock["company"])
527 | 
528 |     def run(self):
529 |         for i in range(1, len(sys.argv)):
530 |             if sys.argv[i] == "--companies":
531 |                 self.scrapeCompanies()
532 |             elif sys.argv[i] == "--prices":
533 |                 self.scrapePrices()
534 |             elif sys.argv[i] == "--financials":
535 |                 self.scrapeFinancials()
536 |             elif sys.argv[i] == '--extra':
537 |                 self.addExtraInfo()
538 |             elif sys.argv[i] == "--all":
539 |                 self.scrapeCompanies()
540 |                 self.scrapeFinancials()
541 |                 self.scrapePrices()
542 |                 self.addExtraInfo()
543 |             elif sys.argv[i] == "--test": 
544 |                 self.addPE("tse:g")
545 |             elif sys.argv[i] == "--process":
546 |                 self.process()
547 |             elif sys.argv[i] == "--dump":
548 |                 self.dump()
549 | 
550 | 
551 | Pleco().run()
552 | 
553 | 


--------------------------------------------------------------------------------