├── requirements.txt ├── README.md └── pleco.py /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.12.0 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### TSX Company Directory 2 | 3 | A tool that scrapes the Toronto Stock Exchange (TSX) to create a directory of: 4 | 5 | 1. Company ticker symbols 6 | 2. Company names 7 | 3. Industry classifications 8 | 9 | Note: This tool only lists companies. ETFs (Exchange Traded Funds) are not included in the directory. 10 | 11 | The information is stored in an SQLite database and can be dumped to JSON with the --dump option. 12 | 13 | ### Installation 14 | 15 | 1. Clone this repository 16 | 2. Install Python dependencies: 17 | 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | ### Usage 23 | 24 | Scrape company data: 25 | 26 | ```bash 27 | ./pleco.py --all 28 | ``` 29 | 30 | Output JSON list of all companies: 31 | 32 | ```bash 33 | ./pleco.py --dump 34 | ``` 35 | 36 | Example output: 37 | 38 | ```json 39 | [ 40 | { 41 | "symbol": "TSE:AW", 42 | "name": "A & W Food Services of Canada Inc.", 43 | "industry": "N/A" 44 | }, 45 | { 46 | "symbol": "TSE:AAB", 47 | "name": "Aberdeen International Inc.", 48 | "industry": "Finance" 49 | } 50 | ] 51 | ``` 52 | -------------------------------------------------------------------------------- /pleco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import re 5 | import sqlite3 6 | from bs4 import BeautifulSoup 7 | import urllib.request 8 | import hashlib 9 | import time 10 | import json 11 | 12 | # This program will first download a list of stocks from the TSX. Then, for 13 | # each stock, it grabs company information, including company name and 14 | # quarterly EPS and revenue data, and adds it to an SQLITE database. 15 | 16 | # All web pages downloaded are stored in a folder called "cache", so they will 17 | # never need to be downloaded again. To get fresh data from the web, you will 18 | # have to delete the cache folder. 19 | 20 | # Here is the schema for the SQL database. Note that dates are recorded as the 21 | # number of seconds since January 1, 1970 22 | SCHEMA = """ 23 | CREATE TABLE COMPANIES ( 24 | symbol TEXT PRIMARY KEY, 25 | company TEXT, 26 | industry TEXT 27 | ); 28 | 29 | CREATE TABLE PRICES ( 30 | symbol TEXT, 31 | date INTEGER, 32 | price INTEGER 33 | ); 34 | 35 | 36 | CREATE TABLE FINANCIALS ( 37 | symbol TEXT, 38 | type TEXT, 39 | date TEXT, 40 | value INTEGER 41 | ); 42 | 43 | """ 44 | 45 | DATABASE_NAME = "pleco.db" 46 | CACHE_FOLDER = "cache" 47 | 48 | class Database: 49 | def __init__(self): 50 | create = not os.path.exists( DATABASE_NAME ) 51 | 52 | self.conn = sqlite3.connect( DATABASE_NAME, timeout=99.0 ) 53 | if create: 54 | c = self.conn.cursor() 55 | c.executescript( SCHEMA ) 56 | 57 | def addCompany( self, symbol, company, industry ): 58 | c = self.conn.cursor() 59 | c.execute( "DELETE FROM COMPANIES WHERE symbol=?", (symbol,)); 60 | c.execute( "INSERT INTO COMPANIES values ( ?, ?, ? )", 61 | (symbol, company, industry ) ) 62 | self.conn.commit() 63 | 64 | def getCompanies(self): 65 | c = self.conn.cursor() 66 | c.execute( "SELECT * FROM COMPANIES" ) 67 | return c.fetchall(); 68 | 69 | def setPrice(self, symbol, date, price): 70 | c = self.conn.cursor() 71 | c.execute( "INSERT INTO PRICES VALUES (?, ?, ?)", 72 | ( symbol, date, price ) ) 73 | self.conn.commit() 74 | 75 | def getPrice(self, symbol ): 76 | c = self.conn.cursor() 77 | c.execute( "SELECT price FROM PRICES WHERE symbol=? ORDER BY DATE DESC", 78 | ( symbol, ) ) 79 | return c.fetchone()[0] 80 | 81 | def getPrice(self, symbol): 82 | c = self.conn.cursor() 83 | c.execute( "SELECT price FROM PRICES ORDER BY DATE DESC" ) 84 | return c.fetchone()[0] 85 | 86 | def setFinancials( self, symbol, type, date, value ): 87 | c = self.conn.cursor() 88 | c.execute("DELETE FROM FINANCIALS WHERE symbol=? AND type=? and date=?", 89 | (symbol, type, date)) 90 | c.execute("INSERT INTO FINANCIALS VALUES (?, ?, ?, ?)", 91 | ( symbol, type, date, value ) ) 92 | self.conn.commit() 93 | 94 | def getFinancials( self, symbol, type ): 95 | c = self.conn.cursor() 96 | c.execute( "SELECT * FROM FINANCIALS WHERE symbol=? AND type=? ORDER BY DATE DESC", 97 | (symbol, type)) 98 | return c.fetchall() 99 | 100 | def getEverything( self ): 101 | c = self.conn.cursor() 102 | c.execute( """ 103 | SELECT COMPANIES.symbol, company, industry, type, value, price from 104 | COMPANIES, PRICES, FINANCIALS WHERE 105 | COMPANIES.symbol = PRICES.symbol AND PRICES.symbol = 106 | FINANCIALS.symbol""") 107 | 108 | return c.fetchall() 109 | 110 | # This class will fetch a web page from the WWW. However, if the web page 111 | # exists in the cache, it will instead use the cached version. 112 | class PageCache: 113 | def __init__(self): 114 | if not os.path.exists( CACHE_FOLDER ): 115 | os.mkdir( CACHE_FOLDER ) 116 | 117 | def get( self, url, fname = None ): 118 | if fname == None: 119 | fname = hashlib.sha1(url.encode('utf-8')).hexdigest() 120 | fname = os.path.join( CACHE_FOLDER, fname ) 121 | 122 | if os.path.exists( fname ): 123 | return open( fname, "rt" ).read() 124 | else: 125 | print("Retrieve %s" % url) 126 | f = urllib.request.urlopen(url) 127 | content = f.read().decode('utf-8') 128 | f.close() 129 | 130 | f = open( fname, "w" ); 131 | f.write( content ); 132 | f.close() 133 | 134 | return content 135 | 136 | class EmptyClass: pass 137 | 138 | # The Pleco class contains logic for scraping the stock information from the 139 | # internet. 140 | class Pleco: 141 | def __init__(self): 142 | self.db = Database() 143 | self.webCache = PageCache() 144 | 145 | # This function will, given a stock symbol, scrape the industry from 146 | # the global and mail. It returns it as a string. 147 | def scrapeIndustryForSymbol( self, symbol ): 148 | symbol = symbol.upper() 149 | if symbol.startswith("TSE:"): 150 | symbol = symbol[4:] 151 | symbol = symbol.replace(".", "-") 152 | 153 | # lookup file, otherwise retrieve the url 154 | url = f"https://www.theglobeandmail.com/investing/markets/stocks/{symbol}-T/profile/" 155 | page = self.webCache.get( url ) 156 | 157 | soup = BeautifulSoup(page, 'html.parser') 158 | industry_element = soup.find('barchart-field', {"name": "industryGroup"}) 159 | 160 | if industry_element is None: 161 | print(f"Warning: Cannot find industry in {url}") 162 | return "N/A" 163 | 164 | return industry_element.get('value') 165 | 166 | # This function will, given a stock symbol, scrape the company name from 167 | # Google Finance. It returns it as a string. 168 | def scrapeCompanyNameForSymbol( self, symbol ): 169 | url = "http://www.google.com/finance?q=%s&fstype=ii" % symbol.upper() 170 | page = self.webCache.get( url ) 171 | 172 | expr = re.compile(r"""Financial Statements for (.*?) - Google Finance""") 173 | m = expr.search(page) 174 | if m: 175 | return BeautifulSoup(m.group(1), 'html.parser').contents[0].string 176 | else: 177 | return None 178 | 179 | # This function will return a list of all of the stock symbols on the TSX, 180 | # scraped from the TSX web page. 181 | def scrapeCompanies( self ): 182 | letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" 183 | found = {} 184 | 185 | for s in letters: 186 | url = f"https://www.tsx.com/json/company-directory/search/tsx/{s}" 187 | page = self.webCache.get(url) 188 | 189 | try: 190 | data = json.loads(page) 191 | for company in data.get('results', []): 192 | symbol = "TSE:" + company['symbol'] 193 | if symbol in found: 194 | continue 195 | 196 | found[symbol] = 1 197 | name = company['name'] 198 | industry = self.scrapeIndustryForSymbol(symbol) 199 | 200 | if name and industry: 201 | print(f"Found {name} ({symbol}) - {industry}") 202 | self.db.addCompany(symbol, name, industry) 203 | 204 | except json.JSONDecodeError as e: 205 | print(f"Error parsing JSON for letter {s}: {e}", file=sys.stderr) 206 | 207 | # Assume the database already has the companies table filled in. This 208 | # function will get the current price of every company that we know about 209 | # and store it in the prices table, along with the date. 210 | def scrapePrices( self ): 211 | date = int(time.time()) 212 | 213 | # Given a list of symbols, we get the prices from YAHOO finance and 214 | # insert them into the PRICES table of the database. 215 | def getPrices(stocks, list): 216 | prices = requestYahooPrices( convertToYahooFormat( list ) ) 217 | 218 | for i in range(len(prices)): 219 | self.db.setPrice( list[i], date, prices[i] ) 220 | print("%s = $%.2f" % (list[i], float(prices[i]) / 1000)) 221 | 222 | # Given a stock symbol which may be in google finance format, we 223 | # convert them to yahoo format (eg, ending in .to) 224 | def convertToYahooFormat( list ): 225 | # convert from google to yahoo format. 226 | ret = [] 227 | for symbol in list: 228 | symbol = symbol[4:] # remove tse: 229 | symbol = symbol.lower().replace('.', '-') + ".to" 230 | ret.append( symbol ) 231 | 232 | return ret 233 | 234 | # Given a list of stock symbols, we request the current prices from 235 | # Yahoo and return them as a list. The prices are returned in the same 236 | # order as the requested symbols so you can match them up. 237 | def requestYahooPrices(symbols): 238 | # form HTTP request 239 | url = "http://finance.yahoo.com/d/quotes.csv?s=%s&f=l1&e=.csv" % \ 240 | ( ",".join(symbols) ) 241 | prices = [] 242 | 243 | content = self.webCache.get(url) 244 | 245 | # for each line, 246 | for line in content.split("\n"): 247 | line = line.strip() 248 | if line == "": continue 249 | prices.append(int(float(line) * 1000)) 250 | 251 | # return the list. 252 | return prices 253 | 254 | stocks = {} 255 | date = int(time.time()) 256 | for company in self.db.getCompanies(): 257 | stocks[company[0]] = 0 258 | 259 | # for each chunk of 64 stocks, 260 | array = [] 261 | for key in stocks.keys(): 262 | array.append( key ) 263 | if len( array ) == 64: 264 | getPrices( stocks, array ) 265 | array = [] 266 | 267 | if len( array ) > 0: 268 | getPrices( stocks, array ) 269 | array = [] 270 | 271 | # Scrape the financial information from the quarterly reports of all 272 | # companies and store in the database. 273 | def scrapeFinancials( self ): 274 | for company in self.db.getCompanies(): 275 | self.scrapeFinancialsForSymbol( company[0] ) 276 | 277 | # Scrape the financial information from the quarterly reports of a single 278 | # company and store in the database. 279 | def scrapeFinancialsForSymbol( self, symbol ): 280 | date = int(time.time()) 281 | 282 | def checkPresence( page, pattern ): 283 | for line in page: 284 | if line.find(pattern) != -1: 285 | return True 286 | 287 | return False 288 | 289 | def extractRow( soup, text ): 290 | def byname(tag): 291 | return str(tag.string).rstrip() == text and tag.name == 'td' 292 | 293 | tag = soup.find(byname) 294 | contents = [] 295 | while tag: 296 | tag = tag.findNextSibling('td') 297 | if tag == None: break 298 | contents.append(str(tag.find(text=True))) 299 | return moneyToNumber(contents) 300 | 301 | def moneyToNumber( arr ): 302 | ret = [] 303 | for a in arr: 304 | if a == '-': 305 | ret.append(0) 306 | else: 307 | ret.append(int(float(a.replace(",", "")) * 1000 )) 308 | 309 | return ret 310 | 311 | def extractDates( lines ): 312 | values = [] 313 | expr = re.compile(r"""(\d{4}-\d{2}-\d{2})""") 314 | for line in lines: 315 | m = expr.search(line) 316 | if m: 317 | values.append( m.group(0) ) 318 | else: 319 | values.append("") 320 | 321 | return values 322 | 323 | def findLinesLike( page, pattern ): 324 | lines = [] 325 | skipped = -1 326 | pattern = re.compile(pattern) 327 | for line in page: 328 | if pattern.search(line): 329 | lines.append( line ) 330 | skipped = 0 331 | elif skipped >= 0: 332 | skipped += 1 333 | if skipped >= 5: 334 | break 335 | return lines 336 | 337 | print("Scraping financials for %s" % symbol) 338 | 339 | # retrieve the web page 340 | url = "http://www.google.com/finance?q=%s&fstype=ii" % symbol 341 | page = self.webCache.get( url ) 342 | soup = BeautifulSoup(page, 'html.parser') 343 | page = page.split('\n') 344 | quarterlyPage = soup.find( "div", { "id" : "incinterimdiv" } ) 345 | annualPage = soup.find( "div", { "id" : "incannualdiv" } ) 346 | 347 | qstr = str(quarterlyPage).split('\n') 348 | astr = str(annualPage).split('\n') 349 | 350 | # Look for "In Millions of". If not there, error! 351 | if not checkPresence( page, "In Millions of" ): 352 | print("While processing %s could not find 'In Millions of' at %s" % (symbol, url), file=sys.stderr) 353 | return False 354 | 355 | # Set multiplier to 1000000 356 | multiplier = 1000000 357 | 358 | # build array of all lines like "3 months Ending" 359 | quarterlyDates = extractDates(findLinesLike( qstr, r"""\d+ (months|weeks) ending""" )) 360 | 361 | # Build array of all lines like "12 months Ending" 362 | annualDates = extractDates(findLinesLike( astr, r"""\d+ (months|weeks) ending""" )) 363 | 364 | # Look for td containing "Total Revenue" 365 | # Extract all td elements in siblings that contain only a number 366 | 367 | # Build table for revenue 368 | quarterlyRevenue = extractRow( quarterlyPage, "Revenue" ) 369 | annualRevenue = extractRow( annualPage, "Revenue" ) 370 | 371 | # Build table for ";Diluted EPS Normalized EPS&" 372 | quarterlyEPS = extractRow( quarterlyPage, "Diluted Normalized EPS" ) 373 | annualEPS = extractRow( annualPage, "Diluted Normalized EPS" ) 374 | 375 | for i in range( len(quarterlyRevenue) ): 376 | self.db.setFinancials( symbol, "QuarterlyRevenue", quarterlyDates[i], 377 | quarterlyRevenue[i] * multiplier ) 378 | self.db.setFinancials( symbol, "QuarterlyEPS", quarterlyDates[i], 379 | quarterlyEPS[i] ) 380 | 381 | for i in range( len(annualRevenue) ): 382 | self.db.setFinancials( symbol, "AnnualRevenue", annualDates[i], 383 | annualRevenue[i] * multiplier ) 384 | self.db.setFinancials( symbol, "AnnualEPS", annualDates[i], 385 | annualEPS[i] ) 386 | 387 | def addProjected( self, symbol, type ): 388 | financials = self.db.getFinancials( symbol, "Quarterly%s" % type ) 389 | if len(financials) < 4: 390 | return 391 | 392 | projected = financials[0][3] + financials[1][3] + financials[2][3] + \ 393 | financials[3][3] 394 | 395 | self.db.setFinancials( symbol, "Projected%s" % type, 0, projected ) 396 | 397 | def addAverageGrowth( self, symbol, type ): 398 | financials = self.db.getFinancials( symbol, "Annual%s" % type ) 399 | avgGrowth = 0.0 400 | if len(financials) > 1: 401 | projected = self.db.getFinancials( symbol, "Projected%s" % type ) 402 | financials.extend( projected ) 403 | financials.reverse() 404 | first = financials[0][3] 405 | count = 0 406 | for val in financials: 407 | if first > 0: 408 | growth = float((val[3] - first)) / first 409 | avgGrowth += growth 410 | count += 1 411 | else: 412 | avgGrowth = 0.0 413 | count = 0 414 | first = val[3] 415 | 416 | if count < 2: 417 | avgGrowth = 0.0 418 | else: 419 | avgGrowth /= count 420 | 421 | self.db.setFinancials( symbol, "Average%sGrowth" % type, 0, 422 | round( avgGrowth * 100 ) ) 423 | 424 | def addYearsOfGrowth( self, symbol, type ): 425 | financials = self.db.getFinancials( symbol, "Annual%s" % type ) 426 | count = 0 427 | if len(financials) > 0: 428 | last = financials[0] 429 | for line in financials[1:]: 430 | if line[3] < last: 431 | count += 1 432 | else: 433 | break 434 | 435 | self.db.setFinancials( symbol, "YearsOf%sGrowth" % type, 0, count ) 436 | 437 | def addPE( self, symbol ): 438 | price = self.db.getPrice( symbol ) 439 | financials = self.db.getFinancials( symbol, "ProjectedEPS" ) 440 | if len(financials) == 0: 441 | return 442 | 443 | earnings = financials[0][3] 444 | if earnings > 0: 445 | pe = round(float(price)/float(earnings) * 10) 446 | else: 447 | pe = 0 448 | 449 | self.db.setFinancials( symbol, "PE", 0, pe ); 450 | 451 | def addExtraInfo( self ): 452 | for company in self.db.getCompanies(): 453 | symbol = company[0] 454 | print("Processing %s... \r" % symbol, end='') 455 | sys.stdout.flush() 456 | self.addProjected(symbol, "EPS") 457 | self.addProjected(symbol, "Revenue") 458 | self.addAverageGrowth( symbol, "EPS" ) 459 | self.addAverageGrowth( symbol, "Revenue" ) 460 | self.addYearsOfGrowth( symbol, "EPS" ) 461 | self.addYearsOfGrowth( symbol, "Revenue" ) 462 | self.addPE( symbol ) 463 | 464 | print() 465 | 466 | def dump(self): 467 | companies = [] 468 | for company in self.db.getCompanies(): 469 | companies.append({ 470 | "symbol": company[0], 471 | "name": company[1], 472 | "industry": company[2] 473 | }) 474 | print(json.dumps(companies, indent=2)) 475 | 476 | def process(self): 477 | stocks = {} 478 | for record in self.db.getEverything(): 479 | symbol = record[0] 480 | company = record[1] 481 | industry = record[2] 482 | type = record[3] 483 | value = record[4] 484 | price = record[5] 485 | if symbol not in stocks: 486 | stock = { "symbol": symbol, 487 | "price": price, 488 | "company": company, 489 | "industry": industry} 490 | stocks[symbol] = stock 491 | else: 492 | stock = stocks[symbol] 493 | 494 | stock[type] = value 495 | 496 | stocks = filter( self.filt, stocks.values() ) 497 | 498 | stocks.sort( key = lambda stock: stock["AverageRevenueGrowth"] ) 499 | self.printTable(stocks) 500 | 501 | def filt(self, stock): 502 | return \ 503 | stock["YearsOfRevenueGrowth"] >= 1 and \ 504 | stock["YearsOfEPSGrowth"] >= 1 and \ 505 | stock["AverageRevenueGrowth"] >= 5 and \ 506 | stock["AverageEPSGrowth"] >= 5 and \ 507 | "PE" in stock and \ 508 | stock["PE"] >= 0 and \ 509 | stock["PE"] <= 50 \ 510 | and stock["ProjectedEPS"] > 0 \ 511 | and stock["industry"].find("Oil") == -1 \ 512 | and stock["industry"].find("Mining") == -1 \ 513 | and stock["industry"].find("Metals") == -1 \ 514 | and stock["industry"].find("Diversified") == -1 \ 515 | and stock["industry"].find("Forestry") == -1 516 | 517 | def printTable(self, stocks): 518 | print("symbol, AverageRevenueGrowth, YearsOfRevenueGrowth, AverageEPSGrowth, YearsOfEPSGrowth, PE, Company") 519 | for stock in stocks: 520 | print(stock["symbol"].ljust(13), end=' ') 521 | print(str(stock["AverageRevenueGrowth"]).ljust(5), end=' ') 522 | print(str(stock["YearsOfRevenueGrowth"]).ljust(3), end=' ') 523 | print(str(stock["AverageEPSGrowth"]).ljust(5), end=' ') 524 | print(str(stock["YearsOfEPSGrowth"]).ljust(3), end=' ') 525 | print(str(stock["PE"]).ljust(5), end=' ') 526 | print(stock["company"]) 527 | 528 | def run(self): 529 | for i in range(1, len(sys.argv)): 530 | if sys.argv[i] == "--companies": 531 | self.scrapeCompanies() 532 | elif sys.argv[i] == "--prices": 533 | self.scrapePrices() 534 | elif sys.argv[i] == "--financials": 535 | self.scrapeFinancials() 536 | elif sys.argv[i] == '--extra': 537 | self.addExtraInfo() 538 | elif sys.argv[i] == "--all": 539 | self.scrapeCompanies() 540 | self.scrapeFinancials() 541 | self.scrapePrices() 542 | self.addExtraInfo() 543 | elif sys.argv[i] == "--test": 544 | self.addPE("tse:g") 545 | elif sys.argv[i] == "--process": 546 | self.process() 547 | elif sys.argv[i] == "--dump": 548 | self.dump() 549 | 550 | 551 | Pleco().run() 552 | 553 | --------------------------------------------------------------------------------