├── doc └── pot_stocks.ods ├── input ├── sql_cmd │ ├── select_notupdated1.txt │ ├── select_notupdated2.txt │ ├── select_notupdated3.txt │ └── clean.txt ├── ms_investment-types.csv ├── pot_stocks.json ├── api.json ├── api00.json ├── symbols.csv └── ctycodes.csv ├── LICENSE ├── main.py ├── README.md ├── dataframes.py ├── sample_rules_output.csv ├── fetch.py └── parse.py /doc/pot_stocks.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Todo/mstables/master/doc/pot_stocks.ods -------------------------------------------------------------------------------- /input/sql_cmd/select_notupdated1.txt: -------------------------------------------------------------------------------- 1 | SELECT 0, '', Tickers.id, Tickers.ticker FROM Tickers LEFT JOIN ( 2 | 3 | SELECT url_id AS UID, ticker_id AS TID, exch_id AS EID FROM Fetched_urls 4 | WHERE UID = {} AND strftime('%Y%W', fetch_date) = strftime('%Y%W', 'now') 5 | 6 | ) on Tickers.id = TID WHERE UID IS NULL 7 | -------------------------------------------------------------------------------- /input/sql_cmd/select_notupdated2.txt: -------------------------------------------------------------------------------- 1 | SELECT Exchanges.id, Exchanges.exchange_sym, Tickers.id, Tickers.ticker FROM Master LEFT JOIN ( 2 | 3 | SELECT url_id AS UID, exch_id AS EID0, ticker_id AS TID0 FROM Fetched_urls 4 | WHERE UID = {} AND strftime('%Y%W', fetch_date) = strftime('%Y%W', 'now') 5 | 6 | ) ON EID0 = Master.exchange_id AND TID0 = Master.ticker_id 7 | JOIN Tickers ON Tickers.id = Master.ticker_id 8 | JOIN Exchanges ON Exchanges.id = Master.exchange_id 9 | WHERE UID IS NULL 10 | -------------------------------------------------------------------------------- /input/sql_cmd/select_notupdated3.txt: -------------------------------------------------------------------------------- 1 | SELECT Exchanges.id, Exchanges.exchange_sym, Tickers.id, Tickers.ticker FROM Master LEFT JOIN ( 2 | 3 | SELECT url_id AS UID, exch_id AS EID0, ticker_id AS TID0 FROM Fetched_urls 4 | WHERE UID = {} AND strftime('%Y%j', fetch_date) = strftime('%Y%j', 'now') 5 | 6 | ) ON EID0 = Master.exchange_id AND TID0 = Master.ticker_id 7 | JOIN Tickers ON Tickers.id = Master.ticker_id 8 | JOIN Exchanges ON Exchanges.id = Master.exchange_id 9 | WHERE UID IS NULL AND Exchanges.exchange_sym in ("XNAS", "XNYS") 10 | -------------------------------------------------------------------------------- /input/sql_cmd/clean.txt: -------------------------------------------------------------------------------- 1 | delete from Tickers where ticker = ''; 2 | delete from Exchanges where exchange_sym = ''; 3 | delete from Master where exchange_id in ( 4 | select Master.exchange_id 5 | from Master left join Exchanges on Master.exchange_id = Exchanges.id 6 | where Exchanges.exchange_sym is null 7 | ); 8 | --UPDATE Fetched_urls SET source_text = NULL WHERE source_text IS NOT NULL; 9 | DELETE FROM Fetched_urls; --WHERE strftime('%Y%W', fetch_date) < strftime('%Y%W', 'now'); 10 | /*DELETE FROM Master 11 | WHERE update_date_id in ( 12 | SELECT id FROM TimeRefs 13 | WHERE substr(dates, 1, 1) = '2' AND length(dates) = 10 AND strftime('%Y%W', dates) < strftime('%Y%W', 'now') 14 | ORDER BY dates DESC 15 | );*/ 16 | -------------------------------------------------------------------------------- /input/ms_investment-types.csv: -------------------------------------------------------------------------------- 1 | 2 | Code,Investment Type 3 | BK,529 Benchmark 4 | PG,529 Peergroup 5 | CP,529 Plan 6 | CT,529 Portfolio 7 | AG,Aggregate 8 | CA,Category Average 9 | FC,Closed-End Fund 10 | CU,Currency Exchange 11 | SP,Private Fund 12 | UA,Account 13 | EI,Economic Indicators 14 | FE,Exchange-Traded Fund 15 | FG,Euro Fund 16 | F0,Fixed Income 17 | FH,Hedge Fund 18 | H1,HFR Hedge Fund 19 | VS,eVestment Separate Accounts 20 | VH,eVestment Hedge Funds 21 | XI,Index 22 | PS,European Pension/Life Fund Wrappers 23 | FV,Insurance Product Fund 24 | PO,MF Objective 25 | FM,Money Market Fund 26 | FO,Open-End Fund 27 | SA,Separate Account 28 | ST,Stock 29 | V1,UK LP SubAccounts 30 | P1,UK Life and Pension Polices 31 | FI,Unit Investment Trust 32 | VP,VA Policy 33 | VA,VA Subaccount 34 | LP,VL Policy 35 | VL,VL Subaccount 36 | DF,Restricted Investors 37 | IF,Internal Only 38 | S1,UBS Separate Accounts 39 | PI,Special Pooled Funds for Unregistered VA 40 | -------------------------------------------------------------------------------- /input/pot_stocks.json: -------------------------------------------------------------------------------- 1 | [["ACB" , "CAN"], 2 | ["ACRGF" , "USA"], 3 | ["ACRG.U" , "CAN"], 4 | ["CL" , "CAN"], 5 | ["CNNX" , "CAN"], 6 | ["CURA" , "CAN"], 7 | ["CWEB" , "CAN"], 8 | ["EMH" , "CAN"], 9 | ["FIRE" , "CAN"], 10 | ["GGB" , "CAN"], 11 | ["GLH" , "CAN"], 12 | ["GTII" , "CAN"], 13 | ["HARV" , "CAN"], 14 | ["HEXO" , "CAN"], 15 | ["HIP" , "CAN"], 16 | ["IAN" , "CAN"], 17 | ["ISOL" , "CAN"], 18 | ["LHS" , "CAN"], 19 | ["MJAR" , "CAN"], 20 | ["MMEN" , "CAN"], 21 | ["MPXI" , "CAN"], 22 | ["MPXOF", "USA"], 23 | ["N" , "CAN"], 24 | ["OGI" , "CAN"], 25 | ["OH" , "CAN"], 26 | ["PLTH" , "CAN"], 27 | ["RIV" , "CAN"], 28 | ["SNN" , "CAN"], 29 | ["TER" , "CAN"], 30 | ["TGIF" , "CAN"], 31 | ["TGOD" , "CAN"], 32 | ["TILT" , "CAN"], 33 | ["TRST" , "CAN"], 34 | ["TRUL" , "CAN"], 35 | ["VIVO" , "CAN"], 36 | ["WAYL" , "CAN"], 37 | ["XLY" , "CAN"], 38 | ["APHA" , "USA"], 39 | ["CGC" , "USA"], 40 | ["CRON" , "USA"], 41 | ["CVSI" , "USA"], 42 | ["GRWG" , "USA"], 43 | ["GWPH" , "USA"], 44 | ["IIPR" , "USA"], 45 | ["KSHB" , "USA"], 46 | ["MRMD" , "USA"], 47 | ["TLRY" , "USA"], 48 | ["TRTC" , "USA"]] 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Caio Brandao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /input/api.json: -------------------------------------------------------------------------------- 1 | { 2 | "1":"https://www.morningstar.com/api/v2/search/Securities/500/usquote-us/?q={}{}", 3 | "2":"https://www.morningstar.com/api/v2/search/Securities/500/usquote-noneus/?q={}{}", 4 | "3":"https://www.morningstar.com/api/v2/search/securities/500/usquote-v2/?q={}{}", 5 | "4":"http://quotes.morningstar.com/stockq/c-company-profile?t={}:{}", 6 | "5":"http://quotes.morningstar.com/stockq/c-header?t={}:{}", 7 | "6":"http://financials.morningstar.com/valuate/valuation-history.action?type=price-earnings&t={}:{}&culture=en-US&order=asc", 8 | "7":"http://financials.morningstar.com/finan/financials/getKeyStatPart.html?t={}:{}&culture=en-US&order=asc", 9 | "8":"http://financials.morningstar.com/finan/financials/getFinancePart.html?t={}:{}&culture=en-US&order=asc", 10 | "9":"http://performance.morningstar.com/perform/Performance/stock/exportStockPrice.action?t={}:{}&pd=1yr&freq=d&pg=0&culture=en-US", 11 | "10":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=is&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 12 | "11":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=is&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 13 | "12":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=cf&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 14 | "13":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=cf&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 15 | "14":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=bs&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 16 | "15":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=bs&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 17 | "16":"http://insiders.morningstar.com/insiders/trading/insider-activity-data2.action?&t={}:{}®ion=usa&culture=en-US&cur=&yc=1&tc=&pageSize=100&_=1556547256995" 18 | } 19 | -------------------------------------------------------------------------------- /input/api00.json: -------------------------------------------------------------------------------- 1 | { 2 | "0":"https://finance.yahoo.com/quote/{}{}", 3 | "1":"https://www.morningstar.com/api/v2/search/Securities/500/usquote-us/?q={}{}", 4 | "2":"https://www.morningstar.com/api/v2/search/Securities/500/usquote-noneus/?q={}{}", 5 | "3":"https://www.morningstar.com/api/v2/search/securities/500/usquote-v2/?q={}{}", 6 | "4":"http://quotes.morningstar.com/stockq/c-company-profile?t={}:{}", 7 | "5":"http://quotes.morningstar.com/stockq/c-header?t={}:{}", 8 | "6":"http://financials.morningstar.com/valuate/valuation-history.action?type=price-earnings&t={}:{}&culture=en-US&order=asc", 9 | "7":"http://financials.morningstar.com/finan/financials/getKeyStatPart.html?t={}:{}&culture=en-US&order=asc", 10 | "8":"http://financials.morningstar.com/finan/financials/getFinancePart.html?t={}:{}&culture=en-US&order=asc", 11 | "9":"http://performance.morningstar.com/perform/Performance/stock/exportStockPrice.action?t={}:{}&pd=1yr&freq=d&pg=0&culture=en-US", 12 | "10":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=is&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 13 | "11":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=is&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 14 | "12":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=cf&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 15 | "13":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=cf&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 16 | "14":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=bs&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 17 | "15":"http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t={}:{}®ion=usa&culture=en-US&reportType=bs&period=3&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1", 18 | "16":"http://insiders.morningstar.com/insiders/trading/insider-activity-data2.action?&t={}:{}®ion=usa&culture=en-US&cur=&yc=1&tc=&pageSize=100&_=1556547256995" 19 | } 20 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from shutil import copyfile 4 | from datetime import datetime 5 | from importlib import reload 6 | import fetch, time, os, re, sqlite3 7 | 8 | __author__ = "Caio Brandao" 9 | __copyright__ = "Copyright 2019+, Caio Brandao" 10 | __license__ = "MIT" 11 | __version__ = "0.0" 12 | __maintainer__ = "Caio Brandao" 13 | __email__ = "caiobran88@gmail.com" 14 | 15 | 16 | # Create back-up file under /db/backup 17 | def backup_db(file): 18 | #today = datetime.today().strftime('%Y%m%d%H') 19 | new_file = db_file['db_backup'].format( 20 | input('Enter back-up file name:\n')) 21 | fetch.print_('Please wait while the database file is backed-up ...') 22 | copyfile(db_file['path'], new_file) 23 | return '\n~ Back-up file saved\t{}'.format(new_file) 24 | 25 | 26 | # Change variable for .sqlite file name based on user input 27 | def change_name(old_name): 28 | msg = 'Existing database files in directory \'db/\': {}\n' 29 | msg += 'Enter new name for .sqlite file (current = \'{}\'):\n' 30 | fname = lambda x: re.sub('.sqlite', '', x) 31 | files = [fname(f) for f in os.listdir('db/') if '.sqlite' in f] 32 | return input(msg.format(files, old_name)) 33 | 34 | 35 | # Print options menu 36 | def print_menu(names): 37 | gap = 22 38 | dash = '=' 39 | banner = ' Welcome to msTables ' 40 | file = '\'{}.sqlite\''.format(db_file['name']) 41 | menu = { 42 | '0' : 'Change database file name (current name = {})'.format(file), 43 | '1' : 'Create database tables and import latest symbols', 44 | '2' : 'Download Morningstar data into database', 45 | '3' : 'Erase all records from database tables', 46 | '4' : 'Delete all database tables', 47 | '5' : 'Erase all downloaded history from \'Fetched_urls\' table', 48 | #'X' : 'Parse (FOR TESTING PURPOSES)', 49 | '6' : 'Create a database back-up file' 50 | } 51 | 52 | print(dash * (len(banner) + gap * 2)) 53 | print('{}{}{}'.format(dash * gap, banner, dash * gap)) 54 | print('\nAvailable actions:\n') 55 | for k, v in menu.items(): 56 | print(k, '-', v) 57 | print('\n' + dash * (len(banner) + gap * 2)) 58 | 59 | return menu 60 | 61 | 62 | # Print command line menu for user input 63 | def main(file): 64 | while True: 65 | 66 | # Print menu and capture user selection 67 | ops = print_menu(file) 68 | while True: 69 | try: 70 | inp0 = input('Enter action no.:\n').strip() 71 | break 72 | except KeyboardInterrupt: 73 | print('\nGoodbye!') 74 | exit() 75 | if inp0 not in ops.keys(): break 76 | reload(fetch) #Comment out after development 77 | start = time.time() 78 | inp = int(inp0) 79 | ans = 'y' 80 | 81 | # Ask user to confirm selection if input > 2 82 | if inp > 2: 83 | msg = '\nAre you sure you would like to {}? (Y/n):\n' 84 | ans = input(msg.format(ops[inp0].upper())).lower() 85 | 86 | # Call function according to user input 87 | if ans == 'y': 88 | print() 89 | try: 90 | # Change db file name 91 | if inp == 0: 92 | db_file['name'] = change_name(db_file['name']) 93 | start = time.time() 94 | db_file['path'] = db_file['npath'].format(db_file['name']) 95 | msg = ('~ Database file \'{}\' selected' 96 | .format(db_file['name'])) 97 | 98 | # Create database tables 99 | elif inp == 1: 100 | msg = fetch.create_tables(db_file['path']) 101 | 102 | # Download data from urls listed in api.json 103 | elif inp == 2: 104 | start = fetch.fetch(db_file['path']) 105 | msg = '\n~ Database updated successfully' 106 | 107 | # Erase records from all tables 108 | elif inp == 3: 109 | msg = fetch.erase_tables(db_file['path']) 110 | 111 | # Delete all tables 112 | elif inp == 4: 113 | msg = fetch.delete_tables(db_file['path']) 114 | 115 | # Delete Fetched_urls table records 116 | elif inp == 5: 117 | msg = fetch.del_fetch_history(db_file['path']) 118 | 119 | # Back-up database file 120 | elif inp == int(list(ops.keys())[-1]): 121 | msg = backup_db(db_file) 122 | 123 | # TESTING 124 | elif inp == 99: 125 | fetch.parse.parse(db_file['path']) 126 | msg = 'FINISHED' 127 | # except sqlite3.OperationalError as S: 128 | # msg = '### Error message - {}'.format(S) + \ 129 | # '\n### Scroll up for more details. If table does not ' + \ 130 | # 'exist, make sure to execute action 1 before choosing' + \ 131 | # ' other actions.' 132 | # pass 133 | # except KeyboardInterrupt: 134 | # print('\nGoodbye!') 135 | # exit() 136 | except Exception as e: 137 | print('\a') 138 | #print('\n\n### Error @ main.py:\n {}\n'.format(e)) 139 | raise 140 | 141 | # Print output message 142 | #os.system('clear') 143 | print(msg) 144 | 145 | # Calculate and print execution time 146 | end = time.time() 147 | print('\n~ Execution Time\t{:.2f} sec\n'.format(end - start)) 148 | else: 149 | os.system('clear') 150 | 151 | 152 | # Define database (db) file and menu text variables 153 | db_file = dict() 154 | db_file['npath'] = 'db/{}.sqlite' 155 | db_file['name'] = 'mstables' 156 | db_file['path'] = db_file['npath'].format(db_file['name']) 157 | db_file['db_backup'] = 'db/backup/{}.sqlite' 158 | 159 | if __name__ == '__main__': 160 | os.system('clear') 161 | main(db_file) 162 | print('Goodbye!\n\n') 163 | -------------------------------------------------------------------------------- /input/symbols.csv: -------------------------------------------------------------------------------- 1 | "Country and Currency","Currency Code","Graphic Image ","Font: Code2000","Font: Arial Unicode MS","Unicode: Decimal","Unicode: Hex"," " 2 | "Albania Lek","ALL","","Lek","Lek","76, 101, 107","4c, 65, 6b"," " 3 | "Afghanistan Afghani","AFN","","؋","؋","1547","60b"," " 4 | "Argentina Peso","ARS","","$","$","36","24"," info" 5 | "Aruba Guilder","AWG","","ƒ","ƒ","402","192"," " 6 | "Australia Dollar","AUD","","$","$","36","24"," " 7 | "Azerbaijan Manat","AZN","","₼","₼","8380","20bc"," " 8 | "Bahamas Dollar","BSD","","$","$","36","24"," " 9 | "Barbados Dollar","BBD","","$","$","36","24"," " 10 | "Belarus Ruble","BYN","","Br","Br","66, 114","42, 72"," " 11 | "Belize Dollar","BZD","","BZ$","BZ$","66, 90, 36","42, 5a, 24"," " 12 | "Bermuda Dollar","BMD","","$","$","36","24"," " 13 | "Bolivia Bolíviano","BOB","","$b","$b","36, 98","24, 62"," " 14 | "Bosnia and Herzegovina Convertible Marka","BAM","","KM","KM","75, 77","4b, 4d"," " 15 | "Botswana Pula","BWP","","P","P","80","50"," " 16 | "Bulgaria Lev","BGN","","лв","лв","1083, 1074","43b, 432"," " 17 | "Brazil Real","BRL","","R$","R$","82, 36","52, 24"," info" 18 | "Brunei Darussalam Dollar","BND","","$","$","36","24"," " 19 | "Cambodia Riel","KHR","","៛","៛","6107","17db"," " 20 | "Canada Dollar","CAD","","$","$","36","24"," " 21 | "Cayman Islands Dollar","KYD","","$","$","36","24"," " 22 | "Chile Peso","CLP","","$","$","36","24"," info" 23 | "China Yuan Renminbi","CNY","","¥","¥","165","a5"," info" 24 | "China Yuan Renminbi","RMB","","¥","¥","165","a5"," info" 25 | "Colombia Peso","COP","","$","$","36","24"," " 26 | "Costa Rica Colon","CRC","","₡","₡","8353","20a1"," " 27 | "Croatia Kuna","HRK","","kn","kn","107, 110","6b, 6e"," " 28 | "Cuba Peso","CUP","","₱","₱","8369","20b1"," " 29 | "Czech Republic Koruna","CZK","","Kč","Kč","75, 269","4b, 10d"," " 30 | "Denmark Krone","DKK","","kr","kr","107, 114","6b, 72"," info" 31 | "Dominican Republic Peso","DOP","","RD$","RD$","82, 68, 36","52, 44, 24"," " 32 | "East Caribbean Dollar","XCD","","$","$","36","24"," " 33 | "Egypt Pound","EGP","","£","£","163","a3"," " 34 | "El Salvador Colon","SVC","","$","$","36","24"," " 35 | "Euro Member Countries","EUR","","€","€","8364","20ac"," " 36 | "Falkland Islands (Malvinas) Pound","FKP","","£","£","163","a3"," " 37 | "Fiji Dollar","FJD","","$","$","36","24"," " 38 | "Ghana Cedi","GHS","","¢","¢","162","a2"," " 39 | "Gibraltar Pound","GIP","","£","£","163","a3"," " 40 | "Guatemala Quetzal","GTQ","","Q","Q","81","51"," " 41 | "Guernsey Pound","GGP","","£","£","163","a3"," " 42 | "Guyana Dollar","GYD","","$","$","36","24"," " 43 | "Honduras Lempira","HNL","","L","L","76","4c"," " 44 | "Hong Kong Dollar","HKD","","$","$","36","24"," info" 45 | "Hungary Forint","HUF","","Ft","Ft","70, 116","46, 74"," " 46 | "Iceland Krona","ISK","","kr","kr","107, 114","6b, 72"," " 47 | "India Rupee","INR","","","","",""," info" 48 | "Indonesia Rupiah","IDR","","Rp","Rp","82, 112","52, 70"," " 49 | "Iran Rial","IRR","","﷼","﷼","65020","fdfc"," " 50 | "Isle of Man Pound","IMP","","£","£","163","a3"," " 51 | "Israel Shekel","ILS","","₪","₪","8362","20aa"," " 52 | "Jamaica Dollar","JMD","","J$","J$","74, 36","4a, 24"," " 53 | "Japan Yen","JPY","","¥","¥","165","a5"," info" 54 | "Jersey Pound","JEP","","£","£","163","a3"," " 55 | "Kazakhstan Tenge","KZT","","лв","лв","1083, 1074","43b, 432"," " 56 | "Korea (North) Won","KPW","","₩","₩","8361","20a9"," " 57 | "Korea (South) Won","KRW","","₩","₩","8361","20a9"," " 58 | "Kyrgyzstan Som","KGS","","лв","лв","1083, 1074","43b, 432"," " 59 | "Laos Kip","LAK","","₭","₭","8365","20ad"," " 60 | "Lebanon Pound","LBP","","£","£","163","a3"," " 61 | "Liberia Dollar","LRD","","$","$","36","24"," " 62 | "Macedonia Denar","MKD","","ден","ден","1076, 1077, 1085","434, 435, 43d"," " 63 | "Malaysia Ringgit","MYR","","RM","RM","82, 77","52, 4d"," " 64 | "Mauritius Rupee","MUR","","₨","₨","8360","20a8"," " 65 | "Mexico Peso","MXN","","$","$","36","24"," info" 66 | "Mongolia Tughrik","MNT","","₮","₮","8366","20ae"," " 67 | "Mozambique Metical","MZN","","MT","MT","77, 84","4d, 54"," " 68 | "Namibia Dollar","NAD","","$","$","36","24"," " 69 | "Nepal Rupee","NPR","","₨","₨","8360","20a8"," " 70 | "Netherlands Antilles Guilder","ANG","","ƒ","ƒ","402","192"," " 71 | "New Zealand Dollar","NZD","","$","$","36","24"," " 72 | "Nicaragua Cordoba","NIO","","C$","C$","67, 36","43, 24"," " 73 | "Nigeria Naira","NGN","","₦","₦","8358","20a6"," " 74 | "Norway Krone","NOK","","kr","kr","107, 114","6b, 72"," " 75 | "Oman Rial","OMR","","﷼","﷼","65020","fdfc"," " 76 | "Pakistan Rupee","PKR","","₨","₨","8360","20a8"," " 77 | "Panama Balboa","PAB","","B/.","B/.","66, 47, 46","42, 2f, 2e"," " 78 | "Paraguay Guarani","PYG","","Gs","Gs","71, 115","47, 73"," " 79 | "Peru Sol","PEN","","S/.","S/.","83, 47, 46","53, 2f, 2e"," info" 80 | "Philippines Peso","PHP","","₱","₱","8369","20b1"," " 81 | "Poland Zloty","PLN","","zł","zł","122, 322","7a, 142"," " 82 | "Qatar Riyal","QAR","","﷼","﷼","65020","fdfc"," " 83 | "Romania Leu","RON","","lei","lei","108, 101, 105","6c, 65, 69"," " 84 | "Russia Ruble","RUB","","₽","₽","8381","20bd"," " 85 | "Saint Helena Pound","SHP","","£","£","163","a3"," " 86 | "Saudi Arabia Riyal","SAR","","﷼","﷼","65020","fdfc"," " 87 | "Serbia Dinar","RSD","","Дин.","Дин.","1044, 1080, 1085, 46","414, 438, 43d, 2e"," " 88 | "Seychelles Rupee","SCR","","₨","₨","8360","20a8"," " 89 | "Singapore Dollar","SGD","","$","$","36","24"," " 90 | "Solomon Islands Dollar","SBD","","$","$","36","24"," " 91 | "Somalia Shilling","SOS","","S","S","83","53"," " 92 | "South Africa Rand","ZAR","","R","R","82","52"," " 93 | "Sri Lanka Rupee","LKR","","₨","₨","8360","20a8"," " 94 | "Sweden Krona","SEK","","kr","kr","107, 114","6b, 72"," info" 95 | "Switzerland Franc","CHF","","CHF","CHF","67, 72, 70","43, 48, 46"," " 96 | "Suriname Dollar","SRD","","$","$","36","24"," " 97 | "Syria Pound","SYP","","£","£","163","a3"," " 98 | "Taiwan New Dollar","TWD","","NT$","NT$","78, 84, 36","4e, 54, 24"," info" 99 | "Thailand Baht","THB","","฿","฿","3647","e3f"," " 100 | "Trinidad and Tobago Dollar","TTD","","TT$","TT$","84, 84, 36","54, 54, 24"," " 101 | "Turkey Lira","TRY","","","","",""," info" 102 | "Tuvalu Dollar","TVD","","$","$","36","24"," " 103 | "Ukraine Hryvnia","UAH","","₴","₴","8372","20b4"," " 104 | "United Kingdom Pound","GBP","","£","£","163","a3"," " 105 | "United States Dollar","USD","","$","$","36","24"," " 106 | "Uruguay Peso","UYU","","$U","$U","36, 85","24, 55"," " 107 | "Uzbekistan Som","UZS","","лв","лв","1083, 1074","43b, 432"," " 108 | "Venezuela Bolívar","VEF","","Bs","Bs","66, 115","42, 73"," " 109 | "Viet Nam Dong","VND","","₫","₫","8363","20ab"," " 110 | "Yemen Rial","YER","","﷼","﷼","65020","fdfc"," " 111 | "Zimbabwe Dollar","ZWD","","Z$","Z$","90, 36","5a, 24"," " 112 | -------------------------------------------------------------------------------- /input/ctycodes.csv: -------------------------------------------------------------------------------- 1 | "COUNTRY","A2 (ISO)","A3 (UN)","NUM (UN)","DIALING CODE" 2 | "Afghanistan","AF","AFG","4","93" 3 | "Albania","AL","ALB","8","355" 4 | "Algeria","DZ","DZA","12","213" 5 | "American Samoa","AS","ASM","16","1-684" 6 | "Andorra","AD","AND","20","376" 7 | "Angola","AO","AGO","24","244" 8 | "Anguilla","AI","AIA","660","1-264" 9 | "Antarctica","AQ","ATA","10","672" 10 | "Antigua and Barbuda","AG","ATG","28","1-268" 11 | "Argentina","AR","ARG","32","54" 12 | "Armenia","AM","ARM","51","374" 13 | "Aruba","AW","ABW","533","297" 14 | "Australia","AU","AUS","36","61" 15 | "Austria","AT","AUT","40","43" 16 | "Azerbaijan","AZ","AZE","31","994" 17 | "Bahamas","BS","BHS","44","1-242" 18 | "Bahrain","BH","BHR","48","973" 19 | "Bangladesh","BD","BGD","50","880" 20 | "Barbados","BB","BRB","52","1-246" 21 | "Belarus","BY","BLR","112","375" 22 | "Belgium","BE","BEL","56","32" 23 | "Belize","BZ","BLZ","84","501" 24 | "Benin","BJ","BEN","204","229" 25 | "Bermuda","BM","BMU","60","1-441" 26 | "Bhutan","BT","BTN","64","975" 27 | "Bolivia","BO","BOL","68","591" 28 | "Bonaire","BQ","BES","535","599" 29 | "Bosnia and Herzegovina","BA","BIH","70","387" 30 | "Botswana","BW","BWA","72","267" 31 | "Bouvet Island","BV","BVT","74","47" 32 | "Brazil","BR","BRA","76","55" 33 | "British Indian Ocean Territory","IO","IOT","86","246" 34 | "Brunei Darussalam","BN","BRN","96","673" 35 | "Bulgaria","BG","BGR","100","359" 36 | "Burkina Faso","BF","BFA","854","226" 37 | "Burundi","BI","BDI","108","257" 38 | "Cambodia","KH","KHM","116","855" 39 | "Cameroon","CM","CMR","120","237" 40 | "Canada","CA","CAN","124","1" 41 | "Cape Verde","CV","CPV","132","238" 42 | "Cayman Islands","KY","CYM","136","1-345" 43 | "Central African Republic","CF","CAF","140","236" 44 | "Chad","TD","TCD","148","235" 45 | "Chile","CL","CHL","152","56" 46 | "China","CN","CHN","156","86" 47 | "Christmas Island","CX","CXR","162","61" 48 | "Cocos (Keeling) Islands","CC","CCK","166","61" 49 | "Colombia","CO","COL","170","57" 50 | "Comoros","KM","COM","174","269" 51 | "Congo","CG","COG","178","242" 52 | "Democratic Republic of the Congo","CD","COD","180","243" 53 | "Cook Islands","CK","COK","184","682" 54 | "Costa Rica","CR","CRI","188","506" 55 | "Croatia","HR","HRV","191","385" 56 | "Cuba","CU","CUB","192","53" 57 | "Curacao","CW","CUW","531","599" 58 | "Cyprus","CY","CYP","196","357" 59 | "Czech Republic","CZ","CZE","203","420" 60 | "Cote d'Ivoire","CI","CIV","384","225" 61 | "Denmark","DK","DNK","208","45" 62 | "Djibouti","DJ","DJI","262","253" 63 | "Dominica","DM","DMA","212","1-767" 64 | "Dominican Republic","DO","DOM","214","1-809,1-829,1-849" 65 | "Ecuador","EC","ECU","218","593" 66 | "Egypt","EG","EGY","818","20" 67 | "El Salvador","SV","SLV","222","503" 68 | "Equatorial Guinea","GQ","GNQ","226","240" 69 | "Eritrea","ER","ERI","232","291" 70 | "Estonia","EE","EST","233","372" 71 | "Ethiopia","ET","ETH","231","251" 72 | "Falkland Islands (Malvinas)","FK","FLK","238","500" 73 | "Faroe Islands","FO","FRO","234","298" 74 | "Fiji","FJ","FJI","242","679" 75 | "Finland","FI","FIN","246","358" 76 | "France","FR","FRA","250","33" 77 | "French Guiana","GF","GUF","254","594" 78 | "French Polynesia","PF","PYF","258","689" 79 | "French Southern Territories","TF","ATF","260","262" 80 | "Gabon","GA","GAB","266","241" 81 | "Gambia","GM","GMB","270","220" 82 | "Georgia","GE","GEO","268","995" 83 | "Germany","DE","DEU","276","49" 84 | "Ghana","GH","GHA","288","233" 85 | "Gibraltar","GI","GIB","292","350" 86 | "Greece","GR","GRC","300","30" 87 | "Greenland","GL","GRL","304","299" 88 | "Grenada","GD","GRD","308","1-473" 89 | "Guadeloupe","GP","GLP","312","590" 90 | "Guam","GU","GUM","316","1-671" 91 | "Guatemala","GT","GTM","320","502" 92 | "Guernsey","GG","GGY","831","44" 93 | "Guinea","GN","GIN","324","224" 94 | "Guinea-Bissau","GW","GNB","624","245" 95 | "Guyana","GY","GUY","328","592" 96 | "Haiti","HT","HTI","332","509" 97 | "Heard Island and McDonald Islands","HM","HMD","334","672" 98 | "Holy See (Vatican City State)","VA","VAT","336","379" 99 | "Honduras","HN","HND","340","504" 100 | "Hong Kong","HK","HKG","344","852" 101 | "Hungary","HU","HUN","348","36" 102 | "Iceland","IS","ISL","352","354" 103 | "India","IN","IND","356","91" 104 | "Indonesia","ID","IDN","360","62" 105 | "Iran, Islamic Republic of","IR","IRN","364","98" 106 | "Iraq","IQ","IRQ","368","964" 107 | "Ireland","IE","IRL","372","353" 108 | "Isle of Man","IM","IMN","833","44" 109 | "Israel","IL","ISR","376","972" 110 | "Italy","IT","ITA","380","39" 111 | "Jamaica","JM","JAM","388","1-876" 112 | "Japan","JP","JPN","392","81" 113 | "Jersey","JE","JEY","832","44" 114 | "Jordan","JO","JOR","400","962" 115 | "Kazakhstan","KZ","KAZ","398","7" 116 | "Kenya","KE","KEN","404","254" 117 | "Kiribati","KI","KIR","296","686" 118 | "Korea, Democratic People's Republic of","KP","PRK","408","850" 119 | "Korea, Republic of","KR","KOR","410","82" 120 | "Kuwait","KW","KWT","414","965" 121 | "Kyrgyzstan","KG","KGZ","417","996" 122 | "Lao People's Democratic Republic","LA","LAO","418","856" 123 | "Latvia","LV","LVA","428","371" 124 | "Lebanon","LB","LBN","422","961" 125 | "Lesotho","LS","LSO","426","266" 126 | "Liberia","LR","LBR","430","231" 127 | "Libya","LY","LBY","434","218" 128 | "Liechtenstein","LI","LIE","438","423" 129 | "Lithuania","LT","LTU","440","370" 130 | "Luxembourg","LU","LUX","442","352" 131 | "Macao","MO","MAC","446","853" 132 | "Macedonia, the Former Yugoslav Republic of","MK","MKD","807","389" 133 | "Madagascar","MG","MDG","450","261" 134 | "Malawi","MW","MWI","454","265" 135 | "Malaysia","MY","MYS","458","60" 136 | "Maldives","MV","MDV","462","960" 137 | "Mali","ML","MLI","466","223" 138 | "Malta","MT","MLT","470","356" 139 | "Marshall Islands","MH","MHL","584","692" 140 | "Martinique","MQ","MTQ","474","596" 141 | "Mauritania","MR","MRT","478","222" 142 | "Mauritius","MU","MUS","480","230" 143 | "Mayotte","YT","MYT","175","262" 144 | "Mexico","MX","MEX","484","52" 145 | "Micronesia, Federated States of","FM","FSM","583","691" 146 | "Moldova, Republic of","MD","MDA","498","373" 147 | "Monaco","MC","MCO","492","377" 148 | "Mongolia","MN","MNG","496","976" 149 | "Montenegro","ME","MNE","499","382" 150 | "Montserrat","MS","MSR","500","1-664" 151 | "Morocco","MA","MAR","504","212" 152 | "Mozambique","MZ","MOZ","508","258" 153 | "Myanmar","MM","MMR","104","95" 154 | "Namibia","NA","NAM","516","264" 155 | "Nauru","NR","NRU","520","674" 156 | "Nepal","NP","NPL","524","977" 157 | "Netherlands","NL","NLD","528","31" 158 | "New Caledonia","NC","NCL","540","687" 159 | "New Zealand","NZ","NZL","554","64" 160 | "Nicaragua","NI","NIC","558","505" 161 | "Niger","NE","NER","562","227" 162 | "Nigeria","NG","NGA","566","234" 163 | "Niue","NU","NIU","570","683" 164 | "Norfolk Island","NF","NFK","574","672" 165 | "Northern Mariana Islands","MP","MNP","580","1-670" 166 | "Norway","NO","NOR","578","47" 167 | "Oman","OM","OMN","512","968" 168 | "Pakistan","PK","PAK","586","92" 169 | "Palau","PW","PLW","585","680" 170 | "Palestine, State of","PS","PSE","275","970" 171 | "Panama","PA","PAN","591","507" 172 | "Papua New Guinea","PG","PNG","598","675" 173 | "Paraguay","PY","PRY","600","595" 174 | "Peru","PE","PER","604","51" 175 | "Philippines","PH","PHL","608","63" 176 | "Pitcairn","PN","PCN","612","870" 177 | "Poland","PL","POL","616","48" 178 | "Portugal","PT","PRT","620","351" 179 | "Puerto Rico","PR","PRI","630","1" 180 | "Qatar","QA","QAT","634","974" 181 | "Romania","RO","ROU","642","40" 182 | "Russian Federation","RU","RUS","643","7" 183 | "Rwanda","RW","RWA","646","250" 184 | "Reunion","RE","REU","638","262" 185 | "Saint Barthelemy","BL","BLM","652","590" 186 | "Saint Helena","SH","SHN","654","290" 187 | "Saint Kitts and Nevis","KN","KNA","659","1-869" 188 | "Saint Lucia","LC","LCA","662","1-758" 189 | "Saint Martin (French part)","MF","MAF","663","590" 190 | "Saint Pierre and Miquelon","PM","SPM","666","508" 191 | "Saint Vincent and the Grenadines","VC","VCT","670","1-784" 192 | "Samoa","WS","WSM","882","685" 193 | "San Marino","SM","SMR","674","378" 194 | "Sao Tome and Principe","ST","STP","678","239" 195 | "Saudi Arabia","SA","SAU","682","966" 196 | "Senegal","SN","SEN","686","221" 197 | "Serbia","RS","SRB","688","381" 198 | "Seychelles","SC","SYC","690","248" 199 | "Sierra Leone","SL","SLE","694","232" 200 | "Singapore","SG","SGP","702","65" 201 | "Sint Maarten (Dutch part)","SX","SXM","534","1-721" 202 | "Slovakia","SK","SVK","703","421" 203 | "Slovenia","SI","SVN","705","386" 204 | "Solomon Islands","SB","SLB","90","677" 205 | "Somalia","SO","SOM","706","252" 206 | "South Africa","ZA","ZAF","710","27" 207 | "South Georgia and the South Sandwich Islands","GS","SGS","239","500" 208 | "South Sudan","SS","SSD","728","211" 209 | "Spain","ES","ESP","724","34" 210 | "Sri Lanka","LK","LKA","144","94" 211 | "Sudan","SD","SDN","729","249" 212 | "Suriname","SR","SUR","740","597" 213 | "Svalbard and Jan Mayen","SJ","SJM","744","47" 214 | "Swaziland","SZ","SWZ","748","268" 215 | "Sweden","SE","SWE","752","46" 216 | "Switzerland","CH","CHE","756","41" 217 | "Syrian Arab Republic","SY","SYR","760","963" 218 | "Taiwan","TW","TWN","158","886" 219 | "Tajikistan","TJ","TJK","762","992" 220 | "United Republic of Tanzania","TZ","TZA","834","255" 221 | "Thailand","TH","THA","764","66" 222 | "Timor-Leste","TL","TLS","626","670" 223 | "Togo","TG","TGO","768","228" 224 | "Tokelau","TK","TKL","772","690" 225 | "Tonga","TO","TON","776","676" 226 | "Trinidad and Tobago","TT","TTO","780","1-868" 227 | "Tunisia","TN","TUN","788","216" 228 | "Turkey","TR","TUR","792","90" 229 | "Turkmenistan","TM","TKM","795","993" 230 | "Turks and Caicos Islands","TC","TCA","796","1-649" 231 | "Tuvalu","TV","TUV","798","688" 232 | "Uganda","UG","UGA","800","256" 233 | "Ukraine","UA","UKR","804","380" 234 | "United Arab Emirates","AE","ARE","784","971" 235 | "United Kingdom","GB","GBR","826","44" 236 | "United States","US","USA","840","1" 237 | "United States Minor Outlying Islands","UM","UMI","581","1" 238 | "Uruguay","UY","URY","858","598" 239 | "Uzbekistan","UZ","UZB","860","998" 240 | "Vanuatu","VU","VUT","548","678" 241 | "Venezuela","VE","VEN","862","58" 242 | "Viet Nam","VN","VNM","704","84" 243 | "British Virgin Islands","VG","VGB","92","1-284" 244 | "US Virgin Islands","VI","VIR","850","1-340" 245 | "Wallis and Futuna","WF","WLF","876","681" 246 | "Western Sahara","EH","ESH","732","212" 247 | "Yemen","YE","YEM","887","967" 248 | "Zambia","ZM","ZMB","894","260" 249 | "Zimbabwe","ZW","ZWE","716","263" 250 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mstables 2 | ======== 3 | 4 | msTables is a [MorningStar.com](https://www.morningstar.com) scraper written in python that fetches, parses and stores financial and market data for over 70k securities into a relational SQLite database. The scraper provides a Command Line Interface (CLI) that allows the user greater flexibility for creating and managing multiple *.sqlite* files. Once data has been downloaded into the database files, [dataframes.py](dataframes.py) module can be used to easily create DataFrame objects from the database tables for further analysis. 5 | 6 | The scraper should work as long as the structure of the responses does not change for the URL's used. See [input/api.json](input/api.json) for the complete list of URL's. 7 | 8 | IMPORTANT: The Morningstar.com data is protected under "Copyright (c) 2018 Morningstar. All rights reserved." This tool should be for personal purposes only. See the following links for more information regarding Morningstar.com terms & conditions: 9 | - [Copyright][2] 10 | - [User Agreement][3] 11 | 12 | ## Motivation 13 | As a fan of [Benjamin Graham](https://en.wikipedia.org/wiki/Benjamin_Graham)'s [value investing](https://en.wikipedia.org/wiki/Value_investing), I have always searched for sources of consolidated financial data that would allow me to identify 'undervalued' companies from a large pool of global public stocks. However, most *(if not all)* financial services that provide such data consolidation are not free and, as a small retail investor, I was not willing to pay for their fees. In fact, most of the data I needed was already available for free on various financial website, just not in a consolidated format. Therefore, I decided to create a web scraper for [MorningStar.com](https://www.morningstar.com), which is the website that I found to have the most available data in a more standardized and structured format. MS was also one of the only website services that published free financial performance data for the past 10 yrs, while most sites only provided free data for last 5 yrs. 14 | 15 | ## Next steps 16 | - Finalize instructions for the scraper CLI 17 | 18 | 19 | Instructions 20 | ------------ 21 | 22 | ### Program Requirements 23 | The scraper should run on any Linux distribution that has Python3 and the following modules installed: 24 | 25 | - [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) 26 | - [requests](http://docs.python-requests.org/en/master/) 27 | - [sqlite3](https://docs.python.org/3/library/sqlite3.html) 28 | - [pandas](https://pandas.pydata.org/) 29 | - [numpy](http://www.numpy.org/) 30 | - [multiprocessing](https://docs.python.org/3/library/multiprocessing.html?highlight=multiprocessing#module-multiprocessing) 31 | 32 | To view the [notebook with data visualization examples][1] mentioned in the instructions below, you must also have [Jupyter](https://jupyter.org/) and [matplotlib](https://matplotlib.org/) installed. 33 | 34 | ### Installation 35 | Open a Linux terminal in the desired installation directory and execute `git clone https://github.com/caiobran/msTables.git` to download the project files. 36 | 37 | ### Using the scraper Command Line Interface (CLI) 38 | 39 | Execute `python main.py` from the project root directory to start the scraper CLI. If the program has started correctly, you should see the following interface: 40 | 41 | 42 | 43 | 1. If you are running the scraper for the first time, enter option `1` to create the initial SQLite database tables. 44 | 2. Once that action has been completed, and on subsequent runs, enter option `2` to download the latest data from the MorningStar [URL's](input/api.json). 45 | - You will be prompted to enter the number of records you would like to update. You can enter a large number such as `1000000` if you would like the scraper to update all records. You may also enter smaller quantities if you do not want the scraper to run for a long period of time. 46 | - On average, it has taken about three days to update all records with the current program parameters and an Internet speed > 100mbps. The program can be interrupted at any time using Ctrl+C. 47 | - One may want to increase the size of the multiprocessing pool in [main.py](main.py) that is used for URL requests to speed up the scraper. *However, I do not recommend doing that as the MorningStar servers will not be too happy about receiving many simultaneous GET requests from the same IP address.* 48 | 49 | *(documentation in progress, to be updated with instructions on remaining actions)* 50 | 51 | ### How to access the SQLite database tables using module _dataframes.py_ 52 | The scraper will automatically create a directory *db/* in the root folder to store the *.sqlite* database files generated. The current file name in use will be displayed on the scraper CLI under action `0` (see CLI figure above). Database files will contain a relational database with the following main tables: 53 | 54 | **Database Tables** 55 | 56 | - _**Master**_: Main bridge table with complete list of security and exchange symbol pairs, security name, sector, industry, security type, and FY end dates 57 | - _**MSheader**_: Quote Summary data with day hi, day lo, 52wk hi, 52wk lo, forward P/E, div. yield, volumes, and current P/B, P/S, and P/CF ratios 58 | - _**MSvaluation**_: 10yr stock valuation indicators (P/E, P/S, P/B, P/C) 59 | - _**MSfinancials**_: Key performance ratios for past 10 yrs 60 | - _**MSratio_cashflow**_, _**MSratio_financial**_, _**MSratio_growth**_, _**MSratio_profitability**_, _**MSratio_efficiency**_: Financial performance ratios for past 10 yrs 61 | - _**MSreport_is_yr**_, _**MSreport_is_qt**_: Income Statements for past 5 yrs and 5 qtrs, respectively 62 | - _**MSreport_bs_yr**_, _**MSreport_bs_qt**_: Balance Sheets for past 5 yrs and 5 qtrs, respectively 63 | - _**MSreport_cf_yr**_, _**MSreport_cf_qt**_: Cash Flow Statements for past 5 yrs and 5 qtrs, respectively 64 | - _**MSpricehistory**_: Current 50, 100 and 200 day price averages and 10 year price history (price history is compressed) 65 | - _**InsiderTransactions**_: Insider transactions for the past year from [http://insiders.morningstar.com](http://insiders.morningstar.com) (+600k transactions) 66 | 67 | **How to slice and dice the data using dataframes.py** 68 | 69 | Module _dataframes_ contains a class that can be used to generate pandas DataFrames for the data in the SQLite database file that is generated by the web crawler. 70 | 71 | See Jupyter notebook [data_overview.ipynb][1] for examples on how to create DataFrame objects to manipulate and visualize the data. Below is a list of all content found in the notebook: 72 | 73 | **Juypter Notebook Content** 74 | 75 | 1. [Required modules and matplotlib backend][1] 76 | 1. [Creating a master (bridge table) DataFrame instance using the DataFrames class][1] 77 | 1. [Methods for creating DataFrame instances][1] 78 | 1. `quoteheader` - [MorningStar (MS) Quote Header][1] 79 | 1. `valuation` - [MS Valuation table with Price Ratios (P/E, P/S, P/B, P/C) for the past 10 yrs][1] 80 | 1. `keyratios` - [MS Ratio - Key Financial Ratios & Values][1] 81 | 1. `finhealth` - [MS Ratio - Financial Health][1] 82 | 1. `profitability` - [MS Ratio - Profitability][1] 83 | 1. `growth` - [MS Ratio - Growth][1] 84 | 1. `cfhealth` - [MS Ratio - Cash Flow Health][1] 85 | 1. `efficiency` - [MS Ratio - Efficiency][1] 86 | 1. `annualIS` - [MS Annual Income Statements][1] 87 | 1. `quarterlyIS` - [MS Quarterly Income Statements][1] 88 | 1. `annualBS` - [MS Annual Balance Sheets][1] 89 | 1. `quarterlyBS` - [MS Quarterly Balance Sheets][1] 90 | 1. `annualCF` - [MS Annual Cash Flow Statements][1] 91 | 1. `quarterlyCF` - [MS Quarterly Cash Flow Statements][1] 92 | 1. `insider_trades` - [Insider transactions for the past year][1] 93 | 1. [Performing statistical analysis][1] 94 | 1. [Count of database records][1] 95 | 1. [Last updated dates][1] 96 | 1. [Number of records by security type][1] 97 | 1. [Number of records by country, based on of exchanges][1] 98 | 1. [Number of records per exchange][1] 99 | 1. [Number of stocks by sector][1] 100 | 1. [Number of stocks by industry][1] 101 | 1. [Mean price ratios (P/E, P/S, P/B, P/CF) of stocks by sectors][1] 102 | 1. [Applying various criteria to filter common stocks][1] 103 | 1. [CAGR > 7% for past 7 years][1] 104 | 1. [No earnings deficit (loss) for past 5 or 7 years][1] 105 | 1. [Uniterrupted and increasing Dividends for past 5 yrs][1] 106 | 1. [P/E Ratio of 25 or less for the past 7 yrs and less then 20 for TTM][1] 107 | 1. [Growth for the past year][1] 108 | 1. [Long-term debt < 50% of total capital][1] *(pending)* 109 | 1. [Stocks with insider buys in the past 3 months][1] 110 | 111 | **Below are sample snip-its of code from [data_overview.ipynb][1]:** 112 | 113 | - Count of records downloaded from Morningstar.com by security type: 114 | 115 | 116 | - Plot of average US stocks P/E by sector for the past 10 years: 117 | 118 | 119 | - Applying fundamental rules to screen the list of stocks ([see sample output](https://github.com/caiobran/mstables/blob/master/sample_rules_output.ods)): 120 | 121 | 122 | 123 |
124 | 125 | MIT License 126 | ----------- 127 | 128 | Copyright (c) 2019 Caio Brandao 129 | 130 | Permission is hereby granted, free of charge, to any person obtaining a copy 131 | of this software and associated documentation files (the "Software"), to deal 132 | in the Software without restriction, including without limitation the rights 133 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 134 | copies of the Software, and to permit persons to whom the Software is 135 | furnished to do so, subject to the following conditions: 136 | 137 | The above copyright notice and this permission notice shall be included in all 138 | copies or substantial portions of the Software. 139 | 140 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 141 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 142 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 143 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 144 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 145 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 146 | SOFTWARE. 147 | 148 | [1]:https://github.com/caiobran/msTables/blob/master/data_overview.ipynb 149 | [2]:https://www.morningstar.com/about/copyright.html 150 | [3]:https://www.morningstar.com/about/user-agreement.html 151 | -------------------------------------------------------------------------------- /dataframes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sqlite3 4 | import json 5 | import sys 6 | import re 7 | import os 8 | 9 | 10 | class DataFrames(): 11 | 12 | db_file = 'db/mstables.sqlite' # Standard db file name 13 | 14 | def __init__(self, file = db_file): 15 | 16 | msg = 'Creating initial DataFrames objects from file {}...\n' 17 | print(msg.format(file)) 18 | 19 | self.conn = sqlite3.connect( 20 | file, detect_types=sqlite3.PARSE_COLNAMES) 21 | self.cur = self.conn.cursor() 22 | 23 | # Row Headers 24 | colheaders = self.table('ColHeaders', True) 25 | self.colheaders = colheaders.set_index('id') 26 | 27 | # Dates and time references 28 | timerefs = self.table('TimeRefs', True) 29 | self.timerefs = timerefs.set_index('id').replace(['', '—'], None) 30 | 31 | # Reference tables 32 | self.urls = self.table('URLs', True) 33 | self.securitytypes = self.table('SecurityTypes', True) 34 | self.tickers = self.table('Tickers', True) 35 | self.sectors = self.table('Sectors', True) 36 | self.industries = self.table('Industries', True) 37 | self.styles = self.table('StockStyles', True) 38 | self.exchanges = self.table('Exchanges', True) 39 | self.countries = (self.table('Countries', True) 40 | .rename(columns={'a2_iso':'country_c2', 'a3_un':'country_c3'})) 41 | self.companies = self.table('Companies', True) 42 | self.currencies = self.table('Currencies', True) 43 | self.stocktypes = self.table('StockTypes', True) 44 | 45 | #self.fetchedurls = self.table('Fetched_urls', True) 46 | 47 | # Master table 48 | self.master0 = self.table('Master', True) 49 | 50 | # Merge Tables 51 | self.master = (self.master0 52 | # Ticker Symbols 53 | .merge(self.tickers, left_on='ticker_id', right_on='id') 54 | .drop(['id'], axis=1) 55 | # Company / Security Name 56 | .merge(self.companies, left_on='company_id', right_on='id') 57 | .drop(['id', 'company_id'], axis=1) 58 | # Exchanges 59 | .merge(self.exchanges, left_on='exchange_id', right_on='id') 60 | .drop(['id'], axis=1) 61 | # Industries 62 | .merge(self.industries, left_on='industry_id', right_on='id') 63 | .drop(['id', 'industry_id'], axis=1) 64 | # Sectors 65 | .merge(self.sectors, left_on='sector_id', right_on='id') 66 | .drop(['id', 'sector_id'], axis=1) 67 | # Countries 68 | .merge(self.countries, left_on='country_id', right_on='id') 69 | .drop(['id', 'country_id'], axis=1) 70 | # Security Types 71 | .merge(self.securitytypes, left_on='security_type_id', right_on='id') 72 | .drop(['id', 'security_type_id'], axis=1) 73 | # Stock Types 74 | .merge(self.stocktypes, left_on='stock_type_id', right_on='id') 75 | .drop(['id', 'stock_type_id'], axis=1) 76 | # Stock Style Types 77 | .merge(self.styles, left_on='style_id', right_on='id') 78 | .drop(['id', 'style_id'], axis=1) 79 | # Quote Header Info 80 | .merge(self.quoteheader(), on=['ticker_id', 'exchange_id']) 81 | .rename(columns={'fpe':'PE_Forward'}) 82 | # Currency 83 | .merge(self.currencies, left_on='currency_id', right_on='id') 84 | .drop(['id', 'currency_id'], axis=1) 85 | # Fiscal Year End 86 | .merge(self.timerefs, left_on='fyend_id', right_on='id') 87 | .drop(['fyend_id'], axis=1) 88 | .rename(columns={'dates':'fy_end'}) 89 | ) 90 | # Change date columns to TimeFrames 91 | self.master['fy_end'] = pd.to_datetime(self.master['fy_end']) 92 | self.master['update_date'] = pd.to_datetime(self.master['update_date']) 93 | self.master['lastdate'] = pd.to_datetime(self.master['lastdate']) 94 | self.master['_52wk_hi'] = self.master['_52wk_hi'].astype('float') 95 | self.master['_52wk_lo'] = self.master['_52wk_lo'].astype('float') 96 | self.master['lastprice'] = self.master['lastprice'].astype('float') 97 | self.master['openprice'] = self.master['openprice'].astype('float') 98 | 99 | print('\nInitial DataFrames created successfully.') 100 | 101 | 102 | def quoteheader(self): 103 | return self.table('MSheader') 104 | 105 | 106 | def valuation(self): 107 | # Create DataFrame 108 | val = self.table('MSvaluation') 109 | 110 | # Rename column headers with actual year values 111 | yrs = val.iloc[0, 2:13].replace(self.timerefs['dates']).to_dict() 112 | cols = val.columns[:13].values.tolist() + list(map( 113 | lambda col: ''.join([col[:3], yrs[col[3:]]]), val.columns[13:])) 114 | val.columns = cols 115 | 116 | # Resize and reorder columns 117 | val = val.set_index(['exchange_id', 'ticker_id']).iloc[:, 11:] 118 | 119 | return val 120 | 121 | 122 | def keyratios(self): 123 | keyr = self.table('MSfinancials') 124 | yr_cols = ['Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 125 | 'Y7', 'Y8', 'Y9', 'Y10'] 126 | keyr = self.get_yrcolumns(keyr, yr_cols) 127 | keyr[yr_cols[:-1]] = keyr[yr_cols[:-1]].astype('datetime64') 128 | 129 | return keyr 130 | 131 | 132 | def finhealth(self): 133 | finan = self.table('MSratio_financial') 134 | yr_cols = [col for col in finan.columns if col.startswith('fh_Y')] 135 | finan = self.get_yrcolumns(finan, yr_cols) 136 | finan[yr_cols[:-1]] = finan[yr_cols[:-1]].astype('datetime64') 137 | 138 | return finan 139 | 140 | 141 | def profitability(self): 142 | profit= self.table('MSratio_profitability') 143 | yr_cols = [col for col in profit.columns if col.startswith('pr_Y')] 144 | profit = self.get_yrcolumns(profit, yr_cols) 145 | profit[yr_cols[:-1]] = profit[yr_cols[:-1]].astype('datetime64') 146 | 147 | return profit 148 | 149 | 150 | def growth(self): 151 | growth = self.table('MSratio_growth') 152 | yr_cols = [col for col in growth.columns if col.startswith('gr_Y')] 153 | growth = self.get_yrcolumns(growth, yr_cols) 154 | growth[yr_cols[:-1]] = growth[yr_cols[:-1]].astype('datetime64') 155 | 156 | return growth 157 | 158 | 159 | def cfhealth(self): 160 | cfhealth = self.table('MSratio_cashflow') 161 | yr_cols = [col for col in cfhealth.columns if col.startswith('cf_Y')] 162 | cfhealth = self.get_yrcolumns(cfhealth, yr_cols) 163 | cfhealth[yr_cols[:-1]] = cfhealth[yr_cols[:-1]].astype('datetime64') 164 | 165 | return cfhealth 166 | 167 | 168 | def efficiency(self): 169 | effic = self.table('MSratio_efficiency') 170 | yr_cols = [col for col in effic.columns if col.startswith('ef_Y')] 171 | effic = self.get_yrcolumns(effic, yr_cols) 172 | effic[yr_cols[:-1]] = effic[yr_cols[:-1]].astype('datetime64') 173 | 174 | return effic 175 | 176 | # Income Statement - Annual 177 | def annualIS(self): 178 | rep_is_yr = self.table('MSreport_is_yr') 179 | yr_cols = [col for col in rep_is_yr.columns 180 | if col.startswith('Year_Y')] 181 | rep_is_yr = self.get_yrcolumns(rep_is_yr, yr_cols) 182 | rep_is_yr[yr_cols[:-1]] = rep_is_yr[yr_cols[:-1]].astype('datetime64') 183 | 184 | return rep_is_yr 185 | 186 | # Income Statement - Quarterly 187 | def quarterlyIS(self): 188 | rep_is_qt = self.table('MSreport_is_qt') 189 | yr_cols = [col for col in rep_is_qt.columns 190 | if col.startswith('Year_Y')] 191 | rep_is_qt = self.get_yrcolumns(rep_is_qt, yr_cols) 192 | rep_is_qt[yr_cols[:-1]] = rep_is_qt[yr_cols[:-1]].astype('datetime64') 193 | 194 | return rep_is_qt 195 | 196 | # Balance Sheet - Annual 197 | def annualBS(self): 198 | rep_bs_yr = self.table('MSreport_bs_yr') 199 | yr_cols = [col for col in rep_bs_yr.columns 200 | if col.startswith('Year_Y')] 201 | rep_bs_yr = self.get_yrcolumns(rep_bs_yr, yr_cols) 202 | rep_bs_yr[yr_cols[:-1]] = rep_bs_yr[yr_cols[:-1]].astype('datetime64') 203 | 204 | return rep_bs_yr 205 | 206 | # Balance Sheet - Quarterly 207 | def quarterlyBS(self): 208 | rep_bs_qt = self.table('MSreport_bs_qt') 209 | yr_cols = [col for col in rep_bs_qt.columns 210 | if col.startswith('Year_Y')] 211 | rep_bs_qt = self.get_yrcolumns(rep_bs_qt, yr_cols) 212 | rep_bs_qt[yr_cols[:-1]] = rep_bs_qt[yr_cols[:-1]].astype('datetime64') 213 | 214 | return rep_bs_qt 215 | 216 | # Cashflow Statement - Annual 217 | def annualCF(self): 218 | rep_cf_yr = self.table('MSreport_cf_yr') 219 | yr_cols = [col for col in rep_cf_yr.columns 220 | if col.startswith('Year_Y')] 221 | rep_cf_yr = self.get_yrcolumns(rep_cf_yr, yr_cols) 222 | rep_cf_yr[yr_cols[:-1]] = rep_cf_yr[yr_cols[:-1]].astype('datetime64') 223 | 224 | return rep_cf_yr 225 | 226 | # Cashflow Statement - Quarterly 227 | def quarterlyCF(self): 228 | rep_cf_qt = self.table('MSreport_cf_qt') 229 | yr_cols = [col for col in rep_cf_qt.columns 230 | if col.startswith('Year_Y')] 231 | rep_cf_qt = self.get_yrcolumns(rep_cf_qt, yr_cols) 232 | rep_cf_qt[yr_cols[:-1]] = rep_cf_qt[yr_cols[:-1]].astype('datetime64') 233 | 234 | return rep_cf_qt 235 | 236 | # 10yr Price History 237 | def priceHistory(self): 238 | 239 | return self.table('MSpricehistory') 240 | 241 | 242 | def insider_trades(self): 243 | df_insiders = self.table('Insiders', False) 244 | df_tradetypes = self.table('TransactionType', False) 245 | df_trades = self.table('InsiderTransactions', False) 246 | df_trades['date'] = pd.to_datetime(df_trades['date']) 247 | df = (df_trades 248 | .merge(df_insiders, left_on='name_id', right_on='id') 249 | .drop(['id', 'name_id'], axis=1) 250 | .merge(df_tradetypes, left_on='transaction_id', right_on='id') 251 | .drop(['id', 'transaction_id'], axis=1) 252 | ) 253 | return df 254 | 255 | 256 | def get_yrcolumns(self, df, cols): 257 | for yr in cols: 258 | df = (df.merge(self.timerefs, left_on=yr, right_on='id') 259 | .drop(yr, axis=1).rename(columns={'dates':yr})) 260 | 261 | return df 262 | 263 | 264 | def table(self, tbl, prnt = False): 265 | self.cur.execute('SELECT * FROM {}'.format(tbl)) 266 | cols = list(zip(*self.cur.description))[0] 267 | 268 | try: 269 | if prnt == True: 270 | msg = '\t- DataFrame \'df.{}\' ...' 271 | print(msg.format(tbl.lower())) 272 | return pd.DataFrame(self.cur.fetchall(), columns=cols) 273 | except: 274 | raise 275 | 276 | 277 | def __del__(self): 278 | self.cur.close() 279 | self.conn.close() 280 | -------------------------------------------------------------------------------- /sample_rules_output.csv: -------------------------------------------------------------------------------- 1 | company,sector,industry,openprice,yield,avevol,CAGR_Rev,CAGR_OpeInc,CAGR_OpeCF,CAGR_FreeCF,Dividend_Y10,Rev_Growth_Y9,OpeInc_Growth_Y9,NetInc_Growth_Y9,PE_TTM,PB_TTM,PS_TTM,PC_TTM 2 | AB Sagax A,Real Estate,Real Estate Services,16.0,1.1,,19.6,18.1,31.0,31.0,1.8,20.2,17.2,6.8,12.7,2.8,17.8,25.2 3 | AF Poyry AB B,Industrials,Engineering & Construction,96.5,2.7,11383.0,10.9,16.4,15.5,15.7,4.7,10.4,20.7,14.2,18.4,2.7,1.1,18.1 4 | AVI Ltd,Consumer Defensive,Packaged Foods,5.8,4.5,,7.8,10.8,12.7,24.6,4.4,1.9,7.0,7.9,18.8,6.6,2.3,15.0 5 | Adler Real Estate AG,Real Estate,Real Estate Services,12.8,0.3,136030.0,86.2,91.3,64.1,62.9,0.0,41.9,49.1,109.5,3.6,0.7,2.3,7.6 6 | Ado Properties SA,Real Estate,Real Estate Services,46.5,1.3,54560.0,46.5,43.1,47.3,47.0,0.6,20.2,17.4,8.7,5.3,1.0,13.2,19.8 7 | Alimentation Couche-Tard Inc Class B,Consumer Defensive,Grocery Stores,52.9,0.5,,7.7,19.0,13.3,9.7,0.3,35.6,21.1,38.4,17.0,3.7,0.6,10.6 8 | Apple Inc,Technology,Consumer Electronics,184.9,1.4,30578092.1,9.2,7.7,7.6,7.5,2.9,15.9,15.6,23.1,16.4,8.0,3.8,12.9 9 | Armanino Foods of Distinction Inc,Consumer Defensive,Packaged Foods,3.3,2.8,19498.0,7.7,9.9,18.5,10.8,0.1,7.3,12.9,23.2,17.0,5.9,2.6,16.0 10 | Ascendas India Trust,Real Estate,Real Estate - General,0.9,,29988.0,8.3,12.2,12.6,12.4,0.0,20.1,22.8,37.5,6.0,1.4,6.4,8.0 11 | BOC Aviation Ltd,Industrials,Airports & Air Services,22.6,3.6,15000000.0,14.6,18.7,15.5,7.8,0.3,23.5,27.0,5.8,9.6,1.4,3.7,3.4 12 | Barratt Developments PLC,Consumer Cyclical,Residential Construction,7.1,4.5,,13.3,27.9,25.4,25.3,0.3,4.8,7.9,9.1,8.6,1.4,1.2,9.2 13 | Barratt Developments PLC ADR,Consumer Cyclical,Residential Construction,15.3,7.6,430.0,13.3,27.9,25.4,25.3,0.9,4.8,7.9,9.1,8.3,1.3,1.2,8.9 14 | Beijing North Star Co Ltd Class H,Real Estate,Real Estate - General,1.0,4.4,18000000.0,26.5,22.8,17.4,19.9,0.1,15.6,48.7,4.3,4.8,0.5,0.4,3.3 15 | Bellway PLC,Consumer Cyclical,Residential Construction,652.6,4.5,339100.0,21.6,34.0,33.9,34.4,1.4,15.6,14.2,14.5,7.1,1.4,1.2,13.4 16 | Billington Holdings PLC,Industrials,Engineering & Construction,290.1,3.9,16678.0,14.9,38.0,20.1,24.6,0.1,6.0,12.9,15.6,8.9,1.6,0.5,7.8 17 | Build King Holdings Ltd,Industrials,Engineering & Construction,0.8,2.0,36000000.0,23.5,98.3,24.0,29.6,0.0,5.3,111.7,123.7,4.4,2.0,0.3,2.8 18 | CK Infrastructure Holdings Ltd,Industrials,Infrastructure Operations,18.5,3.7,21000000.0,7.3,36.6,7.1,8.2,2.4,18.8,55.0,1.8,14.1,1.4,23.2,42.8 19 | CK Infrastructure Holdings Ltd ADR,Industrials,Infrastructure Operations,40.0,3.8,926.0,7.3,36.6,7.1,8.2,12.0,18.8,55.0,1.8,14.2,1.4,23.4,43.1 20 | CPL Resources PLC,Industrials,Staffing & Outsourcing Services,565.0,2.3,702.0,9.6,8.4,38.0,44.7,0.1,14.8,16.2,20.1,10.0,1.8,0.3,7.9 21 | Canadian Apartment Properties Real Estate Investment Trust,Real Estate,REIT - Residential,47.9,2.8,376599.0,7.6,7.9,10.6,17.4,1.3,7.8,10.2,45.5,5.7,1.2,10.0,16.0 22 | Castellum AB,Real Estate,Real Estate - General,47.8,2.1,122351.0,11.4,13.1,12.6,12.6,5.7,7.6,10.9,26.8,5.8,1.2,8.3,16.2 23 | Central Asia Metals PLC,Basic Materials,Copper,2.6,6.4,,30.4,26.9,19.1,20.3,0.2,87.6,65.4,32.0,9.5,1.6,2.7,6.3 24 | Central China Land Media Co Ltd,Basic Materials,Chemicals,8.0,2.2,41000000.0,25.6,25.4,37.3,43.2,0.2,10.1,15.7,5.9,11.1,1.1,0.9,14.4 25 | Chengdu Fusen Noble-House Industrial Co Ltd A,Consumer Cyclical,Home Furnishings & Fixtures,25.0,4.4,26000000.0,14.1,15.8,7.9,11.6,0.6,12.9,11.4,12.9,8.8,1.4,4.6,6.8 26 | China Lesso Group Holdings Ltd,Basic Materials,Building Materials,0.6,4.9,,12.7,12.7,24.0,59.9,0.2,16.6,22.4,8.7,5.8,1.0,0.6,3.3 27 | China Maple Leaf Educational Systems Ltd Shs Unitary 144A/Reg S,Consumer Defensive,Education & Training Services,0.4,2.5,151.3,23.3,28.3,24.6,60.7,0.1,23.8,20.2,32.2,16.4,2.6,6.6,11.8 28 | China National Building Material Co Ltd Class H ADR,Basic Materials,Building Materials,47.0,1.7,2065.0,13.2,9.6,33.0,98.5,5.2,71.6,73.4,133.4,6.6,0.6,0.2,1.1 29 | China Resources Land Ltd,Real Estate,Real Estate - General,9.9,2.8,67500038.0,16.8,28.4,13.9,13.6,0.8,22.3,33.7,27.5,8.3,1.4,1.7,7.4 30 | China Resources Land Ltd ADR,Real Estate,Real Estate - General,43.6,2.9,477.0,16.8,28.4,13.9,13.6,8.5,22.3,33.7,27.5,8.4,1.4,1.7,7.5 31 | China Resources Pharmaceutical Group Ltd,Healthcare,Drug Manufacturers - Specialty & Generic,3.7,0.9,22000000.0,10.2,8.0,15.0,41.5,0.1,9.9,14.9,15.9,17.4,1.8,0.4,8.5 32 | China Sunsine Chemical Holdings Ltd,Consumer Cyclical,Rubber & Plastics,0.8,2.1,0.0,14.1,41.0,33.6,101.8,0.1,19.9,37.2,87.9,4.7,1.2,0.9,3.5 33 | Citic Telecom International Holdings Ltd,Communication Services,Telecom Services,1.3,5.2,84000000.0,9.5,27.8,11.1,15.0,0.2,27.0,11.7,7.9,12.0,1.3,1.2,6.3 34 | Daiwa House Industry Co Ltd,Real Estate,Real Estate - General,24.7,3.5,0.0,13.6,22.1,18.4,13.9,110.2,8.1,12.0,17.2,8.4,1.3,0.5,6.6 35 | Daiwa House Industry Co Ltd ADR,Real Estate,Real Estate - General,28.0,3.6,25515.0,13.6,22.1,18.4,13.9,109.6,8.1,12.0,17.2,8.5,1.3,0.5,6.7 36 | Dream Global Real Estate Investment Trust,Real Estate,REIT - Office,9.0,5.8,7122.8,10.0,9.8,13.2,13.2,0.7,34.1,40.1,97.1,4.5,0.9,7.3,16.2 37 | Elanders AB Class B,Industrials,Business Services,8.6,3.2,,38.7,29.6,28.9,50.5,2.6,15.0,45.1,54.4,11.9,1.1,0.3,3.8 38 | Envea SA,Technology,Scientific & Technical Instruments,70.0,0.9,936.0,11.2,19.1,17.6,18.5,0.6,14.3,58.2,80.0,11.8,1.7,1.2,13.1 39 | Faes Farma SA,Healthcare,Medical Instruments & Supplies,4.0,3.5,48.0,12.5,16.8,13.0,11.7,0.1,18.1,23.3,22.2,19.7,3.0,3.3,18.3 40 | Fuyao Glass Industry Group Co Ltd,Consumer Cyclical,Auto Parts,14.1,6.4,184000000.0,12.0,15.0,15.4,18.4,1.5,8.1,22.3,30.9,14.6,2.9,3.0,11.2 41 | Gima TT SpA Ordinary Shares,Industrials,Diversified Industrials,6.9,6.0,337693.0,56.2,78.3,14.9,11.8,0.4,20.9,17.8,17.5,12.0,11.0,3.3,34.4 42 | HIRATA Corp,Industrials,Diversified Industrials,59.7,3.0,,19.8,83.5,24.6,20.6,100.0,16.9,13.6,13.2,16.0,1.7,1.0,29.1 43 | Hexpol AB B,Basic Materials,Specialty Chemicals,6.8,2.6,125.0,11.4,11.4,8.1,8.2,2.0,12.6,8.3,7.8,15.2,2.7,1.8,14.1 44 | Ho Bee Land Ltd,Real Estate,Real Estate Services,1.6,3.1,,7.2,39.3,26.2,27.9,0.1,19.6,34.2,8.3,6.2,0.5,8.5,7.5 45 | Hon Kwok Land Investment Co Ltd,Real Estate,Real Estate - General,4.1,3.0,87512.0,49.6,9.2,22.6,22.4,0.1,13.1,35.6,409.5,3.3,0.3,1.5, 46 | Howden Joinery Group PLC,Consumer Cyclical,Home Furnishings & Fixtures,509.0,2.2,21000000.0,9.6,10.9,13.4,13.6,0.1,7.7,2.4,2.9,16.3,5.4,2.1,19.0 47 | Howden Joinery Group PLC ADR,Consumer Cyclical,Home Furnishings & Fixtures,27.0,2.2,4948.0,9.6,10.9,13.4,13.6,0.4,7.7,2.4,2.9,16.7,5.6,2.1,19.5 48 | ISDN Holdings Ltd,Industrials,Engineering & Construction,0.6,2.7,109310.0,12.9,23.6,11.4,14.9,0.0,3.4,41.0,14.6,8.0,0.6,0.3,7.0 49 | Intrum AB,Financial Services,Credit Services,77.7,4.0,75566.0,24.0,26.2,21.7,21.8,9.5,41.5,42.5,29.0,14.2,1.3,2.2,5.2 50 | Intrum AB ADR,Financial Services,Credit Services,25.9,8.2,742.0,24.0,26.2,21.7,21.8,9.3,41.5,42.5,29.0,14.6,1.3,2.3,5.3 51 | Jones Lang LaSalle Inc,Real Estate,Real Estate Services,146.2,0.5,307288.0,29.6,14.0,15.6,19.4,0.8,105.7,31.4,90.6,14.6,1.9,0.4,11.7 52 | Klovern AB B,Real Estate,Real Estate Services,1.2,0.9,,8.7,8.8,13.7,13.5,0.4,11.4,5.7,28.1,3.6,0.6,3.1,13.2 53 | Knowit AB,Technology,Information Technology Services,88.3,2.5,569.0,9.3,27.1,20.9,21.0,4.8,12.8,12.3,14.7,19.0,3.9,1.4,15.5 54 | Kruk SA,Financial Services,Credit Services,172.5,2.0,2587.0,21.1,30.9,61.1,56.4,5.0,0.2,17.6,11.8,14.7,2.6,4.6, 55 | Link Real Estate Investment Trust,Real Estate,REIT - Retail,27.0,2.8,23500107.0,9.0,10.5,9.1,9.1,2.5,8.3,8.9,169.7,4.4,1.1,19.7,33.3 56 | Loomis AB B,Industrials,Business Services,96.4,2.6,10011.0,11.0,14.6,16.8,19.9,9.0,11.3,3.9,7.7,16.4,3.1,1.3,10.0 57 | Macfarlane Group PLC,Consumer Cyclical,Packaging & Containers,99.8,2.1,200301.0,8.5,14.9,32.0,27.2,0.0,10.9,19.2,17.7,18.6,2.6,0.7,13.7 58 | Marine Products Corp,Consumer Cyclical,Recreational Vehicles,13.8,2.6,,12.2,28.5,18.1,18.5,0.4,11.7,18.9,47.6,18.6,7.1,1.8,23.5 59 | Midea Group Co Ltd Class A,Technology,Consumer Electronics,51.4,2.3,390000000.0,16.5,29.5,22.6,22.9,1.2,7.9,64.0,17.0,17.2,4.2,1.3,12.5 60 | Morguard Corp,Real Estate,Real Estate Services,193.8,0.3,1157.0,17.8,16.3,10.8,9.7,0.6,5.5,2.3,3.1,6.9,0.6,1.9,7.6 61 | NSD Co Ltd,Technology,Software - Application,21.8,4.0,,9.7,12.0,14.2,15.6,52.0,5.2,10.4,18.2,19.9,2.5,1.9,20.7 62 | Nobility Homes Inc,Consumer Cyclical,Residential Construction,20.4,4.4,,17.7,43.1,21.7,21.7,0.2,14.0,31.4,50.0,16.1,1.7,2.0,10.7 63 | Nolato AB B,Technology,Communication Equipment,42.5,2.8,,12.4,18.1,15.3,10.4,12.5,20.6,21.8,26.2,16.1,4.5,1.4,11.1 64 | Nordic Waterproofing Holding A/S,Basic Materials,Building Materials,8.4,4.5,,9.9,9.4,16.8,16.2,3.8,22.6,8.4,11.1,14.2,2.0,0.8,11.7 65 | Northview Apartment Real Estate Investment Trust,Real Estate,REIT - Residential,28.2,5.8,127433.0,15.8,15.6,13.4,33.6,1.6,9.7,11.7,36.4,6.3,1.1,5.0,13.1 66 | OEM International AB B,Industrials,Diversified Industrials,19.2,2.9,,13.3,15.2,10.1,12.6,6.0,13.6,14.6,16.1,19.6,5.2,1.6,22.4 67 | PRO Real Estate Investment Trust,Real Estate,REIT - Diversified,2.3,9.1,100178.0,83.0,87.2,69.5,69.5,0.2,38.0,48.3,86.7,10.5,1.1,4.8,14.3 68 | Packaging Corp of America,Consumer Cyclical,Packaging & Containers,99.2,3.2,925768.0,13.9,15.8,14.2,11.0,3.0,8.8,16.8,10.4,12.0,3.5,1.3,7.9 69 | RPC Group PLC,Consumer Cyclical,Packaging & Containers,9.0,3.5,,28.9,40.9,35.0,50.7,0.3,36.4,76.5,92.3,12.6,1.6,0.8,9.0 70 | Sabra Health Care REIT Inc,Real Estate,REIT - Healthcare Facilities,17.1,9.1,,35.8,29.3,42.1,42.1,1.8,53.7,36.8,76.2,12.8,1.1,5.5,9.6 71 | SalMar ASA,Consumer Defensive,Farm Products,127.8,5.0,20675.0,12.7,19.8,20.3,17.9,19.0,5.1,71.3,56.9,12.3,4.8,3.9,15.8 72 | Shanghai Environment Group Ltd,Industrials,Waste Management,14.6,0.5,73000000.0,39.2,62.4,28.0,63.3,0.1,0.7,11.9,14.2,13.8,1.3,3.1,8.0 73 | Shanghai Pharmaceuticals Holding Co Ltd A,Healthcare,Medical Distribution,19.6,1.9,260000000.0,15.3,20.3,26.4,77.3,0.4,21.6,46.6,10.2,14.0,1.4,0.3,17.5 74 | Shanghai Pharmaceuticals Holding Co Ltd Class H,Healthcare,Medical Distribution,6.7,2.7,43000000.0,15.3,20.3,26.4,77.3,0.7,21.6,46.6,10.2,9.9,1.0,0.2,12.4 75 | Singapore Shipping Corp Ltd,Industrials,Shipping & Ports,0.2,3.5,,18.1,13.2,12.1,12.1,0.0,6.4,18.4,22.0,8.8,1.0,1.9,3.5 76 | Slate Office REIT,Real Estate,REIT - Office,4.3,12.1,9934.0,49.6,45.0,43.7,32.0,0.8,38.0,48.4,55.2,5.4,0.7,2.0,8.5 77 | Softronic AB B,Technology,Information Technology Services,1.8,4.2,,11.3,51.9,16.7,18.3,0.8,43.7,312.5,1.1,18.6,3.6,1.0,14.8 78 | Somero Enterprises Inc Shs Reg-S,Industrials,Farm & Construction Equipment,4.0,4.3,,15.9,37.0,24.6,26.9,0.2,9.8,14.7,17.0,11.5,4.5,2.6,10.4 79 | Stanley Electric Co Ltd,Consumer Cyclical,Auto Parts,23.8,1.7,,10.9,14.1,19.9,65.3,49.0,13.8,25.7,25.5,12.0,1.4,1.1,6.8 80 | Stellus Capital Investment Corp,Financial Services,Asset Management,14.3,9.5,138403.0,12.8,15.5,8.1,8.1,1.4,34.4,33.0,15.8,4.7,1.2,4.3, 81 | Suzuki Motor Corp,Consumer Cyclical,Auto Manufacturers,40.8,1.5,1175.0,7.8,20.9,18.6,55.7,81.0,18.5,40.3,34.9,9.6,1.6,0.6,5.3 82 | TDC Soft Inc,Technology,Software - Infrastructure,6.8,2.6,0.0,7.0,14.7,17.1,22.8,17.5,4.2,12.5,7.2,13.2,1.9,0.8,15.9 83 | TK Group (Holdings) Ltd,Consumer Cyclical,Rubber & Plastics,5.3,3.4,404392.0,13.9,18.0,21.8,21.7,0.2,23.5,18.0,16.0,12.6,4.1,1.9,9.0 84 | Takara Leben Co Ltd,Real Estate,Real Estate - General,2.7,4.7,,11.3,14.6,62.4,95.5,16.0,7.0,21.7,20.6,4.8,0.9,0.3,2.7 85 | Thor Industries Inc,Consumer Cyclical,Recreational Vehicles,62.8,2.3,799712.0,20.8,23.6,26.4,22.1,1.5,14.9,12.9,14.9,15.0,1.9,0.5,6.4 86 | Trelleborg AB B,Industrials,Diversified Industrials,51.2,3.0,37690.0,9.6,12.1,16.5,17.0,8.1,7.7,10.2,11.0,13.6,1.4,1.2,11.0 87 | UnitedHealth Group Inc,Healthcare,Health Care Plans,236.2,1.5,2069.0,13.1,12.5,17.6,19.2,3.4,12.4,12.6,13.5,18.1,4.1,1.0,21.6 88 | Vidrala SA,Consumer Cyclical,Packaging & Containers,82.0,1.3,,15.0,14.6,18.7,12.5,0.3,16.1,28.6,30.1,18.3,3.5,2.2,10.5 89 | Walgreens Boots Alliance Inc,Consumer Defensive,Pharmaceutical Retailers,47.7,3.1,0.0,12.7,11.7,14.0,17.4,1.7,11.3,14.8,23.2,10.0,2.0,0.4,8.2 90 | Warehouses De Pauw,Real Estate,REIT - Industrial,135.2,3.5,20803.0,15.5,15.1,14.6,15.6,4.5,20.6,20.0,39.8,9.3,1.9,15.2,17.0 91 | Winnebago Industries Inc,Consumer Cyclical,Recreational Vehicles,33.8,1.2,581877.0,20.2,29.9,52.7,55.8,0.4,30.4,30.0,43.5,10.5,2.0,0.6,9.2 92 | Wuxi Little Swan Co Ltd,Technology,Consumer Electronics,51.5,2.4,436489.0,22.0,41.0,23.7,22.6,1.0,10.5,50.2,23.6,14.7,3.2,1.2,11.4 93 | Yamaya Corp,Consumer Defensive,Beverages - Brewers,16.7,2.1,,7.1,15.8,38.8,74.1,44.0,1.0,41.0,93.7,6.5,0.7,0.1,3.7 94 | Yuzhou Properties Co Ltd,Real Estate,Real Estate - General,1.2,7.7,49500000.0,26.6,27.1,8.0,61.3,0.3,12.0,0.2,25.6,4.6,0.9,0.6,4.9 95 | ZENKOKU HOSHO Co Ltd,Financial Services,Credit Services,30.3,4.3,,9.6,23.8,14.0,14.1,80.0,10.2,10.8,12.9,11.3,2.3,6.3,8.0 96 | ZENKOKU HOSHO Co Ltd ADR,Financial Services,Credit Services,11.6,2.0,1436.0,9.6,23.8,14.0,14.1,25.6,10.2,10.8,12.9,12.4,2.5,6.9,8.8 97 | Zhejiang Hangmin Co Ltd,Consumer Cyclical,Textile Manufacturing,10.7,2.6,87000000.0,20.8,14.2,15.8,12.9,0.3,115.5,50.8,15.4,11.2,1.8,1.0,6.7 98 | -------------------------------------------------------------------------------- /fetch.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import multiprocessing as mp 3 | from datetime import datetime 4 | from importlib import reload #Comment out once done using 5 | from csv import reader 6 | import numpy as np 7 | import pandas as pd 8 | import requests, sqlite3, time, json, zlib, re, os, parse 9 | 10 | 11 | def create_tables(db_file): 12 | 13 | def mssitemap(): 14 | urls1 = [] 15 | xml_files = [ 16 | 'sal-quote-stock-sitemap.xml', 'sal-quote-cefs-sitemap.xml', 17 | 'sal-quote-funds-sitemap.xml', 'sal-quote-etfs-sitemap.xml' 18 | ] 19 | url = 'https://www.morningstar.com/sitemaps/individual/{}' 20 | 21 | for xml_file in xml_files: 22 | type = re.findall('sal-quote-(.+?)-sitemap', xml_file)[0] 23 | 24 | print('\nFetching list of {} from MorningStar.com'.format(type)) 25 | xml = requests.get(url.format(xml_file)).text 26 | 27 | print('Parsing list of {}'.format(type)) 28 | tree = ET.fromstring(xml) 29 | url_tag = '{http://www.sitemaps.org/schemas/sitemap/0.9}url' 30 | loc_tag = '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' 31 | urls2 = tree.findall('{}/{}'.format(url_tag, loc_tag)) 32 | 33 | print('List of {} length = {}'.format(type, len(urls2))) 34 | 35 | def get_ticker(u, typ): 36 | while True: 37 | try: 38 | x = re.findall('/{}/(.+)/'.format(typ), 39 | u)[0].split('/')[1].upper() 40 | if x.find(' ') > 0: 41 | x = '' 42 | break 43 | except IndexError: 44 | typ = 'stocks' 45 | except: 46 | raise 47 | 48 | return x 49 | 50 | urls1 += [(get_ticker(url.text, type),) for url in urls2] 51 | #urls1 += [(url.text,) for url in urls2] 52 | 53 | print('\nTotal length = {}'.format(len(urls1))) 54 | return urls1 55 | 56 | # Create database connection 57 | print('\nPlease wait, database tables are being created ...') 58 | conn = sqlite3.connect(db_file) 59 | conn.execute('pragma auto_vacuum = 1') 60 | cur = conn.cursor() 61 | 62 | # Create database tables based on table.json 63 | for table in tbl_names: 64 | columns = ' '.join(['{} {}'.format(k, v) for k, v in tbl_js[table].items()]) 65 | sql = 'CREATE TABLE IF NOT EXISTS {} ({})'.format(table, columns) 66 | db_execute(cur, sql) 67 | 68 | std_list = [ 69 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 70 | 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 71 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 72 | ] 73 | sql = 'INSERT OR IGNORE INTO tickers (ticker) VALUES (?)' 74 | cur.executemany(sql, mssitemap() + std_list) 75 | 76 | # Insert list of tickers and exchanges previously retrieved into database 77 | file = 'ticker_exch.json' 78 | if file in os.listdir(fd_input): 79 | with open(fd_input + file) as fi: 80 | tbls = json.load(fi) 81 | for tbl in tbls: 82 | if tbl == 'Tickers': 83 | col = '(id, ticker)' 84 | val = '(?, ?)' 85 | elif tbl == 'Exchanges': 86 | col = '(id, exchange, exchange_sym, country_id)' 87 | val = '(?, ?, ?, ?)' 88 | elif tbl == 'Master': 89 | col = '(ticker_id, exchange_id)' 90 | val = '(?, ?)' 91 | sql = 'INSERT OR IGNORE INTO {} {} VALUES {}'.format(tbl, col, val) 92 | cur.executemany(sql, tbls[tbl]) 93 | 94 | # Insert list of countries into Countries table 95 | sql = '''INSERT OR IGNORE INTO Countries 96 | (country, a2_iso, a3_un) VALUES (?, ?, ?)''' 97 | cur.executemany(sql, csv_content('input/ctycodes.csv', 3)) 98 | 99 | # Insert list of currencies into Currencies table 100 | sql = '''INSERT OR IGNORE INTO Currencies (currency, currency_code) 101 | VALUES (?, ?)''' 102 | cur.executemany(sql, csv_content('input/symbols.csv', 2)) 103 | 104 | # Insert list of types into SecurityTypes table 105 | sql = '''INSERT OR IGNORE INTO SecurityTypes 106 | (security_type_code, security_type) VALUES (?, ?)''' 107 | cur.executemany(sql, csv_content('input/ms_investment-types.csv', 2)) 108 | 109 | # Insert list of api URLs into URLs table 110 | for k, v in apis.items(): 111 | sql = sql_insert('URLs', '(id, url)', (k, v)) 112 | db_execute(cur, sql) 113 | 114 | save_db(conn) 115 | cur.close() 116 | conn.close() 117 | 118 | msg = '\n~ The following {} database tables were successfully created:\n' 119 | tbls = json.dumps(sorted(tbl_names), indent=2) 120 | tbls = re.sub('\[|\]|",|"\n', '', tbls) 121 | msg += re.sub('"', '- ', tbls) 122 | return msg.format(len(tbl_names)) 123 | 124 | 125 | def csv_content(file, columns, header=False): 126 | with open(file) as csvfile: 127 | info = reader(csvfile)#, delimiter=',', quotechar='"') 128 | if header == True: 129 | return [row[:columns] for row in info] 130 | return [row[:columns] for row in info][1:] 131 | 132 | 133 | def db_execute(cur, sql): 134 | x = 0 135 | while x < 100: 136 | try: 137 | sql = re.sub('\'Null\'|\'null\'|None', 'NULL', sql) 138 | return cur.execute(sql) 139 | except KeyboardInterrupt: 140 | print('\nGoodbye!') 141 | exit() 142 | except Exception as e: 143 | if x == 99: 144 | msg = '\n\n### Error occured while executing SQL cmd:' 145 | msg += '\n\n \'{}\'\n' 146 | print(msg.format(sql)) 147 | print('### Error type - {}'.format(type(e))) 148 | raise 149 | x += 1 150 | 151 | 152 | def db_execute_tpl(cur, sql, tpl): 153 | while True: 154 | try: 155 | return cur.execute(sql,tpl) 156 | except sqlite3.OperationalError as S: 157 | fetch.print_('') 158 | print('\tError - sqlite3 error: {}'.format(S)) 159 | except KeyboardInterrupt: 160 | print('\nGoodbye!') 161 | exit() 162 | except: 163 | print('\n\nSQL cmd = \'{}\'\n{}\n'.format(sql, tpl)) 164 | raise 165 | 166 | 167 | def delete_tables(db_file): 168 | print_('\nPlease wait, database tables are being deleted ...') 169 | 170 | # Create database connection 171 | conn = sqlite3.connect(db_file) 172 | cur = conn.cursor() 173 | 174 | # Drop tables and commit database 175 | for table in tbl_names: 176 | print_('Deleting database table {} ...'.format(table)) 177 | sql = 'DROP TABLE IF EXISTS ' + table 178 | db_execute(cur, sql) 179 | save_db(conn) 180 | 181 | cur.close() 182 | conn.close() 183 | 184 | msg = '\n~ {} database tables were sucessfully deleted.' 185 | return msg.format(len(tbl_names)) 186 | 187 | 188 | def del_fetch_history(db_file): 189 | print_('\nPlease wait, download history is being erased ...') 190 | 191 | # Create database connection 192 | conn = sqlite3.connect(db_file) 193 | cur = conn.cursor() 194 | 195 | # Drop tables and commit database 196 | table = 'Fetched_urls' 197 | sql = 'DELETE FROM ' + table 198 | db_execute(cur, sql) 199 | save_db(conn) 200 | 201 | cur.close() 202 | conn.close() 203 | 204 | return '\n~ Download history (table Fetched_urls) erased.' 205 | 206 | 207 | def erase_tables(db_file): 208 | print_('\nPlease wait, database tables are being erased ...') 209 | 210 | # Create database connection 211 | conn = sqlite3.connect(db_file) 212 | cur = conn.cursor() 213 | for table in tbl_names: 214 | print_('Erasing database table {} ...'.format(table)) 215 | sql = 'DELETE FROM ' + table 216 | db_execute(cur, sql) 217 | save_db(conn) 218 | cur.close() 219 | conn.close() 220 | 221 | msg = '\n~ Records from {} database tables were sucessfully erased.' 222 | return msg.format(len(tbl_names)) 223 | 224 | 225 | def fetch(db_file): 226 | div = 150 227 | pool_size = 50 228 | 229 | # Get user input for stp (no. of tickers to update) 230 | while True: 231 | # User input for number of tickers to update 232 | try: 233 | msg = 'Qty. of records to be updated:\n' 234 | stp = int(input(msg)) 235 | except KeyboardInterrupt: 236 | print('\nGoodbye!') 237 | exit() 238 | except Exception: 239 | continue 240 | start = time.time() 241 | break 242 | 243 | # Fetch data for each API for tickers qty. = 'stp' 244 | dividend = max(stp, div) 245 | runs = dividend // div 246 | div = min(stp, div) 247 | tickers = [] 248 | for i in range(runs): 249 | t0 = time.time() 250 | 251 | # Create db connection 252 | conn = sqlite3.connect(db_file) 253 | cur = conn.cursor() 254 | 255 | # Get list of URL's to be retrieved and print current run info 256 | msg = '\nRun {} / {}' 257 | if i == 0: 258 | try: 259 | urls = get_url_list(cur) 260 | except KeyboardInterrupt: 261 | print('\nGoodbye!') 262 | exit() 263 | except: 264 | raise 265 | msg0 = '\nTotal URL requests pending =\t{:9,.0f}\n' 266 | msg0 += 'Total URL requests planned =\t{:9,.0f}\n' 267 | print(msg0.format(len(urls), 268 | min(len(urls), stp * len(apis)))) 269 | msg0 = '\t({} requests per API per run = {} requests per run)' 270 | msg += msg0.format(div, div*len(apis)) 271 | 272 | j = i * div * len(apis) 273 | items = urls[j:j + div * len(apis)] 274 | items_ct = len(items) 275 | sort0 = lambda x: (x[0], x[2], x[3]) 276 | #items = sorted(items, key=sort0) 277 | print(msg.format(i+1, '{:.0f}' 278 | .format(min(len(urls), stp * len(apis))/(div*len(apis))))) 279 | 280 | # Execute sql clean.txt and exit loop if no records remain to update 281 | if items_ct == 0: 282 | #with open(sql_cmds.format('clean.txt')) as file: 283 | # cur.executescript(file.read().strip()) 284 | break 285 | 286 | # Fetch data from API's using multiprocessing.Pool 287 | results = [] 288 | while True: 289 | try: 290 | with mp.Pool(pool_size) as p: 291 | #r = p.imap_unordered(fetch_api, items) 292 | #r = p.map(fetch_api, items) 293 | r = p.imap(fetch_api, items) 294 | for turn in range(len(items)): 295 | try: 296 | results.append(r.next(timeout=5)) 297 | except mp.context.TimeoutError: 298 | pass 299 | break 300 | except KeyboardInterrupt: 301 | print('\nGoodbye!') 302 | exit() 303 | except: 304 | raise 305 | 306 | # Fetch data from API's without multiprocessing.Pool 307 | '''for item in url_info: 308 | results.append(fetch_api(item))''' 309 | 310 | # Enter URL data into Fetched_urls 311 | if results != []: 312 | try: 313 | results = list(filter(lambda x: x is not None, results)) 314 | except: 315 | # DELETE 316 | print('\n\n\n{}\n\n'.format(results[:1])) 317 | raise 318 | 319 | msg = ' - Success rate:\t{:,.0f} out of {:,.0f} ({:.1%})' 320 | totreq = min(stp, div)*len(apis) 321 | srate = len(results)/totreq 322 | print_('') 323 | print(msg.format(len(results), totreq, srate)) 324 | 325 | # Insert new data 326 | msg = 'Storing source data into database table \'Fetched_urls\'...' 327 | print_(msg) 328 | cols = 'url_id, ticker_id, exch_id, fetch_date, ' + \ 329 | 'status_code, source_text' 330 | sql = 'INSERT OR IGNORE INTO Fetched_urls ({}) VALUES ({})' 331 | sql = sql.format(cols, '?, ?, ?, date(?), ?, ?') 332 | #print('\n\nSQL = {}'.format(sql)) 333 | cur.executemany(sql, results) 334 | 335 | # Export new ticker and exchange lists to input folder 336 | output = {} 337 | with open(fd_input + 'ticker_exch.json', 'w') as file: 338 | sql = 'SELECT * FROM Tickers' 339 | ticks = cur.execute(sql).fetchall() 340 | output['Tickers'] = ticks 341 | sql = 'SELECT * FROM Exchanges' 342 | exchs = cur.execute(sql).fetchall() 343 | output['Exchanges'] = exchs 344 | sql = 'SELECT ticker_id, exchange_id FROM Master' 345 | fetched = cur.execute(sql).fetchall() 346 | output['Master'] = fetched 347 | file.write(json.dumps(output, indent=2)) 348 | 349 | # Save (commit) changes and close db 350 | save_db(conn) 351 | cur.close() 352 | conn.close() 353 | 354 | # Call parsing module from parse.py 355 | t1 = time.time() 356 | print_(' - Fetch Duration:\t{:.2f} sec\n'.format(t1-t0)) 357 | parse.parse(db_file) 358 | t1 = time.time() 359 | print_(' - Total Duration:\t{:.2f} sec\n'.format(t1-t0)) 360 | print(' - Speed:\t\t{:.2f} records/sec'.format( 361 | len(results)/(t1-t0))) 362 | 363 | return start 364 | 365 | 366 | def fetch_api(url_info): 367 | t0 = time.time() 368 | 369 | # Unpack variables 370 | url_id, url, ticker_id, exch_id = url_info 371 | num = ticker_list[url_id]['{}:{}'.format(exch_id, ticker_id)] 372 | ct = ticker_count[url_id] 373 | print_progress(url_id, num, ct) 374 | 375 | # Fetch URL data 376 | x = 0 377 | while True: 378 | try: 379 | page = requests.get(url) 380 | status_code = page.status_code 381 | data = re.sub('\'', '', page.text) 382 | #msg = '\n\nurl = {}\ntext = {}\n' 383 | #print(msg.format(url, page.text)) 384 | if data == '' or status_code != 200: 385 | return 386 | data = zlib.compress(data.encode()) 387 | break 388 | except requests.exceptions.ConnectionError: 389 | if x > 9: 390 | print_('') 391 | print('\n\tError: requests.exceptions.ConnectionError') 392 | msg = 'Ticker: {}, Exch: {}, URL: {}\n' 393 | print(msg.format(ticker_id, exch_id, url)) 394 | return 395 | except requests.exceptions.ChunkedEncodingError: 396 | print_('') 397 | print('\n\tError: requests.exceptions.ChunkedEncodingError') 398 | msg = 'Ticker: {}, Exch: {}, URL: {}\n' 399 | print(msg.format(ticker_id, exch_id, url)) 400 | time.sleep(4) 401 | return 402 | except KeyboardInterrupt: 403 | print('\nGoodbye!') 404 | exit() 405 | except: 406 | raise 407 | x += 1 408 | 409 | # Timer to attemp to slow down and 'align' Pool requests to every sec 410 | if False: 411 | time.sleep((1 - (time.time() % 1))) 412 | 413 | return (url_id, ticker_id, exch_id, today, status_code, data) 414 | 415 | 416 | def get_url_list(cur): 417 | 418 | urls = [] 419 | api = [(int(k), v) for k, v in apis.items()] 420 | with open(sql_cmds.format('select_notupdated1.txt')) as file: 421 | sql_cmd1 = file.read().strip() 422 | with open(sql_cmds.format('select_notupdated2.txt')) as file: 423 | sql_cmd2 = file.read().strip() 424 | with open(sql_cmds.format('select_notupdated3.txt')) as file: 425 | sql_cmd3 = file.read().strip() 426 | 427 | for url_id, url0 in api: 428 | 429 | # Select list of tickers not yet updated for current API 430 | print_('Creating URL list for API {} ...'.format(url_id)) 431 | if url_id < 4: 432 | sql = sql_cmd1.format(url_id) 433 | elif url_id == 9: 434 | sql = sql_cmd3.format(url_id) 435 | else: 436 | sql = sql_cmd2.format(url_id) 437 | tickers = db_execute(cur, sql).fetchall() 438 | ticker_count[url_id] = len(tickers) 439 | ticker_list[url_id] = {} 440 | 441 | # Create list of URL's for each ticker 442 | def url_list(ct, tick): 443 | exch_id, exch_sym = tick[0], tick[1] 444 | sym_id, symbol = tick[2], tick[3] 445 | url = url0.format(exch_sym, symbol) 446 | ticker_list[url_id]['{}:{}'.format(exch_id, sym_id)] = ct 447 | return (url_id, url, sym_id, exch_id) 448 | 449 | urls = urls + [url_list(c, ticker) 450 | for c, ticker in enumerate(tickers)] 451 | 452 | # Print API list and no. of tickers to be updated for each 453 | msg = '\nQty. of records pending update per API no.:\n\n' 454 | print_(msg) 455 | df_tickct = pd.DataFrame([(k, '{:8,.0f}'.format(v)) 456 | for k, v in ticker_count.items()]) 457 | print(df_tickct.rename(columns={0:'API', 1:'Pending'}) 458 | .set_index('API')) 459 | df_tickct = None 460 | 461 | #urls = sorted(urls, key=lambda x: (x[2], x[3], x[0])) 462 | urls = sorted(urls, key=lambda x: (x[0], x[2], x[3])) 463 | 464 | return urls 465 | 466 | 467 | def print_(msg): 468 | msg = 'echo -en "\\r\\e[K{}"'.format(msg) 469 | os.system(msg) 470 | 471 | 472 | def print_progress(api, num, ct): 473 | msg = 'Fetching API {:.0f}... {:7,.0f} / {:7,.0f} ({:.2%})' 474 | msg = msg.format(api, num+1, ct, (num+1)/ct) 475 | msg = 'echo -en "\\r\\e[K{}"'.format(msg) 476 | os.system(msg) 477 | 478 | 479 | def save_db(conn): 480 | err = None 481 | while True: 482 | try: 483 | conn.commit() 484 | except sqlite3.OperationalError as err1: 485 | if err != err1: 486 | err = err1 487 | print(err) 488 | continue 489 | except Exception as errs: 490 | print(errs) 491 | raise 492 | break 493 | 494 | 495 | def sql_insert(table, columns, values): 496 | if len(values) == 1: 497 | values = '(\'{}\')'.format(values[0]) 498 | 499 | sql = 'INSERT OR IGNORE INTO {} {} VALUES {}' 500 | sql = sql.format(table, columns, values) 501 | return sql 502 | 503 | 504 | def sql_insert_one_get_id(cur, tbl, col, val): 505 | 506 | # Insert value into db table 507 | column = '({})'.format(col) 508 | sql1 = sql_insert(tbl, column, (val,)) 509 | sql2 = sql_record_id(tbl, column, val) 510 | 511 | # Select ID from table for value 512 | try: 513 | db_execute(cur, sql1) 514 | id = db_execute(cur, sql2).fetchone()[0] 515 | except: 516 | print('\n\n\t# Error @ SQL1 =', sql1, '\n\nSQL2 =', sql2, '\n\n') 517 | raise 518 | 519 | return id 520 | 521 | 522 | def sql_record_id(table, column, value): 523 | if type(value) is str: 524 | sql = 'SELECT id FROM {} WHERE {} =\'{}\'' 525 | else: 526 | sql = 'SELECT id FROM {} WHERE {} ={}' 527 | return sql.format(table, column, value) 528 | 529 | 530 | def sql_update_record(table, dict1, dict2): 531 | updates = str(dict1).replace('{\'', '').replace(', \'', ', ') 532 | updates = updates.replace('}', '').replace('\':', ' =') 533 | conds = str(dict2).replace('{\'', '(') 534 | conds = conds.replace('}', ')').replace('\':', ' =') 535 | conds = conds.replace(', \'', ' AND ') 536 | sql = 'UPDATE OR IGNORE ' + table + ' SET ' + updates + ' WHERE ' + conds 537 | sql = re.sub('\'null\'', 'null', sql) 538 | return sql 539 | 540 | 541 | reload(parse) #Comment out after development 542 | 543 | # Reference variables 544 | ticker_list = {} 545 | ticker_count = {} 546 | fd_input = 'input/' 547 | today = datetime.today().strftime('%Y-%m-%d') 548 | sql_cmds = '{}sql_cmd/{}'.format(fd_input, '{}') 549 | with open('{}/api.json'.format(fd_input)) as file: 550 | apis = json.load(file) 551 | with open('{}/tables.json'.format(fd_input)) as file: 552 | tbl_js = json.load(file) 553 | tbl_names = list(tbl_js.keys()) 554 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup as bs 2 | from importlib import reload #Comment out once done using 3 | import datetime as DT 4 | from io import StringIO 5 | import pandas as pd 6 | import numpy as np 7 | import fetch, sqlite3, time, json, zlib, csv, sys, re 8 | 9 | 10 | # Manage database connection and fetch data to be parsed 11 | def parse(db_file): 12 | start = time.time() 13 | 14 | # Create db connection 15 | fetch.print_('Please wait while the database is being queried ...') 16 | 17 | while True: 18 | try: 19 | conn = sqlite3.connect(db_file) 20 | cur = conn.cursor() 21 | except sqlite3.OperationalError as S: 22 | fetch.print_('') 23 | print('\tError - sqlite3 error: {}'.format(S)) 24 | continue 25 | except KeyboardInterrupt: 26 | print('\nGoodbye!') 27 | exit() 28 | except: 29 | raise 30 | break 31 | 32 | # Get list of fetched urls from Fetched_urls 33 | cols = 'url_id, ticker_id, exch_id, fetch_date, source_text' 34 | sql = '''SELECT {} FROM Fetched_urls 35 | WHERE status_code = 200 AND source_text IS NOT NULL 36 | ORDER BY ticker_id asc, url_id desc''' 37 | sql = sql.format(cols) 38 | fetched = fetch.db_execute(cur, sql).fetchall() 39 | 40 | # Call parsing methods 41 | parsing(conn, cur, fetched) 42 | 43 | # Save db and close db connection 44 | fetch.save_db(conn) 45 | cur.close() 46 | conn.close() 47 | fetched = None 48 | 49 | 50 | # Parse data fetched from database table 'Fetched_urls' 51 | def parsing(conn, cur, items): 52 | stp = len(items) 53 | spds = [] 54 | if stp > 0: 55 | for i in range(stp): 56 | 57 | # Unpack record from Fetched_urls 58 | api = items[i][0] 59 | ticker_id = items[i][1] 60 | exch_id = items[i][2] 61 | fetch_date = items[i][3] 62 | source_text = items[i][4] 63 | parse = True 64 | 65 | # Decompress and check data integrity before parsing 66 | try: 67 | if source_text is not None: 68 | source_text = zlib.decompress(source_text).decode() 69 | except BaseException as B: 70 | print('\n\nB - {}'.format(str(B))) 71 | raise 72 | except Exception as E: 73 | print('\n\nE - {}'.format(str(E))) 74 | raise 75 | except KeyboardInterrupt: 76 | print('\nGoodbye!') 77 | exit() 78 | 79 | if (source_text is None or len(source_text) == 0 or 80 | 'Morningstar.com Error Page' in source_text or 81 | 'This page is temporarily unavailable' in source_text): 82 | parse = False 83 | code = 0 84 | 85 | # Print progress message 86 | msg = 'Parsing results into database...' 87 | msg += '\t{:6,.0f} / {:6,.0f}\t({:6.1%} )' 88 | ct = i + 1 89 | pct = (i + 1) / stp 90 | fetch.print_(msg.format(ct, stp, pct)) 91 | 92 | # Invoke parsing function based on API number 93 | if parse == True: 94 | if api in [1, 2, 3]: 95 | code = parse_1(cur, source_text, api) 96 | elif api == 4: 97 | code = parse_2(cur, ticker_id, exch_id, source_text) 98 | elif api == 5: 99 | code = parse_3(cur, ticker_id, exch_id, source_text) 100 | elif api == 6: 101 | code = parse_4(cur, ticker_id, exch_id, source_text) 102 | elif api == 7: 103 | code = parse_5(cur, ticker_id, exch_id, source_text) 104 | elif api == 8: 105 | code = parse_6(cur, ticker_id, exch_id, source_text) 106 | elif api == 9: 107 | code = parse_7(cur, ticker_id, exch_id, source_text) 108 | elif api in [10, 11, 12, 13, 14, 15]: 109 | code = parse_8(cur, api, ticker_id, exch_id, source_text) 110 | elif api == 16: 111 | code = parse_9(cur, ticker_id, exch_id, source_text) 112 | elif api == 0: 113 | code = parse_10(cur, ticker_id, source_text) 114 | source_text = None 115 | 116 | # Updated record in Fetched_urls with results from parsing 117 | if True: 118 | dict1 = { 119 | 'status_code':code, 120 | 'source_text':source_text 121 | } 122 | dict2 = { 123 | 'url_id':api, 124 | 'ticker_id':ticker_id, 125 | 'exch_id':exch_id, 126 | 'fetch_date':fetch_date 127 | } 128 | sql = fetch.sql_update_record('Fetched_urls', dict1, dict2) 129 | 130 | # DELETE 131 | if dict1['source_text'] == '': 132 | print('\n\n\n{}\n\n'.format(sql)) 133 | raise 134 | 135 | fetch.db_execute(cur, sql) 136 | 137 | #print('\n{} SQL = {}\n\n'.format(api, sql)) 138 | 139 | if i % 1000 == 0 and i + 1 != stp: 140 | fetch.save_db(conn) 141 | 142 | 143 | # Parse table(s) from source html code 144 | def get_html_table(sp): 145 | 146 | tr_tags = sp.find_all('tr') 147 | table = [] 148 | for tr in tr_tags: 149 | td_tags = tr.find_all(['th', 'td']) 150 | if len(td_tags) > 1: 151 | table.append([tag.text for tag in td_tags]) 152 | 153 | return table 154 | 155 | 156 | # https://www.morningstar.com/api/v2/search/securities/5/usquote-v2/ 157 | def parse_1(cur, data, api): 158 | 159 | results = [] 160 | try: 161 | js = json.loads(data) 162 | if js['m'][0]['n'] != 0: 163 | results = js['m'][0]['r'] 164 | except KeyError: 165 | fetch.print_('') 166 | print('\tError: KeyError at Parse_1\n') 167 | return 1 168 | except KeyboardInterrupt: 169 | print('\nGoodbye!') 170 | exit() 171 | except: 172 | print('Data = {} {}\n'.format(data, len(data))) 173 | raise 174 | 175 | if results == []: 176 | return 1 177 | 178 | for result in results: 179 | # Read data from current result 180 | exch = result['OS01X'] 181 | symbol = result['OS001'] 182 | exch_sym = result['LS01Z'] 183 | country = result['XI018'] 184 | type = result['OS010'] 185 | comp = result['OS01W'] 186 | curr = result['OS05M'] 187 | 188 | if exch_sym == '' or symbol == '': 189 | continue 190 | 191 | # Fetch id's for data from db and update tables 192 | 193 | # Tickers 194 | ticker_id = int(fetch.sql_insert_one_get_id( 195 | cur, 'Tickers', 'ticker', symbol)) 196 | # Currencies 197 | curr_id = int(fetch.sql_insert_one_get_id( 198 | cur, 'Currencies', 'currency_code', curr)) 199 | # Companies 200 | comp_id = int(fetch.sql_insert_one_get_id( 201 | cur, 'Companies', 'company', comp)) 202 | # SecurityTypes 203 | type_id = int(fetch.sql_insert_one_get_id( 204 | cur, 'SecurityTypes', 'security_type_code', type)) 205 | # Countries 206 | country_id = int(fetch.sql_insert_one_get_id(cur, 207 | 'Countries', 'a3_un', country)) 208 | # Exchanges 209 | exch_id = int(fetch.sql_insert_one_get_id(cur, 210 | 'Exchanges', 'exchange_sym', exch_sym)) 211 | dict1 = { 212 | 'exchange':exch, 213 | 'exchange_sym':exch_sym, 214 | 'country_id':country_id 215 | } 216 | sql = fetch.sql_update_record('Exchanges', dict1, {'id':exch_id}) 217 | fetch.db_execute(cur, sql) 218 | # Master Table 219 | columns = '(ticker_id, exchange_id)' 220 | sql = fetch.sql_insert('Master', columns, (ticker_id, exch_id)) 221 | fetch.db_execute(cur, sql) 222 | dict1 = { 223 | 'company_id':comp_id, 224 | 'security_type_id':type_id, 225 | 'update_date':DT.date.today().strftime('%Y-%m-%d') 226 | } 227 | dict2 = { 228 | 'ticker_id':ticker_id, 229 | 'exchange_id':exch_id 230 | } 231 | sql = fetch.sql_update_record('Master', dict1, dict2) 232 | fetch.db_execute(cur, sql) 233 | 234 | return 200 235 | 236 | 237 | # http://quotes.morningstar.com/stockq/c-company-profile 238 | def parse_2(cur, ticker_id, exch_id, data): 239 | 240 | soup = bs(data, 'html.parser') 241 | tags = soup.find_all('span') 242 | 243 | try: 244 | sector = tags[2].text.strip() 245 | industry = tags[4].text.strip() 246 | stype = tags[6].text.strip() 247 | fyend = tags[10].text.strip() 248 | style = tags[12].text.strip() 249 | except KeyboardInterrupt: 250 | print('\nGoodbye!') 251 | exit() 252 | except: 253 | print('\n\nTicker_id = {}, Exch_id = {}'.format(ticker_id, exch_id)) 254 | print('Data = {} {}\n'.format(data, len(data))) 255 | raise 256 | 257 | # Insert sector into Sectors 258 | sector_id = fetch.sql_insert_one_get_id(cur, 'Sectors', 'Sector', sector) 259 | 260 | # Insert industry into Industries 261 | sql = fetch.sql_insert('Industries', 262 | '(industry, sector_id)', (industry, sector_id)) 263 | fetch.db_execute(cur, sql) 264 | sql = fetch.sql_record_id('Industries', '(industry)', industry) 265 | industry_id = fetch.db_execute(cur, sql).fetchone()[0] 266 | 267 | # Insert stock_type into StockTypes 268 | stype_id = fetch.sql_insert_one_get_id( 269 | cur, 'StockTypes', 'stock_type', stype) 270 | 271 | # Insert fyend into FYEnds 272 | fyend_id = fetch.sql_insert_one_get_id(cur, 'TimeRefs', 'dates', fyend) 273 | 274 | # Insert style into StockStyles 275 | style_id = fetch.sql_insert_one_get_id(cur, 'StockStyles', 'style', style) 276 | 277 | # Update Tickers table with parsed data 278 | sql = fetch.sql_update_record('Master', {'industry_id':industry_id, 279 | 'stock_type_id':stype_id, 'fyend_id':fyend_id, 'style_id':style_id}, 280 | {'ticker_id':ticker_id, 'exchange_id':exch_id}) 281 | fetch.db_execute(cur, sql) 282 | 283 | return 200 284 | 285 | 286 | # http://quotes.morningstar.com/stockq/c-header 287 | # API No. 5 288 | def parse_3(cur, ticker_id, exch_id, data): 289 | 290 | soup = bs(data, 'html.parser') 291 | tags = soup.find_all('span') + soup.find_all('div') 292 | 293 | # Parse data into info dictionary 294 | info = {} 295 | noise = ['', '-', '—', '— mil', '— bil'] 296 | for count, tag in enumerate(tags): 297 | 298 | attrs = tag.attrs 299 | text = re.sub('[\n\t]', '', tag.text.strip()) 300 | text = re.sub('\s\s*', ' ', text) 301 | 302 | try: 303 | if attrs.get('vkey') == 'Currency': 304 | if text in noise: 305 | info['currency_id'] = 'null' 306 | else: 307 | val = fetch.sql_insert_one_get_id( 308 | cur, 'Currencies', 'currency_code', text) 309 | info['currency_id'] = val 310 | 311 | elif attrs.get('vkey') == 'LastDate': 312 | if text == '': 313 | info['lastdate'] = 'null' 314 | else: 315 | info['lastdate'] = pd.to_datetime( 316 | text).strftime('%Y-%m-%d') 317 | elif attrs.get('vkey') == 'DayRange': 318 | text = re.sub('^-0.00', '0.00', text) 319 | vals = text.split('-') 320 | if '-' not in text or text in noise or '' in vals: 321 | info['day_lo'] = 'null' 322 | info['day_hi'] = 'null' 323 | else: 324 | info['day_lo'] = float(re.sub(',', '', vals[0])) 325 | info['day_hi'] = float(re.sub(',', '', vals[1])) 326 | elif attrs.get('vkey') == '_52Week': 327 | text = re.sub('^-0.00', '0.00', text) 328 | vals = text.split('-') 329 | if '-' not in text or text in noise or '' in vals: 330 | info['_52wk_lo'] = 'null' 331 | info['_52wk_hi'] = 'null' 332 | else: 333 | info['_52wk_lo'] = float(re.sub(',', '', vals[0])) 334 | info['_52wk_hi'] = float(re.sub(',', '', vals[1])) 335 | elif attrs.get('vkey') == 'Volume': 336 | if text in noise: 337 | info['lastvol'] = 'null' 338 | else: 339 | text = re.sub(',', '', text) 340 | unit = 1 341 | if ' mil' in text: 342 | unit = 10E6 343 | text = text.replace(' mil', '') 344 | elif ' bil' in text: 345 | unit = 10E9 346 | text = text.replace(' bil', '') 347 | elif ' tri' in text: 348 | unit = 10E12 349 | text = text.replace(' tri', '') 350 | info['lastvol'] = float(text) * unit 351 | elif attrs.get('vkey') == 'AverageVolume': 352 | if text in noise: 353 | info['avevol'] = 'null' 354 | else: 355 | text = re.sub(',', '', text) 356 | unit = 1 357 | if ' mil' in text: 358 | unit = 10E6 359 | text = text.replace(' mil', '') 360 | elif ' bil' in text: 361 | unit = 10E9 362 | text = text.replace(' bil', '') 363 | elif ' tri' in text: 364 | unit = 10E12 365 | text = text.replace(' tri', '') 366 | info['avevol'] = float(text) * unit 367 | elif attrs.get('gkey') == 'Forward': 368 | fpe = text 369 | elif attrs.get('vkey') == 'OpenPrice': 370 | if text in noise: 371 | info['openprice'] = 'null' 372 | else: 373 | info['openprice'] = float(re.sub(',', '', text)) 374 | elif attrs.get('vkey') == 'LastPrice': 375 | if text in noise: 376 | info['lastprice'] = 'null' 377 | else: 378 | info['lastprice'] = float(re.sub(',', '', text)) 379 | elif attrs.get('vkey') == 'ProjectedYield': 380 | if text in noise: 381 | info['yield'] = 'null' 382 | else: 383 | info['yield'] = float(re.sub('[%,]', '', text)) 384 | elif attrs.get('vkey') == 'PE': 385 | if text in noise: 386 | info['fpe'] = 'null' 387 | else: 388 | info['fpe'] = float(re.sub(',', '', text)) 389 | elif attrs.get('vkey') == 'PB': 390 | if text in noise: 391 | info['pb'] = 'null' 392 | else: 393 | info['pb'] = float(re.sub(',', '', text)) 394 | elif attrs.get('vkey') == 'PS': 395 | if text in noise: 396 | info['ps'] = 'null' 397 | else: 398 | info['ps'] = float(re.sub(',', '', text)) 399 | elif attrs.get('vkey') == 'PC': 400 | if text in noise: 401 | info['pc'] = 'null' 402 | else: 403 | info['pc'] = float(re.sub(',', '', text)) 404 | except: 405 | print('\n\n{' + text + '}\n') 406 | raise 407 | 408 | # Check if parsing was successful 409 | if info == {}: 410 | return 3 411 | 412 | if 'fpe' in locals() and fpe != 'Forward' and 'fpe' in info: 413 | del info['fpe'] 414 | 415 | # Remove 'empty' string values 416 | for k, v in info.items(): 417 | if v == '' or v == ' ': 418 | info[k] = 'null' 419 | 420 | # Insert data into MSheader table 421 | table = 'MSheader' 422 | # Update 423 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 424 | sql = fetch.sql_update_record(table, info, dict0) 425 | fetch.db_execute(cur, sql) 426 | # Insert 427 | if cur.rowcount == 0: 428 | info['ticker_id'] = ticker_id 429 | info['exchange_id'] = exch_id 430 | sql = fetch.sql_insert(table, tuple(info.keys()), tuple(info.values())) 431 | fetch.db_execute(cur, sql) 432 | 433 | return 200 434 | 435 | 436 | # http://financials.morningstar.com/valuate/valuation-history.action 437 | def parse_4(cur, ticker_id, exch_id, data): 438 | 439 | info = {} 440 | def clean_val(h, v): 441 | if v != '—': 442 | info[h] = v 443 | 444 | soup = bs(data, 'html.parser') 445 | table = get_html_table(soup) 446 | script = soup.find('script').text 447 | script = re.sub('[ \n\t]|\\n|\\t', '', script) 448 | script = re.findall('\[\[.+?\]\]', script)[0] 449 | columns = json.loads(script) 450 | 451 | # Parse Yr Columns 452 | for year, column in enumerate(columns): 453 | if column[0] % 2 == 0: 454 | yr = column[1] 455 | yr_id = fetch.sql_insert_one_get_id(cur, 'TimeRefs', 'dates', yr) 456 | header = 'Y{}'.format(int((year-1)/2)) 457 | info[header] = yr_id 458 | 459 | # Parse 'Price/Earnings' 460 | for yr, val in enumerate(table[1][1:]): 461 | header = 'PE_Y{}'.format(yr) 462 | clean_val(header, val) 463 | 464 | # Parse 'Price/Book' 465 | for yr, val in enumerate(table[4][1:]): 466 | header = 'PB_Y{}'.format(yr) 467 | clean_val(header, val) 468 | 469 | # Parse 'Price/Sales' 470 | for yr, val in enumerate(table[7][1:]): 471 | header = 'PS_Y{}'.format(yr) 472 | clean_val(header, val) 473 | 474 | # Parse 'Price/Cash Flow' 475 | for yr, val in enumerate(table[10][1:]): 476 | header = 'PC_Y{}'.format(yr) 477 | clean_val(header, val) 478 | 479 | # Check if parsing was successful 480 | if info == {}: 481 | return 4 482 | 483 | # Insert data into MSvaluation table 484 | table = 'MSvaluation' 485 | # Update 486 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 487 | sql1 = fetch.sql_update_record(table, info, dict0) 488 | # Insert 489 | info['ticker_id'] = ticker_id 490 | info['exchange_id'] = exch_id 491 | sql2 = fetch.sql_insert(table, tuple(info.keys()), tuple(info.values())) 492 | fetch.db_execute(cur, sql1) 493 | if cur.rowcount == 0: 494 | fetch.db_execute(cur, sql2) 495 | 496 | return 200 497 | 498 | 499 | # http://financials.morningstar.com/finan/financials/getKeyStatPart.html 500 | # API No. 7 501 | def parse_5(cur, ticker_id, exch_id, data): 502 | 503 | # Check if source data has correct information 504 | try: 505 | js = json.loads(data)['componentData'] 506 | if js is None: 507 | return 5 508 | soup = bs(js, 'html.parser') 509 | except KeyboardInterrupt: 510 | print('\nGoodbye!') 511 | exit() 512 | except: 513 | print('\n\nTicker_id = {}, Exch_id = {}'.format(ticker_id, exch_id)) 514 | print('Data = {} {}\n'.format(data, len(data))) 515 | raise 516 | 517 | # Parse table 518 | tables = {} 519 | trows = soup.find_all('tr') 520 | tname = '' 521 | for trow in trows: 522 | div_id = trow.parent.parent.parent.attrs['id'] 523 | tname0 = re.sub('tab-', 'MSratio_', div_id) 524 | if tname != tname0: 525 | tname = tname0 526 | tables[tname] = {} 527 | 528 | row_tags = trow.find_all(['th', 'td']) 529 | for i, row_tag in enumerate(row_tags): 530 | if 'id' in row_tag.attrs: 531 | text = row_tag.text 532 | id = re.sub('-', '_', row_tag.attrs['id']) 533 | if i != 0: 534 | text_id = fetch.sql_insert_one_get_id( 535 | cur, 'TimeRefs', 'dates', text) 536 | else: 537 | text_id = fetch.sql_insert_one_get_id( 538 | cur, 'ColHeaders', 'header', text) 539 | tables[tname][id] = text_id 540 | elif 'headers' in row_tag.attrs: 541 | headers = row_tag.attrs['headers'] 542 | header = '_'.join([headers[2], headers[0]]) 543 | header = re.sub('-', '_', header) 544 | val = re.sub(',', '', row_tag.text) 545 | if val == '—': 546 | val = None 547 | else: 548 | try: 549 | val = float(val) 550 | except: 551 | val = None 552 | tables[tname][header] = val 553 | 554 | # Check if parsing was successful 555 | if tables == {}: 556 | return 5 557 | 558 | # Insert data into tables 559 | for table in tables: 560 | # Update 561 | info = tables[table] 562 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 563 | sql = fetch.sql_update_record(table, info, dict0) 564 | fetch.db_execute(cur, sql) 565 | # Insert 566 | if cur.rowcount == 0: 567 | tables[table]['ticker_id'] = ticker_id 568 | tables[table]['exchange_id'] = exch_id 569 | info = tables[table] 570 | sql = fetch.sql_insert( 571 | table, tuple(info.keys()), tuple(info.values())) 572 | fetch.db_execute(cur, sql) 573 | 574 | return 200 575 | 576 | 577 | # http://financials.morningstar.com/finan/financials/getFinancePart.html 578 | # API No. 8 579 | def parse_6(cur, ticker_id, exch_id, data): 580 | 581 | # Check if source data has correct information 582 | try: 583 | js = json.loads(data)['componentData'] 584 | if js is None: 585 | return 6 586 | soup = bs(js, 'html.parser') 587 | except KeyboardInterrupt: 588 | print('\nGoodbye!') 589 | exit() 590 | except: 591 | print('\n\nTicker_id = {}, Exch_id = {}'.format(ticker_id, exch_id)) 592 | print('Data = {} {}\n'.format(data, len(data))) 593 | raise 594 | 595 | # Parse table 596 | table = {} 597 | trows = soup.find_all('tr') 598 | for trow in trows: 599 | row_tags = trow.find_all(['th', 'td']) 600 | for i, row_tag in enumerate(row_tags): 601 | if 'id' in row_tag.attrs: 602 | text = row_tag.text 603 | if i != 0: 604 | text_id = fetch.sql_insert_one_get_id( 605 | cur, 'TimeRefs', 'dates', text) 606 | else: 607 | text_id = fetch.sql_insert_one_get_id( 608 | cur, 'ColHeaders', 'header', text) 609 | table[row_tag.attrs['id']] = text_id 610 | elif 'headers' in row_tag.attrs: 611 | headers = row_tag.attrs['headers'] 612 | headers.reverse() 613 | val = re.sub(',', '', row_tag.text) 614 | if val == '—': 615 | val = None 616 | else: 617 | val = float(val) 618 | table['_'.join(headers)] = val 619 | 620 | if table == {}: 621 | return 6 622 | 623 | # Insert data into tables 624 | tname = 'MSfinancials' 625 | # Update 626 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 627 | sql = fetch.sql_update_record(tname, table, dict0) 628 | ct = fetch.db_execute(cur, sql) 629 | # Insert 630 | if cur.rowcount == 0: 631 | table['ticker_id'] = ticker_id 632 | table['exchange_id'] = exch_id 633 | sql = fetch.sql_insert( 634 | tname, tuple(table.keys()), tuple(table.values())) 635 | fetch.db_execute(cur, sql) 636 | 637 | return 200 638 | 639 | 640 | # http://performance.mor.../perform/Performance/stock/exportStockPrice.action 641 | # API No. 9 642 | def parse_7(cur, ticker_id, exch_id, data): 643 | 644 | tbl = pd.read_csv(StringIO(data), sep=',', header=1) 645 | tbl = tbl.where(tbl['Volume'] != '???').dropna(axis=0, how='all') 646 | tbl['diff'] = 100 * tbl['Close'].diff(-1) / tbl['Close'].shift(-1) 647 | 648 | if len(tbl) <= 1: 649 | return 99 650 | 651 | last_open0 = tbl.iloc[0, 4] 652 | last_open1 = tbl.iloc[1, 4] 653 | 654 | if last_open0 <= 0.0: 655 | return 99 656 | 657 | info = dict() 658 | info['last_open'] = last_open0 659 | info['last_close'] = tbl.iloc[0, 4] 660 | info['lastday_var'] = 100*(last_open0-last_open1)/last_open1 661 | info['ave_10d'] = tbl.iloc[:9, 4].mean() 662 | info['ave_50d'] = tbl.iloc[:49, 4].mean() 663 | info['ave_100d'] = tbl.iloc[:99, 4].mean() 664 | info['ave_200d'] = tbl.iloc[:199, 4].mean() 665 | for i in [5, 10, 30, 50, 100, 200]: 666 | info['max_var{}'.format(i)] = tbl['diff'].iloc[:i-1].max() 667 | info['max_var{}_date'.format(i)] = (DT. 668 | datetime.strptime(tbl[tbl['diff'] == info['max_var{}'.format(i)]] 669 | .iloc[0, 0],'%m/%d/%Y').strftime('%Y-%m-%d')) 670 | info['min_var{}'.format(i)] = tbl['diff'].iloc[:i-1].min() 671 | info['min_var{}_date'.format(i)] = (DT. 672 | datetime.strptime(tbl[tbl['diff'] == info['min_var{}'.format(i)]] 673 | .iloc[0, 0],'%m/%d/%Y').strftime('%Y-%m-%d')) 674 | 675 | nonan = lambda x: (str(x[1]) != 'nan') and (str(x[1]) != 'inf') 676 | info = dict(filter(nonan, info.items())) 677 | 678 | # Insert data into tables 679 | # Update 680 | table = 'MSpricehistory' 681 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 682 | sql = fetch.sql_update_record(table, info, dict0) 683 | fetch.db_execute(cur, sql) 684 | # Insert 685 | if cur.rowcount == 0: 686 | info['ticker_id'] = ticker_id 687 | info['exchange_id'] = exch_id 688 | sql = fetch.sql_insert( 689 | table, tuple(info.keys()), tuple(info.values())) 690 | fetch.db_execute(cur, sql) 691 | 692 | return 200 693 | 694 | 695 | # http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html 696 | def parse_8(cur, api, ticker_id, exch_id, data): 697 | 698 | # Check if source data has correct information 699 | msg = 'There is no available information in our database to display.' 700 | if msg in data: 701 | return 8 702 | 703 | # Parse source data with JSON and BeautifulSoup 704 | try: 705 | js = json.loads(data) 706 | html = js['result'] 707 | soup = bs(html, 'html.parser') 708 | tags = soup.find_all('div') 709 | except KeyboardInterrupt: 710 | print('\nGoodbye!') 711 | exit() 712 | except: 713 | print('\n\n', data) 714 | raise 715 | 716 | info = {} 717 | info0 = {} 718 | type = 'MSreport' 719 | 720 | if api in [10, 11]: 721 | type += '_is' 722 | elif api in [12, 13]: 723 | type += '_cf' 724 | elif api in [14, 15]: 725 | type += '_bs' 726 | if api in [10, 12, 14]: 727 | type += '_yr' 728 | elif api in [11, 13, 15]: 729 | type += '_qt' 730 | #fname = 'test/{}.json'.format(type) 731 | 732 | '''with open(fname) as file: 733 | info0 = json.load(file)''' 734 | 735 | # Parse data into info dictionary 736 | for tag in tags: 737 | attrs = tag.attrs 738 | 739 | if 'id' in attrs: 740 | tag_id = tag['id'] 741 | value = tag.text 742 | 743 | # Parse currency and FY End month number 744 | if tag_id == 'unitsAndFiscalYear': 745 | info['fye_month'] = int(tag['fyenumber']) 746 | curr_id = fetch.sql_insert_one_get_id( 747 | cur, 'Currencies', 'currency_code', tag['currency']) 748 | info['currency_id'] = curr_id 749 | 750 | # Parse Yrly or Qtrly values 751 | elif tag_id[:2] == 'Y_': 752 | parent = tag.parent['id'] 753 | key = '{}_{}'.format(parent, tag_id) 754 | 755 | if 'rawvalue' in attrs: 756 | if tag['rawvalue'] in ['—', 'nbsp']: 757 | continue 758 | info[key] = float(re.sub(',', '', tag['rawvalue'])) 759 | #info0[key] = 'REAL,' 760 | else: 761 | if 'title' in attrs: 762 | value = tag['title'] 763 | value_id = fetch.sql_insert_one_get_id( 764 | cur, 'TimeRefs', 'dates', value) 765 | info[key] = value_id 766 | #info0[key] = 'INTEGER,' 767 | 768 | # Parse labels 769 | elif tag_id[:3] == 'lab' and 'padding' not in tag_id: 770 | value_id = fetch.sql_insert_one_get_id( 771 | cur, 'ColHeaders', 'header', value) 772 | info[tag_id] = value_id 773 | #info0[tag_id] = 'INTEGER,' 774 | 775 | # Check if parsing was successful 776 | if info == {} and info0 == {}: 777 | return 8 778 | 779 | # Insert data into tables 780 | # Update 781 | dict0 = {'ticker_id':ticker_id, 'exchange_id':exch_id} 782 | sql = fetch.sql_update_record(type, info, dict0) 783 | fetch.db_execute(cur, sql) 784 | # Insert 785 | if cur.rowcount == 0: 786 | info['ticker_id'] = ticker_id 787 | info['exchange_id'] = exch_id 788 | sql = fetch.sql_insert(type, tuple(info.keys()), tuple(info.values())) 789 | fetch.db_execute(cur, sql) 790 | 791 | return 200 792 | 793 | 794 | # http://insiders.mor.../insiders/trading/insider-activity-data2.action 795 | # API No. 16 796 | def parse_9(cur, ticker_id, exch_id, data): 797 | 798 | data = re.sub('([A-Z])', r' \1', data) 799 | data = re.sub(' +', ' ', data) 800 | data = re.sub('\\n|\\t', '', data) 801 | soup = bs(data, 'html.parser') 802 | table = get_html_table(soup) 803 | 804 | if len(table) > 1: 805 | for row in table: 806 | date = '' 807 | info = {} 808 | if row[0] != '': 809 | info['date'] = DT.datetime.strptime( 810 | row[0], '%m/%d/%Y').strftime('%Y-%m-%d') 811 | try: 812 | info['quantity'] = float(re.sub(',', '', row[3])) 813 | info['value'] = float(re.sub(',', '', row[6])) 814 | except ValueError: 815 | info['quantity'] = 0 816 | info['value'] = 0 817 | except: 818 | raise 819 | 820 | name = row[1].strip() 821 | info['name_id'] = fetch.sql_insert_one_get_id( 822 | cur, 'Insiders', 'name', name) 823 | 824 | type = row[5].strip() 825 | if ' ' in type: 826 | type = type.split()[0] 827 | info['transaction_id'] = fetch.sql_insert_one_get_id( 828 | cur, 'TransactionType', 'type', type) 829 | 830 | # Insert data into tables 831 | info['ticker_id'] = ticker_id 832 | info['exchange_id'] = exch_id 833 | sql = fetch.sql_insert('InsiderTransactions', 834 | tuple(info.keys()), tuple(info.values())) 835 | fetch.db_execute(cur, sql) 836 | 837 | return 200 838 | 839 | 840 | # https://finance.yahoo.com/quote/ 841 | # API No. 0 842 | def parse_10(cur, ticker_id, data): 843 | 844 | sql = 'SELECT ticker FROM Tickers WHERE id = ?' 845 | ticker = fetch.db_execute_tpl(cur, sql, (ticker_id,)).fetchall()[0][0] 846 | 847 | #soup = bs(data, 'html.parser') 848 | tables = [] 849 | try: 850 | tables = pd.read_html(data) 851 | except: 852 | return 10 853 | 854 | if len(tables) == 2: 855 | 856 | info = dict() 857 | try: 858 | info['prev_close'] = float(tables[0].loc[0, 1]) 859 | info['open'] = float(tables[0].loc[1, 1]) 860 | info['beta'] = float(tables[1].loc[1, 1]) 861 | info['eps_ttm'] = float(tables[1].loc[3, 1]) 862 | info['pe_ttm'] = float(tables[1].loc[2, 1]) 863 | info['yr_target'] = float(tables[1].loc[7, 1]) 864 | except: 865 | pass 866 | 867 | try: 868 | date0 = tables[1].loc[6, 1] 869 | if isinstance(date0, float) == False: 870 | exdiv_date = DT.datetime.strptime(date0, '%Y-%m-%d') 871 | info['exdiv_date'] = exdiv_date.strftime('%Y-%m-%d') 872 | 873 | date0 = tables[1].loc[4, 1] 874 | if isinstance(date0, float) == False: 875 | if '-' in date0: 876 | date0 = date0.split('-')[0].strip() 877 | earn_date = DT.datetime.strptime(date0, '%b %d, %Y') 878 | info['earnings_date'] = earn_date.strftime('%Y-%m-%d') 879 | 880 | div_yield = tables[1].loc[5, 1] 881 | if '%' in div_yield: 882 | div_yield = div_yield.split('(')[1].split('%')[0] 883 | info['div_yield'] = float(div_yield) 884 | except: 885 | print('\n\nTicker: ' + ticker) 886 | print() 887 | for table in tables: 888 | print(table) 889 | raise 890 | 891 | nonan = lambda x: (str(x[1]) != 'nan') 892 | info = dict(filter(nonan, info.items())) 893 | 894 | # Insert data into tables 895 | if len(info) > 0: 896 | #print(json.dumps(info, indent=2)) 897 | # Update 898 | table = 'YahooQuote' 899 | dict0 = {'ticker_id':ticker_id} 900 | sql = fetch.sql_update_record(table, info, dict0) 901 | fetch.db_execute(cur, sql) 902 | # Insert 903 | if cur.rowcount == 0: 904 | info['ticker_id'] = ticker_id 905 | sql = fetch.sql_insert( 906 | table, tuple(info.keys()), tuple(info.values())) 907 | fetch.db_execute(cur, sql) 908 | 909 | return 200 910 | return 10 911 | --------------------------------------------------------------------------------