├── testclean.py ├── Data └── Pickles │ └── manager.dat ├── .gitignore ├── README ├── Utilities ├── reset_manager.py └── test_compression.py ├── data_reporter.py ├── cik_reader.py ├── yahoo_parser.py ├── stock.py ├── parser.py ├── scraper.py ├── rb_parser.py ├── stock_price.py ├── classifier.py ├── clean_scrape.py ├── stocks_downloader.py ├── Old └── data_manager.py └── manager.py /testclean.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Data/Pickles/manager.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teamdandelion/RoboBuffett/HEAD/Data/Pickles/manager.dat -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Data/Processed/* 2 | Data/Unprocessed/* 3 | Data/Exceptions/* 4 | Data/Active/* 5 | Data/Inactive/* 6 | Data/Pickles/CIKs* 7 | Data/2001/* 8 | testdoc 9 | *.csv 10 | *.pyc 11 | *.log 12 | TestData/* 13 | good_CIK.txt 14 | Utilities/*.dat 15 | Utilities/generate_CIKTicker_mapping.py 16 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Project goal: Predict future stock performance based on textual analysis of SEC filings. 2 | 3 | Modules: 4 | 5 | Scraper (Izaak): 6 | Pull SEC documents from EDGAR. 7 | 8 | Stock (Ahmad): 9 | Get stock prices and maintain CIK<->Ticker mapping. 10 | 11 | Manager (Dan): 12 | Manage & organize data. Manages parsing, serialization, classification, and testing. 13 | 14 | Parser (Dan): 15 | Parse SEC filings, getting header info and document text. 16 | 17 | Classifier (Ahmad): 18 | Generate training classifications for documents, calls on Stock. Called by manager. 19 | 20 | To be added: Multinomial model generator, LLV (log likelihood value) classifier. To be maintained by Dan. -------------------------------------------------------------------------------- /Utilities/reset_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, shutil 4 | 5 | def clean_all(): 6 | DataDir = os.path.expanduser('~/Documents/Code/RoboBuffett/Data/') 7 | temp = DataDir + 'Temp/' 8 | ensure(temp) 9 | ensure(temp + 'Pickles/') 10 | to_delete = ('Preprocessed','Active','Inactive','Processed','Exceptions','Pickles/CIKs', 'Pickles/manager.dat') 11 | for item in to_delete: 12 | try: 13 | os.rename(DataDir + item, temp + item) 14 | except OSError as e: 15 | print str(e) +': ' + item 16 | print "Renamed, removing temp dir" 17 | shutil.rmtree(temp) 18 | 19 | def ensure(dir): 20 | if not os.path.exists(dir): 21 | os.makedirs(dir) 22 | 23 | if __name__ == "__main__": 24 | clean_all() -------------------------------------------------------------------------------- /data_reporter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import os.path 4 | from os.path import isdir 5 | import random 6 | datadir = '/Volumes/Conduit/RBData' 7 | years = map(str,range(1999,2012)) 8 | qtrs = ['QTR1','QTR2','QTR3','QTR4'] 9 | 10 | 11 | bad_years = [] 12 | bad_quarters = [] 13 | 14 | def djoin(dir1, dir2=""): 15 | return datadir + '/' + dir1 + '/' + dir2 16 | 17 | def missing_linebreaks(filee): 18 | if os.path.getsize(filee) > 0 and os.path.isfile(filee): 19 | with open(filee, 'r') as f: 20 | i = 0 21 | for line in f: 22 | i += 1 23 | if i > 2: 24 | return False 25 | print filee 26 | return True 27 | return False 28 | 29 | for year in years: 30 | if not isdir(djoin(year)): 31 | print "!Year {} not found".format(year) 32 | bad_years.append(year) 33 | continue 34 | print "----" 35 | for qtr in qtrs: 36 | dirr = djoin(year,qtr) 37 | if not isdir(dirr): 38 | print "!Quarter {} {} not found".format(qtr,year) 39 | bad_quarters.append((qtr,year)) 40 | continue 41 | 42 | os.chdir(dirr) 43 | files = len(os.listdir('.')) 44 | size = float(sum([os.path.getsize(f) for f in os.listdir('.') if os.path.isfile(f)])) / (1024**3) 45 | if files > 0: 46 | avg = size / files * (1024**2) 47 | else: 48 | avg = 0 49 | 50 | rsample = random.sample(os.listdir('.'),min(20,files)) 51 | missing_lb = any(map(missing_linebreaks, rsample)) 52 | 53 | print "Quarter {} {}: {:6d} files, {:1.2f}GB size, {:4.0f}kB avg size".format(qtr, year, files, size, avg) 54 | if missing_lb: 55 | print "Quarter {} {}: Missing linebreaks!".format(qtr,year) 56 | print zip(rsample,map(missing_linebreaks, rsample)) 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /cik_reader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import defaultdict 4 | 5 | 6 | def main(): 7 | 8 | 9 | lst = open('stocks_CIK.txt', 'r').read().split('\n')[:-1] 10 | 11 | ''' Get rid of repeat CIK's : Find out a way to deal with them later''' 12 | remove_repeats = [] 13 | for s in lst: 14 | remove_repeats += [k for k in lst if s.split('\t')[3]==s.split('\t')[3]] 15 | for r in remove_repeats: 16 | try: lst.remove(r) 17 | except: pass 18 | 19 | ''' Properly parse stocks_CIK ''' 20 | 21 | collector = defaultdict(list) 22 | for s in lst: 23 | l = s.split('\t') 24 | ticker = l[0] # obtain ticker symbol 25 | name = l[1] # obtain company name 26 | flag = int(l[2]) # CIK flag 27 | cik = l[3] 28 | if flag == -1: 29 | pass 30 | elif flag == 100: 31 | collector[(flag,1)].append((ticker,cik,name)) 32 | elif (flag < 100) | (flag > -1): 33 | # If only one CIK 34 | if len(cik) == 10: 35 | collector[(flag,1)].append((ticker,cik,name)) 36 | else: 37 | cik_eval = eval(cik) 38 | collector[(flag,len(cik_eval))].append((ticker,cik,name)) 39 | else: 40 | print 'Encountered unexpected line, quit' 41 | exit() 42 | 43 | # Write good ticker, CIK pairs in here 44 | #writer = open('good_CIK.txt', 'w') 45 | #writer.write(ticker+'\t'+cik+'\n') # write pair to file 46 | #writer.close() 47 | 48 | d = dict(collector) 49 | 50 | k = d.keys() 51 | for i in k: 52 | if i[1] == 1: 53 | print i, len(collector[i]) 54 | 55 | if i[0] == 100: 56 | print i, len(collector[i]) 57 | 58 | if i[1] == 2: 59 | print i, len(collector[i]) 60 | 61 | if i[1] == 3: 62 | print i, len(collector[i]) 63 | 64 | with open('validated_CIK.txt','wb') as f: 65 | for k,v in d.iteritems(): 66 | if (k[0] == 100) | (k[1] == 1): 67 | for i in v: 68 | f.write(i[0]+'\t'+i[1]+'\t'+i[2]+'\n') 69 | 70 | ones_list = [] 71 | for k,v in d.iteritems(): 72 | if (k[0] == 2) & (k[1] == 1): 73 | for i in v: 74 | ones_list.append([i[0],i[1],i[2]]) 75 | 76 | matchcounter = 0 77 | for i in [i[1] for i in ones_list]: 78 | matchups = [(k[0],k[2]) for k in ones_list if k[1] == i] 79 | if len(matchups) > 1: 80 | print matchups 81 | matchcounter += 1 82 | 83 | print matchcounter 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /yahoo_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | 4 | from BeautifulSoup import BeautifulSoup as bs 5 | from collections import defaultdict 6 | from multiprocessing import Pool 7 | import urllib2 8 | try: import cPickle as pickle 9 | except: import pickle 10 | 11 | def main(): 12 | download_list = 0 13 | validate_list = 0 14 | compile_list = 0 15 | 16 | 17 | ''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 18 | fulllistdat = 'raw_stocks_list.dat' 19 | if download_list: 20 | maximum = 23686; cnt = 1; fcnt = 1; collector = defaultdict(list); 21 | while cnt < maximum: 22 | soup = bs(urllib2.urlopen('http://screener.finance.yahoo.com/b?pr=0/&s=tk&vw=1&db=stocks&b=' + str(cnt))) 23 | table = soup.findAll("table")[1].contents[1].contents[1].contents[1] 24 | for n in range(21)[1:]: 25 | try: 26 | ticker = str(table.contents[n].find('a').string).replace(';','') 27 | name = str(table.contents[n].findAll('font')[1].string).replace('&','&') 28 | collector[ticker] = name 29 | print fcnt,'of',maximum,'\t',ticker,'\t',name 30 | fcnt += 1 31 | except: 32 | saver(collector, fulllistdat) 33 | cnt += 20 34 | saver(collector, fulllistdat) 35 | 36 | 37 | 38 | 39 | ''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 40 | # Load pickled stocks data 41 | with open(fulllistdat) as f: 42 | d = pickle.load(f) 43 | 44 | if validate_list: 45 | # Clear file in which we record validation tickers 46 | open('record_stock_name_validation.txt', 'w').close() 47 | 48 | pool = Pool(processes=16) 49 | pool.map(validator, d.keys()) 50 | 51 | 52 | 53 | ''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ''' 54 | if compile_list: 55 | with open('raw_name_validation.txt', 'r') as f: 56 | rcrd = f.read().split('\n')[:-1] 57 | collector = defaultdict(list) 58 | for i in rcrd: 59 | collector[i.split('\t')[0]] = i.split('\t')[1] 60 | notdone = list(set(d.keys()).difference(set(collector.keys()))) 61 | 62 | pool = Pool(processes=8) 63 | pool.map(validator, notdone) 64 | 65 | final_list = defaultdict(list) 66 | p = 0; n = 0; f = 0; r = 0; 67 | for k,v in collector.iteritems(): 68 | if v == '': 69 | n += 1 70 | elif v == 'FAIL': 71 | f += 1 72 | elif v == 'PASS': 73 | final_list[k] = d[k] 74 | p += 1 75 | else: 76 | final_list[v] = d[k] 77 | r += 1 78 | 79 | with open('stocks_list.dat','wb') as fn: 80 | pickle.dump(dict(final_list),fn) 81 | 82 | print 'Total list:',len(final_list) 83 | print 'Nothing:',n,'| Fail:',f,'| Pass:',p,'| Replace:',r 84 | 85 | 86 | def validator(ticker): 87 | soup = bs(urllib2.urlopen('http://finance.yahoo.com/q?s=' + ticker.replace('&','%26') )) 88 | 89 | outcome = '' 90 | 91 | try: 92 | if ( str(soup.find('h3').contents[0]) == 'Changed Ticker Symbol' ): 93 | outcome = str(soup.findAll('p')[1].contents[1].contents[0]) 94 | except: pass 95 | 96 | try: 97 | if ( str(soup.findAll('h2')[2].contents[0]) == 'There are no All Markets results for' ): 98 | outcome = 'FAIL' 99 | except: pass 100 | 101 | try: 102 | tname = str(soup.findAll('h2')[3].contents[0]).split('(')[-1][:-1] 103 | # tname = fname[fname.find("(")+1:fname.find(")")] 104 | if ticker in tname: 105 | if ticker == tname: 106 | outcome = 'PASS' 107 | else: 108 | outcome = tname 109 | except: pass 110 | 111 | with open('raw_name_validation.txt','a') as f: 112 | f.write(ticker+'\t'+outcome+'\n') 113 | 114 | print ticker,'\t',outcome 115 | 116 | def saver(collector, fname): 117 | with open(fname,'wb') as f: 118 | pickle.dump(dict(collector),f) 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /stock.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, collections, datetime 4 | try: import cPickle as pickle 5 | except: import pickle 6 | 7 | """Handles using stock prices that have already been scraped""" 8 | #CIKs are *string* values. Considered storing as ints but CIKs are 10-digit, could potentially overflow 32-bit ints. Since we never do arithmetic on them, and only use them is index keys, better to use strings. 9 | 10 | #Tickers will be (Exchange, Ticker) tuples, e.g. ('NYSE', 'GS'). 11 | 12 | #Prices will be integer values with 100 = $1 conversion. Thus Apple's price of $558.22 becomes 55822. 13 | 14 | # 'data' outputed is in form (open, high, low, close, volume) 15 | 16 | def ticker_to_CIK(ticker): 17 | ''' Return CIK if valid ticker, return None otherwise ''' 18 | with open('validated_CIK.dat') as f: 19 | d = pickle.load(f) 20 | try: return [i[1] for i in d if i[0] == ticker][0] 21 | except IndexError: return None 22 | 23 | def CIK_to_ticker(CIK): 24 | ''' Return ticker if valid CIK, return None otherwise ''' 25 | with open('validated_CIK.dat') as f: 26 | d = pickle.load(f) 27 | try: return [i[0] for i in d if i[1] == CIK][0] 28 | except IndexError: return None 29 | 30 | def good_CIKs(): 31 | ''' Return list of all CIKs for which we have trading info on. ''' 32 | with open('validated_CIK.dat') as f: 33 | d = pickle.load(f) 34 | return [i[1] for i in d] 35 | 36 | def good_tickers(): 37 | ''' Return list of all tickers for which we have trading info on. ''' 38 | with open('validated_CIK.dat') as f: 39 | d = pickle.load(f) 40 | return [i[0] for i in d] 41 | 42 | def get_open(ticker, dates): 43 | ''' Return a price on that date, or the next available day ''' 44 | ''' Returns list of (date, price) tuples ''' 45 | if isinstance(dates,list): 46 | return [(i[0],i[1][0]) for i in get_data(ticker,dates)] 47 | elif isinstance(dates,tuple): 48 | return [(i[0],i[1][0]) for i in [get_datum(ticker,dates)]] 49 | else: 50 | return (None,None) 51 | 52 | def get_close(ticker, dates): 53 | ''' Return a price on that date, or the next available day ''' 54 | ''' Returns list of (date, price) tuples ''' 55 | if isinstance(dates,list): 56 | return [(i[0],i[1][3]) for i in get_data(ticker,dates)] 57 | elif isinstance(dates,tuple): 58 | return [(i[0],i[1][3]) for i in [get_datum(ticker,dates)]] 59 | else: 60 | return (None,None) 61 | 62 | def get_volume(ticker, dates): 63 | ''' Return volume on that date, or the next available day ''' 64 | ''' Returns list of (date, volume) tuples ''' 65 | if isinstance(dates,list): 66 | return [(i[0],i[1][4]) for i in get_data(ticker,dates)] 67 | elif isinstance(dates,tuple): 68 | return [(i[0],i[1][4]) for i in [get_datum(ticker,dates)]] 69 | else: 70 | return (None,None) 71 | 72 | def get_data(ticker, dates): 73 | out_list = [] 74 | for d in dates: 75 | out_list.append(get_datum(ticker,d)) 76 | return out_list 77 | 78 | def get_datum(ticker, date): 79 | ''' Return a price on that date, or the next available day ''' 80 | ''' Returns (date, price) tuple ''' 81 | # Check if file exists 82 | if os.path.isfile('stocks_dat/'+ticker+'.dat'): 83 | with open('stocks_dat/'+ticker+'.dat','r') as f: 84 | d = pickle.load(f) 85 | try: return (date, d[date]) 86 | # (exception) If date entry does not exist, try next day 87 | except KeyError: return get_datum(ticker, get_nextday(date)) 88 | # (any other exception) return None 89 | else: return (None, None) 90 | else: 91 | return (None, None) 92 | 93 | def get_nextday(date): 94 | n = datetime.date(date[0],date[1],date[2]) + datetime.timedelta(days=1) 95 | return (n.year, n.month, n.day) 96 | 97 | def get_marketcap(ticker, date): 98 | # Return market cap on closest defined day, raise an exception if not defined at all in a 12 month span 99 | ''' WORK ON THIS ''' 100 | return None 101 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import string, re 3 | from datetime import date 4 | import sys # For test parsing functionality 5 | from pdb import set_trace as debug 6 | 7 | class ParseError(BaseException): 8 | pass 9 | 10 | 11 | 12 | 13 | def parse_quarterly_filing(filepath): 14 | with open(filepath, 'r') as doc: 15 | rawtext = doc.read() 16 | # First partition to separate all of the header data (header + filers) 17 | filer_ptext = '\nFILER:\n' # Break filer sections at this text 18 | partitioned = rawtext.partition('') 19 | if partitioned[1] != '': 20 | partitioned = partitioned[0].partition('') 21 | if partitioned[1] != '': 22 | raise ParseError('Unable to partition header from body') 23 | 24 | header_text = partitioned[0] 25 | document_text = partitioned[2] # Text of the document 26 | 27 | header_text = header_text.partition(filer_ptext) 28 | if header_text[1] != filer_ptext: 29 | raise ParseError('Unable to partition on %s' % filer_ptext) 30 | filer_text = header_text[2].partition(filer_ptext) 31 | header_text = header_text[0] # Just the document header - filing date etc 32 | 33 | filers_textlist = [] # Also important 34 | 35 | while filer_text[1] == filer_ptext: 36 | filers_textlist.append(filer_text[0]) 37 | filer_text = filer_text[2].partition(filer_ptext) 38 | filers_textlist.append(filer_text[0]) 39 | 40 | #Is there a more efficient place to define these constants? 41 | header_info = ( 42 | ('DocType', 'CONFORMED SUBMISSION TYPE:' ), 43 | ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:'), 44 | ('FilingDate', 'FILED AS OF DATE:' )) 45 | 46 | filer_info = ( 47 | ('CompanyName', 'COMPANY CONFORMED NAME:' ), 48 | ('CIK', 'CENTRAL INDEX KEY:' ), 49 | ('SIC', 'STANDARD INDUSTRIAL CLASSIFICATION:')) 50 | 51 | try: 52 | header_dict = parse_fields(header_text, header_info) 53 | #header_dict['FilingDate'] = str2date(header_dict['FilingDate']) 54 | except ParseError: # Re-raise with a name 55 | raise ParseError('Unable to parse header') 56 | 57 | 58 | cik2filer = {} 59 | for filer in filers_textlist: 60 | try: 61 | filerdict = parse_fields(filer, filer_info) 62 | CIK = filerdict['CIK'] 63 | filerdict['SIC'] = force_to_int(filerdict['SIC']) 64 | cik2filer[CIK] = filerdict 65 | except ParseError: 66 | pass 67 | 68 | 69 | if cik2filer == {}: 70 | raise ParseError('No valid filers') 71 | 72 | 73 | #word_count = build_word_count(document_text) 74 | 75 | return (header_dict, cik2filer, document_text) 76 | 77 | 78 | def build_word_count(text): 79 | to_remove = string.punctuation + string.digits 80 | text = re.sub('<[^>]*>', '', text) # Remove all 81 | text = text.translate(None, (to_remove)) 82 | # Removes all punctuation and digits 83 | text = text.lower().split() 84 | # Splits the text into a list of lowercase words 85 | # Possible improvements: Strip tables 86 | num_words = len(text) 87 | word_count = {} 88 | for word in text: 89 | try: 90 | word_count[word] += 1 91 | except KeyError: 92 | word_count[word] = 1 93 | # This try/except method may be somewhat more efficient than if-then branching for unigram processing. For n-grams, perhaps better to use if-then. 94 | return (word_count, num_words) 95 | 96 | def parse_fields(text, property_info): 97 | # Defines the properties to seek in the text of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called. 98 | properties = {} 99 | text = text.split('\n') 100 | for line in text: 101 | line = line.strip() 102 | for (name, identifier) in property_info: 103 | if line.startswith(identifier): 104 | content = line.partition(identifier)[2].strip() 105 | if content == '': 106 | raise ParseError('Empty field') 107 | properties[name] = content 108 | break # Move on to the next line once we find a field 109 | 110 | if len(properties) != len(property_info): 111 | raise ParseError('Unable to find all fields') 112 | else: 113 | return properties 114 | 115 | def force_to_int(val): 116 | try: 117 | converted = int(val) 118 | except ValueError: 119 | to_remove = string.punctuation + string.letters + string.whitespace 120 | forced_val = val.translate(None, (to_remove)) 121 | if forced_val == '': 122 | raise ParseError('Unable to convert SIC to #: %s' % val) 123 | converted = int(forced_val) 124 | return converted 125 | 126 | def str2date(datestr): 127 | year = int(datestr[0:4]) 128 | month = int(datestr[4:6]) 129 | day = int(datestr[6:8]) 130 | return date(year, month, day) 131 | 132 | def test_parse(document): 133 | (header, filers, rawtext) = parse_quarterly_filing(document) 134 | pretty_dict(header, "header") 135 | [pretty_dict(x, "filer") for x in filers] 136 | wc = build_word_count(rawtext) 137 | pretty_dict(wc, "words") 138 | 139 | def main(): 140 | argv = sys.argv 141 | if len(argv) == 1: 142 | print "Give a document and I'll test parse it" 143 | exit(0) 144 | fpath = argv[1] 145 | test_parse(fpath) 146 | 147 | def pretty_dict(output, name): 148 | print name + ":" 149 | for key, val in output.iteritems(): 150 | print "\t" + str(key) + ": " + str(val) 151 | 152 | if __name__ == "__main__": 153 | main() 154 | 155 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | from ftplib import FTP 2 | import os 3 | import sys 4 | import zipfile 5 | import re 6 | import argparse 7 | import threading 8 | import Queue 9 | 10 | def connect_to_SEC(index): 11 | if index > 50: 12 | print "Maximum number of attempts exceeded. Try again later." 13 | else: 14 | try: 15 | return FTP('ftp.sec.gov') 16 | except EOFError: 17 | print "Connection refused on attempt {0}. Trying again...".format(index) 18 | return connect_to_SEC(index + 1) 19 | 20 | def download_file(serverpath, local_path): 21 | global ftp 22 | with open (local_path, 'w') as out_file: 23 | command = 'RETR ' + serverpath.strip() 24 | ftp.retrbinary(command, out_file.write) 25 | 26 | def ensure(dir): 27 | if not os.path.exists(dir): 28 | os.makedirs(dir) 29 | 30 | def extract_and_remove(zip_path, out_dir): 31 | with zipfile.ZipFile(zip_path, 'r') as outzip: 32 | outzip.extractall(out_dir) 33 | os.remove(zip_path) 34 | 35 | def download_index_files(out_dir): 36 | years = ['1993', '1994', '1995', '1996', 37 | '1997', '1998', '1999', '2000', 38 | '2001', '2002', '2003', '2004', 39 | '2005', '2006', '2007', '2008', 40 | '2009', '2010', '2011', '2012'] 41 | 42 | quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4'] 43 | 44 | # Get the current working directory so that we can change it 45 | # back when we're done 46 | old_cwd = os.getcwd() 47 | ensure(out_dir) 48 | os.chdir(out_dir) 49 | 50 | for year in years: 51 | for quarter in quarters: 52 | subdir = year + '/' + quarter 53 | ensure(subdir) 54 | path = subdir + '/form.zip' 55 | download_file(path, path) 56 | extract_and_remove(path, subdir) 57 | 58 | os.chdir(old_cwd) 59 | 60 | 61 | def split_list(xs, y, eq_func=lambda a, b: a == b): 62 | for i, x in enumerate(xs): 63 | if eq_func(x, y): 64 | return [xs[:i], xs[i + 1:]] 65 | else: 66 | return [xs] 67 | 68 | def paths_for_10ks(index_file): 69 | paths = [] 70 | lines = index_file.read().splitlines() 71 | lines = split_list(lines, '-+$', lambda a, b: re.match(b, a))[1] 72 | for line in lines: 73 | if line[:4] == '10-K' or line[:4] == '10-Q': 74 | fields = re.split('\s\s+', line) 75 | company, date, server_path = (fields[1], fields[3], fields[4]) 76 | paths.append((server_path, '{0}_{1}_{2}'.format(company.replace('/', '-'), date, fields[0].replace('/','-')))) 77 | return paths 78 | 79 | def download_forms_serially(paths): 80 | global ftp 81 | for server_path, local_path in paths: 82 | try: 83 | with open(local_path, 'w') as out_file: 84 | ftp.retrlines('RETR ' + server_path, out_file.write) 85 | print "Saved: {0}".format(local_path) 86 | except Exception as e: 87 | print e 88 | print 'Download failed on file at: {0}'.format(server_path) 89 | 90 | def download_10ks(data_directory): 91 | for root, dirs, files in os.walk(data_directory): 92 | for name in files: 93 | path = os.path.join(root, name) 94 | if path.split('.')[-1] != 'idx': 95 | continue 96 | with open(path, 'r') as index_file: 97 | form_paths = [(s, os.path.join(root, l)) for s,l in paths_for_10ks(index_file)] 98 | download_forms_serially(form_paths) 99 | 100 | # A class to facilitate multithreaded downloading of data over FTP 101 | class FTPThread(threading.Thread): 102 | """A class to download data over FTP in parallel threads""" 103 | def __init__(self, server_path, local_path): 104 | self.server_path = server_path 105 | self.local_path = local_path 106 | threading.Thread.__init__(self) 107 | def run(self): 108 | global ftp 109 | try: 110 | with open(self.local_path, 'w') as out_file: 111 | ftp.retrlines('RETR ' + self.server_path, out_file.write) 112 | print "Saved: {0}".format(self.local_path) 113 | except Exception as e: 114 | print e 115 | print 'Download failed on file at: {0}'.format(self.server_path) 116 | 117 | 118 | def download_forms(paths, max_threads): 119 | finished = [] 120 | def producer(q, paths): 121 | for server_path, local_path in paths: 122 | thread = FTPThread(server_path, local_path) 123 | thread.start() 124 | q.put(thread, True) 125 | 126 | def consumer(q, total_files): 127 | while len(finished) < total_files: 128 | thread = q.get(True) 129 | thread.join() 130 | finished.append(thread) 131 | 132 | q = Queue.Queue(max_threads) 133 | 134 | prod_thread = threading.Thread(target=producer, args=(q, paths)) 135 | cons_thread = threading.Thread(target=consumer, args=(q, len(paths))) 136 | prod_thread.start() 137 | cons_thread.start() 138 | prod_thread.join() 139 | cons_thread.join() 140 | 141 | if __name__ == '__main__': 142 | parser = argparse.ArgumentParser(description='Download either index files (i) or form files (f) to a given directory.') 143 | parser.add_argument('mode', type=str, choices=['i', 'f']) 144 | parser.add_argument('directory', type=str) 145 | 146 | args = parser.parse_args() 147 | 148 | ftp = connect_to_SEC(0) 149 | ftp.login() 150 | 151 | if args.mode == 'i': 152 | index_path = '/edgar/full-index' 153 | ftp.cwd(index_path) 154 | download_index_files(args.directory) 155 | else: 156 | download_10ks(args.directory) 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /rb_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import string, re 3 | from datetime import date 4 | import sys # For test parsing functionality 5 | from pdb import set_trace as debug 6 | 7 | class ParseError(BaseException): 8 | pass 9 | 10 | 11 | 12 | 13 | def parse_quarterly_filing(filepath): 14 | """Parse a 10-K or 10-Q. 15 | Returns (header_dict, cik2filer, document_text). 16 | header_dict = """ 17 | with open(filepath, 'r') as doc: 18 | rawtext = doc.read() 19 | # First partition to separate all of the header data (header + filers) 20 | filer_ptext = '\nFILER:\n' # Break filer sections at this text 21 | partitioned = rawtext.partition('') 22 | if partitioned[1] != '': 23 | partitioned = partitioned[0].partition('') 24 | if partitioned[1] != '': 25 | raise ParseError('Unable to partition header from body') 26 | 27 | header_text = partitioned[0] 28 | document_text = partitioned[2] # Text of the document 29 | 30 | header_text = header_text.partition(filer_ptext) 31 | if header_text[1] != filer_ptext: 32 | raise ParseError('Unable to partition on %s' % filer_ptext) 33 | filer_text = header_text[2].partition(filer_ptext) 34 | header_text = header_text[0] # Just the document header - filing date etc 35 | 36 | filers_textlist = [] # Also important 37 | 38 | while filer_text[1] == filer_ptext: 39 | filers_textlist.append(filer_text[0]) 40 | filer_text = filer_text[2].partition(filer_ptext) 41 | filers_textlist.append(filer_text[0]) 42 | 43 | #Is there a more efficient place to define these constants? 44 | # Defines the properties to seek in the text of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called. 45 | 46 | header_info = ( 47 | ('DocType', 'CONFORMED SUBMISSION TYPE:' ), 48 | ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:'), 49 | ('FilingDate', 'FILED AS OF DATE:' )) 50 | 51 | filer_info = ( 52 | ('CompanyName', 'COMPANY CONFORMED NAME:' ), 53 | ('CIK', 'CENTRAL INDEX KEY:' ), 54 | ('SIC', 'STANDARD INDUSTRIAL CLASSIFICATION:')) 55 | 56 | try: 57 | header_dict = parse_fields(header_text, header_info) 58 | #header_dict['FilingDate'] = str2date(header_dict['FilingDate']) 59 | except ParseError: # Re-raise with a name 60 | raise ParseError('Unable to parse header') 61 | 62 | 63 | cik2filer = {} 64 | for filer in filers_textlist: 65 | try: 66 | filerdict = parse_fields(filer, filer_info) 67 | CIK = filerdict['CIK'] 68 | filerdict['SIC'] = force_to_int(filerdict['SIC']) 69 | cik2filer[CIK] = filerdict 70 | except ParseError: 71 | pass 72 | 73 | 74 | if cik2filer == {}: 75 | raise ParseError('No valid filers') 76 | 77 | 78 | #word_count = build_word_count(document_text) 79 | 80 | return (header_dict, cik2filer, document_text) 81 | 82 | 83 | def build_word_count(text): 84 | to_remove = string.punctuation + string.digits 85 | text = re.sub('<[^>]*>', '', text) # Remove all 86 | text = text.translate(None, (to_remove)) 87 | # Removes all punctuation and digits 88 | text = text.lower().split() 89 | # Splits the text into a list of lowercase words 90 | # Possible improvements: Strip tables 91 | num_words = len(text) 92 | word_count = {} 93 | for word in text: 94 | try: 95 | word_count[word] += 1 96 | except KeyError: 97 | word_count[word] = 1 98 | # This try/except method may be somewhat more efficient than 99 | # if-then branching for unigram processing. For n-grams, 100 | # perhaps better to use if-then. 101 | return (word_count, num_words) 102 | 103 | def parse_fields(text, property_info): 104 | """Parses a text, looking for specific field information 105 | Takes raw text, and a list of (name, identifier) tuples. 106 | Returns a dictionary mapping names to the content of the line that started with 'identifier'.""" 107 | properties = {} 108 | text = text.split('\n') 109 | for line in text: 110 | line = line.strip() 111 | for (name, identifier) in property_info: 112 | if line.startswith(identifier): 113 | content = line.partition(identifier)[2].strip() 114 | # Content = everything that followed the identifier 115 | if content == '': 116 | raise ParseError('Empty field') 117 | properties[name] = content 118 | break # Move on to the next line once we find a field 119 | 120 | if len(properties) != len(property_info): 121 | raise ParseError('Unable to find all fields') 122 | else: 123 | return properties 124 | 125 | def force_to_int(val): 126 | try: 127 | converted = int(val) 128 | except ValueError: 129 | to_remove = string.punctuation + string.letters + string.whitespace 130 | forced_val = val.translate(None, (to_remove)) 131 | if forced_val == '': 132 | raise ParseError('Unable to convert SIC to #: %s' % val) 133 | converted = int(forced_val) 134 | return converted 135 | 136 | def str2date(datestr): 137 | year = int(datestr[0:4]) 138 | month = int(datestr[4:6]) 139 | day = int(datestr[6:8]) 140 | return date(year, month, day) 141 | 142 | def test_parse(document): 143 | (header, filers, rawtext) = parse_quarterly_filing(document) 144 | pretty_dict(header, "header") 145 | [pretty_dict(x, "filer") for x in filers] 146 | wc = build_word_count(rawtext) 147 | pretty_dict(wc, "words") 148 | 149 | def main(): 150 | argv = sys.argv 151 | if len(argv) == 1: 152 | print "Give a document and I'll test parse it" 153 | exit(0) 154 | fpath = argv[1] 155 | test_parse(fpath) 156 | 157 | def pretty_dict(output, name): 158 | print name + ":" 159 | for key, val in output.iteritems(): 160 | print "\t" + str(key) + ": " + str(val) 161 | 162 | if __name__ == "__main__": 163 | main() 164 | 165 | -------------------------------------------------------------------------------- /stock_price.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | 4 | from datetime import datetime 5 | from string import lower, upper 6 | import os, csv 7 | 8 | def main(): 9 | # Input stock 10 | ticker = lower('AAPL') 11 | 12 | # If output directory doesn't exist, make it 13 | output_dir = 'stock_data' 14 | if not os.path.exists(output_dir): 15 | os.makedirs(output_dir) 16 | 17 | # If stock CSV isn't downloaded, DOWNLOAD it 18 | if os.path.isfile(output_dir+"/"+ticker+".csv"): 19 | print upper(ticker) + " data already exists in this directory.\n" 20 | else: 21 | download_csv(ticker,output_dir) 22 | 23 | # Run this code to download all S&P 500 files 24 | """ 25 | for s in snp_list(): 26 | download_csv(s,output_dir) 27 | """ 28 | 29 | # Now that the CSV data is downloaded, 30 | # arrange it into a tuple 31 | data = vectorize_csv(ticker,output_dir) 32 | 33 | # # # # TEST EXAMPLES # # # # 34 | 35 | # Test to see if the tuples have been created 36 | print "Here are the first few lins of the data tuple output:" 37 | print "Format: (DATE, OPEN, HIGH, LOW, CLOSE, VOLUME)" 38 | print data[1:5] 39 | 40 | # Retrieve price for specific date 41 | sample_date = (2012,1,18); 42 | print "\nFind price on:", sample_date, "\n", get_prices(data,sample_date) 43 | sample_date = (2012,1,21); 44 | print "\nFind price on:", sample_date, "\n", get_prices(data,sample_date) 45 | 46 | 47 | # Parses the CSV file and returns a tuple with data 48 | # as a tuple of: 49 | # (DATE, OPEN, HIGH, LOW, CLOSE, VOLUME) 50 | def vectorize_csv(ticker,output_dir): 51 | fulldata = csv.reader(open(output_dir+'/'+ticker+'.csv', 'rb')) 52 | output = [] 53 | 54 | # Throw away header 55 | fulldata.next() 56 | 57 | # Store rest of data in list of tuples 58 | for row in fulldata: 59 | s = row[0].split('-') 60 | tup = ((int(s[0]), int(s[1]), int(s[2])), float(row[1]), \ 61 | float(row[2]), float(row[3]), float(row[4]), int(row[5]), \ 62 | float(row[6])) 63 | output.append(tup) 64 | 65 | # Adjust for dividends, splits, etc. 66 | """ 67 | DATEtemp{ptr,1} = DATEvar; 68 | OPENtemp(ptr,1) = OPENvar * adj_close / CLOSEvar; 69 | HIGHtemp(ptr,1) = HIGHvar * adj_close / CLOSEvar; 70 | LOWtemp (ptr,1) = LOWvar * adj_close / CLOSEvar; 71 | CLOSEtemp(ptr,1)= CLOSEvar * adj_close / CLOSEvar; 72 | VOLtemp(ptr,1) = VOLvar; 73 | """ 74 | 75 | # Reverse to normal chronological order, so 1st entry is oldest data 76 | output.reverse() 77 | return output 78 | 79 | 80 | # Downloads CSV file and stores it in the 81 | # respective directory 82 | def download_csv(ticker,output_dir): 83 | # Build URL string 84 | start_year = '1993' 85 | #Don't need price data older than our filings 86 | now = datetime.now() 87 | 88 | url_string = 'http://ichart.finance.yahoo.com/table.csv?' 89 | url_string += '&s=' + (ticker) 90 | url_string += '&d=' + str(now.month-1) 91 | url_string += '&e=' + str(now.day) 92 | url_string += '&f=' + str(now.year) 93 | url_string += '&g=d&a=0&b=1&c=' + start_year 94 | url_string += '&ignore.csv' 95 | 96 | # Download file using system call 97 | os.system("wget \'" + url_string + "\' -O \'" + output_dir + "/" + (ticker) + ".csv\'") 98 | 99 | print "Finished downloading " + upper(ticker) + "\n" 100 | 101 | def get_prices(data,date): 102 | try: 103 | return (i for i in data if i[0] == date).next() 104 | except StopIteration: 105 | print "No data for the date", date 106 | return None 107 | 108 | 109 | # Simply gives the list of stocks on S&P 500 110 | # Kept this function at the bottom due to size 111 | def snp_list(): 112 | return ['MMM', 'ACE', 'ABT', 'ANF', 'ACN', 'ADBE', 'AMD', 'AES', 'AET', 'AFL', 'A', 'GAS', 'APD', 'ARG', 'AKAM', 'AA', 'ATI', 'AGN', 'ALL', 'ALTR', 'MO', 'AMZN', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AMP', 'ABC', 'AMGN', 'APH', 'APC', 'ADI', 'AON', 'APA', 'AIV', 'APOL', 'AAPL', 'AMAT', 'ADM', 'AIZ', 'T', 'ADSK', 'ADP', 'AN', 'AZO', 'AVB', 'AVY', 'AVP', 'BHI', 'BLL', 'BAC', 'BK', 'BCR', 'BAX', 'BBT', 'BEAM', 'BDX', 'BBBY', 'BMS', 'BRK.B', 'BBY', 'BIG', 'BIIB', 'BLK', 'HRB', 'BMC', 'BA', 'BWA', 'BXP', 'BSX', 'BMY', 'BRCM', 'BF.B', 'CHRW', 'CA', 'CVC', 'COG', 'CAM', 'CPB', 'COF', 'CAH', 'CFN', 'KMX', 'CCL', 'CAT', 'CBG', 'CBS', 'CELG', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHK', 'CVX', 'CB', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CTXS', 'CLF', 'CLX', 'CME', 'CMS', 'COH', 'KO', 'CCE', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CSC', 'CAG', 'COP', 'CNX', 'ED', 'STZ', 'CEG', 'GLW', 'COST', 'CVH', 'COV', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DVA', 'DF', 'DE', 'DELL', 'DNR', 'XRAY', 'DVN', 'DV', 'DO', 'DTV', 'DFS', 'DISCA', 'DLTR', 'D', 'RRD', 'DOV', 'DOW', 'DPS', 'DTE', 'DD', 'DUK', 'DNB', 'ETFC', 'EMN', 'ETN', 'EBAY', 'ECL', 'EIX', 'EW', 'EP', 'EA', 'EMC', 'EMR', 'ETR', 'EOG', 'EQT', 'EFX', 'EQR', 'EL', 'EXC', 'EXPE', 'EXPD', 'ESRX', 'XOM', 'FFIV', 'FDO', 'FAST', 'FII', 'FDX', 'FIS', 'FITB', 'FHN', 'FSLR', 'FE', 'FISV', 'FLIR', 'FLS', 'FLR', 'FMC', 'FTI', 'F', 'FRX', 'BEN', 'FCX', 'FTR', 'GME', 'GCI', 'GPS', 'GD', 'GE', 'GIS', 'GPC', 'GNW', 'GILD', 'GS', 'GR', 'GT', 'GOOG', 'GWW', 'HAL', 'HOG', 'HAR', 'HRS', 'HIG', 'HAS', 'HCP', 'HCN', 'HNZ', 'HP', 'HES', 'HPQ', 'HD', 'HON', 'HRL', 'HSP', 'HST', 'HCBK', 'HUM', 'HBAN', 'ITW', 'TEG', 'INTC', 'ICE', 'IBM', 'IFF', 'IGT', 'IP', 'IPG', 'INTU', 'ISRG', 'IVZ', 'IRM', 'XYL', 'JBL', 'JEC', 'CBE', 'JDSU', 'JNJ', 'JCI', 'JOY', 'JPM', 'JNPR', 'K', 'KEY', 'KMB', 'KIM', 'KLAC', 'KSS', 'KFT', 'KR', 'LLL', 'LH', 'LM', 'LEG', 'LEN', 'LUK', 'LXK', 'LIFE', 'LLY', 'LTD', 'LNC', 'LLTC', 'LMT', 'L', 'LO', 'LOW', 'LSI', 'MTB', 'M', 'MRO', 'MPC', 'MAR', 'MMC', 'MAS', 'ANR', 'MA', 'MAT', 'MKC', 'MCD', 'MHP', 'MCK', 'MJN', 'MWV', 'MHS', 'MDT', 'MRK', 'MET', 'PCS', 'MCHP', 'MU', 'MSFT', 'MOLX', 'TAP', 'MON', 'MCO', 'MS', 'MOS', 'MMI', 'MSI', 'MUR', 'MYL', 'NBR', 'NDAQ', 'NOV', 'NTAP', 'NFLX', 'NWL', 'NFX', 'NEM', 'NWSA', 'NEE', 'NKE', 'NI', 'NE', 'NBL', 'JWN', 'NSC', 'NTRS', 'NOC', 'NU', 'CMG', 'NVLS', 'NRG', 'NUE', 'NVDA', 'NYX', 'ORLY', 'OXY', 'OMC', 'OKE', 'ORCL', 'OI', 'PCAR', 'IR', 'PLL', 'PH', 'PDCO', 'PAYX', 'BTU', 'JCP', 'PBCT', 'POM', 'PEP', 'PKI', 'PRGO', 'PFE', 'PCG', 'PM', 'PNW', 'PXD', 'PBI', 'PCL', 'PNC', 'RL', 'PPG', 'PPL', 'PX', 'PCP', 'PCLN', 'PFG', 'PG', 'PGN', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'QEP', 'PWR', 'QCOM', 'DGX', 'RRC', 'RTN', 'RHT', 'RF', 'RSG', 'RAI', 'RHI', 'ROK', 'COL', 'ROP', 'ROST', 'RDC', 'R', 'SWY', 'SAI', 'CRM', 'SNDK', 'SLE', 'SCG', 'SLB', 'SNI', 'SEE', 'SHLD', 'SRE', 'SHW', 'SIAL', 'SPG', 'SLM', 'SJM', 'SNA', 'SO', 'LUV', 'SWN', 'SE', 'S', 'STJ', 'SWK', 'SPLS', 'SBUX', 'HOT', 'STT', 'SRCL', 'SYK', 'SUN', 'STI', 'SVU', 'SYMC', 'SYY', 'TROW', 'TGT', 'TEL', 'TE', 'THC', 'TDC', 'TER', 'TSO', 'TXN', 'TXT', 'HSY', 'TRV', 'TMO', 'TIF', 'TWX', 'TWC', 'TIE', 'TJX', 'TMK', 'TSS', 'TRIP', 'TSN', 'TYC', 'USB', 'UNP', 'UNH', 'UPS', 'X', 'UTX', 'UNM', 'URBN', 'VFC', 'VLO', 'VAR', 'VTR', 'VRSN', 'VZ', 'VIAB', 'V', 'VNO', 'VMC', 'WMT', 'WAG', 'DIS', 'WPO', 'WM', 'WAT', 'WPI', 'WLP', 'WFC', 'WDC', 'WU', 'WY', 'WHR', 'WFM', 'WMB', 'WIN', 'WEC', 'WPX', 'WYN', 'WYNN', 'XEL', 'XRX', 'XLNX', 'XL', 'YHOO', 'YUM', 'ZMH', 'ZION'] 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /Utilities/test_compression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''Test compression strategies, counter vs dict, etc''' 3 | import os, sys, logging, string, time, copy, gc 4 | from datetime import date, timedelta 5 | from pdb import set_trace as debug 6 | from collections import Counter 7 | # Note: Using Counter on the test dataset increased the pickled size from 1.3mb to 1.8mb 8 | try: 9 | import cPickle as pickle 10 | except: 11 | import pickle 12 | 13 | # Change this below robobuffet_directory = '/Users/danmane/Documents/Code/Git/RoboBuffett' 14 | 15 | def printandlog(msg): 16 | logging.info(msg) 17 | print msg 18 | 19 | def floatRange(minv, maxv, step): 20 | """Not reliable for heavy-duty use due to floating point oddities""" 21 | x = minv 22 | while x <= maxv+step: 23 | yield x 24 | x += step 25 | 26 | 27 | class DictDoc: 28 | def __init__(self): 29 | self.word_count = {} 30 | self.num_words = 0 31 | 32 | def makeCopy(self, original): 33 | self.num_words = original.num_words 34 | self.word_count = original.word_count.copy() 35 | 36 | def generate(self, path): 37 | with open(path, 'r') as f: 38 | text = f.read() 39 | to_remove = string.punctuation + string.digits 40 | text = text.translate(None, (to_remove)) 41 | # Removes all punctuation and digits 42 | text = text.lower() 43 | text = text.split() 44 | # Splits the text into a list of lowercase words 45 | # Possible improvements: Strip tables, formatting (e.g. , - 2 -) 46 | self.num_words = len(text) 47 | for word in text: 48 | try: 49 | self.word_count[word] += 1 50 | except KeyError: 51 | self.word_count[word] = 1 52 | 53 | def delete(self): 54 | del self.word_count 55 | del self.num_words 56 | 57 | class ContDoc: 58 | def __init__(self, path): 59 | with open(path, 'r') as f: 60 | text = f.read() 61 | to_remove = string.punctuation + string.digits 62 | text = text.translate(None, (to_remove)) 63 | # Removes all punctuation and digits 64 | text = text.lower() 65 | text = text.split() 66 | self.num_words = len(text) 67 | self.word_count = Counter(text) 68 | 69 | def main(data_dir): 70 | rb_dir = '/Users/danmane/Documents/Code/Git/RoboBuffett' 71 | os.chdir(rb_dir) 72 | #with open('./Utilities/compression.log', 'w') as cleanlog: 73 | # pass # Empty the log before each run 74 | 75 | logging.basicConfig(filename='./Utilities/compression.log', level=logging.INFO) 76 | files = os.listdir(data_dir) 77 | 78 | #docs = process_file_set(files, 'Utilities/dict.dat', DictDoc, 'Naive Dictionary:', data_dir) 79 | #process_file_set(files, 'Utilities/cont.dat', DictDoc, 'Naive Container:', data_dir) 80 | #print "About to load from pickle" 81 | 82 | print "About to start range" 83 | 84 | # s1 = time.time() 85 | # cheapcopy = dictlist_copy(docs) 86 | # s2 = time.time() 87 | # print "cheap: %f" % (s2-s1) 88 | # docscopy = copy.deepcopy(docs) 89 | # s3 = time.time() 90 | #print "regular: %f" % (s3-s2) 91 | 92 | for t in floatRange(.05, .95, .1): 93 | docs = load_docs_from_file() 94 | print "Finished load for threshold %f" %t 95 | test_compression(docs, t) 96 | for doc in docs: 97 | doc.delete() 98 | del docs 99 | 100 | def load_docs_from_file(): 101 | with open('Utilities/dict.dat', 'r') as f: 102 | docs = pickle.load(f) 103 | return docs 104 | 105 | def dictlist_copy(docs): 106 | outdocs = [] 107 | for doc in docs: 108 | new_doc = DictDoc() 109 | new_doc.makeCopy(doc) 110 | outdocs.append(new_doc) 111 | return outdocs 112 | 113 | 114 | def test_compression(docs, threshold): 115 | print "Starting compression for threshold %f" % threshold 116 | start = time.time() 117 | index_list_and_dict = generate_word_index(docs, threshold) 118 | with open('./Utilities/index.dat', 'w') as f: 119 | pickle.dump(index_list_and_dict, f, 2) 120 | 121 | index_dict = index_list_and_dict[1] 122 | 123 | compress_dict_set(docs, index_dict) 124 | with open('./Utilities/compressed_dict.dat', 'w') as f: 125 | pickle.dump(docs, f, 2) 126 | end = time.time() 127 | printandlog('Compressedion with threshold %f:' % threshold) 128 | printandlog('Time elapsed: %f' % (end-start)) 129 | 130 | size = os.stat('./Utilities/compressed_dict.dat').st_size 131 | size += os.stat('./Utilities/index.dat').st_size 132 | size /= float(10**6) 133 | printandlog('Size: %f' % size) 134 | 135 | def process_file_set(files, dbFile, Dtype, type_descr, data_dir): 136 | print "Processing %s" % type_descr 137 | start = time.time() 138 | docs = [] 139 | n_total = len(files) 140 | count = 0 141 | for fpath in files: 142 | if fpath[0] != '.': 143 | new_obj = Dtype() 144 | new_obj.generate((data_dir + '/' + fpath)) 145 | docs.append(new_obj) 146 | count += 1 147 | if count % 100 == 0: 148 | print "%d of %d" % (count, n_total) 149 | 150 | with open(dbFile, 'w') as f: 151 | pickle.dump(docs, f, 2) 152 | end = time.time() 153 | size = os.stat(dbFile).st_size 154 | size /= float(10**6) 155 | elapsed = end-start 156 | printandlog(type_descr) 157 | printandlog('Time elasped: %f' % elapsed) 158 | printandlog('Pickled size: %f' % size) 159 | return docs 160 | 161 | def compress_dict_set(docs, idx_dict): 162 | for doc in docs: 163 | doc.word_list = [0] * len(idx_dict) 164 | for word, count in doc.word_count.copy().iteritems(): 165 | try: 166 | idx = idx_dict[word] 167 | doc.word_list[idx] = count 168 | del doc.word_count[word] 169 | except KeyError: 170 | pass 171 | 172 | def generate_word_index(dict_set, threshold): 173 | """Generates an index of commonly used words in the documents, so that the documents can be stored in compressed form. We can remove all instances of commonly used words from the dictionaries, and add a k-tuple of word counts, where k is the number of commonly used words. THRESHOLD determines what proportion of documents a word must be in for it to be included in the list. 174 | Creates self.index_list, an ordered list of words in the index. Creates self.index_dict which maps from element indicies back to the right word in the sequence. Sets self.indexed = 1.""" 175 | # Threshold in (0, 1) 176 | start = time.time() 177 | dict_index = {} 178 | threshold *= len(dict_set) 179 | for document in dict_set: 180 | for word in document.word_count.iterkeys(): 181 | try: 182 | dict_index[word] += 1 183 | except KeyError: 184 | dict_index[word] = 1 185 | index_list = [] 186 | for word, val in dict_index.iteritems(): 187 | if val > threshold: 188 | index_list.append(word) 189 | del dict_index 190 | index_list.sort() 191 | index_dict = {} 192 | for i in xrange(len(index_list)): 193 | index_dict[index_list[i]] = i 194 | end = time.time() 195 | #printandlog('Dict Index time elapsed: %f' % (end-start)) 196 | return (index_list, index_dict) 197 | 198 | if __name__ == "__main__": 199 | main('BigData') 200 | 201 | 202 | -------------------------------------------------------------------------------- /classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import math 3 | 4 | '''Notes: 5 | The purpose of this module is to assign classification to (company, quarter, duration) tuples. The company will be represented by a unique company identifier and quarters will be represented as year, quarter tuples e.g. (2002,3) = 3Q2002. 6 | The classifier will assign a class to each tuple according to its performance, relative to other companies in the same industry, based on its relative performance during the period [filing date + 1, filing date + 1 + duration]. 7 | 8 | Company return = ( company.close(filedate + duration) - company.open(filedate + 1)) / company.open(filedate + 1) 9 | Industry return = (industry.close(filedate + duration) - industry.open(filedate + 1)) / industry.open(filedate + 1) 10 | Classification is based on (company return - industry return) 11 | 12 | The advantage to this classification approach is that it will capture idiosyncratic outperformance by companies relative to their peers, rather than macro-level economic trends. 13 | 14 | Classification will be based on threshold return levels, which will be expressed as an ordered list [t1, t2, t3]. The (c,q,d) tuple will be assigned to the first threshold for which relative return <= threshold level, where assignment means returning the 0-based index of the threshold. If the return is greater than the maximum threshold level, then it will return the index of the max threshold + 1 (i.e. returns len(thresholds)). 15 | ''' 16 | 17 | '''Process description: 18 | C = |thresholds| = number of different classifications 19 | For each investment horizon ('Duration'): 20 | Generate C sets, where each set contains pointers to documents which fall into this classification 21 | Then we need to convert each set of documents into a dictionary of wordcounts 22 | We want to use only 'common' terms, e.g. nix terms specific to an individual company like "Chlorox" or "Swiffer" 23 | Need to generate a set of all "common" words that we will include in classifiers 24 | 25 | 26 | 27 | 28 | 29 | ''' 30 | 31 | 32 | thresholds = [-.4, .15] # Represents 3 classes: (-Inf, -40%), (-40%, 15%), (15%, Inf) 33 | durations = [1, 20, 40] 34 | # Returns compared to threshold values are annualized returns relative to industry, rather than raw rates of return 35 | 36 | def annualize_return(rate_of_return, duration): 37 | # This attempts to account for the opportunity cost of capital, i.e. getting a 10% return on a 1-month holding is generally better than a 15% return on a 6-month holding. However, this is imperfect because the opportunity cost is properly a function of how many other documents are coming out in the near future, how likely we are to want to buy those stocks, etc. So if documents are evenly spaced throughout the year, so that on average we want to buy a stock every week, we shouldn't necessarily favor 1-day holding periods over 2-day holding periods (as this model would heavily favor). However if all documents are released in 1-week periods every quarter then we should really want short holding periods. If we invest over medium to long term periods (e.g. 2 months) then this is less of an issue. 38 | trading_days_per_year = 252 39 | return rate_of_return ** (trading_days_per_year/float(duration)) 40 | 41 | def training_classification(company, date, durations, thresholds): 42 | # Takes a company, a start date (i.e. date of filing), a list of investment durations, and a list of thresholds 43 | # Returns a classification for each duration, with classification corresponding to one of the thresholds 44 | ticker = company.ticker 45 | SIC = company.SIC 46 | #sector = company.sector 47 | start = next_trading_day(ticker, date) 48 | # Requires a next_trading_day module 49 | classifications = [] 50 | for duration in durations: 51 | try: 52 | stock_return = get_stock_return(ticker, start, end) 53 | sic_return = get_sic_return(SIC, start, end) 54 | # sector_return = get_sector_return(sector, start, end) 55 | # baseline_return = weight_sicsector(SIC, sic_return, sector, sector_return) 56 | relative_return = stock_return - sic_return 57 | ann_relative_return = annualize_return(relative_return, duration) 58 | classif = threshold_sieve(ann_relative_return, thresholds) 59 | classifications.append(classif) 60 | except StockRangeError: 61 | classifications.append(None) 62 | 63 | def threshold_sieve(val, thresholds): 64 | for i in xrange(len(thresholds)): 65 | if val <= thresholds[i]: 66 | return i 67 | return i+1 68 | 69 | 70 | def create_classification_set(manager, thresholds, durations): 71 | # Take a manager, thresholds, durations 72 | # Choose a 'training set' of Company/Date pairs (i.e. document references) 73 | # Generate a classification set for each duration 74 | # Classify each Company/Date pair into a threshold group for each duration 75 | # Return the d sets (d = |durations|) 76 | 77 | 78 | def generate_classification_model(TODO): 79 | # Take a classification set and the manager 80 | # Generate a group dictionary for the set 81 | # Adjust for psuedocount 82 | pass 83 | 84 | def classify_multinomial(text, groups, psuedocount): 85 | """Classifies a text into one of the provided groups, given a psuedocount. 86 | 87 | Returns a tuple containing the chosen group and the difference in log- 88 | likelihood between the chosen group and the second best option 89 | (for validation purposes and perhaps confidence estimation). 90 | 91 | """ 92 | comparisons = {} 93 | for group in groups: 94 | comparisons[group] = likelihood_comparison(text, group, psuedocount) 95 | max = float("-inf") 96 | second_max = float("-inf") 97 | 98 | #Want to find the maximum LLV (to classify the group) and the second-maximum 99 | #LLV (to report the difference) 100 | for group in comparisons: 101 | if comparisons[group] > second_max: 102 | if comparisons[group] > max: 103 | second_max = max 104 | max = comparisons[group] 105 | classification = group 106 | else: 107 | second_max = comparisons[group] 108 | 109 | diff = max - second_max 110 | assert diff > 0 111 | return (classification, diff, max) 112 | 113 | 114 | # Handling psuedocount classifications: 115 | # Generate classification groups 'pure' with word-counts rather than thetas 116 | # Generate set of all words in all documents 117 | # Ensure that each classification group has 118 | 119 | def multinomial_LLV(text, (group_dict, wordcount), psuedocount): 120 | """Generates log-likelihood that given Text came from given TextGroup. 121 | 122 | Note that likelihood function has no absolute meaning, since it is a log- 123 | likelihood with constants disregarded. Instead, the return value may be 124 | used as a basis for comparison to decide which TextGroup is more likely to 125 | contain the Text. 126 | """ 127 | #Make local copies of the dictionaries so we can alter them without causing problems 128 | theta_dict = copy.copy(group_dict) 129 | 130 | #DO psuedocount biasing beforehand 131 | 132 | numWords = float(wordcount + psuedocount * len(group_dict)) 133 | # Need to add psuedocounts since log(0) is undefined (or in orig. multinomial model absent the log transformation, multiplying by a 0 factor would force the result to 0) 134 | for word in theta_dict: 135 | theta_dict[word] += psuedocount 136 | for word in text.dict: 137 | if word not in theta_dict: 138 | theta_dict[word] = psuedocount 139 | numWords += psuedocount 140 | theta = {} 141 | for word in theta_dict: 142 | theta[word] = theta_dict[word] / numWords 143 | 144 | loglikelihood = 0 145 | for word in text.dict: 146 | loglikelihood += text.dict[word] * math.log(theta[word]) 147 | return loglikelihood 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /clean_scrape.py: -------------------------------------------------------------------------------- 1 | from ftplib import FTP 2 | from tempfile import NamedTemporaryFile 3 | from itertools import * 4 | import sys 5 | import os 6 | import zipfile 7 | import subprocess 8 | from contextlib import contextmanager 9 | 10 | 11 | @contextmanager 12 | def directory(path): 13 | old_dir = os.getcwd() 14 | os.chdir(path) 15 | yield 16 | os.chdir(old_dir) 17 | 18 | # run with -c for client mode 19 | 20 | # Initialize a variable called ftp so that we can access it from 21 | # any function after setting it to an FTP object in main 22 | ftp = None 23 | 24 | hosts = ['altair.cs.uchicago.edu', 'ursa.cs.uchicago.edu', 25 | 'ankaa.cs.uchicago.edu', 'antares.cs.uchicago.edu', 26 | 'arcturus.cs.uchicago.edu', 'as.cs.uchicago.edu', 27 | 'avior.cs.uchicago.edu', 'be.cs.uchicago.edu', 28 | 'betelgeuse.cs.uchicago.edu', 'canopus.cs.uchicago.edu', 29 | 'capella.cs.uchicago.edu', 'da.cs.uchicago.edu', 30 | 'deneb.cs.uchicago.edu', 'dubhe.cs.uchicago.edu', 31 | 'gacrux.cs.uchicago.edu', 'hadar.cs.uchicago.edu', 32 | 'ki.cs.uchicago.edu', 'mimosa.cs.uchicago.edu', 33 | 'naos.cs.uchicago.edu', 'polaris.cs.uchicago.edu', 34 | 'procyon.cs.uchicago.edu', 'rastaban.cs.uchicago.edu', 35 | 're.cs.uchicago.edu', 'rigel.cs.uchicago.edu', 36 | 'saiph.cs.uchicago.edu', 'sh.cs.uchicago.edu', 37 | 'sirius.cs.uchicago.edu', 'ul.cs.uchicago.edu'] 38 | 39 | def connect_to_SEC(max_attempts=50): 40 | """ Connect to the SEC ftp server, timing out after max_attempts 41 | attempts. 42 | """ 43 | for i in xrange(max_attempts): 44 | try: 45 | return FTP('ftp.sec.gov') 46 | except EOFError: 47 | pass 48 | print "Maximum number of attempts exceeded. Try again later." 49 | 50 | 51 | def download_file(server_path, local_path): 52 | """Download a file at server_path on the global ftp server object 53 | to local_path. 54 | """ 55 | global ftp 56 | with NamedTemporaryFile(delete=False) as out_file: 57 | temp_file_name = out_file.name 58 | ftp.retrbinary('RETR ' + server_path, out_file.write) 59 | os.rename(temp_file_name, local_path) 60 | print "Succesfully downloaded to {0}".format(local_path) 61 | 62 | 63 | def ensure(dir): 64 | """Create a directory if it does not exist 65 | """ 66 | if not os.path.exists(dir): 67 | os.makedirs(dir) 68 | 69 | 70 | def extract_and_remove(zip_path, out_dir): 71 | """Extract the zip file at zip_path to out_dir and then delete it 72 | """ 73 | with zipfile.ZipFile(zip_path, 'r') as outzip: 74 | outzip.extractall(out_dir) 75 | os.remove(zip_path) 76 | 77 | 78 | def download_index_files(out_dir): 79 | """Download all of the SEC index files, organizing them into a 80 | directory structure rooted at out_dir. 81 | """ 82 | 83 | years = ['1993', '1994', '1995', '1996', 84 | '1997', '1998', '1999', '2000', 85 | '2001', '2002', '2003', '2004', 86 | '2005', '2006', '2007', '2008', 87 | '2009', '2010', '2011', '2012'] 88 | 89 | quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4'] 90 | 91 | ensure(out_dir) 92 | 93 | with directory(out_dir): 94 | for year in years: 95 | for quarter in quarters: 96 | subdir = year + '/' + quarter 97 | ensure(subdir) 98 | path = subdir + '/form.zip' 99 | download_file(path, path) 100 | extract_and_remove(path, subdir) 101 | 102 | 103 | 104 | dropuntil = lambda pred, xs: dropwhile(lambda x: not pred(x), xs) 105 | 106 | 107 | def paths_for_10ks(index_file): 108 | paths = [] 109 | # drop the header of the index file, which is seperated from the 110 | # body by a line of all '-'s 111 | lines = dropuntil(lambda a: re.match('-+$', a), index_file) 112 | lines.next() 113 | for line in lines: 114 | if line[:4] == '10-K' or line[:4] == '10-Q': 115 | fields = re.split('\s\s+', line) 116 | company, date, server_path = (fields[1], fields[3], fields[4]) 117 | paths.append((server_path, '{0}_{1}_{2}'.format(company.replace('/', '-'), date, fields[0].replace('/','-')))) 118 | return paths 119 | 120 | 121 | # Actually don't think I need this 122 | def ssh_setup(user, password): 123 | global hosts 124 | command = 'ssh-keygen -t rsa; cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys' 125 | for host in hosts: 126 | subprocess.call(['ssh', '{0}@{1}'.format(user, host), command]) 127 | 128 | 129 | def create_paths_file(data_dir, out_path): 130 | """Walk the data directory to create file with one 2-tuple per line 131 | that contains the server path and intended local path of each form 132 | in the index file. 133 | """ 134 | seperator = '!!!' 135 | with open(out_path, 'a') as out_file: 136 | for root, dirs, files in os.walk(data_dir): 137 | for name in files: 138 | path = os.path.join(root, name) 139 | if path.split('.')[-1] != 'idx': 140 | continue 141 | with open(path, 'r') as index_file: 142 | form_paths = [(s, os.path.join(root, l)) for s,l in paths_for_10ks(index_file)] 143 | outfile.write('\n'.join(str(t) for t in form_paths) + '\n') 144 | 145 | 146 | def chunkify_paths_file(paths_file_path, num_chunks, out_dir): 147 | """Split the paths files at paths_file_path into the specified number 148 | of chunks, placing the chunks in out_dir 149 | """ 150 | with open(paths_file_path, 'r') as paths_file: 151 | num_lines = sum(1 for line in paths_file) 152 | paths_file.seek(0) 153 | chunk_size = num_lines / num_chunks 154 | for i in xrange(num_chunks): 155 | with open(os.path.join(out_dir, 'paths{0}.txt'.format(i)), 'w') as p: 156 | p.write(''.join(islice(paths_file, 0, chunk_size))) 157 | with open(os.path.join(out_dir, 'paths{0}'.format(num_chunks)), 'w') as p: 158 | p.write(''.join(paths_file)) 159 | 160 | 161 | def client_procedure(chunk_number, chunks_dir): 162 | with open('paths{0}.txt'.format(chunk_number), 'r') as chunk: 163 | for line in chunk: 164 | try: 165 | s, l = eval(line) 166 | except Exception as e: 167 | sys.stderr.write(str(e) + line) 168 | else: 169 | try: 170 | download_file(s, l) 171 | except Exception as e: # Maybe add specific exceptions here but I think catching all is better 172 | sys.stderr.write(str(e) + line) 173 | 174 | 175 | # rename this function 176 | # have a variable for the pollux loop like script 177 | def start_download_on_hosts(consolidator, main_data_dir, hosts, chunks_dir, temp_data_dir, script_path, log_dir): 178 | chunk_paths = [os.path.join(chunks_dir, c) for c in os.listdir(chunks_dir)] 179 | # see if there isn't a less hackish way of doing this 180 | command = ('ssh {h} ' + '"nohup python {0}'.format(script_path) 181 | + ' -c {n}"' + ' >' + log_dir + '/log{n}' + ' 2>' + log_dir + '/err{n}&') 182 | 183 | # This is possibly a bad idea 184 | consolidator_loop = ('"while true; do ' 185 | 'rsync -av --remove-source-files {temp}; ' 186 | 'sleep 2; ' 187 | 'done"') 188 | 189 | subprocess.call('ssh {0} '.format(consolidator) + consolidator_loop) 190 | 191 | for i, (host, chunk_path) in enumerate(zip(hosts, chunk_paths)): 192 | subprocess.call(command.format(h=host, n=i)) 193 | 194 | def main(): 195 | global ftp 196 | usage = ('Download either index files (i) or form files (f) ' 197 | 'to a given directory, or run in client mode (c).') 198 | parser = argparse.ArgumentParser(description=usage) 199 | parser.add_argument('mode', type=str, choices=['i', 'f', 'c']) 200 | parser.add_argument('directory', type=str) 201 | 202 | args = parser.parse_args() 203 | 204 | ftp = connect_to_SEC(0) 205 | ftp.login() 206 | 207 | -------------------------------------------------------------------------------- /stocks_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | 4 | from BeautifulSoup import BeautifulSoup as bs 5 | from numpy import * 6 | from collections import OrderedDict, defaultdict 7 | from multiprocessing import Pool 8 | import urllib2, sys, os, csv, re, time, datetime, filecmp, shutil 9 | try: import cPickle as pickle 10 | except: import pickle 11 | 12 | csv_dir = 'stocks_csv' 13 | stocks_fail = 'stocks_fail.txt' 14 | stocks_CIK = 'stocks_CIK.txt' 15 | dat_dir = 'stocks_dat' 16 | 17 | def main(): 18 | global csv_dir 19 | global stocks_fail 20 | global stocks_CIK 21 | 22 | download_flag = 0 23 | getcik_flag = 0 24 | pickle_flag = 1 25 | 26 | 27 | ''' Name output directory for csv and ensure it is created ''' 28 | 29 | if not os.path.exists(csv_dir): 30 | os.makedirs(csv_dir) 31 | 32 | 33 | ''' DOWNLOAD MODULE ''' 34 | if download_flag: 35 | 36 | # Get list of all stocks on the exchange 37 | with open('stocks_list.dat','r') as f: 38 | d = pickle.load(f) 39 | lst = d.keys() 40 | 41 | # Restore file that stores failed downloads 42 | open(stocks_fail, 'w').close() 43 | 44 | pool = Pool(processes=16) 45 | pool.map(downloader, lst) 46 | 47 | # notdone = list(set(lst).difference(set([i.replace('.csv','') for i in os.listdir(csv_dir)]))) 48 | 49 | # Make sure all file names contain no spaces 50 | os.system('rename -v \'s/\ //g\' ' + csv_dir + '/*') 51 | 52 | # Delete all files that are size 0 53 | os.system('./cleaner.sh ' + csv_dir +'/') 54 | 55 | # Move all files that are duplicates into different directory 56 | lsdir = os.listdir(csv_dir); n = len(lsdir) 57 | collector = defaultdict(list); 58 | while len(lsdir) > 0: 59 | print (n-len(lsdir)), 'of', n 60 | i = lsdir.pop(0) 61 | f1 = csv_dir+'/'+i 62 | 63 | for j in lsdir: 64 | f2 = csv_dir+'/'+j 65 | if f1 != f2: 66 | if filecmp.cmp(f1,f2): 67 | collector[i].append(j) 68 | 69 | ref = [] 70 | for k,v in collector.iteritems(): 71 | if len(v) > 0: 72 | tup = [k] 73 | for i in v: 74 | tup.append(i) 75 | ref.append(tup) 76 | rmv = [] 77 | for i in ref: 78 | i.remove(min(i, key=len)) 79 | rmv = rmv + i 80 | # Make sure mv directory exists 81 | csv_mv_dir = 'stocks_csv_mv' 82 | if not os.path.exists(csv_mv_dir): 83 | os.makedirs(csv_mv_dir) 84 | for r in rmv: 85 | try: shutil.move(csv_dir+'/'+r, csv_mv_dir+'/'+r) 86 | except: pass 87 | 88 | 89 | ''' Get list of directories downloaded ''' 90 | lsdir = os.listdir(csv_dir); totl = len(lsdir) 91 | 92 | 93 | ''' RECORD TICKER, CIK code pairs ''' 94 | if getcik_flag: 95 | # Restore file that stores failed downloads 96 | open(stocks_CIK, 'w').close() 97 | 98 | pool = Pool(processes=20) 99 | pool.map(cikgetter, [i.replace('.csv','') for i in lsdir]) 100 | 101 | # Combine files 102 | cpuoutputs = [] 103 | for i in os.listdir('.'): 104 | if (stocks_CIK in i) & (len(i) > len(stocks_CIK) + 2): 105 | cpuoutputs.append(i) 106 | open('stocks_CIK_full.txt', 'w').close() 107 | writefile = open('stocks_CIK_full.txt','a') 108 | for i in cpuoutputs: 109 | with open(i,'r') as fr: 110 | interim = fr.read() 111 | writefile.write(interim) 112 | writefile.close() 113 | 114 | with open('stocks_CIK_full.txt','r') as f: 115 | d = f.read().split('\n')[:-1] 116 | ciked_list = [] 117 | for i in d: 118 | ciked_list.append(i.split('\t')[0]) 119 | 120 | notciked = list(set([i.replace('.csv','') for i in lsdir]).difference(set(ciked_list))) 121 | 122 | pool = Pool(processes=8) 123 | pool.map(cikgetter, notciked) 124 | 125 | # Re-Combine files 126 | cpuoutputs = [] 127 | for i in os.listdir('.'): 128 | if (stocks_CIK in i) & (len(i) > len(stocks_CIK) + 2): 129 | cpuoutputs.append(i) 130 | open('stocks_CIK_full.txt', 'w').close() 131 | writefile = open('stocks_CIK_full.txt','a') 132 | for i in cpuoutputs: 133 | with open(i,'r') as fr: 134 | interim = fr.read() 135 | writefile.write(interim) 136 | os.remove(i) 137 | writefile.close() 138 | 139 | 140 | 141 | ''' VECTORIZE DATA (make list of tuples) and STORE AS PICKLE ''' 142 | if pickle_flag: 143 | # Name output directory for pickle and ensure it is created 144 | global dat_dir 145 | if not os.path.exists(dat_dir): 146 | os.makedirs(dat_dir) 147 | 148 | pool = Pool(processes=8) 149 | pool.map(pickler, [i.replace('.csv','') for i in lsdir]) 150 | 151 | 152 | 153 | def pickler(ticker): 154 | global dat_dir 155 | global csv_dir 156 | csv2pickle(ticker,csv_dir,dat_dir) 157 | print 'Pickled', ticker 158 | 159 | def downloader(ticker): 160 | global csv_dir 161 | global stocks_fail 162 | 163 | # Download file, and return flag indicating 164 | dflag = download_csv(ticker, csv_dir) 165 | 166 | # Take action based on what flag is showed 167 | if dflag == 0: 168 | print 'Downloaded',ticker 169 | 170 | elif dflag == 2048: 171 | print 'Can\'t find ticker',ticker,'to download' 172 | os.system('rm ' + csv_dir + '/' + ticker + '.csv') 173 | 174 | with open(stocks_fail, 'a') as fal: 175 | fal.write(ticker + '\n') 176 | 177 | else: 178 | print 'Quitting downloader: non-resolved issue occured. OS error flag:', dflag 179 | exit() 180 | 181 | 182 | def cikgetter(ticker): 183 | global stocks_CIK 184 | 185 | cik = get_CIK(ticker) # get CIK tuple 186 | #(CIK,name, {-1, if fail; 100, if traditional method; 0<=n<=99, means n words subtracted}) 187 | 188 | # Write to file (ticker, company name, code, CIK) 189 | with open(stocks_CIK+str(os.getpid()), 'a') as fn: 190 | fn.write(ticker + '\t' + cik[1] + '\t' + str(cik[2]) + '\t' + cik[0] + '\n') 191 | 192 | # Take action based on CIK tuple flag 193 | print ticker,'\t',cik[2],'\t',cik[1],'\t',cik[0] 194 | 195 | 196 | ''' DOWNLOADS CSV FROM YAHOO ''' 197 | def download_csv(ticker,csv_dir): 198 | # Build URL string 199 | start_year = '1950' 200 | now = datetime.datetime.now() 201 | 202 | url_string = 'http://ichart.finance.yahoo.com/table.csv?' 203 | url_string += '&s=' + ticker.replace('&','%26') 204 | url_string += '&d=' + str(now.month-1) 205 | url_string += '&e=' + str(now.day) 206 | url_string += '&f=' + str(now.year) 207 | url_string += '&g=d&a=0&b=1&c=' + start_year 208 | url_string += '&ignore.csv' 209 | 210 | # Download file using system call 211 | return os.system('wget \'' + url_string + '\' -O \'' + csv_dir + '/' + ticker + '.csv\' -q') 212 | 213 | 214 | ''' Parses the CSV file and returns a tuple with data as a tuple of: 215 | (DATE, OPEN, HIGH, LOW, CLOSE, VOLUME) ''' 216 | def csv2pickle(ticker,csv_dir,dat_dir): 217 | with open(csv_dir+'/'+ticker+'.csv', 'rb') as f: 218 | fulldata = csv.reader(f) 219 | 220 | # Throw away header 221 | fulldata.next() 222 | 223 | # Temporarily store data in list to 224 | # adjust for dividends, splits, etc; 225 | DATE = []; OPEN = []; HIGH = []; LOW = []; CLOSE = []; VOL = []; ADJ = []; 226 | for row in fulldata: 227 | try: 228 | s = row[0].split('-') 229 | DATE.append( (int(s[0]), int(s[1]), int(s[2])) ) 230 | OPEN.append( float(row[1]) ) 231 | HIGH.append( float(row[2]) ) 232 | LOW.append( float(row[3]) ) 233 | CLOSE.append( float(row[4]) ) 234 | VOL.append( int(row[5]) ) 235 | ADJ.append( float(row[6]) ) 236 | except IndexError: 237 | l = min(len(DATE),len(OPEN),len(HIGH),len(LOW),len(CLOSE),len(VOL),len(ADJ)) 238 | DATE = DATE[:l] 239 | OPEN = OPEN[:l] 240 | HIGH = HIGH[:l] 241 | LOW = LOW[:l] 242 | CLOSE = CLOSE[:l] 243 | VOL = VOL[:l] 244 | ADJ = ADJ[:l] 245 | break 246 | 247 | # Carry out adjustment, then convert to our currency (mul by 100) 248 | OPENadj = 100 * array(OPEN) * array(ADJ) / array(CLOSE) 249 | HIGHadj = 100 * array(HIGH) * array(ADJ) / array(CLOSE) 250 | LOWadj = 100 * array(LOW) * array(ADJ) / array(CLOSE) 251 | CLOSEadj = 100 * array(ADJ) 252 | 253 | # Since the adjustment may divide by zero, we zero the Infs and NaNs 254 | OPENadj[ isinf(OPENadj) ] = 0.0; OPENadj[ isnan(OPENadj) ] = 0.0; 255 | HIGHadj[ isinf(HIGHadj) ] = 0.0; HIGHadj[ isnan(HIGHadj) ] = 0.0; 256 | LOWadj[ isinf(LOWadj) ] = 0.0; LOWadj[ isnan(LOWadj) ] = 0.0; 257 | 258 | # Make output list of tuples 259 | output = [] 260 | for idx in xrange(len(DATE)): 261 | tup = ( DATE[idx], ( int(OPENadj[idx]), int(HIGHadj[idx]), \ 262 | int(LOWadj[idx]), int(CLOSEadj[idx]), VOL[idx]) ) 263 | output.append(tup) 264 | 265 | # Reverse to normal chronological order, so 1st entry is oldest data 266 | output.reverse() 267 | 268 | # Convert to ordered dictionary 269 | output = OrderedDict(output) 270 | 271 | # Dump into pickle 272 | with open(dat_dir+'/'+ticker+'.dat', 'wb') as f: 273 | pickle.dump(output, f) 274 | 275 | 276 | def get_CIK(ticker): 277 | # returns (0-flag or name, cik or list of ciks) 278 | soup = bs(urllib2.urlopen('http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK='+ticker+'&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany')) 279 | 280 | with open('stocks_list.dat','r') as f: 281 | nameF = pickle.load(f)[ticker] # full name of company 282 | 283 | try: 284 | cik = str(soup.findAll('link')[1].get('href').split('&CIK=')[1].split('&type=')[0]) 285 | return (cik, nameF, 100) 286 | 287 | except IndexError: 288 | 289 | nameR = re.findall('[a-z&.-]+', nameF.lower()) # regex name of company 290 | 291 | if nameF == 'FAIL': 292 | return('FAIL',nameF,-1) 293 | else: 294 | cik2 = get_CIK2( nameR , len(nameR) ) 295 | 296 | if cik2[0] == 1: 297 | return (cik2[1][0][0], nameF, len(nameR)-cik2[2]) 298 | elif cik2[0] == -1: 299 | return (cik2[1][0][0], nameF, -1) 300 | else: 301 | return (str(cik2[1]), nameF, len(nameR)-cik2[2]) 302 | #(CIK,name, {-1, if fail; 100, if traditional method; 0<=n<=99, means n words subtracted}) 303 | 304 | ''' More robust method of grabbing CIKs ''' 305 | def get_CIK2(name,ngram): 306 | # Returns (number of CIKs, list of ciks [(CIK,name)] ) 307 | if ngram > 0: 308 | 309 | soup = bs(urllib2.urlopen('http://www.sec.gov/cgi-bin/cik.pl.c?company=' + '+'.join(name[:ngram]))) 310 | 311 | # Find how many search results on Edgar 312 | try: 313 | test = int(soup.find('strong').contents[0]) 314 | except ValueError: 315 | test = int(soup.find('b').contents[0]) 316 | except: 317 | test = 0 318 | 319 | if test == 0: 320 | return get_CIK2(name,ngram-1) 321 | 322 | else: 323 | l = soup.findAll('pre')[1].contents 324 | out = []; 325 | for (c, n) in zip(l[0::2], l[1::2]): 326 | out.append( (str(c.contents[0]), str(n).strip()) ) 327 | return (len(l)/2, out, ngram) 328 | 329 | else: 330 | return (-1, [('FAIL','FAIL')],-1) 331 | 332 | ''' Return success rate for tuple (error, total) ''' 333 | def err(e,t): 334 | return '('+str(round((t-e)*100./t,2))+'%)' 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | ''' 345 | def get_prices(data,date): 346 | try: 347 | return (i for i in data if i[0] == date).next() 348 | except StopIteration: 349 | print 'No data for the date', date 350 | return None 351 | ''' 352 | 353 | 354 | if __name__ == '__main__': 355 | main() 356 | -------------------------------------------------------------------------------- /Old/data_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This module manages and maintains data for RoboBuffett 4 | ''' 5 | import os, sys, logging, string 6 | from datetime import date, timedelta 7 | from pdb import set_trace as debug 8 | from collections import Counter 9 | # Note: Using Counter on the test dataset increased the pickled size from 1.3mb to 1.8mb 10 | try: 11 | import cPickle as pickle 12 | except: 13 | import pickle 14 | 15 | class Financial_Universe: 16 | '''Manage all financial information for all of the stocks. This will be the object that we pickle for serialization. For memory efficiency, it will not contain the full raw text of documents, but will contain methods by which they can be loaded into memeory. 17 | Perhaps we will want to include the word frequency counts since they will require less space. Will have to see whether it's practical. 18 | ''' 19 | def __init__(self, data_dir): 20 | self.companies = {} 21 | self.industries = {} 22 | self.documents = [] 23 | num_docs = len(os.listdir(data_dir)) 24 | print "Num docs: %d" % num_docs 25 | docs_counted = 0 26 | for doc_path in os.listdir(data_dir): 27 | if doc_path[0] == ".": continue 28 | newdoc = Document(data_dir + doc_path, 'SEC_Quarterly') 29 | self.documents.append(newdoc) 30 | docs_counted += 1 31 | 32 | for i in xrange(len(newdoc.CIK)): 33 | CIK = newdoc.CIK[i] 34 | # CIKs stored as a list since there may be several 35 | """***TODO:***: Better system for handling CIKs, integrate with stock tickers""" 36 | 37 | if CIK not in self.companies: 38 | self.companies[CIK] = Company(newdoc, i) # Create a new company entry based on the document 39 | else: 40 | self.companies[CIK].add_document(newdoc, i) 41 | if docs_counted % 100 == 0: 42 | print "Docs processed %d" % docs_counted 43 | 44 | for CIK, company in self.companies.iteritems(): 45 | if CIK not in self.industries: 46 | self.industries[CIK] = Industry(CIK, company) 47 | else: 48 | self.industries[CIK].add_company(CIK, company) 49 | self.get_counts() 50 | 51 | def get_counts(self): 52 | sum = 0 53 | for doc in self.documents: 54 | sum += doc.num_words 55 | self.num_words = sum 56 | self.num_docs = len(self.documents) 57 | self.num_companies = len(self.companies) 58 | self.num_industries = len(self.industries) 59 | 60 | def generate_word_index(self, threshold): 61 | """Generates an index of commonly used words in the documents, so that the documents can be stored in compressed form. We can remove all instances of commonly used words from the dictionaries, and add a k-tuple of word counts, where k is the number of commonly used words. THRESHOLD determines what proportion of documents a word must be in for it to be included in the list. 62 | Creates self.index_list, an ordered list of words in the index. Creates self.index_dict which maps from element indicies back to the right word in the sequence. Sets self.indexed = 1.""" 63 | # Threshold in (0, 1) 64 | dict_index = {} 65 | threshold *= self.num_docs 66 | for document in self.documents: 67 | for word in document.word_freq.iterkeys(): 68 | try: 69 | dictindex[word] += 1 70 | except KeyError: 71 | dictindex[word] = 1 72 | self.index_list = [] 73 | for word, val in dictindex.iteritems(): 74 | if val > threshold: 75 | self.index_list.append(word) 76 | del dictindex 77 | self.index_list.sort() 78 | self.index_dict = {} 79 | for i in xrange(len(self.index_list)): 80 | self.index_dict[index_list[i]] = i 81 | self.indexed = 1 82 | 83 | class Company: 84 | def __init__(self, document, idx): 85 | self.CIK = document.CIK[idx] 86 | self.SIC = document.SIC[idx] 87 | self.documents = [(document.date, document)] 88 | self.name = document.cname[idx] 89 | 90 | def __repr__(self): 91 | return "" + self.name[0] # Currently names are stored as a list as there may be multiple. Not a super satisfactory solution 92 | 93 | def add_document(self, document): 94 | self.documents.append((document.date, document)) 95 | if document.cname[idx] != company.name: 96 | print "Name discrepancy: %s, %s" % (company.name, document.cname) 97 | logging.debug("Name discrepancy: %s, %s" % (company.name, document.cname)) 98 | if document.SIC[idx] != company.SIC: 99 | print "SIC discrepancy: %d %d" % (company.SIC, document.SIC) 100 | logging.debug("SIC discrepancy: %d %d" % (company.SIC, document.SIC)) 101 | 102 | class Industry: 103 | def __init__(self, CIK, company): 104 | self.SIC = company.SIC 105 | self.components = {CIK: company} 106 | self.n_componenets = 1 107 | 108 | def __repr__(self): 109 | return "" + str(self.SIC[0]) 110 | 111 | def add_company(self, CIK, company): 112 | if CIK not in self.components: 113 | self.components[CIK] = company 114 | self.n_components += 1 115 | 116 | class Document: 117 | def __init__(self, docpath, doctype): 118 | '''Populate the following''' 119 | self.path = docpath 120 | self.properties = {} 121 | self.word_freq = {} 122 | self.num_words = {} 123 | try: 124 | self.docfile = open(docpath, 'r') 125 | except IOError: 126 | print "Bad file path ", docpath 127 | logging.warning('Bad doc path: %s' % docpath) 128 | return 129 | 130 | if doctype == 'SEC_Quarterly': 131 | self.parse_quarterly_filing() 132 | else: 133 | print "Document not supported: %s type %s" % (docpath, doctype) 134 | logging.warning('Unsupported doc %s type %s' % (docpath, doctype)) 135 | self.docfile.close() 136 | del self.docfile # Delete file references so Pickle won't complain 137 | 138 | def __repr__(self): 139 | return "" + self.path 140 | 141 | 142 | def parse_quarterly_filing(self): 143 | """Parse a quarterly filing. Makes a dictionary in self.properties containing all of the attributes pulled from the quarterly filing. Makes a word-frequency too.""" 144 | # The last condition 'Item 1. B' triggers when we have parsed all the header info and are into the actual document. Since the dictionaryName is '' it won't store anything, but it returns a nonzero value so that the loop will break 145 | 146 | logging.info("Parsing quarterly filing %s" % self.path) 147 | partition_text = 'PART I' 148 | text = self.docfile.read() 149 | #debug() 150 | text = text.partition(partition_text) 151 | # Currently I partition it into Header and Body by seperating at the first instance of the text 'PART I'. I consider this a placeholder 152 | if text[1] != partition_text: 153 | print "Warning: Unable to partition %s" % self.path 154 | logging.warning("ERROR: Unable to partition document.") 155 | return 156 | header = text[0].split("\n") #Consider mapping .strip for efficiency 157 | text = text[2] 158 | self.parse_quarterly_header(header) 159 | self.build_word_freq(text) 160 | 161 | def build_word_freq(self, text): 162 | to_remove = string.punctuation + string.digits 163 | text = text.translate(None, (to_remove)) 164 | # Removes all punctuation and digits 165 | text = text.lower() 166 | text = text.split() 167 | # Splits the text into a list of lowercase words 168 | # Possible improvements: Strip tables, formatting (e.g. , - 2 -) 169 | self.num_words = len(text) 170 | self.word_count = {} 171 | for word in text: 172 | try: 173 | self.word_count[word] += 1 174 | except KeyError: 175 | self.word_count[word] = 1 176 | # This try/except method may be somewhat more efficient than if-then branching for unigram processing. For n-grams, perhaps better to use if-then. 177 | 178 | #def compress(self, ilist, idict): 179 | 180 | def parse_quarterly_header(self, header): 181 | property_info = ( 182 | #DictionaryName, FilingText, int list 183 | ('DocType', 'CONFORMED SUBMISSION TYPE:', 0, 0), 184 | ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:', 0, 0), 185 | ('FilingDate', 'FILED AS OF DATE:', 0, 0), 186 | ('CompanyName', 'COMPANY CONFORMED NAME:', 0, 1), 187 | ('CIK', 'CENTRAL INDEX KEY:', 1, 1), 188 | ('SIC', 'STANDARD INDUSTRIAL CLASSIFICATION:', 1, 1), 189 | ('IRS_Num', 'IRS NUMBER:', 1, 1), 190 | ('FY_End', 'FISCAL YEAR END:', 1, 1), 191 | ('SEC_FileNo', 'SEC FILE NUMBER:', 1, 1)) 192 | # Defines the properties to seek in the header of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called. 193 | for line in header: 194 | line = line.strip() 195 | for property_tuple in property_info: 196 | self.grab_property(line, *property_tuple) 197 | 198 | if len(self.properties) < len(property_info): 199 | msg = "Found %d of %d fields" % (len(self.properties), len(property_info)) 200 | logging.warning(msg) 201 | 202 | try: 203 | self.convert_property_to_date('ReportingPeriod') 204 | except KeyError: 205 | logging.warning("Doc has no reporting period") 206 | try: 207 | self.convert_property_to_date('FilingDate') 208 | self.date = self.properties['FilingDate'] 209 | except KeyError: 210 | logging.error("Doc has no filing date!") 211 | print "Doc %s has no filing date!" % self.path 212 | try: 213 | self.type = self.properties['DocType'] 214 | self.CIK = self.properties['CIK'] # A list 215 | self.SIC = self.properties['SIC'] # A list 216 | self.cname = self.properties['CompanyName'] # A list 217 | except KeyError as e: 218 | logging.error(e) 219 | 220 | def grab_property(self, line, name, identifier, isInt=0, isList=0): 221 | """Checks LINE for IDENTIFIER. If IDENTIFIER is found in the line, then the text immediately after IDENTIFIER is saved in self.properties[PROPNAME]. If the isInt flag is set, then the content is converted to an integer value. If it doesn't convert to int cleanly, then non-digits characters are stripped, it's force converted, and a note is made in the log. In text mode, leading or trailing whitespace around the content is also removed. grab_property returns the content that it stores. If PROPNAME is "" then no value is stored, but the content is still returned.""" 222 | if line.startswith(identifier): 223 | content = line.partition(identifier)[2].strip() 224 | # Take the content after the identifier, and strip whitespace 225 | props = self.properties 226 | if isInt: 227 | try: 228 | content = int(content) 229 | except ValueError: 230 | logging.debug('''ValueError occured converting "%s" to int in line:\n%s . Forcing conversion.''' % (content, line)) 231 | try: 232 | to_remove = string.punctuation + string.ascii_letters + string.whitespace 233 | content = int(content.translate(None, to_remove)) 234 | except ValueError as e: 235 | logging.error('Unable to force-convert ' + str(e)) 236 | if name != '': 237 | # If propname is the empty string, nothing is stored 238 | if isList: 239 | if name in props: # Append to existing list 240 | props[name].append(content) 241 | else: # Start a new list 242 | props[name] = [content] 243 | else: # Just store a value 244 | props[name] = content 245 | return content 246 | 247 | def convert_property_to_date(self, propname): 248 | prop = self.properties[propname] 249 | yyyy = int(prop[0:4]) 250 | mm = int(prop[4:6]) 251 | dd = int(prop[6:8]) 252 | self.properties[propname] = date(yyyy, mm, dd) 253 | 254 | def main(): 255 | if len(sys.argv) == 1: 256 | data_dir = "./TestData/Docs_From_1994/" 257 | else: 258 | data_dir = argv[1] 259 | 260 | with open('./data_manager.log', 'w') as cleanlog: 261 | pass # Empty the log before each run 262 | 263 | logging.basicConfig(filename='data_manager.log', level=logging.DEBUG) 264 | universe = Financial_Universe(data_dir) 265 | 266 | print "Statistics: %d documents, %d companies %d industries %d words" % (universe.num_docs, universe.num_companies, universe.num_industries, universe.num_words) 267 | logging.info("Statistics: %d documents, %d companies %d industries %d words" % (universe.num_docs, universe.num_companies, universe.num_industries, universe.num_words)) 268 | with open('./universe.dat', 'w') as f: 269 | pickle.dump(universe, f, 0) 270 | #debug() 271 | 272 | if __name__ == "__main__": 273 | main() 274 | 275 | 276 | -------------------------------------------------------------------------------- /manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Current Status: Shift to SQL database instead of ad-hoc python class in progress. 4 | 5 | import os, sys, logging, string, time, math, rb_parser 6 | from rb_parser import ParseError 7 | from pdb import set_trace as debug 8 | from os.path import basename 9 | Path = os.path.join 10 | try: 11 | import cPickle as pickle 12 | except: 13 | import pickle 14 | #import stock 15 | 16 | 17 | def main(): 18 | # Todo: Implement better UI 19 | DataDir = os.path.expanduser('~/Documents/Code/RoboBuffett/Data/') 20 | logfile = DataDir + '../Logs/manager.log' 21 | touch(logfile) 22 | logging.basicConfig(filename=logfile, level=logging.DEBUG) 23 | manager = load_manager(DataDir) 24 | #manager.preprocess() 25 | #manager.process() 26 | [for co in manager.] 27 | 28 | save_manager(manager) 29 | save_industry_dict(manager) 30 | pretty_dict(manager.industries) 31 | manager.print_stats() 32 | 33 | 34 | 35 | def load_manager(DataDir): 36 | '''Load the manager from desk if it's available, or generate a new Manager instance if not''' 37 | try: 38 | with open(DataDir + 'Pickles/manager.dat', 'r') as f: 39 | return pickle.load(f) 40 | except IOError: 41 | return Manager(DataDir) 42 | 43 | def save_manager(manager): 44 | '''Saves the manager to disk''' 45 | with open(manager.DataDir + 'Pickles/manager.dat', 'w') as f: 46 | pickle.dump(manager, f, 2) 47 | 48 | def save_industry_dict(manager): 49 | '''Saves industry dict, useful for generating price indices''' 50 | with open(manager.DataDir + 'Pickles/industrydict.dat', 'w') as f: 51 | pickle.dump(manager.industries, f, 0) 52 | 53 | class Manager(object): 54 | """Persistent object that manages the entire dataset 55 | Functionality overview: 56 | init = Basic setup 57 | preprocess = Vital step that organizes documents by CIK. Moves 58 | unparseable documents to 'Exceptions' 59 | process = Process documents in Data/Preprocessed. Create company 60 | entries containing parsed word counts. Moves documents 61 | to 'Active' or 'Inactive' 62 | print_stats = Print a bunch of statistics about the manager 63 | 64 | Terminology: 65 | CIK: Central Index Key, used as unique identifiers for companies 66 | SIC: Standard Industrial Code, SEC's industry designators 67 | good CIK: a CIK for which we have stock price information 68 | active CIK: a CIK in the document dataset which is good 69 | inactive CIK: a CIK in the document dataset which isn't good 70 | processed documents: have been run thru the pre-processor 71 | exception documents: they didn't parse 72 | active documents: 'owned' by an active CIK 73 | inactive documents: not owned by an active CIK 74 | valid documents: union of active and inactive documents 75 | 76 | 77 | """ 78 | def __init__(self, DataDir): 79 | ### Set up directory structure ### 80 | self.DataDir = DataDir 81 | os.chdir(DataDir) 82 | vital_dirs = ('Pickles/','Pickles/CIKs','Active/','Inactive/', 83 | 'Unprocessed/','Preprocessed/','Processed/','Exceptions/') 84 | map(ensure, vital_dirs) # Make sure they all exist 85 | 86 | ### Mappings ### 87 | self.industries = {} # Mapping from SIC->[CIK] 88 | self.CIK_to_Ticker = dePickle('Pickles/CIK_Ticker.dat') 89 | 90 | ### Sets ### 91 | self.good_CIKs = set(self.CIK_to_Ticker.iterkeys()) 92 | self.active_CIKs = set() 93 | self.CIK2date = {} # Map from active CIKs to documents (the dates) 94 | self.inactive_CIKs = set() 95 | self.processed_docs = set() 96 | # Original names of all documents processed by the manager. 97 | # Maintained to avoid double-counting documents. 98 | self.valid_docs = set() 99 | 100 | # Invariant: len(processed) >= len(valid) - len(exception) 101 | # This is because for every processed document, the parser 102 | # either fails and generates an exception, or succeeds and 103 | # creates 1 or more valid documents corresponding to the 104 | # number of valid filers (unique CIKs) found in the document. 105 | self.exception_docs = set() 106 | self.active_docs = set() 107 | self.inactive_docs = set() 108 | self.company_word_sets = [] 109 | 110 | 111 | def preprocess(self): 112 | """Preprocess the documents in Data/Unprocessed 113 | Finds a doc's CIKs and creates hard links in the folder 114 | Preprocessed/CIK. If a doc doesn't parse properly, it is 115 | moved to Data/Exceptions instead. 116 | The pre-processing step allows us to consider only one CIK 117 | at a time during the processing step, for memory efficiency. 118 | """ 119 | n_proc = 0 120 | n_valid = 0 121 | n_except = 0 122 | start = time.time() 123 | os.chdir(self.DataDir + 'Unprocessed/') 124 | for (docpath, docname) in recursive_file_gen('.'): 125 | # Returns (path, filename) tuples for all files in directory 126 | # and subdirectories that don't begin with '.' or '_' 127 | if docname in self.processed_docs: continue 128 | self.processed_docs.add(docname) 129 | n_proc += 1 130 | # Code assumes that docnames are unique 131 | try: 132 | (header, cik2filers, _) = rb_parser.parse_quarterly_filing(docpath) 133 | # Returns (but doesn't process) the raw text. 134 | date = header['FilingDate'] 135 | doctype = header['DocType'] 136 | for CIK in cik2filers.iterkeys(): 137 | new_docname = CIK + '_' + date + '.txt' 138 | ensure(self.DataDir + 'Preprocessed/' + CIK) 139 | safelink(docpath, self.DataDir + 'Preprocessed/' + CIK + '/' + new_docname) 140 | if new_docname in self.valid_docs: 141 | print "Repeated doc: %s" % new_docname 142 | self.valid_docs.add(new_docname) 143 | n_valid += 1 144 | if n_valid != len(self.valid_docs): 145 | pass#debug() 146 | 147 | except ParseError as e: 148 | self.exception_docs.add(docname) 149 | n_except += 1 150 | logging.warning(docname + ": " + str(e)) 151 | safelink(docpath, self.DataDir + 'Exceptions/' + basename(docpath)) 152 | 153 | 154 | # if n_proc > n_valid + n_except: 155 | # print "Warning: proc %d, valid %d, except %d" % (n_proc, n_valid, n_except) 156 | # elif n_proc % 100 == 0: 157 | # print "Proc %d, valid %d, except %d, combined %d" % (n_proc, n_valid, n_except, n_valid + n_except) 158 | # if n_proc != len(self.processed_docs) or n_valid != len(self.valid_docs) or n_except != len(self.exception_docs): 159 | # debug() 160 | 161 | end = time.time() 162 | print "Time elapsed in preprocessing: %.1f" % (end-start) 163 | 164 | def process(self): 165 | start = time.time() 166 | os.chdir(self.DataDir + 'Preprocessed') 167 | # Iterate through all the preprocessed CIKs 168 | for CIK in os.listdir('.'): 169 | if CIK[0] == '.' or not os.path.isdir(CIK): continue 170 | 171 | if CIK in self.good_CIKs: 172 | self.active_CIKs.add(CIK) 173 | company = self.load_company(CIK) 174 | ensure(self.DataDir + 'Active/' + CIK) 175 | if CIK not in self.CIK2date: 176 | self.CIK2date[CIK] = [] 177 | for filing in os.listdir(CIK): 178 | filingpath = CIK + '/' + filing 179 | (header, filers, rawtext) = rb_parser.parse_quarterly_filing(filingpath) 180 | company.properties(filers) 181 | # Update company properties with info taken from the 'filers' part of the document 182 | date = header['FilingDate'] 183 | company.add_document(date, rawtext) 184 | # Creates a word dictionary and wordcount from the raw text returned by the parser 185 | self.CIK2date[CIK].append(date) 186 | self.active_docs.add(filing) 187 | os.rename(filingpath, self.DataDir + 'Active/' + filingpath) 188 | # Move the filing to the 'Active' directory - note this means atm all parsed data is stored in the directory structure 189 | company.build_wordset() 190 | self.company_word_sets.append(company.wordset) 191 | self.save_company(company) 192 | SIC = company.SIC 193 | 194 | try: 195 | if CIK not in self.industries[SIC]: 196 | self.industries[SIC].append(CIK) 197 | except KeyError: 198 | self.industries[SIC] = [CIK] 199 | del company # Get it out of memory. Probably unnecessary 200 | 201 | else: # if CIK not in self.goodCIKs 202 | self.inactive_CIKs.add(CIK) 203 | ensure(self.DataDir + 'Inactive/' + CIK) 204 | for filing in os.listdir(CIK): 205 | self.inactive_docs.add(filing) 206 | os.rename(CIK +'/'+ filing, 207 | self.DataDir + 'Inactive/' + CIK +'/'+ filing) 208 | os.removedirs(CIK) 209 | end = time.time() 210 | print "Time elapsed in processing: %.1f" % (end-start) 211 | 212 | def generic_word_set(self, proportion): 213 | self.generic_word_set = proportional_set_intersection(self.company_word_sets, proportion) 214 | 215 | def gen_training_set(self, cutoff, skipyears): 216 | self.training_set = {} 217 | for CIK, dates in self.CIK2date: 218 | if random.random() > cutoff: continue 219 | datelist = [] 220 | for date in dates: 221 | if date not in skipyears: 222 | datelist.append(date) 223 | if datelist != []: 224 | self.training_set[CIK] = datelist 225 | 226 | 227 | def load_company(self, CIK): 228 | # Look for the company in: 229 | # 1. The pickles directory 230 | # 2. Make a new company 231 | # If #2, then add to active list. If #3, then add to active list and add SIC to industries. 232 | if os.path.exists(self.DataDir + 'Pickles/CIKs/' + CIK + '.dat'): 233 | with open(self.DataDir + 'Pickles/CIKs/' + CIK + '.dat', 'r') as f: 234 | company = pickle.load(f) 235 | else: 236 | company = Company(CIK) 237 | return company 238 | 239 | def save_company(self, company): 240 | with open(self.DataDir + 'Pickles/CIKs/' + company.CIK + '.dat', 'w') as f: 241 | pickle.dump(company, f, 2) 242 | 243 | def print_stats(self): 244 | good = len(self.good_CIKs) 245 | active = len(self.active_CIKs) 246 | inactive = len(self.inactive_CIKs) 247 | sics = len(self.industries.keys()) 248 | proc = len(self.processed_docs) 249 | valid = len(self.valid_docs) 250 | exceptions = len(self.exception_docs) 251 | activeD = len(self.active_docs) 252 | inactiveD = len(self.inactive_docs) 253 | try: 254 | safeprint("%d good CIKs, %d active CIKs, %d inactive CIKs" % (good, active, inactive)) 255 | safeprint("%.2f of observed CIKs are active, %.2f of good CIKs are active" % (active / float(active + inactive), active / float(good))) 256 | safeprint("%d SICs, average of %1.2f active CIKs per SIC" % (sics, active / float(sics))) 257 | safeprint("%d processed documents, %d valid, %d exceptions" % (proc, valid, exceptions)) 258 | safeprint("Implied: %1.2f CIKs per document, %.2f exception rate" % (valid / float(proc - exceptions), exceptions / float(proc))) 259 | safeprint("%d active documents, %d inactive, %.2f activation rate" % (activeD, inactiveD, activeD / float(proc))) 260 | except ZeroDivisionError: 261 | safeprint("Please run the manager on some files before printing stats") 262 | 263 | class Company(object): 264 | """Keeps track of a single company (as identified by CIK). 265 | Contains CIK, SIC classification (if any), name, a list of filingdates, a mapping from filing dates to document parses, and a set of all words used by this company in any document.""" 266 | def __init__(self, CIK): 267 | self.CIK = CIK 268 | self.SIC = 0 269 | self.name = '' 270 | self.dates = [] 271 | self.docs = {} # (count_dict, #words) tuples are indexed by filingdate 272 | self.wordset = set() 273 | 274 | def properties(self, filers): 275 | # If company has no properties, then add them. If not, check for discrepancies 276 | filerdict = filers[self.CIK] 277 | newSIC = filerdict['SIC'] 278 | if self.SIC == 0: 279 | self.SIC = newSIC 280 | elif self.SIC != newSIC: 281 | logging.warning("Company switched SICs: CIK: %s orig SIC: %d new SIC: %d" % (self.CIK, self.SIC, newSIC)) 282 | cname = filerdict['CompanyName'] 283 | if self.name == '': 284 | self.name = cname 285 | elif self.name != cname: 286 | logging.warning("Company switched names: %s %s" % (self.name, cname)) 287 | 288 | 289 | def add_document(self, filing_date, raw_text): 290 | self.dates.append(filing_date) 291 | word_count, n_words = rb_parser.build_word_count(raw_text) 292 | self.docs[filing_date] = (word_count, n_words) 293 | self.wordset |= word_count.viewkeys() 294 | 295 | def rebuild_wordset(self): 296 | #Should build a set containing every word which exists in at least one filing 297 | #This is done automatically as documents are added; should only be called if you have some reason to rebuild the entire set 298 | for (word_dict, numwords) in self.docs.itervalues(): 299 | self.wordset |= word_dict.viewkeys() 300 | 301 | 302 | def proportional_set_intersection(sets, p): 303 | # Takes a list of sets: [Set1, Set2, Set3]. 304 | # s = len(sets) 305 | # Returns a set containing every element which was in at least p proportion of the sets, i.e. there were at least s * p instances in the sets 306 | count = {} 307 | for sett in sets: 308 | for element in sett: 309 | try: 310 | count[element] += 1 311 | except KeyError: 312 | count[element] = 1 313 | 314 | s = len(sets) 315 | n = math.floor(s * p) 316 | 317 | outset = set() 318 | for key,val in count.iteritems(): 319 | if val > n: 320 | outset.add(key) 321 | 322 | 323 | # Utility functions 324 | 325 | def recursive_file_gen(mydir): 326 | for root, dirs, files in os.walk(mydir): 327 | for file in files: 328 | if file[0] not in ('.', '_'): 329 | yield (os.path.join(root, file), file) 330 | 331 | def ensure(dir): 332 | if not os.path.exists(dir): 333 | os.makedirs(dir) 334 | 335 | def touch(filepath): 336 | if not os.path.exists(filepath): 337 | with open(filepath, 'w') as f: 338 | pass 339 | 340 | def safelink(source, dest): 341 | try: 342 | os.link(source, dest) 343 | except OSError: 344 | pass 345 | 346 | def dePickle(filestr): 347 | with open(filestr, 'r') as f: 348 | return pickle.load(f) 349 | 350 | def safeprint(string): 351 | try: 352 | print string 353 | except: 354 | pass 355 | 356 | def pretty_dict(output): 357 | lenlist = [] 358 | for key, val in output.iteritems(): 359 | lenlist.append((key,len(val))) 360 | lenlist = sorted(lenlist, key=lambda student: student[1]) 361 | for (sic, i) in lenlist: 362 | print str(sic) + ('*' * i) 363 | 364 | if __name__ == "__main__": 365 | main() --------------------------------------------------------------------------------