├── testclean.py
├── Data
    └── Pickles
    │   └── manager.dat
├── .gitignore
├── README
├── Utilities
    ├── reset_manager.py
    └── test_compression.py
├── data_reporter.py
├── cik_reader.py
├── yahoo_parser.py
├── stock.py
├── parser.py
├── scraper.py
├── rb_parser.py
├── stock_price.py
├── classifier.py
├── clean_scrape.py
├── stocks_downloader.py
├── Old
    └── data_manager.py
└── manager.py


/testclean.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Data/Pickles/manager.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/teamdandelion/RoboBuffett/HEAD/Data/Pickles/manager.dat


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Data/Processed/*
 2 | Data/Unprocessed/*
 3 | Data/Exceptions/*
 4 | Data/Active/*
 5 | Data/Inactive/*
 6 | Data/Pickles/CIKs*
 7 | Data/2001/*
 8 | testdoc
 9 | *.csv
10 | *.pyc
11 | *.log
12 | TestData/*
13 | good_CIK.txt
14 | Utilities/*.dat
15 | Utilities/generate_CIKTicker_mapping.py
16 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Project goal: Predict future stock performance based on textual analysis of SEC filings. 
 2 | 
 3 | Modules:
 4 | 
 5 | Scraper (Izaak):
 6 | Pull SEC documents from EDGAR. 
 7 | 
 8 | Stock (Ahmad):
 9 | Get stock prices and maintain CIK<->Ticker mapping.
10 | 
11 | Manager (Dan): 
12 | Manage & organize data. Manages parsing, serialization, classification, and testing. 
13 | 
14 | Parser (Dan):
15 | Parse SEC filings, getting header info and document text. 
16 | 
17 | Classifier (Ahmad):
18 | Generate training classifications for documents, calls on Stock. Called by manager.
19 | 
20 | To be added: Multinomial model generator, LLV (log likelihood value) classifier. To be maintained by Dan.


--------------------------------------------------------------------------------
/Utilities/reset_manager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os, shutil
 4 | 
 5 | def clean_all():
 6 |     DataDir = os.path.expanduser('~/Documents/Code/RoboBuffett/Data/')
 7 |     temp = DataDir + 'Temp/'
 8 |     ensure(temp)
 9 |     ensure(temp + 'Pickles/')
10 |     to_delete = ('Preprocessed','Active','Inactive','Processed','Exceptions','Pickles/CIKs', 'Pickles/manager.dat')
11 |     for item in to_delete:
12 |         try:
13 |             os.rename(DataDir + item, temp + item)
14 |         except OSError as e:
15 |             print str(e) +': ' + item
16 |     print "Renamed, removing temp dir"
17 |     shutil.rmtree(temp)
18 | 
19 | def ensure(dir):
20 |     if not os.path.exists(dir):
21 |         os.makedirs(dir)
22 | 
23 | if __name__ == "__main__":
24 |     clean_all()


--------------------------------------------------------------------------------
/data_reporter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import os.path
 4 | from os.path import isdir
 5 | import random
 6 | datadir = '/Volumes/Conduit/RBData'
 7 | years = map(str,range(1999,2012))
 8 | qtrs = ['QTR1','QTR2','QTR3','QTR4']
 9 | 
10 | 
11 | bad_years = []
12 | bad_quarters = []
13 | 
14 | def djoin(dir1, dir2=""):
15 |     return datadir + '/' + dir1 + '/' + dir2
16 | 
17 | def missing_linebreaks(filee):
18 |     if os.path.getsize(filee) > 0 and os.path.isfile(filee):
19 |         with open(filee, 'r') as f:
20 |             i = 0
21 |             for line in f:
22 |                 i += 1
23 |                 if i > 2:
24 |                     return False
25 |         print filee
26 |         return True
27 |     return False
28 | 
29 | for year in years:
30 |     if not isdir(djoin(year)):
31 |         print "!Year {} not found".format(year)
32 |         bad_years.append(year)
33 |         continue
34 |     print "----"
35 |     for qtr in qtrs:
36 |         dirr = djoin(year,qtr)
37 |         if not isdir(dirr):
38 |             print "!Quarter {} {} not found".format(qtr,year)
39 |             bad_quarters.append((qtr,year))
40 |             continue
41 | 
42 |         os.chdir(dirr)
43 |         files = len(os.listdir('.'))
44 |         size = float(sum([os.path.getsize(f) for f in os.listdir('.') if os.path.isfile(f)])) / (1024**3)
45 |         if files > 0:
46 |             avg = size / files * (1024**2)
47 |         else:
48 |             avg = 0
49 | 
50 |         rsample = random.sample(os.listdir('.'),min(20,files))
51 |         missing_lb = any(map(missing_linebreaks, rsample))
52 | 
53 |         print "Quarter {} {}: {:6d} files, {:1.2f}GB size, {:4.0f}kB avg size".format(qtr, year, files, size, avg)
54 |         if missing_lb:
55 |             print "Quarter {} {}: Missing linebreaks!".format(qtr,year)
56 |             print zip(rsample,map(missing_linebreaks, rsample))
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/cik_reader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def main():
 7 | 
 8 | 
 9 | 	lst = open('stocks_CIK.txt', 'r').read().split('\n')[:-1]
10 | 
11 | 	''' Get rid of repeat CIK's : Find out a way to deal with them later'''
12 | 	remove_repeats = []
13 | 	for s in lst:
14 | 		remove_repeats += [k for k in lst if s.split('\t')[3]==s.split('\t')[3]]
15 | 	for r in remove_repeats:
16 | 		try: lst.remove(r)
17 | 		except: pass
18 | 
19 | 	''' Properly parse stocks_CIK '''
20 | 
21 | 	collector = defaultdict(list)
22 | 	for s in lst:
23 | 		l = s.split('\t')
24 | 		ticker = l[0] # obtain ticker symbol
25 | 		name   = l[1] # obtain company name
26 | 		flag   = int(l[2]) # CIK flag
27 | 		cik    = l[3]
28 | 		if flag   ==  -1:
29 | 			pass
30 | 		elif flag == 100:
31 | 			collector[(flag,1)].append((ticker,cik,name))
32 | 		elif (flag < 100) | (flag > -1):
33 | 			# If only one CIK
34 | 			if len(cik) == 10:
35 | 				collector[(flag,1)].append((ticker,cik,name))
36 | 			else:
37 | 				cik_eval = eval(cik)
38 | 				collector[(flag,len(cik_eval))].append((ticker,cik,name))
39 | 		else:
40 | 			print 'Encountered unexpected line, quit'
41 | 			exit()
42 | 
43 | 	# Write good ticker, CIK pairs in here
44 | 	#writer = open('good_CIK.txt', 'w')
45 | 	#writer.write(ticker+'\t'+cik+'\n') # write pair to file
46 | 	#writer.close()
47 | 
48 | 	d = dict(collector)
49 | 
50 | 	k = d.keys()
51 | 	for i in k:
52 | 		if i[1] == 1:
53 | 			print i, len(collector[i])
54 | 
55 | 		if i[0] == 100:
56 | 			print i, len(collector[i])
57 | 
58 | 		if i[1] == 2:
59 | 			print i, len(collector[i])
60 | 
61 | 		if i[1] == 3:
62 | 			print i, len(collector[i])
63 | 
64 | 	with open('validated_CIK.txt','wb') as f:
65 | 		for k,v in d.iteritems():
66 | 			if (k[0] == 100) | (k[1] == 1):
67 | 				for i in v:
68 | 					f.write(i[0]+'\t'+i[1]+'\t'+i[2]+'\n')
69 | 
70 | 	ones_list = []
71 | 	for k,v in d.iteritems():
72 | 		if (k[0] == 2) & (k[1] == 1):
73 | 			for i in v:
74 | 				ones_list.append([i[0],i[1],i[2]])
75 | 
76 | 	matchcounter = 0
77 | 	for i in [i[1] for i in ones_list]:
78 | 		matchups = [(k[0],k[2]) for k in ones_list if k[1] == i]
79 | 		if len(matchups) > 1:
80 | 			print matchups
81 | 			matchcounter += 1
82 | 
83 | 	print matchcounter
84 | 
85 | 
86 | if __name__ == '__main__':
87 | 	main()
88 | 


--------------------------------------------------------------------------------
/yahoo_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | 
  4 | from BeautifulSoup import BeautifulSoup as bs
  5 | from collections import defaultdict
  6 | from multiprocessing import Pool
  7 | import urllib2
  8 | try: import cPickle as pickle
  9 | except: import pickle
 10 | 
 11 | def main():
 12 | 	download_list = 0
 13 | 	validate_list = 0
 14 | 	compile_list  = 0
 15 | 
 16 | 
 17 | 	''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
 18 | 	fulllistdat = 'raw_stocks_list.dat'
 19 | 	if download_list:
 20 | 		maximum = 23686; cnt = 1; fcnt = 1; collector = defaultdict(list);
 21 | 		while cnt < maximum:
 22 | 			soup = bs(urllib2.urlopen('http://screener.finance.yahoo.com/b?pr=0/&s=tk&vw=1&db=stocks&b=' + str(cnt)))
 23 | 			table = soup.findAll("table")[1].contents[1].contents[1].contents[1]
 24 | 			for n in range(21)[1:]:
 25 | 				try:
 26 | 					ticker = str(table.contents[n].find('a').string).replace(';','')
 27 | 					name = str(table.contents[n].findAll('font')[1].string).replace('&amp;','&')
 28 | 					collector[ticker] = name
 29 | 					print fcnt,'of',maximum,'\t',ticker,'\t',name
 30 | 					fcnt += 1
 31 | 				except:
 32 | 					saver(collector, fulllistdat)
 33 | 			cnt += 20
 34 | 		saver(collector, fulllistdat)
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 	''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
 40 | 	# Load pickled stocks data
 41 | 	with open(fulllistdat) as f:
 42 | 		d = pickle.load(f)
 43 | 
 44 | 	if validate_list:
 45 | 		# Clear file in which we record validation tickers
 46 | 		open('record_stock_name_validation.txt', 'w').close()
 47 | 
 48 | 		pool = Pool(processes=16)
 49 | 		pool.map(validator, d.keys())
 50 | 
 51 | 
 52 | 
 53 | 	''' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - '''
 54 | 	if compile_list:
 55 | 		with open('raw_name_validation.txt', 'r') as f:
 56 | 			rcrd = f.read().split('\n')[:-1]
 57 | 		collector = defaultdict(list)
 58 | 		for i in rcrd:
 59 | 			collector[i.split('\t')[0]] = i.split('\t')[1]
 60 | 		notdone = list(set(d.keys()).difference(set(collector.keys())))
 61 | 		
 62 | 		pool = Pool(processes=8)
 63 | 		pool.map(validator, notdone)
 64 | 
 65 | 		final_list = defaultdict(list)
 66 | 		p = 0; n = 0; f = 0; r = 0;
 67 | 		for k,v in collector.iteritems():
 68 | 			if v == '':
 69 | 				n += 1
 70 | 			elif v == 'FAIL':
 71 | 				f += 1
 72 | 			elif v == 'PASS':
 73 | 				final_list[k] = d[k]
 74 | 				p += 1
 75 | 			else:
 76 | 				final_list[v] = d[k]
 77 | 				r += 1
 78 | 
 79 | 		with open('stocks_list.dat','wb') as fn:
 80 | 			pickle.dump(dict(final_list),fn)
 81 | 
 82 | 		print 'Total list:',len(final_list)
 83 | 		print 'Nothing:',n,'| Fail:',f,'| Pass:',p,'| Replace:',r
 84 | 			
 85 | 
 86 | def validator(ticker):
 87 | 	soup = bs(urllib2.urlopen('http://finance.yahoo.com/q?s=' + ticker.replace('&','%26') ))
 88 | 
 89 | 	outcome = ''
 90 | 
 91 | 	try:
 92 | 		if ( str(soup.find('h3').contents[0]) == 'Changed Ticker Symbol' ):
 93 | 			outcome = str(soup.findAll('p')[1].contents[1].contents[0])
 94 | 	except: pass
 95 | 
 96 | 	try:
 97 | 		if ( str(soup.findAll('h2')[2].contents[0]) == 'There are no All Markets results for' ):
 98 | 			outcome = 'FAIL'
 99 | 	except: pass
100 | 
101 | 	try:
102 | 		tname = str(soup.findAll('h2')[3].contents[0]).split('(')[-1][:-1]
103 | 		# tname = fname[fname.find("(")+1:fname.find(")")]
104 | 		if ticker in tname:
105 | 			if ticker == tname:
106 | 				outcome = 'PASS'
107 | 			else:
108 | 				outcome = tname
109 | 	except: pass
110 | 
111 | 	with open('raw_name_validation.txt','a') as f:
112 | 		f.write(ticker+'\t'+outcome+'\n')
113 | 
114 | 	print ticker,'\t',outcome
115 | 
116 | def saver(collector, fname):
117 | 	with open(fname,'wb') as f:
118 | 		pickle.dump(dict(collector),f)
119 | 
120 | if __name__ == '__main__':
121 | 	main()
122 | 


--------------------------------------------------------------------------------
/stock.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, collections, datetime
  4 | try: import cPickle as pickle
  5 | except: import pickle
  6 | 
  7 | """Handles using stock prices that have already been scraped"""
  8 | #CIKs are *string* values. Considered storing as ints but CIKs are 10-digit, could potentially overflow 32-bit ints. Since we never do arithmetic on them, and only use them is index keys, better to use strings.
  9 | 
 10 | #Tickers will be (Exchange, Ticker) tuples, e.g. ('NYSE', 'GS').
 11 | 
 12 | #Prices will be integer values with 100 = $1 conversion. Thus Apple's price of $558.22 becomes 55822.
 13 | 
 14 | # 'data' outputed is in form (open, high, low, close, volume)
 15 | 
 16 | def ticker_to_CIK(ticker):
 17 | 	''' Return CIK if valid ticker, return None otherwise '''
 18 | 	with open('validated_CIK.dat') as f:
 19 | 		d = pickle.load(f)
 20 | 	try: return [i[1] for i in d if i[0] == ticker][0]
 21 | 	except IndexError: return None
 22 | 
 23 | def CIK_to_ticker(CIK):
 24 | 	''' Return ticker if valid CIK, return None otherwise '''
 25 | 	with open('validated_CIK.dat') as f:
 26 | 		d = pickle.load(f)
 27 | 	try: return [i[0] for i in d if i[1] == CIK][0]
 28 | 	except IndexError: return None
 29 | 
 30 | def good_CIKs():
 31 | 	''' Return list of all CIKs for which we have trading info on. '''
 32 | 	with open('validated_CIK.dat') as f:
 33 | 		d = pickle.load(f)
 34 | 	return [i[1] for i in d]
 35 | 
 36 | def good_tickers():
 37 | 	''' Return list of all tickers for which we have trading info on. '''
 38 | 	with open('validated_CIK.dat') as f:
 39 | 		d = pickle.load(f)
 40 | 	return [i[0] for i in d]
 41 | 
 42 | def get_open(ticker, dates):
 43 | 	''' Return a price on that date, or the next available day 	'''
 44 | 	''' Returns list of (date, price) tuples 					'''
 45 | 	if isinstance(dates,list):
 46 | 		return [(i[0],i[1][0]) for i in get_data(ticker,dates)]
 47 | 	elif isinstance(dates,tuple):
 48 | 		return [(i[0],i[1][0]) for i in [get_datum(ticker,dates)]]
 49 | 	else:
 50 | 		return (None,None)
 51 | 
 52 | def get_close(ticker, dates):
 53 | 	''' Return a price on that date, or the next available day 	'''
 54 | 	''' Returns list of (date, price) tuples					'''
 55 | 	if isinstance(dates,list):
 56 | 		return [(i[0],i[1][3]) for i in get_data(ticker,dates)]
 57 | 	elif isinstance(dates,tuple):
 58 | 		return [(i[0],i[1][3]) for i in [get_datum(ticker,dates)]]
 59 | 	else:
 60 | 		return (None,None)
 61 | 
 62 | def get_volume(ticker, dates):
 63 | 	''' Return volume on that date, or the next available day 	'''
 64 | 	''' Returns list of (date, volume) tuples					'''
 65 | 	if isinstance(dates,list):
 66 | 		return [(i[0],i[1][4]) for i in get_data(ticker,dates)]
 67 | 	elif isinstance(dates,tuple):
 68 | 		return [(i[0],i[1][4]) for i in [get_datum(ticker,dates)]]
 69 | 	else:
 70 | 		return (None,None)
 71 | 
 72 | def get_data(ticker, dates):
 73 | 	out_list = []
 74 | 	for d in dates:
 75 | 		out_list.append(get_datum(ticker,d))
 76 | 	return out_list
 77 | 
 78 | def get_datum(ticker, date):
 79 | 	''' Return a price on that date, or the next available day 	'''
 80 | 	''' Returns (date, price) tuple 							'''
 81 | 	# Check if file exists
 82 | 	if os.path.isfile('stocks_dat/'+ticker+'.dat'):
 83 | 		with open('stocks_dat/'+ticker+'.dat','r') as f:
 84 | 			d = pickle.load(f)
 85 | 		try: return (date, d[date])
 86 | 		# (exception) If date entry does not exist, try next day
 87 | 		except KeyError: return get_datum(ticker, get_nextday(date))
 88 | 		# (any other exception) return None
 89 | 		else: return (None, None)
 90 | 	else:
 91 | 		return (None, None)
 92 | 
 93 | def get_nextday(date):
 94 | 	n = datetime.date(date[0],date[1],date[2]) + datetime.timedelta(days=1)
 95 | 	return (n.year, n.month, n.day)
 96 | 
 97 | def get_marketcap(ticker, date):
 98 | 	# Return market cap on closest defined day, raise an exception if not defined at all in a 12 month span
 99 | 	'''	WORK ON THIS '''
100 | 	return None
101 | 


--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import string, re
  3 | from datetime import date
  4 | import sys # For test parsing functionality
  5 | from pdb import set_trace as debug
  6 | 
  7 | class ParseError(BaseException):
  8 |     pass
  9 | 
 10 | 
 11 | 
 12 | 
 13 | def parse_quarterly_filing(filepath):
 14 |     with open(filepath, 'r') as doc:
 15 |         rawtext = doc.read()
 16 |     # First partition to separate all of the header data (header + filers)
 17 |     filer_ptext  = '\nFILER:\n' # Break filer sections at this text
 18 |     partitioned = rawtext.partition('</IMS-HEADER>')
 19 |     if partitioned[1] != '</IMS-HEADER>':
 20 |         partitioned = partitioned[0].partition('</SEC-HEADER>')
 21 |         if partitioned[1] != '</SEC-HEADER>':
 22 |             raise ParseError('Unable to partition header from body')
 23 | 
 24 |     header_text = partitioned[0]
 25 |     document_text = partitioned[2] # Text of the document
 26 | 
 27 |     header_text = header_text.partition(filer_ptext)
 28 |     if header_text[1] != filer_ptext:
 29 |         raise ParseError('Unable to partition on %s' % filer_ptext)
 30 |     filer_text  = header_text[2].partition(filer_ptext)
 31 |     header_text = header_text[0] # Just the document header - filing date etc
 32 | 
 33 |     filers_textlist = [] # Also important
 34 | 
 35 |     while filer_text[1] == filer_ptext:
 36 |         filers_textlist.append(filer_text[0])
 37 |         filer_text = filer_text[2].partition(filer_ptext)
 38 |     filers_textlist.append(filer_text[0])
 39 | 
 40 |     #Is there a more efficient place to define these constants?
 41 |     header_info = (
 42 |      ('DocType',         'CONFORMED SUBMISSION TYPE:' ),
 43 |      ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:'),
 44 |      ('FilingDate',      'FILED AS OF DATE:'          ))
 45 | 
 46 |     filer_info = (      
 47 |      ('CompanyName', 'COMPANY CONFORMED NAME:'       ),
 48 |      ('CIK',         'CENTRAL INDEX KEY:'            ),
 49 |      ('SIC',    'STANDARD INDUSTRIAL CLASSIFICATION:'))
 50 | 
 51 |     try:
 52 |         header_dict =  parse_fields(header_text, header_info)
 53 |         #header_dict['FilingDate'] = str2date(header_dict['FilingDate'])
 54 |     except ParseError: # Re-raise with a name
 55 |         raise ParseError('Unable to parse header')
 56 | 
 57 | 
 58 |     cik2filer = {}
 59 |     for filer in filers_textlist:
 60 |         try:
 61 |             filerdict = parse_fields(filer, filer_info)
 62 |             CIK = filerdict['CIK']
 63 |             filerdict['SIC'] = force_to_int(filerdict['SIC'])
 64 |             cik2filer[CIK] = filerdict
 65 |         except ParseError:
 66 |             pass
 67 | 
 68 | 
 69 |     if cik2filer == {}:
 70 |         raise ParseError('No valid filers')
 71 | 
 72 | 
 73 |     #word_count = build_word_count(document_text)
 74 | 
 75 |     return (header_dict, cik2filer, document_text)
 76 | 
 77 | 
 78 | def build_word_count(text):
 79 |     to_remove = string.punctuation + string.digits
 80 |     text = re.sub('<[^>]*>', '', text) # Remove all <Tags>
 81 |     text = text.translate(None, (to_remove))
 82 |     # Removes all punctuation and digits
 83 |     text = text.lower().split()
 84 |     # Splits the text into a list of lowercase words
 85 |     # Possible improvements: Strip tables
 86 |     num_words = len(text)
 87 |     word_count = {}
 88 |     for word in text:
 89 |         try: 
 90 |             word_count[word] += 1
 91 |         except KeyError:
 92 |             word_count[word] = 1
 93 |     # This try/except method may be somewhat more efficient than if-then branching for unigram processing. For n-grams, perhaps better to use if-then.
 94 |     return (word_count, num_words)
 95 | 
 96 | def parse_fields(text, property_info):
 97 |     # Defines the properties to seek in the text of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called.
 98 |     properties = {}
 99 |     text = text.split('\n')
100 |     for line in text:
101 |         line = line.strip()
102 |         for (name, identifier) in property_info:
103 |             if line.startswith(identifier):
104 |                 content = line.partition(identifier)[2].strip()
105 |                 if content == '':
106 |                     raise ParseError('Empty field')
107 |                 properties[name] = content
108 |                 break # Move on to the next line once we find a field
109 | 
110 |     if len(properties) != len(property_info):
111 |         raise ParseError('Unable to find all fields')
112 |     else:
113 |         return properties
114 | 
115 | def force_to_int(val):
116 |     try:
117 |         converted = int(val)
118 |     except ValueError:
119 |         to_remove = string.punctuation + string.letters + string.whitespace
120 |         forced_val = val.translate(None, (to_remove))
121 |         if forced_val == '':
122 |             raise ParseError('Unable to convert SIC to #: %s' % val)
123 |         converted = int(forced_val)
124 |     return converted
125 | 
126 | def str2date(datestr):
127 |     year  = int(datestr[0:4])
128 |     month = int(datestr[4:6])
129 |     day   = int(datestr[6:8])
130 |     return date(year, month, day)
131 | 
132 | def test_parse(document):
133 |     (header, filers, rawtext) = parse_quarterly_filing(document)
134 |     pretty_dict(header, "header")
135 |     [pretty_dict(x, "filer") for x in filers]
136 |     wc = build_word_count(rawtext)
137 |     pretty_dict(wc, "words")
138 | 
139 | def main():
140 |     argv = sys.argv
141 |     if len(argv) == 1:
142 |         print "Give a document and I'll test parse it"
143 |         exit(0)
144 |     fpath = argv[1]
145 |     test_parse(fpath)
146 | 
147 | def pretty_dict(output, name):
148 |     print name + ":"
149 |     for key, val in output.iteritems():
150 |         print "\t" + str(key) + ": " + str(val)
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 
155 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
  1 | from ftplib import FTP
  2 | import os
  3 | import sys
  4 | import zipfile
  5 | import re
  6 | import argparse
  7 | import threading
  8 | import Queue
  9 | 
 10 | def connect_to_SEC(index):
 11 |     if index > 50:
 12 |         print "Maximum number of attempts exceeded. Try again later."
 13 |     else:
 14 |         try:
 15 |             return FTP('ftp.sec.gov')
 16 |         except EOFError:
 17 |             print "Connection refused on attempt {0}. Trying again...".format(index)
 18 |             return connect_to_SEC(index + 1)
 19 | 
 20 | def download_file(serverpath, local_path):
 21 |     global ftp
 22 |     with open (local_path, 'w') as out_file:
 23 |         command = 'RETR ' + serverpath.strip()
 24 |         ftp.retrbinary(command, out_file.write)
 25 | 
 26 | def ensure(dir):
 27 |     if not os.path.exists(dir):
 28 |         os.makedirs(dir)
 29 | 
 30 | def extract_and_remove(zip_path, out_dir):
 31 |     with zipfile.ZipFile(zip_path, 'r') as outzip:
 32 |         outzip.extractall(out_dir)
 33 |     os.remove(zip_path)
 34 | 
 35 | def download_index_files(out_dir):
 36 |     years = ['1993', '1994', '1995', '1996', 
 37 |              '1997', '1998', '1999', '2000', 
 38 |              '2001', '2002', '2003', '2004', 
 39 |              '2005', '2006', '2007', '2008', 
 40 |              '2009', '2010', '2011', '2012']
 41 | 
 42 |     quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
 43 | 
 44 |     # Get the current working directory so that we can change it
 45 |     # back when we're done
 46 |     old_cwd = os.getcwd()
 47 |     ensure(out_dir)
 48 |     os.chdir(out_dir)
 49 | 
 50 |     for year in years:
 51 |         for quarter in quarters:
 52 |             subdir = year + '/' + quarter
 53 |             ensure(subdir)
 54 |             path = subdir + '/form.zip'
 55 |             download_file(path, path)
 56 |             extract_and_remove(path, subdir)
 57 | 
 58 |     os.chdir(old_cwd)
 59 | 
 60 | 
 61 | def split_list(xs, y, eq_func=lambda a, b: a == b):
 62 |     for i, x in enumerate(xs):
 63 |         if eq_func(x, y):
 64 |             return [xs[:i], xs[i + 1:]]
 65 |     else:
 66 |         return [xs]
 67 | 
 68 | def paths_for_10ks(index_file):
 69 |     paths = []
 70 |     lines = index_file.read().splitlines()
 71 |     lines = split_list(lines, '-+$', lambda a, b: re.match(b, a))[1]
 72 |     for line in lines:
 73 |         if line[:4] == '10-K' or line[:4] == '10-Q':
 74 |             fields = re.split('\s\s+', line)
 75 |             company, date, server_path = (fields[1], fields[3], fields[4])
 76 |             paths.append((server_path, '{0}_{1}_{2}'.format(company.replace('/', '-'), date, fields[0].replace('/','-'))))
 77 |     return paths
 78 | 
 79 | def download_forms_serially(paths):
 80 |     global ftp
 81 |     for server_path, local_path in paths:
 82 |         try:
 83 |             with open(local_path, 'w') as out_file:
 84 |                 ftp.retrlines('RETR ' + server_path, out_file.write)
 85 |             print "Saved: {0}".format(local_path)
 86 |         except Exception as e:
 87 |             print e
 88 |             print 'Download failed on file at: {0}'.format(server_path)
 89 | 
 90 | def download_10ks(data_directory):
 91 |     for root, dirs, files in os.walk(data_directory):
 92 |         for name in files:
 93 |             path = os.path.join(root, name)
 94 |             if path.split('.')[-1] != 'idx':
 95 |                 continue
 96 |             with open(path, 'r') as index_file:
 97 |                 form_paths = [(s, os.path.join(root, l)) for s,l in paths_for_10ks(index_file)]
 98 |                 download_forms_serially(form_paths)
 99 | 
100 | # A class to facilitate multithreaded downloading of data over FTP
101 | class FTPThread(threading.Thread):
102 |     """A class to download data over FTP in parallel threads"""
103 |     def __init__(self, server_path, local_path):
104 |         self.server_path = server_path
105 |         self.local_path = local_path
106 |         threading.Thread.__init__(self)
107 |     def run(self):
108 |         global ftp
109 |         try:
110 |             with open(self.local_path, 'w') as out_file:
111 |                 ftp.retrlines('RETR ' + self.server_path, out_file.write)
112 |             print "Saved: {0}".format(self.local_path)
113 |         except Exception as e:
114 |             print e
115 |             print 'Download failed on file at: {0}'.format(self.server_path)
116 | 
117 | 
118 | def download_forms(paths, max_threads):
119 |     finished = []
120 |     def producer(q, paths):
121 |         for server_path, local_path in paths:
122 |             thread = FTPThread(server_path, local_path)
123 |             thread.start()
124 |             q.put(thread, True)
125 | 
126 |     def consumer(q, total_files):
127 |         while len(finished) < total_files:
128 |             thread = q.get(True)
129 |             thread.join()
130 |             finished.append(thread)
131 | 
132 |     q = Queue.Queue(max_threads)
133 | 
134 |     prod_thread = threading.Thread(target=producer, args=(q, paths))
135 |     cons_thread = threading.Thread(target=consumer, args=(q, len(paths)))
136 |     prod_thread.start()
137 |     cons_thread.start()
138 |     prod_thread.join()
139 |     cons_thread.join()
140 | 
141 | if __name__ == '__main__':
142 |     parser = argparse.ArgumentParser(description='Download either index files (i) or form files (f) to a given directory.')
143 |     parser.add_argument('mode', type=str, choices=['i', 'f'])
144 |     parser.add_argument('directory', type=str)
145 | 
146 |     args = parser.parse_args()
147 | 
148 |     ftp = connect_to_SEC(0)
149 |     ftp.login()
150 | 
151 |     if args.mode == 'i':
152 |         index_path = '/edgar/full-index'
153 |         ftp.cwd(index_path)
154 |         download_index_files(args.directory)
155 |     else:
156 |         download_10ks(args.directory)
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/rb_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import string, re
  3 | from datetime import date
  4 | import sys # For test parsing functionality
  5 | from pdb import set_trace as debug
  6 | 
  7 | class ParseError(BaseException):
  8 |     pass
  9 | 
 10 | 
 11 | 
 12 | 
 13 | def parse_quarterly_filing(filepath):
 14 |     """Parse a 10-K or 10-Q. 
 15 |     Returns (header_dict, cik2filer, document_text).
 16 |     header_dict = """
 17 |     with open(filepath, 'r') as doc:
 18 |         rawtext = doc.read()
 19 |     # First partition to separate all of the header data (header + filers)
 20 |     filer_ptext  = '\nFILER:\n' # Break filer sections at this text
 21 |     partitioned = rawtext.partition('</IMS-HEADER>')
 22 |     if partitioned[1] != '</IMS-HEADER>':
 23 |         partitioned = partitioned[0].partition('</SEC-HEADER>')
 24 |         if partitioned[1] != '</SEC-HEADER>':
 25 |             raise ParseError('Unable to partition header from body')
 26 | 
 27 |     header_text = partitioned[0]
 28 |     document_text = partitioned[2] # Text of the document
 29 | 
 30 |     header_text = header_text.partition(filer_ptext)
 31 |     if header_text[1] != filer_ptext:
 32 |         raise ParseError('Unable to partition on %s' % filer_ptext)
 33 |     filer_text  = header_text[2].partition(filer_ptext)
 34 |     header_text = header_text[0] # Just the document header - filing date etc
 35 | 
 36 |     filers_textlist = [] # Also important
 37 | 
 38 |     while filer_text[1] == filer_ptext:
 39 |         filers_textlist.append(filer_text[0])
 40 |         filer_text = filer_text[2].partition(filer_ptext)
 41 |     filers_textlist.append(filer_text[0])
 42 | 
 43 |     #Is there a more efficient place to define these constants?
 44 |     # Defines the properties to seek in the text of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called.
 45 | 
 46 |     header_info = (
 47 |      ('DocType',         'CONFORMED SUBMISSION TYPE:' ),
 48 |      ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:'),
 49 |      ('FilingDate',      'FILED AS OF DATE:'          ))
 50 | 
 51 |     filer_info = (      
 52 |      ('CompanyName', 'COMPANY CONFORMED NAME:'       ),
 53 |      ('CIK',         'CENTRAL INDEX KEY:'            ),
 54 |      ('SIC',    'STANDARD INDUSTRIAL CLASSIFICATION:'))
 55 | 
 56 |     try:
 57 |         header_dict =  parse_fields(header_text, header_info)
 58 |         #header_dict['FilingDate'] = str2date(header_dict['FilingDate'])
 59 |     except ParseError: # Re-raise with a name
 60 |         raise ParseError('Unable to parse header')
 61 | 
 62 | 
 63 |     cik2filer = {}
 64 |     for filer in filers_textlist:
 65 |         try:
 66 |             filerdict = parse_fields(filer, filer_info)
 67 |             CIK = filerdict['CIK']
 68 |             filerdict['SIC'] = force_to_int(filerdict['SIC'])
 69 |             cik2filer[CIK] = filerdict
 70 |         except ParseError:
 71 |             pass
 72 | 
 73 | 
 74 |     if cik2filer == {}:
 75 |         raise ParseError('No valid filers')
 76 | 
 77 | 
 78 |     #word_count = build_word_count(document_text)
 79 | 
 80 |     return (header_dict, cik2filer, document_text)
 81 | 
 82 | 
 83 | def build_word_count(text):
 84 |     to_remove = string.punctuation + string.digits
 85 |     text = re.sub('<[^>]*>', '', text) # Remove all <Tags>
 86 |     text = text.translate(None, (to_remove))
 87 |     # Removes all punctuation and digits
 88 |     text = text.lower().split()
 89 |     # Splits the text into a list of lowercase words
 90 |     # Possible improvements: Strip tables
 91 |     num_words = len(text)
 92 |     word_count = {}
 93 |     for word in text:
 94 |         try: 
 95 |             word_count[word] += 1
 96 |         except KeyError:
 97 |             word_count[word] = 1
 98 |     # This try/except method may be somewhat more efficient than 
 99 |     # if-then branching for unigram processing. For n-grams, 
100 |     # perhaps better to use if-then.
101 |     return (word_count, num_words)
102 | 
103 | def parse_fields(text, property_info):
104 |     """Parses a text, looking for specific field information
105 |     Takes raw text, and a list of (name, identifier) tuples. 
106 |     Returns a dictionary mapping names to the content of the line that started with 'identifier'."""
107 |     properties = {}
108 |     text = text.split('\n')
109 |     for line in text:
110 |         line = line.strip()
111 |         for (name, identifier) in property_info:
112 |             if line.startswith(identifier):
113 |                 content = line.partition(identifier)[2].strip()
114 |                 # Content = everything that followed the identifier
115 |                 if content == '':
116 |                     raise ParseError('Empty field')
117 |                 properties[name] = content
118 |                 break # Move on to the next line once we find a field
119 | 
120 |     if len(properties) != len(property_info):
121 |         raise ParseError('Unable to find all fields')
122 |     else:
123 |         return properties
124 | 
125 | def force_to_int(val):
126 |     try:
127 |         converted = int(val)
128 |     except ValueError:
129 |         to_remove = string.punctuation + string.letters + string.whitespace
130 |         forced_val = val.translate(None, (to_remove))
131 |         if forced_val == '':
132 |             raise ParseError('Unable to convert SIC to #: %s' % val)
133 |         converted = int(forced_val)
134 |     return converted
135 | 
136 | def str2date(datestr):
137 |     year  = int(datestr[0:4])
138 |     month = int(datestr[4:6])
139 |     day   = int(datestr[6:8])
140 |     return date(year, month, day)
141 | 
142 | def test_parse(document):
143 |     (header, filers, rawtext) = parse_quarterly_filing(document)
144 |     pretty_dict(header, "header")
145 |     [pretty_dict(x, "filer") for x in filers]
146 |     wc = build_word_count(rawtext)
147 |     pretty_dict(wc, "words")
148 | 
149 | def main():
150 |     argv = sys.argv
151 |     if len(argv) == 1:
152 |         print "Give a document and I'll test parse it"
153 |         exit(0)
154 |     fpath = argv[1]
155 |     test_parse(fpath)
156 | 
157 | def pretty_dict(output, name):
158 |     print name + ":"
159 |     for key, val in output.iteritems():
160 |         print "\t" + str(key) + ": " + str(val)
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 
165 | 


--------------------------------------------------------------------------------
/stock_price.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | 
  4 | from datetime import datetime
  5 | from string import lower, upper
  6 | import os, csv
  7 | 
  8 | def main():
  9 | 	# Input stock
 10 | 	ticker = lower('AAPL')
 11 | 
 12 | 	# If output directory doesn't exist, make it
 13 | 	output_dir = 'stock_data'
 14 | 	if not os.path.exists(output_dir):
 15 | 		os.makedirs(output_dir)
 16 | 
 17 | 	# If stock CSV isn't downloaded, DOWNLOAD it
 18 | 	if os.path.isfile(output_dir+"/"+ticker+".csv"):
 19 | 		print upper(ticker) + " data already exists in this directory.\n"
 20 | 	else:
 21 | 		download_csv(ticker,output_dir)
 22 | 
 23 | 	# Run this code to download all S&P 500 files
 24 | 	"""
 25 | 	for s in snp_list():
 26 | 		download_csv(s,output_dir)
 27 | 	"""
 28 | 
 29 | 	# Now that the CSV data is downloaded,
 30 | 	# arrange it into a tuple
 31 | 	data = vectorize_csv(ticker,output_dir)
 32 | 
 33 | 	# # # # TEST EXAMPLES # # # #
 34 | 
 35 | 	# Test to see if the tuples have been created
 36 | 	print "Here are the first few lins of the data tuple output:"
 37 | 	print "Format: (DATE, OPEN, HIGH, LOW, CLOSE, VOLUME)"
 38 | 	print data[1:5]
 39 | 
 40 | 	# Retrieve price for specific date
 41 | 	sample_date = (2012,1,18);
 42 | 	print "\nFind price on:", sample_date, "\n", get_prices(data,sample_date)
 43 | 	sample_date = (2012,1,21);
 44 | 	print "\nFind price on:", sample_date, "\n", get_prices(data,sample_date)
 45 | 
 46 | 
 47 | # Parses the CSV file and returns a tuple with data
 48 | # as a tuple of:
 49 | # (DATE, OPEN, HIGH, LOW, CLOSE, VOLUME) 
 50 | def vectorize_csv(ticker,output_dir):
 51 | 	fulldata = csv.reader(open(output_dir+'/'+ticker+'.csv', 'rb'))
 52 | 	output = []
 53 | 
 54 | 	# Throw away header
 55 | 	fulldata.next()
 56 | 
 57 | 	# Store rest of data in list of tuples
 58 | 	for row in fulldata:
 59 | 		s = row[0].split('-')
 60 | 		tup = ((int(s[0]), int(s[1]), int(s[2])), float(row[1]), \
 61 | 			float(row[2]), float(row[3]), float(row[4]), int(row[5]), \
 62 | 			float(row[6]))
 63 | 		output.append(tup)
 64 | 
 65 | 	# Adjust for dividends, splits, etc.
 66 | 	"""
 67 | 	DATEtemp{ptr,1} = DATEvar;
 68 | 	OPENtemp(ptr,1) = OPENvar  * adj_close / CLOSEvar;
 69 | 	HIGHtemp(ptr,1) = HIGHvar  * adj_close / CLOSEvar;
 70 | 	LOWtemp (ptr,1) = LOWvar   * adj_close / CLOSEvar;
 71 | 	CLOSEtemp(ptr,1)= CLOSEvar * adj_close / CLOSEvar;
 72 | 	VOLtemp(ptr,1)  = VOLvar;
 73 | 	"""
 74 | 
 75 | 	# Reverse to normal chronological order, so 1st entry is oldest data
 76 | 	output.reverse()	
 77 | 	return output
 78 | 	
 79 | 
 80 | # Downloads CSV file and stores it in the
 81 | # respective directory
 82 | def download_csv(ticker,output_dir):
 83 | 	# Build URL string
 84 | 	start_year = '1993' 
 85 | 	#Don't need price data older than our filings
 86 | 	now = datetime.now()
 87 | 
 88 | 	url_string = 'http://ichart.finance.yahoo.com/table.csv?'
 89 | 	url_string += '&s=' + (ticker)
 90 | 	url_string += '&d=' + str(now.month-1)
 91 | 	url_string += '&e=' + str(now.day)
 92 | 	url_string += '&f=' + str(now.year)
 93 | 	url_string += '&g=d&a=0&b=1&c=' + start_year
 94 | 	url_string += '&ignore.csv'
 95 | 
 96 | 	# Download file using system call
 97 | 	os.system("wget \'" + url_string + "\' -O \'" + output_dir + "/" + (ticker) + ".csv\'")
 98 | 
 99 | 	print "Finished downloading " + upper(ticker) + "\n"
100 | 
101 | def get_prices(data,date):
102 | 	try:
103 | 		return (i for i in data if i[0] == date).next()
104 | 	except StopIteration:
105 | 		print "No data for the date", date
106 | 		return None
107 | 
108 | 
109 | # Simply gives the list of stocks on S&P 500
110 | # Kept this function at the bottom due to size
111 | def snp_list():
112 | 	return ['MMM', 'ACE', 'ABT', 'ANF', 'ACN', 'ADBE', 'AMD', 'AES', 'AET', 'AFL', 'A', 'GAS', 'APD', 'ARG', 'AKAM', 'AA', 'ATI', 'AGN', 'ALL', 'ALTR', 'MO', 'AMZN', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AMP', 'ABC', 'AMGN', 'APH', 'APC', 'ADI', 'AON', 'APA', 'AIV', 'APOL', 'AAPL', 'AMAT', 'ADM', 'AIZ', 'T', 'ADSK', 'ADP', 'AN', 'AZO', 'AVB', 'AVY', 'AVP', 'BHI', 'BLL', 'BAC', 'BK', 'BCR', 'BAX', 'BBT', 'BEAM', 'BDX', 'BBBY', 'BMS', 'BRK.B', 'BBY', 'BIG', 'BIIB', 'BLK', 'HRB', 'BMC', 'BA', 'BWA', 'BXP', 'BSX', 'BMY', 'BRCM', 'BF.B', 'CHRW', 'CA', 'CVC', 'COG', 'CAM', 'CPB', 'COF', 'CAH', 'CFN', 'KMX', 'CCL', 'CAT', 'CBG', 'CBS', 'CELG', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHK', 'CVX', 'CB', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CTXS', 'CLF', 'CLX', 'CME', 'CMS', 'COH', 'KO', 'CCE', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CSC', 'CAG', 'COP', 'CNX', 'ED', 'STZ', 'CEG', 'GLW', 'COST', 'CVH', 'COV', 'CSX', 'CMI', 'CVS', 'DHI', 'DHR', 'DRI', 'DVA', 'DF', 'DE', 'DELL', 'DNR', 'XRAY', 'DVN', 'DV', 'DO', 'DTV', 'DFS', 'DISCA', 'DLTR', 'D', 'RRD', 'DOV', 'DOW', 'DPS', 'DTE', 'DD', 'DUK', 'DNB', 'ETFC', 'EMN', 'ETN', 'EBAY', 'ECL', 'EIX', 'EW', 'EP', 'EA', 'EMC', 'EMR', 'ETR', 'EOG', 'EQT', 'EFX', 'EQR', 'EL', 'EXC', 'EXPE', 'EXPD', 'ESRX', 'XOM', 'FFIV', 'FDO', 'FAST', 'FII', 'FDX', 'FIS', 'FITB', 'FHN', 'FSLR', 'FE', 'FISV', 'FLIR', 'FLS', 'FLR', 'FMC', 'FTI', 'F', 'FRX', 'BEN', 'FCX', 'FTR', 'GME', 'GCI', 'GPS', 'GD', 'GE', 'GIS', 'GPC', 'GNW', 'GILD', 'GS', 'GR', 'GT', 'GOOG', 'GWW', 'HAL', 'HOG', 'HAR', 'HRS', 'HIG', 'HAS', 'HCP', 'HCN', 'HNZ', 'HP', 'HES', 'HPQ', 'HD', 'HON', 'HRL', 'HSP', 'HST', 'HCBK', 'HUM', 'HBAN', 'ITW', 'TEG', 'INTC', 'ICE', 'IBM', 'IFF', 'IGT', 'IP', 'IPG', 'INTU', 'ISRG', 'IVZ', 'IRM', 'XYL', 'JBL', 'JEC', 'CBE', 'JDSU', 'JNJ', 'JCI', 'JOY', 'JPM', 'JNPR', 'K', 'KEY', 'KMB', 'KIM', 'KLAC', 'KSS', 'KFT', 'KR', 'LLL', 'LH', 'LM', 'LEG', 'LEN', 'LUK', 'LXK', 'LIFE', 'LLY', 'LTD', 'LNC', 'LLTC', 'LMT', 'L', 'LO', 'LOW', 'LSI', 'MTB', 'M', 'MRO', 'MPC', 'MAR', 'MMC', 'MAS', 'ANR', 'MA', 'MAT', 'MKC', 'MCD', 'MHP', 'MCK', 'MJN', 'MWV', 'MHS', 'MDT', 'MRK', 'MET', 'PCS', 'MCHP', 'MU', 'MSFT', 'MOLX', 'TAP', 'MON', 'MCO', 'MS', 'MOS', 'MMI', 'MSI', 'MUR', 'MYL', 'NBR', 'NDAQ', 'NOV', 'NTAP', 'NFLX', 'NWL', 'NFX', 'NEM', 'NWSA', 'NEE', 'NKE', 'NI', 'NE', 'NBL', 'JWN', 'NSC', 'NTRS', 'NOC', 'NU', 'CMG', 'NVLS', 'NRG', 'NUE', 'NVDA', 'NYX', 'ORLY', 'OXY', 'OMC', 'OKE', 'ORCL', 'OI', 'PCAR', 'IR', 'PLL', 'PH', 'PDCO', 'PAYX', 'BTU', 'JCP', 'PBCT', 'POM', 'PEP', 'PKI', 'PRGO', 'PFE', 'PCG', 'PM', 'PNW', 'PXD', 'PBI', 'PCL', 'PNC', 'RL', 'PPG', 'PPL', 'PX', 'PCP', 'PCLN', 'PFG', 'PG', 'PGN', 'PGR', 'PLD', 'PRU', 'PEG', 'PSA', 'PHM', 'QEP', 'PWR', 'QCOM', 'DGX', 'RRC', 'RTN', 'RHT', 'RF', 'RSG', 'RAI', 'RHI', 'ROK', 'COL', 'ROP', 'ROST', 'RDC', 'R', 'SWY', 'SAI', 'CRM', 'SNDK', 'SLE', 'SCG', 'SLB', 'SNI', 'SEE', 'SHLD', 'SRE', 'SHW', 'SIAL', 'SPG', 'SLM', 'SJM', 'SNA', 'SO', 'LUV', 'SWN', 'SE', 'S', 'STJ', 'SWK', 'SPLS', 'SBUX', 'HOT', 'STT', 'SRCL', 'SYK', 'SUN', 'STI', 'SVU', 'SYMC', 'SYY', 'TROW', 'TGT', 'TEL', 'TE', 'THC', 'TDC', 'TER', 'TSO', 'TXN', 'TXT', 'HSY', 'TRV', 'TMO', 'TIF', 'TWX', 'TWC', 'TIE', 'TJX', 'TMK', 'TSS', 'TRIP', 'TSN', 'TYC', 'USB', 'UNP', 'UNH', 'UPS', 'X', 'UTX', 'UNM', 'URBN', 'VFC', 'VLO', 'VAR', 'VTR', 'VRSN', 'VZ', 'VIAB', 'V', 'VNO', 'VMC', 'WMT', 'WAG', 'DIS', 'WPO', 'WM', 'WAT', 'WPI', 'WLP', 'WFC', 'WDC', 'WU', 'WY', 'WHR', 'WFM', 'WMB', 'WIN', 'WEC', 'WPX', 'WYN', 'WYNN', 'XEL', 'XRX', 'XLNX', 'XL', 'YHOO', 'YUM', 'ZMH', 'ZION']
113 | 
114 | 
115 | if __name__ == '__main__':
116 | 	main()
117 | 


--------------------------------------------------------------------------------
/Utilities/test_compression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''Test compression strategies, counter vs dict, etc'''
  3 | import os, sys, logging, string, time, copy, gc
  4 | from datetime import date, timedelta
  5 | from pdb import set_trace as debug
  6 | from collections import Counter
  7 | # Note: Using Counter on the test dataset increased the pickled size from 1.3mb to 1.8mb
  8 | try:
  9 |     import cPickle as pickle
 10 | except:
 11 |     import pickle
 12 | 
 13 | # Change this below robobuffet_directory = '/Users/danmane/Documents/Code/Git/RoboBuffett'
 14 | 
 15 | def printandlog(msg):
 16 |     logging.info(msg)
 17 |     print msg
 18 | 
 19 | def floatRange(minv, maxv, step):
 20 |     """Not reliable for heavy-duty use due to floating point oddities"""
 21 |     x = minv
 22 |     while x <= maxv+step:
 23 |         yield x
 24 |         x += step
 25 | 
 26 | 
 27 | class DictDoc:
 28 |     def __init__(self):
 29 |         self.word_count = {}
 30 |         self.num_words = 0
 31 | 
 32 |     def makeCopy(self, original):
 33 |         self.num_words = original.num_words
 34 |         self.word_count = original.word_count.copy()
 35 | 
 36 |     def generate(self, path):
 37 |         with open(path, 'r') as f:
 38 |             text = f.read()
 39 |         to_remove = string.punctuation + string.digits
 40 |         text = text.translate(None, (to_remove))
 41 |         # Removes all punctuation and digits
 42 |         text = text.lower()
 43 |         text = text.split()
 44 |         # Splits the text into a list of lowercase words
 45 |         # Possible improvements: Strip tables, formatting (e.g. <PAGE>, - 2 -)
 46 |         self.num_words = len(text)
 47 |         for word in text:
 48 |             try: 
 49 |                 self.word_count[word] += 1
 50 |             except KeyError:
 51 |                 self.word_count[word] = 1
 52 | 
 53 |     def delete(self):
 54 |         del self.word_count
 55 |         del self.num_words
 56 | 
 57 | class ContDoc:
 58 |     def __init__(self, path):
 59 |         with open(path, 'r') as f:
 60 |             text = f.read()
 61 |         to_remove = string.punctuation + string.digits
 62 |         text = text.translate(None, (to_remove))
 63 |         # Removes all punctuation and digits
 64 |         text = text.lower()
 65 |         text = text.split()
 66 |         self.num_words = len(text)
 67 |         self.word_count = Counter(text)
 68 | 
 69 | def main(data_dir):
 70 |     rb_dir = '/Users/danmane/Documents/Code/Git/RoboBuffett'
 71 |     os.chdir(rb_dir)
 72 |     #with open('./Utilities/compression.log', 'w') as cleanlog:
 73 |     #    pass # Empty the log before each run
 74 | 
 75 |     logging.basicConfig(filename='./Utilities/compression.log', level=logging.INFO)
 76 |     files = os.listdir(data_dir)
 77 | 
 78 |     #docs = process_file_set(files, 'Utilities/dict.dat', DictDoc, 'Naive Dictionary:', data_dir)    
 79 |     #process_file_set(files, 'Utilities/cont.dat', DictDoc, 'Naive Container:', data_dir)   
 80 |     #print "About to load from pickle"
 81 |     
 82 |     print "About to start range"
 83 |     
 84 |     # s1 = time.time()
 85 |     # cheapcopy = dictlist_copy(docs)
 86 |     # s2 = time.time()
 87 |     # print "cheap:  %f" % (s2-s1)
 88 |     # docscopy = copy.deepcopy(docs)
 89 |     # s3 = time.time()
 90 |     #print "regular: %f" % (s3-s2)
 91 |     
 92 |     for t in floatRange(.05, .95, .1):
 93 |         docs = load_docs_from_file()
 94 |         print "Finished load for threshold %f" %t
 95 |         test_compression(docs, t)
 96 |         for doc in docs:
 97 |             doc.delete()
 98 |         del docs
 99 | 
100 | def load_docs_from_file():
101 |     with open('Utilities/dict.dat', 'r') as f:
102 |         docs = pickle.load(f)
103 |     return docs
104 | 
105 | def dictlist_copy(docs):
106 |     outdocs = []
107 |     for doc in docs:
108 |         new_doc = DictDoc()
109 |         new_doc.makeCopy(doc)
110 |         outdocs.append(new_doc)
111 |     return outdocs
112 | 
113 | 
114 | def test_compression(docs, threshold):
115 |     print "Starting compression for threshold %f" % threshold
116 |     start = time.time()
117 |     index_list_and_dict = generate_word_index(docs, threshold)
118 |     with open('./Utilities/index.dat', 'w') as f:
119 |         pickle.dump(index_list_and_dict, f, 2)
120 | 
121 |     index_dict = index_list_and_dict[1]
122 |     
123 |     compress_dict_set(docs, index_dict)
124 |     with open('./Utilities/compressed_dict.dat', 'w') as f:
125 |         pickle.dump(docs, f, 2)
126 |     end = time.time()
127 |     printandlog('Compressedion with threshold %f:' % threshold)
128 |     printandlog('Time elapsed: %f' % (end-start))
129 | 
130 |     size = os.stat('./Utilities/compressed_dict.dat').st_size
131 |     size += os.stat('./Utilities/index.dat').st_size
132 |     size /= float(10**6)
133 |     printandlog('Size: %f' % size)
134 | 
135 | def process_file_set(files, dbFile, Dtype, type_descr, data_dir):
136 |     print "Processing %s" % type_descr
137 |     start = time.time()
138 |     docs = []
139 |     n_total = len(files)
140 |     count = 0
141 |     for fpath in files:
142 |         if fpath[0] != '.':
143 |             new_obj = Dtype()
144 |             new_obj.generate((data_dir + '/' + fpath))
145 |             docs.append(new_obj)
146 |         count += 1
147 |         if count % 100 == 0:
148 |             print "%d of %d" % (count, n_total)
149 | 
150 |     with open(dbFile, 'w') as f:
151 |         pickle.dump(docs, f, 2)
152 |     end = time.time()
153 |     size = os.stat(dbFile).st_size
154 |     size /= float(10**6)
155 |     elapsed = end-start
156 |     printandlog(type_descr)
157 |     printandlog('Time elasped: %f' % elapsed)
158 |     printandlog('Pickled size: %f' % size)
159 |     return docs
160 | 
161 | def compress_dict_set(docs, idx_dict):
162 |     for doc in docs:
163 |         doc.word_list = [0] * len(idx_dict)
164 |         for word, count in doc.word_count.copy().iteritems():
165 |             try:
166 |                 idx = idx_dict[word]
167 |                 doc.word_list[idx] = count
168 |                 del doc.word_count[word]
169 |             except KeyError:
170 |                 pass
171 | 
172 | def generate_word_index(dict_set, threshold):
173 |     """Generates an index of commonly used words in the documents, so that the documents can be stored in compressed form. We can remove all instances of commonly used words from the dictionaries, and add a k-tuple of word counts, where k is the number of commonly used words. THRESHOLD determines what proportion of documents a word must be in for it to be included in the list.
174 |     Creates self.index_list, an ordered list of words in the index. Creates self.index_dict which maps from element indicies back to the right word in the sequence. Sets self.indexed = 1."""
175 |     # Threshold in (0, 1)
176 |     start = time.time()
177 |     dict_index = {}
178 |     threshold *= len(dict_set)
179 |     for document in dict_set:
180 |         for word in document.word_count.iterkeys():
181 |             try: 
182 |                 dict_index[word] += 1
183 |             except KeyError:
184 |                 dict_index[word]  = 1
185 |     index_list = []
186 |     for word, val in dict_index.iteritems():
187 |         if val > threshold:
188 |             index_list.append(word)
189 |     del dict_index
190 |     index_list.sort()
191 |     index_dict = {}
192 |     for i in xrange(len(index_list)):
193 |         index_dict[index_list[i]] = i
194 |     end = time.time()
195 |     #printandlog('Dict Index time elapsed: %f' % (end-start))
196 |     return (index_list, index_dict)
197 | 
198 | if __name__ == "__main__":
199 |     main('BigData')
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import math
  3 | 
  4 | '''Notes:
  5 | The purpose of this module is to assign classification to (company, quarter, duration) tuples. The company will be represented by a unique company identifier and quarters will be represented as year, quarter tuples e.g. (2002,3) = 3Q2002. 
  6 | The classifier will assign a class to each tuple according to its performance, relative to other companies in the same industry, based on its relative performance during the period [filing date + 1, filing date + 1 + duration]. 
  7 | 
  8 | Company return  = ( company.close(filedate + duration) -  company.open(filedate + 1)) /  company.open(filedate + 1)
  9 | Industry return = (industry.close(filedate + duration) - industry.open(filedate + 1)) / industry.open(filedate + 1)
 10 | Classification is based on (company return - industry return)
 11 | 
 12 | The advantage to this classification approach is that it will capture idiosyncratic outperformance by companies relative to their peers, rather than macro-level economic trends.
 13 | 
 14 | Classification will be based on threshold return levels, which will be expressed as an ordered list [t1, t2, t3]. The (c,q,d) tuple will be assigned to the first threshold for which relative return <= threshold level, where assignment means returning the 0-based index of the threshold. If the return is greater than the maximum threshold level, then it will return the index of the max threshold + 1 (i.e. returns len(thresholds)). 
 15 | '''
 16 | 
 17 | '''Process description:
 18 | C = |thresholds| = number of different classifications
 19 | For each investment horizon ('Duration'):
 20 | Generate C sets, where each set contains pointers to documents which fall into this classification
 21 | Then we need to convert each set of documents into a dictionary of wordcounts
 22 | We want to use only 'common' terms, e.g. nix terms specific to an individual company like "Chlorox" or "Swiffer"
 23 | Need to generate a set of all "common" words that we will include in classifiers
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | '''
 30 | 
 31 | 
 32 | thresholds = [-.4, .15] # Represents 3 classes: (-Inf, -40%), (-40%, 15%), (15%, Inf)
 33 | durations = [1, 20, 40]
 34 | # Returns compared to threshold values are annualized returns relative to industry, rather than raw rates of return 
 35 | 
 36 | def annualize_return(rate_of_return, duration):
 37 |     # This attempts to account for the opportunity cost of capital, i.e. getting a 10% return on a 1-month holding is generally better than a 15% return on a 6-month holding. However, this is imperfect because the opportunity cost is properly a function of how many other documents are coming out in the near future, how likely we are to want to buy those stocks, etc. So if documents are evenly spaced throughout the year, so that on average we want to buy a stock every week, we shouldn't necessarily favor 1-day holding periods over 2-day holding periods (as this model would heavily favor). However if all documents are released in 1-week periods every quarter then we should really want short holding periods. If we invest over medium to long term periods (e.g. 2 months) then this is less of an issue.
 38 |     trading_days_per_year = 252
 39 |     return rate_of_return ** (trading_days_per_year/float(duration))
 40 | 
 41 | def training_classification(company, date, durations, thresholds):
 42 |     # Takes a company, a start date (i.e. date of filing), a list of investment durations, and a list of thresholds
 43 |     # Returns a classification for each duration, with classification corresponding to one of the thresholds
 44 |     ticker = company.ticker
 45 |     SIC    = company.SIC
 46 |     #sector = company.sector
 47 |     start  = next_trading_day(ticker, date)
 48 |     # Requires a next_trading_day module
 49 |     classifications = []
 50 |     for duration in durations:
 51 |         try:
 52 |             stock_return  = get_stock_return(ticker, start, end)
 53 |             sic_return    = get_sic_return(SIC, start, end)
 54 |             # sector_return = get_sector_return(sector, start, end)
 55 |             # baseline_return = weight_sicsector(SIC, sic_return, sector, sector_return)
 56 |             relative_return = stock_return - sic_return
 57 |             ann_relative_return = annualize_return(relative_return, duration)
 58 |             classif = threshold_sieve(ann_relative_return, thresholds)
 59 |             classifications.append(classif)
 60 |         except StockRangeError:
 61 |             classifications.append(None)
 62 | 
 63 | def threshold_sieve(val, thresholds):
 64 |     for i in xrange(len(thresholds)):
 65 |         if val <= thresholds[i]:
 66 |             return i
 67 |     return i+1
 68 | 
 69 | 
 70 | def create_classification_set(manager, thresholds, durations):
 71 |     # Take a manager, thresholds, durations
 72 |     # Choose a 'training set' of Company/Date pairs (i.e. document references)
 73 |     # Generate a classification set for each duration
 74 |     # Classify each Company/Date pair into a threshold group for each duration
 75 |     # Return the d sets (d = |durations|)
 76 |     
 77 | 
 78 | def generate_classification_model(TODO):
 79 |     # Take a classification set and the manager
 80 |     # Generate a group dictionary for the set
 81 |     # Adjust for psuedocount
 82 |     pass
 83 | 
 84 | def classify_multinomial(text, groups, psuedocount):
 85 |     """Classifies a text into one of the provided groups, given a psuedocount.
 86 |     
 87 |     Returns a tuple containing the chosen group and the difference in log-
 88 |     likelihood between the chosen group and the second best option 
 89 |     (for validation purposes and perhaps confidence estimation).
 90 |     
 91 |     """
 92 |     comparisons = {}
 93 |     for group in groups:
 94 |         comparisons[group] = likelihood_comparison(text, group, psuedocount)    
 95 |     max  = float("-inf")
 96 |     second_max = float("-inf")
 97 |     
 98 |     #Want to find the maximum LLV (to classify the group) and the second-maximum
 99 |     #LLV (to report the difference)
100 |     for group in comparisons:
101 |         if comparisons[group] > second_max:
102 |             if comparisons[group] > max:
103 |                 second_max = max
104 |                 max = comparisons[group]
105 |                 classification = group
106 |             else:
107 |                 second_max = comparisons[group]
108 |     
109 |     diff = max - second_max
110 |     assert diff > 0
111 |     return (classification, diff, max)
112 |         
113 | 
114 | # Handling psuedocount classifications:
115 | # Generate classification groups 'pure' with word-counts rather than thetas
116 | # Generate set of all words in all documents
117 | # Ensure that each classification group has 
118 |         
119 | def multinomial_LLV(text, (group_dict, wordcount), psuedocount):
120 |     """Generates log-likelihood that given Text came from given TextGroup.
121 |     
122 |     Note that likelihood function has no absolute meaning, since it is a log-
123 |     likelihood with constants disregarded. Instead, the return value may be 
124 |     used as a basis for comparison to decide which TextGroup is more likely to 
125 |     contain the Text. 
126 |     """
127 |     #Make local copies of the dictionaries so we can alter them without causing problems
128 |     theta_dict = copy.copy(group_dict)
129 |     
130 |     #DO psuedocount biasing beforehand
131 | 
132 |     numWords = float(wordcount + psuedocount * len(group_dict))
133 |     # Need to add psuedocounts since log(0) is undefined (or in orig. multinomial model absent the log transformation, multiplying by a 0 factor would force the result to 0)
134 |     for word in theta_dict:
135 |         theta_dict[word] += psuedocount
136 |     for word in text.dict:
137 |         if word not in theta_dict:
138 |             theta_dict[word] = psuedocount
139 |             numWords += psuedocount
140 |     theta = {}
141 |     for word in theta_dict:
142 |         theta[word] = theta_dict[word] / numWords
143 | 
144 |     loglikelihood = 0
145 |     for word in text.dict:
146 |         loglikelihood += text.dict[word] * math.log(theta[word])                
147 |     return loglikelihood
148 | 
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/clean_scrape.py:
--------------------------------------------------------------------------------
  1 | from ftplib import FTP
  2 | from tempfile import NamedTemporaryFile
  3 | from itertools import *
  4 | import sys
  5 | import os
  6 | import zipfile
  7 | import subprocess
  8 | from contextlib import contextmanager
  9 | 
 10 | 
 11 | @contextmanager
 12 | def directory(path):
 13 |     old_dir = os.getcwd()
 14 |     os.chdir(path)
 15 |     yield
 16 |     os.chdir(old_dir)
 17 | 
 18 | # run with -c for client mode
 19 | 
 20 | # Initialize a variable called ftp so that we can access it from
 21 | # any function after setting it to an FTP object in main
 22 | ftp = None
 23 | 
 24 | hosts = ['altair.cs.uchicago.edu', 'ursa.cs.uchicago.edu',
 25 |          'ankaa.cs.uchicago.edu', 'antares.cs.uchicago.edu',
 26 |          'arcturus.cs.uchicago.edu', 'as.cs.uchicago.edu',
 27 |          'avior.cs.uchicago.edu', 'be.cs.uchicago.edu',
 28 |          'betelgeuse.cs.uchicago.edu', 'canopus.cs.uchicago.edu',
 29 |          'capella.cs.uchicago.edu', 'da.cs.uchicago.edu',
 30 |          'deneb.cs.uchicago.edu', 'dubhe.cs.uchicago.edu',
 31 |          'gacrux.cs.uchicago.edu', 'hadar.cs.uchicago.edu',
 32 |          'ki.cs.uchicago.edu', 'mimosa.cs.uchicago.edu',
 33 |          'naos.cs.uchicago.edu', 'polaris.cs.uchicago.edu',
 34 |          'procyon.cs.uchicago.edu', 'rastaban.cs.uchicago.edu',
 35 |          're.cs.uchicago.edu', 'rigel.cs.uchicago.edu',
 36 |          'saiph.cs.uchicago.edu', 'sh.cs.uchicago.edu',
 37 |          'sirius.cs.uchicago.edu', 'ul.cs.uchicago.edu']
 38 | 
 39 | def connect_to_SEC(max_attempts=50):
 40 |     """ Connect to the SEC ftp server, timing out after max_attempts
 41 |     attempts.
 42 |     """
 43 |     for i in xrange(max_attempts):
 44 |         try:
 45 |             return FTP('ftp.sec.gov')
 46 |         except EOFError:
 47 |             pass
 48 |     print "Maximum number of attempts exceeded. Try again later."
 49 | 
 50 | 
 51 | def download_file(server_path, local_path):
 52 |     """Download a file at server_path on the global ftp server object
 53 |     to local_path.
 54 |     """
 55 |     global ftp
 56 |     with NamedTemporaryFile(delete=False) as out_file:
 57 |         temp_file_name = out_file.name
 58 |         ftp.retrbinary('RETR ' + server_path, out_file.write)
 59 |     os.rename(temp_file_name, local_path)
 60 |     print "Succesfully downloaded to {0}".format(local_path)
 61 | 
 62 | 
 63 | def ensure(dir):
 64 |     """Create a directory if it does not exist
 65 |     """
 66 |     if not os.path.exists(dir):
 67 |         os.makedirs(dir)
 68 | 
 69 | 
 70 | def extract_and_remove(zip_path, out_dir):
 71 |     """Extract the zip file at zip_path to out_dir and then delete it
 72 |     """
 73 |     with zipfile.ZipFile(zip_path, 'r') as outzip:
 74 |         outzip.extractall(out_dir)
 75 |     os.remove(zip_path)
 76 | 
 77 | 
 78 | def download_index_files(out_dir):
 79 |     """Download all of the SEC index files, organizing them into a
 80 |     directory structure rooted at out_dir.
 81 |     """
 82 | 
 83 |     years = ['1993', '1994', '1995', '1996', 
 84 |              '1997', '1998', '1999', '2000', 
 85 |              '2001', '2002', '2003', '2004', 
 86 |              '2005', '2006', '2007', '2008', 
 87 |              '2009', '2010', '2011', '2012']
 88 | 
 89 |     quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
 90 | 
 91 |     ensure(out_dir)
 92 | 
 93 |     with directory(out_dir):
 94 |         for year in years:
 95 |             for quarter in quarters:
 96 |                 subdir = year + '/' + quarter
 97 |                 ensure(subdir)
 98 |                 path = subdir + '/form.zip'
 99 |                 download_file(path, path)
100 |                 extract_and_remove(path, subdir)
101 | 
102 | 
103 | 
104 | dropuntil = lambda pred, xs: dropwhile(lambda x: not pred(x), xs)
105 | 
106 | 
107 | def paths_for_10ks(index_file):
108 |     paths = []
109 |     # drop the header of the index file, which is seperated from the
110 |     # body by a line of all '-'s
111 |     lines = dropuntil(lambda a: re.match('-+$', a), index_file)
112 |     lines.next()
113 |     for line in lines:
114 |         if line[:4] == '10-K' or line[:4] == '10-Q':
115 |             fields = re.split('\s\s+', line)
116 |             company, date, server_path = (fields[1], fields[3], fields[4])
117 |             paths.append((server_path, '{0}_{1}_{2}'.format(company.replace('/', '-'), date, fields[0].replace('/','-'))))
118 |     return paths
119 | 
120 | 
121 | # Actually don't think I need this
122 | def ssh_setup(user, password):
123 |     global hosts
124 |     command = 'ssh-keygen -t rsa; cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys'
125 |     for host in hosts:
126 |         subprocess.call(['ssh', '{0}@{1}'.format(user, host), command])
127 | 
128 | 
129 | def create_paths_file(data_dir, out_path):
130 |     """Walk the data directory to create file with one 2-tuple per line 
131 |     that contains the server path and intended local path of each form
132 |     in the index file.
133 |     """
134 |     seperator = '!!!'
135 |     with open(out_path, 'a') as out_file:
136 |         for root, dirs, files in os.walk(data_dir):
137 |             for name in files:
138 |                 path = os.path.join(root, name)
139 |                 if path.split('.')[-1] != 'idx':
140 |                     continue
141 |                 with open(path, 'r') as index_file:
142 |                     form_paths = [(s, os.path.join(root, l)) for s,l in paths_for_10ks(index_file)]
143 |                     outfile.write('\n'.join(str(t) for t in form_paths) + '\n')
144 | 
145 | 
146 | def chunkify_paths_file(paths_file_path, num_chunks, out_dir):
147 |     """Split the paths files at paths_file_path into the specified number
148 |     of chunks, placing the chunks in out_dir
149 |     """  
150 |     with open(paths_file_path, 'r') as paths_file:
151 |         num_lines = sum(1 for line in paths_file)
152 |         paths_file.seek(0)
153 |         chunk_size = num_lines / num_chunks
154 |         for i in xrange(num_chunks):
155 |             with open(os.path.join(out_dir, 'paths{0}.txt'.format(i)), 'w') as p:
156 |                 p.write(''.join(islice(paths_file, 0, chunk_size)))
157 |         with open(os.path.join(out_dir, 'paths{0}'.format(num_chunks)), 'w') as p:
158 |                 p.write(''.join(paths_file))    
159 | 
160 | 
161 | def client_procedure(chunk_number, chunks_dir):
162 |     with open('paths{0}.txt'.format(chunk_number), 'r') as chunk:
163 |         for line in chunk:
164 |             try:
165 |                 s, l = eval(line)
166 |             except Exception as e:
167 |                 sys.stderr.write(str(e) + line)
168 |             else:
169 |                 try:
170 |                     download_file(s, l)
171 |                 except Exception as e: # Maybe add specific exceptions here but I think catching all is better
172 |                     sys.stderr.write(str(e) + line)
173 | 
174 | 
175 | # rename this function
176 | # have a variable for the pollux loop like script
177 | def start_download_on_hosts(consolidator, main_data_dir, hosts, chunks_dir, temp_data_dir, script_path, log_dir):
178 |     chunk_paths = [os.path.join(chunks_dir, c) for c in os.listdir(chunks_dir)]
179 |     # see if there isn't a less hackish way of doing this
180 |     command = ('ssh {h} ' + '"nohup python {0}'.format(script_path)
181 |               + ' -c {n}"' + ' >' + log_dir + '/log{n}' + ' 2>' + log_dir + '/err{n}&')
182 | 
183 |     # This is possibly a bad idea
184 |     consolidator_loop = ('"while true; do '
185 |                              'rsync -av --remove-source-files {temp}; '
186 |                              'sleep 2; '
187 |                          'done"')
188 | 
189 |     subprocess.call('ssh {0} '.format(consolidator) + consolidator_loop)
190 | 
191 |     for i, (host, chunk_path) in enumerate(zip(hosts, chunk_paths)):
192 |         subprocess.call(command.format(h=host, n=i))
193 | 
194 | def main():
195 |     global ftp
196 |     usage = ('Download either index files (i) or form files (f) '
197 |              'to a given directory, or run in client mode (c).')
198 |     parser = argparse.ArgumentParser(description=usage)
199 |     parser.add_argument('mode', type=str, choices=['i', 'f', 'c'])
200 |     parser.add_argument('directory', type=str)
201 | 
202 |     args = parser.parse_args()
203 | 
204 |     ftp = connect_to_SEC(0)
205 |     ftp.login()
206 | 
207 | 


--------------------------------------------------------------------------------
/stocks_downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | 
  4 | from BeautifulSoup import BeautifulSoup as bs
  5 | from numpy import *
  6 | from collections import OrderedDict, defaultdict
  7 | from multiprocessing import Pool
  8 | import urllib2, sys, os, csv, re, time, datetime, filecmp, shutil
  9 | try: import cPickle as pickle
 10 | except: import pickle
 11 | 
 12 | csv_dir = 'stocks_csv'
 13 | stocks_fail = 'stocks_fail.txt'
 14 | stocks_CIK = 'stocks_CIK.txt'
 15 | dat_dir = 'stocks_dat'
 16 | 
 17 | def main():
 18 | 	global csv_dir
 19 | 	global stocks_fail
 20 | 	global stocks_CIK
 21 | 
 22 | 	download_flag = 0
 23 | 	getcik_flag   = 0
 24 | 	pickle_flag   = 1
 25 | 
 26 | 
 27 | 	''' Name output directory for csv and ensure it is created '''
 28 | 
 29 | 	if not os.path.exists(csv_dir):
 30 | 		os.makedirs(csv_dir)
 31 | 
 32 | 
 33 | 	''' DOWNLOAD MODULE '''
 34 | 	if download_flag:
 35 | 
 36 | 		# Get list of all stocks on the exchange
 37 | 		with open('stocks_list.dat','r') as f:
 38 | 			d = pickle.load(f)
 39 | 		lst = d.keys()
 40 | 
 41 | 		# Restore file that stores failed downloads
 42 | 		open(stocks_fail, 'w').close()
 43 | 
 44 | 		pool = Pool(processes=16)
 45 | 		pool.map(downloader, lst)
 46 | 
 47 | 		# notdone = list(set(lst).difference(set([i.replace('.csv','') for i in os.listdir(csv_dir)])))
 48 | 		
 49 | 		# Make sure all file names contain no spaces
 50 | 		os.system('rename -v \'s/\ //g\' ' + csv_dir + '/*')
 51 | 
 52 | 		# Delete all files that are size 0
 53 | 		os.system('./cleaner.sh ' + csv_dir +'/')
 54 | 
 55 | 		# Move all files that are duplicates into different directory
 56 | 		lsdir = os.listdir(csv_dir); n = len(lsdir)
 57 | 		collector = defaultdict(list);
 58 | 		while len(lsdir) > 0:
 59 | 			print (n-len(lsdir)), 'of', n
 60 | 			i = lsdir.pop(0)
 61 | 			f1 = csv_dir+'/'+i
 62 | 
 63 | 			for j in lsdir:
 64 | 				f2 = csv_dir+'/'+j
 65 | 				if f1 != f2:
 66 | 					if filecmp.cmp(f1,f2):
 67 | 						collector[i].append(j)
 68 | 
 69 | 		ref = []
 70 | 		for k,v in collector.iteritems():
 71 | 			if len(v) > 0:
 72 | 				tup = [k]
 73 | 				for i in v:
 74 | 					tup.append(i)
 75 | 				ref.append(tup)
 76 | 		rmv = []
 77 | 		for i in ref:
 78 | 			i.remove(min(i, key=len))
 79 | 			rmv = rmv + i
 80 | 		# Make sure mv directory exists
 81 | 		csv_mv_dir = 'stocks_csv_mv'
 82 | 		if not os.path.exists(csv_mv_dir):
 83 | 			os.makedirs(csv_mv_dir)
 84 | 		for r in rmv:
 85 | 			try: shutil.move(csv_dir+'/'+r, csv_mv_dir+'/'+r)
 86 | 			except: pass
 87 | 
 88 | 
 89 | 	''' Get list of directories downloaded '''
 90 | 	lsdir = os.listdir(csv_dir); totl = len(lsdir)
 91 | 
 92 | 
 93 | 	''' RECORD TICKER, CIK code pairs '''
 94 | 	if getcik_flag:
 95 | 		# Restore file that stores failed downloads
 96 | 		open(stocks_CIK, 'w').close()
 97 | 
 98 | 		pool = Pool(processes=20)
 99 | 		pool.map(cikgetter, [i.replace('.csv','') for i in lsdir])
100 | 
101 | 		# Combine files
102 | 		cpuoutputs = []
103 | 		for i in os.listdir('.'):
104 | 			if (stocks_CIK in i) & (len(i) > len(stocks_CIK) + 2):
105 | 				cpuoutputs.append(i)
106 | 		open('stocks_CIK_full.txt', 'w').close()
107 | 		writefile = open('stocks_CIK_full.txt','a')
108 | 		for i in cpuoutputs:
109 | 			with open(i,'r') as fr:
110 | 				interim = fr.read()
111 | 			writefile.write(interim)
112 | 		writefile.close()
113 | 
114 | 		with open('stocks_CIK_full.txt','r') as f:
115 | 			d = f.read().split('\n')[:-1]
116 | 		ciked_list = []
117 | 		for i in d:
118 | 			ciked_list.append(i.split('\t')[0])
119 | 
120 | 		notciked = list(set([i.replace('.csv','') for i in lsdir]).difference(set(ciked_list)))
121 | 
122 | 		pool = Pool(processes=8)
123 | 		pool.map(cikgetter, notciked)
124 | 
125 | 		# Re-Combine files
126 | 		cpuoutputs = []
127 | 		for i in os.listdir('.'):
128 | 			if (stocks_CIK in i) & (len(i) > len(stocks_CIK) + 2):
129 | 				cpuoutputs.append(i)
130 | 		open('stocks_CIK_full.txt', 'w').close()
131 | 		writefile = open('stocks_CIK_full.txt','a')
132 | 		for i in cpuoutputs:
133 | 			with open(i,'r') as fr:
134 | 				interim = fr.read()
135 | 			writefile.write(interim)
136 | 			os.remove(i)
137 | 		writefile.close()
138 | 
139 | 
140 | 
141 | 	''' VECTORIZE DATA (make list of tuples) and STORE AS PICKLE '''
142 | 	if pickle_flag:
143 | 		# Name output directory for pickle and ensure it is created
144 | 		global dat_dir
145 | 		if not os.path.exists(dat_dir):
146 | 			os.makedirs(dat_dir)
147 | 
148 | 		pool = Pool(processes=8)
149 | 		pool.map(pickler, [i.replace('.csv','') for i in lsdir])
150 | 
151 | 
152 | 
153 | def pickler(ticker):
154 | 	global dat_dir
155 | 	global csv_dir
156 | 	csv2pickle(ticker,csv_dir,dat_dir)
157 | 	print 'Pickled', ticker
158 | 
159 | def downloader(ticker):
160 | 	global csv_dir
161 | 	global stocks_fail
162 | 
163 | 	# Download file, and return flag indicating
164 | 	dflag = download_csv(ticker, csv_dir)
165 | 
166 | 	# Take action based on what flag is showed
167 | 	if dflag == 0:
168 | 		print 'Downloaded',ticker
169 | 
170 | 	elif dflag == 2048:
171 | 		print 'Can\'t find ticker',ticker,'to download'
172 | 		os.system('rm ' + csv_dir + '/' + ticker + '.csv')
173 | 
174 | 		with open(stocks_fail, 'a') as fal:
175 | 			fal.write(ticker + '\n')
176 | 
177 | 	else:
178 | 		print 'Quitting downloader: non-resolved issue occured. OS error flag:', dflag
179 | 		exit()
180 | 
181 | 
182 | def cikgetter(ticker):
183 | 	global stocks_CIK
184 | 
185 | 	cik = get_CIK(ticker) # get CIK tuple
186 | 	#(CIK,name, {-1, if fail; 100, if traditional method; 0<=n<=99, means n words subtracted})
187 | 
188 | 	# Write to file (ticker, company name, code, CIK)
189 | 	with open(stocks_CIK+str(os.getpid()), 'a') as fn:
190 | 		fn.write(ticker + '\t' + cik[1] + '\t' + str(cik[2]) + '\t' + cik[0] + '\n')
191 | 
192 | 	# Take action based on CIK tuple flag
193 | 	print ticker,'\t',cik[2],'\t',cik[1],'\t',cik[0]
194 | 
195 | 
196 | ''' DOWNLOADS CSV FROM YAHOO '''
197 | def download_csv(ticker,csv_dir):
198 | 	# Build URL string
199 | 	start_year = '1950'
200 | 	now = datetime.datetime.now()
201 | 
202 | 	url_string = 'http://ichart.finance.yahoo.com/table.csv?'
203 | 	url_string += '&s=' + ticker.replace('&','%26')
204 | 	url_string += '&d=' + str(now.month-1)
205 | 	url_string += '&e=' + str(now.day)
206 | 	url_string += '&f=' + str(now.year)
207 | 	url_string += '&g=d&a=0&b=1&c=' + start_year
208 | 	url_string += '&ignore.csv'
209 | 
210 | 	# Download file using system call
211 | 	return os.system('wget \'' + url_string + '\' -O \'' + csv_dir + '/' + ticker + '.csv\' -q')
212 | 
213 | 
214 | ''' Parses the CSV file and returns a tuple with data as a tuple of:
215 | 	(DATE, OPEN, HIGH, LOW, CLOSE, VOLUME) '''
216 | def csv2pickle(ticker,csv_dir,dat_dir):
217 | 	with open(csv_dir+'/'+ticker+'.csv', 'rb') as f:
218 | 		fulldata = csv.reader(f)
219 | 
220 | 		# Throw away header
221 | 		fulldata.next()
222 | 
223 | 		# Temporarily store data in list to
224 | 		# adjust for dividends, splits, etc;
225 | 		DATE = []; OPEN = []; HIGH = []; LOW = []; CLOSE = []; VOL = []; ADJ = [];
226 | 		for row in fulldata:
227 | 			try:
228 | 				s = row[0].split('-')
229 | 				DATE.append(  (int(s[0]), int(s[1]), int(s[2])) )
230 | 				OPEN.append(  float(row[1]) )
231 | 				HIGH.append(  float(row[2]) )
232 | 				LOW.append(   float(row[3]) )
233 | 				CLOSE.append( float(row[4]) )
234 | 				VOL.append(   int(row[5])   )
235 | 				ADJ.append(   float(row[6]) )
236 | 			except IndexError:
237 | 				l = min(len(DATE),len(OPEN),len(HIGH),len(LOW),len(CLOSE),len(VOL),len(ADJ))
238 | 				DATE  = DATE[:l]
239 | 				OPEN  = OPEN[:l]
240 | 				HIGH  = HIGH[:l]
241 | 				LOW   = LOW[:l]
242 | 				CLOSE = CLOSE[:l]
243 | 				VOL   = VOL[:l]
244 | 				ADJ   = ADJ[:l]
245 | 				break
246 | 
247 | 	# Carry out adjustment, then convert to our currency (mul by 100)
248 | 	OPENadj  = 100 * array(OPEN) * array(ADJ) / array(CLOSE)
249 | 	HIGHadj  = 100 * array(HIGH) * array(ADJ) / array(CLOSE)
250 | 	LOWadj   = 100 * array(LOW)  * array(ADJ) / array(CLOSE)
251 | 	CLOSEadj = 100 * array(ADJ)
252 | 
253 | 	# Since the adjustment may divide by zero, we zero the Infs and NaNs
254 | 	OPENadj[ isinf(OPENadj) ] = 0.0; OPENadj[ isnan(OPENadj) ] = 0.0;
255 | 	HIGHadj[ isinf(HIGHadj) ] = 0.0; HIGHadj[ isnan(HIGHadj) ] = 0.0;
256 | 	LOWadj[  isinf(LOWadj)  ] = 0.0; LOWadj[  isnan(LOWadj)  ] = 0.0;
257 | 
258 | 	# Make output list of tuples
259 | 	output = []
260 | 	for idx in xrange(len(DATE)):
261 | 		tup = ( DATE[idx], ( int(OPENadj[idx]),  int(HIGHadj[idx]), \
262 | 			int(LOWadj[idx]), int(CLOSEadj[idx]), VOL[idx]) )
263 | 		output.append(tup)
264 | 
265 | 	# Reverse to normal chronological order, so 1st entry is oldest data
266 | 	output.reverse()
267 | 
268 | 	# Convert to ordered dictionary
269 | 	output = OrderedDict(output)
270 | 
271 | 	# Dump into pickle
272 | 	with open(dat_dir+'/'+ticker+'.dat', 'wb') as f:
273 | 		pickle.dump(output, f)
274 | 
275 | 
276 | def get_CIK(ticker):
277 | 	# returns (0-flag or name, cik or list of ciks)
278 | 	soup = bs(urllib2.urlopen('http://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK='+ticker+'&filenum=&State=&Country=&SIC=&owner=exclude&Find=Find+Companies&action=getcompany'))
279 | 
280 | 	with open('stocks_list.dat','r') as f:
281 | 		nameF = pickle.load(f)[ticker] # full name of company
282 | 
283 | 	try:
284 | 		cik = str(soup.findAll('link')[1].get('href').split('&CIK=')[1].split('&type=')[0])
285 | 		return (cik, nameF, 100)
286 | 
287 | 	except IndexError:
288 | 
289 | 		nameR = re.findall('[a-z&.-]+', nameF.lower()) # regex name of company
290 | 
291 | 		if nameF == 'FAIL':
292 | 			return('FAIL',nameF,-1)
293 | 		else:
294 | 			cik2 = get_CIK2( nameR , len(nameR) )
295 | 
296 | 			if cik2[0] == 1:
297 | 				return (cik2[1][0][0], nameF, len(nameR)-cik2[2])
298 | 			elif cik2[0] == -1:
299 | 				return (cik2[1][0][0], nameF, -1)
300 | 			else:
301 | 				return (str(cik2[1]), nameF, len(nameR)-cik2[2])
302 | 			#(CIK,name, {-1, if fail; 100, if traditional method; 0<=n<=99, means n words subtracted})
303 | 
304 | ''' More robust method of grabbing CIKs '''
305 | def get_CIK2(name,ngram):
306 | 	# Returns (number of CIKs, list of ciks [(CIK,name)] )
307 | 	if ngram > 0:
308 | 
309 | 		soup = bs(urllib2.urlopen('http://www.sec.gov/cgi-bin/cik.pl.c?company=' + '+'.join(name[:ngram])))
310 | 
311 | 		# Find how many search results on Edgar
312 | 		try:
313 | 			test = int(soup.find('strong').contents[0])
314 | 		except ValueError:
315 | 			test = int(soup.find('b').contents[0])
316 | 		except:
317 | 			test = 0
318 | 
319 | 		if test == 0:
320 | 			return get_CIK2(name,ngram-1)
321 | 
322 | 		else:
323 | 			l = soup.findAll('pre')[1].contents
324 | 			out = [];
325 | 			for (c, n) in zip(l[0::2], l[1::2]):
326 | 				out.append( (str(c.contents[0]), str(n).strip()) )
327 | 			return (len(l)/2, out, ngram)
328 | 
329 | 	else:
330 | 		return (-1, [('FAIL','FAIL')],-1)
331 | 
332 | ''' Return success rate for tuple (error, total) '''
333 | def err(e,t):
334 | 	return '('+str(round((t-e)*100./t,2))+'%)'
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | '''
345 | def get_prices(data,date):
346 | 	try:
347 | 		return (i for i in data if i[0] == date).next()
348 | 	except StopIteration:
349 | 		print 'No data for the date', date
350 | 		return None
351 | '''
352 | 
353 | 
354 | if __name__ == '__main__':
355 | 	main()
356 | 


--------------------------------------------------------------------------------
/Old/data_manager.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | This module manages and maintains data for RoboBuffett
  4 | '''
  5 | import os, sys, logging, string
  6 | from datetime import date, timedelta
  7 | from pdb import set_trace as debug
  8 | from collections import Counter
  9 | # Note: Using Counter on the test dataset increased the pickled size from 1.3mb to 1.8mb
 10 | try:
 11 | 	import cPickle as pickle
 12 | except:
 13 | 	import pickle
 14 | 
 15 | class Financial_Universe:
 16 | 	'''Manage all financial information for all of the stocks. This will be the object that we pickle for serialization. For memory efficiency, it will not contain the full raw text of documents, but will contain methods by which they can be loaded into memeory.
 17 | 	Perhaps we will want to include the word frequency counts since they will require less space. Will have to see whether it's practical.
 18 | 	'''
 19 | 	def __init__(self, data_dir):
 20 | 		self.companies  = {}
 21 | 		self.industries = {}
 22 | 		self.documents  = []
 23 | 		num_docs = len(os.listdir(data_dir))
 24 | 		print "Num docs: %d" % num_docs
 25 | 		docs_counted = 0
 26 | 		for doc_path in os.listdir(data_dir):
 27 | 			if doc_path[0] == ".": continue
 28 | 			newdoc = Document(data_dir + doc_path, 'SEC_Quarterly')
 29 | 			self.documents.append(newdoc)
 30 | 			docs_counted += 1
 31 | 			
 32 | 			for i in xrange(len(newdoc.CIK)):
 33 | 				CIK = newdoc.CIK[i] 
 34 | 				# CIKs stored as a list since there may be several
 35 | 				"""***TODO:***: Better system for handling CIKs, integrate with stock tickers"""
 36 | 
 37 | 				if CIK not in self.companies:
 38 | 					self.companies[CIK] = Company(newdoc, i) # Create a new company entry based on the document
 39 | 				else:
 40 | 					self.companies[CIK].add_document(newdoc, i)
 41 | 			if docs_counted % 100 == 0:
 42 | 				print "Docs processed %d" % docs_counted
 43 | 
 44 | 		for CIK, company in self.companies.iteritems():
 45 | 			if CIK not in self.industries:
 46 | 				self.industries[CIK] = Industry(CIK, company)
 47 | 			else:
 48 | 				self.industries[CIK].add_company(CIK, company)
 49 | 		self.get_counts()
 50 | 
 51 | 	def get_counts(self):
 52 | 		sum = 0
 53 | 		for doc in self.documents:
 54 | 			sum += doc.num_words
 55 | 		self.num_words = sum
 56 | 		self.num_docs  = len(self.documents)
 57 | 		self.num_companies   = len(self.companies)
 58 | 		self.num_industries = len(self.industries)
 59 | 
 60 | 	def generate_word_index(self, threshold):
 61 | 		"""Generates an index of commonly used words in the documents, so that the documents can be stored in compressed form. We can remove all instances of commonly used words from the dictionaries, and add a k-tuple of word counts, where k is the number of commonly used words. THRESHOLD determines what proportion of documents a word must be in for it to be included in the list.
 62 | 		Creates self.index_list, an ordered list of words in the index. Creates self.index_dict which maps from element indicies back to the right word in the sequence. Sets self.indexed = 1."""
 63 | 		# Threshold in (0, 1)
 64 | 		dict_index = {}
 65 | 		threshold *= self.num_docs
 66 | 		for document in self.documents:
 67 | 			for word in document.word_freq.iterkeys():
 68 | 				try: 
 69 | 					dictindex[word] += 1
 70 | 				except KeyError:
 71 | 					dictindex[word]  = 1
 72 | 		self.index_list = []
 73 | 		for word, val in dictindex.iteritems():
 74 | 			if val > threshold:
 75 | 				self.index_list.append(word)
 76 | 		del dictindex
 77 | 		self.index_list.sort()
 78 | 		self.index_dict = {}
 79 | 		for i in xrange(len(self.index_list)):
 80 | 			self.index_dict[index_list[i]] = i
 81 | 		self.indexed = 1
 82 | 
 83 | class Company:
 84 | 	def __init__(self, document, idx):
 85 | 		self.CIK = document.CIK[idx]
 86 | 		self.SIC = document.SIC[idx]
 87 | 		self.documents = [(document.date, document)]
 88 | 		self.name = document.cname[idx]
 89 | 
 90 | 	def __repr__(self):
 91 | 		return "<COMPANY>" + self.name[0] # Currently names are stored as a list as there may be multiple. Not a super satisfactory solution
 92 | 
 93 | 	def add_document(self, document):
 94 | 		self.documents.append((document.date, document))
 95 | 		if document.cname[idx] != company.name:
 96 | 			print "Name discrepancy: %s, %s" % (company.name, document.cname)
 97 | 			logging.debug("Name discrepancy: %s, %s" % (company.name, document.cname))
 98 | 		if document.SIC[idx] != company.SIC:
 99 | 			print "SIC discrepancy: %d %d" % (company.SIC, document.SIC)
100 | 			logging.debug("SIC discrepancy: %d %d" % (company.SIC, document.SIC))
101 | 
102 | class Industry:
103 | 	def __init__(self, CIK, company):
104 | 		self.SIC = company.SIC
105 | 		self.components = {CIK: company}
106 | 		self.n_componenets = 1
107 | 
108 | 	def __repr__(self):
109 | 		return "<INDUSTRY>" + str(self.SIC[0])
110 | 	
111 | 	def add_company(self, CIK, company):
112 | 		if CIK not in self.components:
113 | 			self.components[CIK] = company
114 | 			self.n_components += 1
115 | 
116 | class Document:
117 | 	def __init__(self, docpath, doctype):
118 | 		'''Populate the following'''
119 | 		self.path = docpath
120 | 		self.properties = {}
121 | 		self.word_freq  = {}
122 | 		self.num_words = {}
123 | 		try:
124 | 			self.docfile = open(docpath, 'r')
125 | 		except IOError:
126 | 			print "Bad file path ", docpath
127 | 			logging.warning('Bad doc path: %s' % docpath)
128 | 			return 
129 | 
130 | 		if doctype == 'SEC_Quarterly':
131 | 			self.parse_quarterly_filing()
132 | 		else:
133 | 			print "Document not supported: %s type %s" % (docpath, doctype)
134 | 			logging.warning('Unsupported doc %s type %s' % (docpath, doctype))
135 | 		self.docfile.close()
136 | 		del self.docfile # Delete file references so Pickle won't complain
137 | 
138 | 	def __repr__(self):
139 | 		return "<DOCUMENT>" + self.path
140 | 
141 | 
142 | 	def parse_quarterly_filing(self):
143 | 		"""Parse a quarterly filing. Makes a dictionary in self.properties containing all of the attributes pulled from the quarterly filing. Makes a word-frequency too."""
144 | 		# The last condition 'Item 1. B' triggers when we have parsed all the header info and are into the actual document. Since the dictionaryName is '' it won't store anything, but it returns a nonzero value so that the loop will break
145 | 
146 | 		logging.info("Parsing quarterly filing %s" % self.path)
147 | 		partition_text = 'PART I'
148 | 		text = self.docfile.read()
149 | 		#debug()
150 | 		text = text.partition(partition_text)
151 | 		# Currently I partition it into Header and Body by seperating at the first instance of the text 'PART I'. I consider this a placeholder
152 | 		if text[1] != partition_text:
153 | 			print "Warning: Unable to partition %s" % self.path
154 | 			logging.warning("ERROR: Unable to partition document.")
155 | 			return
156 | 		header = text[0].split("\n") #Consider mapping .strip for efficiency
157 | 		text = text[2]
158 | 		self.parse_quarterly_header(header)
159 | 		self.build_word_freq(text)
160 | 
161 | 	def build_word_freq(self, text):
162 | 		to_remove = string.punctuation + string.digits
163 | 		text = text.translate(None, (to_remove))
164 | 		# Removes all punctuation and digits
165 | 		text = text.lower()
166 | 		text = text.split()
167 | 		# Splits the text into a list of lowercase words
168 | 		# Possible improvements: Strip tables, formatting (e.g. <PAGE>, - 2 -)
169 | 		self.num_words = len(text)
170 | 		self.word_count = {}
171 | 		for word in text:
172 | 			try: 
173 | 				self.word_count[word] += 1
174 | 			except KeyError:
175 | 				self.word_count[word] = 1
176 | 		# This try/except method may be somewhat more efficient than if-then branching for unigram processing. For n-grams, perhaps better to use if-then.
177 | 
178 | 	#def compress(self, ilist, idict):
179 | 
180 | 	def parse_quarterly_header(self, header):
181 | 		property_info = ( 
182 | 		  #DictionaryName,    FilingText,                   int   list
183 | 		 ('DocType',         'CONFORMED SUBMISSION TYPE:',   0,     0),
184 | 		 ('ReportingPeriod', 'CONFORMED PERIOD OF REPORT:',  0,     0),
185 | 		 ('FilingDate',      'FILED AS OF DATE:',            0,     0),
186 | 	 	 ('CompanyName',     'COMPANY CONFORMED NAME:',      0,     1),
187 | 		 ('CIK',             'CENTRAL INDEX KEY:',           1,     1),
188 | 		 ('SIC',      'STANDARD INDUSTRIAL CLASSIFICATION:', 1,     1),
189 | 		 ('IRS_Num',         'IRS NUMBER:',                  1,     1),
190 | 		 ('FY_End',          'FISCAL YEAR END:',             1,     1),
191 | 		 ('SEC_FileNo',      'SEC FILE NUMBER:',             1,     1))
192 | 		# Defines the properties to seek in the header of the filing, and names to assign them to in the self.properties dictionary. I hope Python doesn't waste time re-creating this tuple every time parse_quarterly_filing is called.
193 | 		for line in header:
194 | 			line = line.strip()
195 | 			for property_tuple in property_info:
196 | 				self.grab_property(line, *property_tuple)
197 | 
198 | 		if len(self.properties) < len(property_info):
199 | 			msg = "Found %d of %d fields" % (len(self.properties), len(property_info))
200 | 			logging.warning(msg)
201 | 		
202 | 		try:
203 | 			self.convert_property_to_date('ReportingPeriod')
204 | 		except KeyError:
205 | 			logging.warning("Doc has no reporting period")
206 | 		try:
207 | 			self.convert_property_to_date('FilingDate')
208 | 			self.date  = self.properties['FilingDate']
209 | 		except KeyError:
210 | 			logging.error("Doc has no filing date!")
211 | 			print "Doc %s has no filing date!" % self.path
212 | 		try:
213 | 			self.type  = self.properties['DocType']
214 | 			self.CIK   = self.properties['CIK'] # A list
215 | 			self.SIC   = self.properties['SIC'] # A list
216 | 			self.cname = self.properties['CompanyName'] # A list
217 | 		except KeyError as e:
218 | 			logging.error(e)
219 | 
220 | 	def grab_property(self, line, name, identifier, isInt=0, isList=0):
221 | 		"""Checks LINE for IDENTIFIER. If IDENTIFIER is found in the line, then the text immediately after IDENTIFIER is saved in self.properties[PROPNAME]. If the isInt flag is set, then the content is converted to an integer value. If it doesn't convert to int cleanly, then non-digits characters are stripped, it's force converted, and a note is made in the log. In text mode, leading or trailing whitespace around the content is also removed. grab_property returns the content that it stores. If PROPNAME is "" then no value is stored, but the content is still returned."""
222 | 		if line.startswith(identifier):
223 | 			content = line.partition(identifier)[2].strip()
224 | 			# Take the content after the identifier, and strip whitespace
225 | 			props = self.properties
226 | 			if isInt: 
227 | 				try:
228 | 					content = int(content)
229 | 				except ValueError:
230 | 					logging.debug('''ValueError occured converting "%s" to int in line:\n%s . Forcing conversion.''' % (content, line))
231 | 					try: 
232 | 						to_remove = string.punctuation + string.ascii_letters + string.whitespace
233 | 						content = int(content.translate(None, to_remove))
234 | 					except ValueError as e:
235 | 						logging.error('Unable to force-convert ' + str(e))
236 | 			if name != '': 
237 | 			# If propname is the empty string, nothing is stored
238 | 				if isList:
239 | 					if name in props: # Append to existing list
240 | 						props[name].append(content)
241 | 					else: # Start a new list
242 | 						props[name] = [content]
243 | 				else: # Just store a value
244 | 					props[name] = content
245 | 			return content
246 | 	
247 | 	def convert_property_to_date(self, propname):
248 | 		prop = self.properties[propname]
249 | 		yyyy = int(prop[0:4])
250 | 		mm   = int(prop[4:6])
251 | 		dd   = int(prop[6:8])
252 | 		self.properties[propname] = date(yyyy, mm, dd)
253 | 
254 | def main():
255 | 	if len(sys.argv) == 1:
256 | 		data_dir = "./TestData/Docs_From_1994/"
257 | 	else:
258 | 		data_dir = argv[1]
259 | 
260 | 	with open('./data_manager.log', 'w') as cleanlog:
261 | 		pass # Empty the log before each run
262 | 
263 | 	logging.basicConfig(filename='data_manager.log', level=logging.DEBUG)
264 | 	universe = Financial_Universe(data_dir)
265 | 
266 | 	print "Statistics: %d documents, %d companies %d industries %d words" % (universe.num_docs, universe.num_companies, universe.num_industries, universe.num_words)
267 | 	logging.info("Statistics: %d documents, %d companies %d industries %d words" % (universe.num_docs, universe.num_companies, universe.num_industries, universe.num_words))
268 | 	with open('./universe.dat', 'w') as f:
269 | 		pickle.dump(universe, f, 0)
270 | 	#debug()
271 | 
272 | if __name__ == "__main__":
273 | 	main()
274 | 
275 | 
276 | 


--------------------------------------------------------------------------------
/manager.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Current Status: Shift to SQL database instead of ad-hoc python class in progress.
  4 | 
  5 | import os, sys, logging, string, time, math, rb_parser
  6 | from rb_parser import ParseError
  7 | from pdb import set_trace as debug
  8 | from os.path import basename
  9 | Path = os.path.join
 10 | try:
 11 |     import cPickle as pickle
 12 | except:
 13 |     import pickle
 14 | #import stock
 15 | 
 16 | 
 17 | def main():
 18 | # Todo: Implement better UI
 19 |     DataDir = os.path.expanduser('~/Documents/Code/RoboBuffett/Data/')
 20 |     logfile = DataDir + '../Logs/manager.log'
 21 |     touch(logfile)
 22 |     logging.basicConfig(filename=logfile, level=logging.DEBUG)
 23 |     manager = load_manager(DataDir)
 24 |     #manager.preprocess()
 25 |     #manager.process()
 26 |     [for co in manager.]
 27 | 
 28 |     save_manager(manager)
 29 |     save_industry_dict(manager)
 30 |     pretty_dict(manager.industries)
 31 |     manager.print_stats()
 32 | 
 33 | 
 34 | 
 35 | def load_manager(DataDir):
 36 |     '''Load the manager from desk if it's available, or generate a new Manager instance if not'''
 37 |     try:
 38 |         with open(DataDir + 'Pickles/manager.dat', 'r') as f:
 39 |             return pickle.load(f)
 40 |     except IOError:
 41 |         return Manager(DataDir)
 42 | 
 43 | def save_manager(manager):
 44 |     '''Saves the manager to disk'''
 45 |     with open(manager.DataDir + 'Pickles/manager.dat', 'w') as f:
 46 |             pickle.dump(manager, f, 2)
 47 | 
 48 | def save_industry_dict(manager):
 49 |     '''Saves industry dict, useful for generating price indices'''
 50 |     with open(manager.DataDir + 'Pickles/industrydict.dat', 'w') as f:
 51 |         pickle.dump(manager.industries, f, 0)
 52 | 
 53 | class Manager(object):
 54 |     """Persistent object that manages the entire dataset
 55 |     Functionality overview:
 56 |     init         = Basic setup
 57 |     preprocess   = Vital step that organizes documents by CIK. Moves 
 58 |                    unparseable documents to 'Exceptions'
 59 |     process      = Process documents in Data/Preprocessed. Create company
 60 |                    entries containing parsed word counts. Moves documents
 61 |                    to 'Active' or 'Inactive'
 62 |     print_stats  = Print a bunch of statistics about the manager
 63 |     
 64 |     Terminology:
 65 |     CIK: Central Index Key, used as unique identifiers for companies
 66 |     SIC: Standard Industrial Code, SEC's industry designators
 67 |     good CIK: a CIK for which we have stock price information
 68 |     active CIK: a CIK in the document dataset which is good
 69 |     inactive CIK: a CIK in the document dataset which isn't good
 70 |     processed documents: have been run thru the pre-processor
 71 |     exception documents: they didn't parse
 72 |     active documents: 'owned' by an active CIK
 73 |     inactive documents: not owned by an active CIK
 74 |     valid documents: union of active and inactive documents
 75 | 
 76 | 
 77 |     """
 78 |     def __init__(self, DataDir):
 79 |         ### Set up directory structure ###
 80 |         self.DataDir = DataDir
 81 |         os.chdir(DataDir)
 82 |         vital_dirs = ('Pickles/','Pickles/CIKs','Active/','Inactive/',
 83 |             'Unprocessed/','Preprocessed/','Processed/','Exceptions/')
 84 |         map(ensure, vital_dirs) # Make sure they all exist
 85 | 
 86 |         ### Mappings ###
 87 |         self.industries = {} # Mapping from SIC->[CIK]
 88 |         self.CIK_to_Ticker = dePickle('Pickles/CIK_Ticker.dat') 
 89 | 
 90 |         ### Sets ###
 91 |         self.good_CIKs     = set(self.CIK_to_Ticker.iterkeys()) 
 92 |         self.active_CIKs   = set() 
 93 |         self.CIK2date = {} # Map from active CIKs to documents (the dates)
 94 |         self.inactive_CIKs = set() 
 95 |         self.processed_docs = set() 
 96 |         # Original names of all documents processed by the manager.
 97 |         # Maintained to avoid double-counting documents. 
 98 |         self.valid_docs     = set() 
 99 |         
100 |         # Invariant: len(processed) >= len(valid) - len(exception)
101 |         # This is because for every processed document, the parser 
102 |         # either fails and generates an exception, or succeeds and 
103 |         # creates 1 or more valid documents corresponding to the 
104 |         # number of valid filers (unique CIKs) found in the document.
105 |         self.exception_docs = set() 
106 |         self.active_docs    = set() 
107 |         self.inactive_docs  = set() 
108 |         self.company_word_sets = []
109 | 
110 | 
111 |     def preprocess(self):
112 |         """Preprocess the documents in Data/Unprocessed 
113 |         Finds a doc's CIKs and creates hard links in the folder 
114 |         Preprocessed/CIK. If a doc doesn't parse properly, it is 
115 |         moved to Data/Exceptions instead. 
116 |         The pre-processing step allows us to consider only one CIK 
117 |         at a time during the processing step, for memory efficiency.
118 |         """
119 |         n_proc = 0
120 |         n_valid = 0
121 |         n_except = 0
122 |         start = time.time()
123 |         os.chdir(self.DataDir + 'Unprocessed/')
124 |         for (docpath, docname) in recursive_file_gen('.'):
125 |         # Returns (path, filename) tuples for all files in directory 
126 |         # and subdirectories that don't begin with '.' or '_'
127 |             if docname in self.processed_docs: continue
128 |             self.processed_docs.add(docname) 
129 |             n_proc += 1
130 |             # Code assumes that docnames are unique
131 |             try:
132 |                 (header, cik2filers, _) = rb_parser.parse_quarterly_filing(docpath)
133 |                 # Returns (but doesn't process) the raw text. 
134 |                 date     = header['FilingDate']
135 |                 doctype  = header['DocType']
136 |                 for CIK in cik2filers.iterkeys():
137 |                     new_docname = CIK + '_' + date + '.txt'
138 |                     ensure(self.DataDir + 'Preprocessed/' + CIK)
139 |                     safelink(docpath, self.DataDir + 'Preprocessed/' + CIK + '/' + new_docname)
140 |                     if new_docname in self.valid_docs:
141 |                         print "Repeated doc: %s" % new_docname
142 |                     self.valid_docs.add(new_docname)
143 |                     n_valid += 1
144 |                 if n_valid != len(self.valid_docs):
145 |                     pass#debug()
146 | 
147 |             except ParseError as e:
148 |                 self.exception_docs.add(docname)
149 |                 n_except += 1
150 |                 logging.warning(docname + ": " + str(e))
151 |                 safelink(docpath, self.DataDir + 'Exceptions/' + basename(docpath))
152 | 
153 | 
154 |             # if n_proc > n_valid + n_except:
155 |             #     print "Warning: proc %d, valid %d, except %d" % (n_proc, n_valid, n_except)
156 |             # elif n_proc % 100 == 0:
157 |             #     print "Proc %d, valid %d, except %d, combined %d" % (n_proc, n_valid, n_except, n_valid + n_except)
158 |             #     if n_proc != len(self.processed_docs) or n_valid != len(self.valid_docs) or n_except != len(self.exception_docs):
159 |             #         debug()
160 | 
161 |         end = time.time()
162 |         print "Time elapsed in preprocessing: %.1f" % (end-start)
163 | 
164 |     def process(self):
165 |         start = time.time()
166 |         os.chdir(self.DataDir + 'Preprocessed')
167 |         # Iterate through all the preprocessed CIKs
168 |         for CIK in os.listdir('.'):
169 |             if CIK[0] == '.' or not os.path.isdir(CIK): continue
170 | 
171 |             if CIK in self.good_CIKs:
172 |                 self.active_CIKs.add(CIK)
173 |                 company = self.load_company(CIK)
174 |                 ensure(self.DataDir + 'Active/' + CIK)
175 |                 if CIK not in self.CIK2date:
176 |                     self.CIK2date[CIK] = []
177 |                 for filing in os.listdir(CIK):
178 |                     filingpath = CIK + '/' + filing
179 |                     (header, filers, rawtext) = rb_parser.parse_quarterly_filing(filingpath)
180 |                     company.properties(filers) 
181 |                     # Update company properties with info taken from the 'filers' part of the document
182 |                     date = header['FilingDate']
183 |                     company.add_document(date, rawtext)
184 |                     # Creates a word dictionary and wordcount from the raw text returned by the parser
185 |                     self.CIK2date[CIK].append(date)
186 |                     self.active_docs.add(filing)
187 |                     os.rename(filingpath, self.DataDir + 'Active/' + filingpath)
188 |                     # Move the filing to the 'Active' directory - note this means atm all parsed data is stored in the directory structure
189 |                 company.build_wordset()
190 |                 self.company_word_sets.append(company.wordset)
191 |                 self.save_company(company)
192 |                 SIC = company.SIC
193 |                 
194 |                 try: 
195 |                     if CIK not in self.industries[SIC]:
196 |                         self.industries[SIC].append(CIK)
197 |                 except KeyError:
198 |                     self.industries[SIC] = [CIK]
199 |                 del company # Get it out of memory. Probably unnecessary
200 | 
201 |             else: # if CIK not in self.goodCIKs
202 |                 self.inactive_CIKs.add(CIK)
203 |                 ensure(self.DataDir + 'Inactive/' + CIK)
204 |                 for filing in os.listdir(CIK):
205 |                     self.inactive_docs.add(filing)
206 |                     os.rename(CIK +'/'+ filing, 
207 |                         self.DataDir + 'Inactive/' + CIK +'/'+ filing)
208 |             os.removedirs(CIK)
209 |         end = time.time()
210 |         print "Time elapsed in processing: %.1f" % (end-start)
211 | 
212 |     def generic_word_set(self, proportion):
213 |         self.generic_word_set = proportional_set_intersection(self.company_word_sets, proportion)
214 | 
215 |     def gen_training_set(self, cutoff, skipyears):
216 |         self.training_set = {}
217 |         for CIK, dates in self.CIK2date:
218 |             if random.random() > cutoff: continue
219 |             datelist = []
220 |             for date in dates:
221 |                 if date not in skipyears:
222 |                     datelist.append(date)
223 |             if datelist != []:
224 |                 self.training_set[CIK] = datelist
225 | 
226 | 
227 |     def load_company(self, CIK):
228 |         # Look for the company in:
229 |             # 1. The pickles directory
230 |             # 2. Make a new company
231 |         # If #2, then add to active list. If #3, then add to active list and add SIC to industries.
232 |         if os.path.exists(self.DataDir + 'Pickles/CIKs/' + CIK + '.dat'):
233 |             with open(self.DataDir + 'Pickles/CIKs/' + CIK + '.dat', 'r') as f:
234 |                 company = pickle.load(f)
235 |         else:
236 |             company = Company(CIK)
237 |         return company
238 | 
239 |     def save_company(self, company):
240 |         with open(self.DataDir + 'Pickles/CIKs/' + company.CIK + '.dat', 'w') as f:
241 |             pickle.dump(company, f, 2)
242 | 
243 |     def print_stats(self):
244 |         good       = len(self.good_CIKs)
245 |         active     = len(self.active_CIKs)
246 |         inactive   = len(self.inactive_CIKs)
247 |         sics       = len(self.industries.keys())
248 |         proc       = len(self.processed_docs)
249 |         valid      = len(self.valid_docs)
250 |         exceptions = len(self.exception_docs)
251 |         activeD    = len(self.active_docs)
252 |         inactiveD  = len(self.inactive_docs)
253 |         try:
254 |             safeprint("%d good CIKs, %d active CIKs, %d inactive CIKs" % (good, active, inactive))
255 |             safeprint("%.2f of observed CIKs are active, %.2f of good CIKs are active" % (active / float(active + inactive), active / float(good)))
256 |             safeprint("%d SICs, average of %1.2f active CIKs per SIC" % (sics, active / float(sics)))
257 |             safeprint("%d processed documents, %d valid, %d exceptions" % (proc, valid, exceptions))
258 |             safeprint("Implied: %1.2f CIKs per document, %.2f exception rate" % (valid / float(proc - exceptions), exceptions / float(proc)))
259 |             safeprint("%d active documents, %d inactive, %.2f activation rate" % (activeD, inactiveD, activeD / float(proc)))
260 |         except ZeroDivisionError:
261 |             safeprint("Please run the manager on some files before printing stats")
262 | 
263 | class Company(object):
264 |     """Keeps track of a single company (as identified by CIK). 
265 |     Contains CIK, SIC classification (if any), name, a list of filingdates, a mapping from filing dates to document parses, and a set of all words used by this company in any document."""
266 |     def __init__(self, CIK):
267 |         self.CIK = CIK
268 |         self.SIC = 0
269 |         self.name = ''
270 |         self.dates = []
271 |         self.docs = {} # (count_dict, #words) tuples are indexed by filingdate
272 |         self.wordset = set()
273 | 
274 |     def properties(self, filers):
275 |         # If company has no properties, then add them. If not, check for discrepancies
276 |         filerdict = filers[self.CIK]
277 |         newSIC = filerdict['SIC']
278 |         if self.SIC == 0:
279 |             self.SIC = newSIC
280 |         elif self.SIC != newSIC:
281 |             logging.warning("Company switched SICs: CIK: %s orig SIC: %d new SIC: %d" % (self.CIK, self.SIC, newSIC))
282 |         cname = filerdict['CompanyName']
283 |         if self.name == '':
284 |             self.name = cname
285 |         elif self.name != cname:
286 |             logging.warning("Company switched names: %s %s" % (self.name, cname))
287 | 
288 | 
289 |     def add_document(self, filing_date, raw_text):
290 |         self.dates.append(filing_date)
291 |         word_count, n_words = rb_parser.build_word_count(raw_text)
292 |         self.docs[filing_date] = (word_count, n_words)
293 |         self.wordset |= word_count.viewkeys()
294 | 
295 |     def rebuild_wordset(self):
296 |         #Should build a set containing every word which exists in at least one filing
297 |         #This is done automatically as documents are added; should only be called if you have some reason to rebuild the entire set
298 |         for (word_dict, numwords) in self.docs.itervalues():
299 |             self.wordset |= word_dict.viewkeys()
300 | 
301 | 
302 | def proportional_set_intersection(sets, p):
303 |     # Takes a list of sets: [Set1, Set2, Set3].
304 |     # s = len(sets)
305 |     # Returns a set containing every element which was in at least p proportion of the sets, i.e. there were at least s * p instances in the sets
306 |     count = {}
307 |     for sett in sets:
308 |         for element in sett:
309 |             try:
310 |                 count[element] += 1
311 |             except KeyError:
312 |                 count[element]  = 1
313 | 
314 |     s = len(sets)
315 |     n = math.floor(s * p)
316 | 
317 |     outset = set()
318 |     for key,val in count.iteritems():
319 |         if val > n:
320 |             outset.add(key)
321 | 
322 | 
323 | # Utility functions
324 | 
325 | def recursive_file_gen(mydir):
326 |     for root, dirs, files in os.walk(mydir):
327 |         for file in files:
328 |             if file[0] not in ('.', '_'):
329 |                 yield (os.path.join(root, file), file)
330 | 
331 | def ensure(dir):
332 |     if not os.path.exists(dir):
333 |         os.makedirs(dir)
334 | 
335 | def touch(filepath):
336 |     if not os.path.exists(filepath):
337 |         with open(filepath, 'w') as f:
338 |             pass
339 | 
340 | def safelink(source, dest):
341 |     try: 
342 |         os.link(source, dest)
343 |     except OSError:
344 |         pass
345 | 
346 | def dePickle(filestr):
347 |     with open(filestr, 'r') as f:
348 |         return pickle.load(f)
349 | 
350 | def safeprint(string):
351 |     try:
352 |         print string
353 |     except:
354 |         pass
355 | 
356 | def pretty_dict(output):
357 |     lenlist = []
358 |     for key, val in output.iteritems():
359 |         lenlist.append((key,len(val)))
360 |     lenlist = sorted(lenlist, key=lambda student: student[1])
361 |     for (sic, i) in lenlist:
362 |         print str(sic) + ('*' * i)
363 | 
364 | if __name__ == "__main__":
365 |     main()


--------------------------------------------------------------------------------