├── requirements.txt ├── README.md ├── project_helper.py ├── tests.py └── project_tests.py /requirements.txt: -------------------------------------------------------------------------------- 1 | alphalens==0.3.2 2 | nltk==3.4.5 3 | numpy==1.13.3 4 | ratelimit==2.2.0 5 | requests==2.20.0 6 | scikit-learn==0.19.1 7 | six==1.11.0 8 | tqdm==4.19.5 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp_10-ks 2 | Code for Project 5 from Udacity's AI for Trading nanodegree program. 3 | 4 | Original source code from https://github.com/udacity/artificial-intelligence-for-trading 5 | -------------------------------------------------------------------------------- /project_helper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import requests 3 | 4 | from ratelimit import limits, sleep_and_retry 5 | 6 | 7 | class SecAPI(object): 8 | SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1} 9 | 10 | @staticmethod 11 | @sleep_and_retry 12 | # Dividing the call limit by half to avoid coming close to the limit 13 | @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds']) 14 | def _call_sec(url): 15 | return requests.get(url) 16 | 17 | def get(self, url): 18 | return self._call_sec(url).text 19 | 20 | 21 | def print_ten_k_data(ten_k_data, fields, field_length_limit=50): 22 | indentation = ' ' 23 | 24 | print('[') 25 | for ten_k in ten_k_data: 26 | print_statement = '{}{{'.format(indentation) 27 | for field in fields: 28 | value = str(ten_k[field]) 29 | 30 | # Show return lines in output 31 | if isinstance(value, str): 32 | value_str = '\'{}\''.format(value.replace('\n', '\\n')) 33 | else: 34 | value_str = str(value) 35 | 36 | # Cut off the string if it gets too long 37 | if len(value_str) > field_length_limit: 38 | value_str = value_str[:field_length_limit] + '...' 39 | 40 | print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str) 41 | 42 | print_statement += '},' 43 | print(print_statement) 44 | print(']') 45 | 46 | 47 | def plot_similarities(similarities_list, dates, title, labels): 48 | assert len(similarities_list) == len(labels) 49 | 50 | plt.figure(1, figsize=(10, 7)) 51 | for similarities, label in zip(similarities_list, labels): 52 | plt.title(title) 53 | plt.plot(dates, similarities, label=label) 54 | plt.legend() 55 | plt.xticks(rotation=90) 56 | 57 | plt.show() 58 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from collections import OrderedDict 3 | import copy 4 | import pandas as pd 5 | import numpy as np 6 | from datetime import date, timedelta 7 | 8 | 9 | pd.options.display.float_format = '{:.8f}'.format 10 | 11 | 12 | def _generate_output_error_msg(fn_name, fn_inputs, fn_outputs, fn_expected_outputs): 13 | formatted_inputs = [] 14 | formatted_outputs = [] 15 | formatted_expected_outputs = [] 16 | 17 | for input_name, input_value in fn_inputs.items(): 18 | formatted_outputs.append('INPUT {}:\n{}\n'.format( 19 | input_name, str(input_value))) 20 | for output_name, output_value in fn_outputs.items(): 21 | formatted_outputs.append('OUTPUT {}:\n{}\n'.format( 22 | output_name, str(output_value))) 23 | for expected_output_name, expected_output_value in fn_expected_outputs.items(): 24 | formatted_expected_outputs.append('EXPECTED OUTPUT FOR {}:\n{}\n'.format( 25 | expected_output_name, str(expected_output_value))) 26 | 27 | return 'Wrong value for {}.\n' \ 28 | '{}\n' \ 29 | '{}\n' \ 30 | '{}' \ 31 | .format( 32 | fn_name, 33 | '\n'.join(formatted_inputs), 34 | '\n'.join(formatted_outputs), 35 | '\n'.join(formatted_expected_outputs)) 36 | 37 | 38 | def _is_equal(x, y): 39 | is_equal = False 40 | 41 | if isinstance(x, pd.DataFrame) or isinstance(y, pd.Series): 42 | is_equal = x.equals(y) 43 | elif isinstance(x, np.ndarray): 44 | is_equal = np.array_equal(x, y) 45 | elif isinstance(x, list): 46 | if len(x) == len(y): 47 | for x_item, y_item in zip(x, y): 48 | if not _is_equal(x_item, y_item): 49 | break 50 | else: 51 | is_equal = True 52 | else: 53 | is_equal = x == y 54 | 55 | return is_equal 56 | 57 | 58 | def project_test(func): 59 | def func_wrapper(*args): 60 | result = func(*args) 61 | print('Tests Passed') 62 | return result 63 | 64 | return func_wrapper 65 | 66 | 67 | def generate_random_tickers(n_tickers=None): 68 | min_ticker_len = 3 69 | max_ticker_len = 5 70 | tickers = [] 71 | 72 | if not n_tickers: 73 | n_tickers = np.random.randint(8, 14) 74 | 75 | ticker_symbol_random = np.random.randint(ord('A'), ord('Z')+1, (n_tickers, max_ticker_len)) 76 | ticker_symbol_lengths = np.random.randint(min_ticker_len, max_ticker_len, n_tickers) 77 | for ticker_symbol_rand, ticker_symbol_length in zip(ticker_symbol_random, ticker_symbol_lengths): 78 | ticker_symbol = ''.join([chr(c_id) for c_id in ticker_symbol_rand[:ticker_symbol_length]]) 79 | tickers.append(ticker_symbol) 80 | 81 | return tickers 82 | 83 | 84 | def generate_random_dates(n_days=None): 85 | if not n_days: 86 | n_days = np.random.randint(14, 20) 87 | 88 | start_year = np.random.randint(1999, 2017) 89 | start_month = np.random.randint(1, 12) 90 | start_day = np.random.randint(1, 29) 91 | start_date = date(start_year, start_month, start_day) 92 | 93 | dates = [] 94 | for i in range(n_days): 95 | dates.append(start_date + timedelta(days=i)) 96 | 97 | return dates 98 | 99 | 100 | def assert_structure(received_obj, expected_obj, obj_name): 101 | assert isinstance(received_obj, type(expected_obj)), \ 102 | 'Wrong type for output {}. Got {}, expected {}'.format(obj_name, type(received_obj), type(expected_obj)) 103 | 104 | if hasattr(expected_obj, 'shape'): 105 | assert received_obj.shape == expected_obj.shape, \ 106 | 'Wrong shape for output {}. Got {}, expected {}'.format(obj_name, received_obj.shape, expected_obj.shape) 107 | elif hasattr(expected_obj, '__len__'): 108 | assert len(received_obj) == len(expected_obj), \ 109 | 'Wrong len for output {}. Got {}, expected {}'.format(obj_name, len(received_obj), len(expected_obj)) 110 | 111 | if type(expected_obj) == pd.DataFrame: 112 | assert set(received_obj.columns) == set(expected_obj.columns), \ 113 | 'Incorrect columns for output {}\n' \ 114 | 'COLUMNS: {}\n' \ 115 | 'EXPECTED COLUMNS: {}'.format(obj_name, sorted(received_obj.columns), sorted(expected_obj.columns)) 116 | 117 | # This is to catch a case where __equal__ says it's equal between different types 118 | assert set([type(i) for i in received_obj.columns]) == set([type(i) for i in expected_obj.columns]), \ 119 | 'Incorrect types in columns for output {}\n' \ 120 | 'COLUMNS: {}\n' \ 121 | 'EXPECTED COLUMNS: {}'.format(obj_name, sorted(received_obj.columns), sorted(expected_obj.columns)) 122 | 123 | for column in expected_obj.columns: 124 | assert received_obj[column].dtype == expected_obj[column].dtype, \ 125 | 'Incorrect type for output {}, column {}\n' \ 126 | 'Type: {}\n' \ 127 | 'EXPECTED Type: {}'.format(obj_name, column, received_obj[column].dtype, expected_obj[column].dtype) 128 | 129 | if type(expected_obj) in {pd.DataFrame, pd.Series}: 130 | assert set(received_obj.index) == set(expected_obj.index), \ 131 | 'Incorrect indices for output {}\n' \ 132 | 'INDICES: {}\n' \ 133 | 'EXPECTED INDICES: {}'.format(obj_name, sorted(received_obj.index), sorted(expected_obj.index)) 134 | 135 | # This is to catch a case where __equal__ says it's equal between different types 136 | assert set([type(i) for i in received_obj.index]) == set([type(i) for i in expected_obj.index]), \ 137 | 'Incorrect types in indices for output {}\n' \ 138 | 'INDICES: {}\n' \ 139 | 'EXPECTED INDICES: {}'.format(obj_name, sorted(received_obj.index), sorted(expected_obj.index)) 140 | 141 | 142 | def does_data_match(obj_a, obj_b): 143 | if type(obj_a) == pd.DataFrame: 144 | # Sort Columns 145 | obj_b = obj_b.sort_index(1) 146 | obj_a = obj_a.sort_index(1) 147 | 148 | if type(obj_a) in {pd.DataFrame, pd.Series}: 149 | # Sort Indices 150 | obj_b = obj_b.sort_index() 151 | obj_a = obj_a.sort_index() 152 | try: 153 | data_is_close = np.isclose(obj_b, obj_a, equal_nan=True) 154 | except TypeError: 155 | data_is_close = obj_b == obj_a 156 | else: 157 | if isinstance(obj_a, collections.Iterable): 158 | data_is_close = data_is_close.all() 159 | 160 | return data_is_close 161 | 162 | 163 | def assert_output(fn, fn_inputs, fn_expected_outputs, check_parameter_changes=True): 164 | assert type(fn_expected_outputs) == OrderedDict 165 | 166 | if check_parameter_changes: 167 | fn_inputs_passed_in = copy.deepcopy(fn_inputs) 168 | else: 169 | fn_inputs_passed_in = fn_inputs 170 | 171 | fn_raw_out = fn(**fn_inputs_passed_in) 172 | 173 | # Check if inputs have changed 174 | if check_parameter_changes: 175 | for input_name, input_value in fn_inputs.items(): 176 | passed_in_unchanged = _is_equal(input_value, fn_inputs_passed_in[input_name]) 177 | 178 | assert passed_in_unchanged, 'Input parameter "{}" has been modified inside the function. ' \ 179 | 'The function shouldn\'t modify the function parameters.'.format(input_name) 180 | 181 | fn_outputs = OrderedDict() 182 | if len(fn_expected_outputs) == 1: 183 | fn_outputs[list(fn_expected_outputs)[0]] = fn_raw_out 184 | elif len(fn_expected_outputs) > 1: 185 | assert type(fn_raw_out) == tuple,\ 186 | 'Expecting function to return tuple, got type {}'.format(type(fn_raw_out)) 187 | assert len(fn_raw_out) == len(fn_expected_outputs),\ 188 | 'Expected {} outputs in tuple, only found {} outputs'.format(len(fn_expected_outputs), len(fn_raw_out)) 189 | for key_i, output_key in enumerate(fn_expected_outputs.keys()): 190 | fn_outputs[output_key] = fn_raw_out[key_i] 191 | 192 | err_message = _generate_output_error_msg( 193 | fn.__name__, 194 | fn_inputs, 195 | fn_outputs, 196 | fn_expected_outputs) 197 | 198 | for fn_out, (out_name, expected_out) in zip(fn_outputs.values(), fn_expected_outputs.items()): 199 | assert_structure(fn_out, expected_out, out_name) 200 | correct_data = does_data_match(expected_out, fn_out) 201 | 202 | assert correct_data, err_message 203 | -------------------------------------------------------------------------------- /project_tests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from collections import OrderedDict 5 | 6 | from tests import assert_output, project_test, assert_structure 7 | 8 | 9 | @project_test 10 | def test_get_documents(fn): 11 | # Test 1 12 | doc = '\nThis is inside the document\n' \ 13 | 'This is the text that should be copied' 14 | text = 'This is before the test document{}\n' \ 15 | 'This is after the document\n' \ 16 | 'This shouldn\t be included.'.format(doc) 17 | 18 | fn_inputs = { 19 | 'text': text} 20 | fn_correct_outputs = OrderedDict([ 21 | ( 22 | 'extracted_docs', [doc])]) 23 | 24 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 25 | 26 | # Test 2 27 | ten_k_real_compressed_doc = '\n' \ 28 | '10-K\n' \ 29 | '1\n' \ 30 | 'test-20171231x10k.htm\n' \ 31 | '10-K\n' \ 32 | '\n' \ 33 | '\n' \ 34 | '\n' \ 35 | ' \n' \ 36 | ' Document\n' \ 37 | ' \n' \ 38 | ' \n' \ 39 | '...\n' \ 40 | ' Data Type:\n' \ 41 | 'xbrli:sharesItemType\n' \ 42 | '\n' \ 43 | '\n' \ 44 | ' Balance Type:\n' \ 45 | 'na\n' \ 46 | '\n' \ 47 | '\n' \ 48 | ' Period Type:\n' \ 49 | 'duration\n' \ 50 | '\n' \ 51 | '\n' \ 52 | '\n' \ 53 | '\n' \ 54 | '\n' \ 55 | '\n' \ 56 | '\n' \ 57 | '\n' 58 | excel_real_compressed_doc = '\n' \ 59 | 'EXCEL\n' \ 60 | '106\n' \ 61 | 'Financial_Report.xlsx\n' \ 62 | 'IDEA: XBRL DOCUMENT\n' \ 63 | '\n' \ 64 | 'begin 644 Financial_Report.xlsx\n' \ 65 | 'M4$L#!!0 ( %"E04P?(\\#P !," + 7W)E;,O+G)E;.MDD^+\n' \ 66 | 'MPD ,Q;]*F?L:5\#8CUYZ6U9_ )Q)OU#.Y,A$[%^>X>];+=44/ 87O+>CT?V\n' \ 67 | '...\n' \ 68 | 'M,C,Q7V1E9BYX;6Q02P$"% ,4 " !0I4%,>V7[]F0L 0!(@A %0\n' \ 69 | 'M @ %N9@, 86UZ;BTR,#$W,3(S,5]L86(N>&UL4$L! A0#% @\n' \ 70 | 'M4*5!3*U*Q:W#O0 U=\) !4 ( !!9,$ &%M>FXM,C Q-S$R\n' \ 71 | '@,S%?<)E+GAM;%!+!08 !@ & (H! #[4 4 !\n' \ 72 | '\n' \ 73 | 'end\n' \ 74 | '\n' 75 | real_compressed_text = '0002014754-18-050402.txt : 20180202\n' \ 76 | '00002014754-18-050402.hdr.sgml : 20180202\n' \ 77 | '20180201204115\n' \ 78 | 'ACCESSION NUMBER: 0002014754-18-050402\n' \ 79 | 'CONFORMED SUBMISSION TYPE: 10-K\n' \ 80 | 'PUBLIC DOCUMENT COUNT: 110\n' \ 81 | 'CONFORMED PERIOD OF REPORT: 20171231\n' \ 82 | 'FILED AS OF DATE: 20180202\n' \ 83 | 'DATE AS OF CHANGE: 20180201\n' \ 84 | '\n' \ 85 | 'FILER:\n' \ 86 | '\n' \ 87 | ' COMPANY DATA: \n' \ 88 | ' COMPANY CONFORMED NAME: TEST\n' \ 89 | ' CENTRAL INDEX KEY: 0001018724\n' \ 90 | ' STANDARD INDUSTRIAL CLASSIFICATION: RANDOM [2357234]\n' \ 91 | ' IRS NUMBER: 91236464620\n' \ 92 | ' STATE OF INCORPORATION: DE\n' \ 93 | ' FISCAL YEAR END: 1231\n' \ 94 | '\n' \ 95 | ' FILING VALUES:\n' \ 96 | ' FORM TYPE: 10-K\n' \ 97 | ' SEC ACT: 1934 Act\n' \ 98 | ' SEC FILE NUMBER: 000-2225413\n' \ 99 | ' FILM NUMBER: 13822526583969\n' \ 100 | '\n' \ 101 | ' BUSINESS ADDRESS: \n' \ 102 | ' STREET 1: 422320 PLACE AVENUE\n' \ 103 | ' CITY: SEATTLE\n' \ 104 | ' STATE: WA\n' \ 105 | ' ZIP: 234234\n' \ 106 | ' BUSINESS PHONE: 306234534246600\n' \ 107 | '\n' \ 108 | ' MAIL ADDRESS: \n' \ 109 | ' STREET 1: 422320 PLACE AVENUE\n' \ 110 | ' CITY: SEATTLE\n' \ 111 | ' STATE: WA\n' \ 112 | ' ZIP: 234234\n' \ 113 | '\n' \ 114 | '{}\n' \ 115 | '{}\n' \ 116 | '\n'.format(ten_k_real_compressed_doc, excel_real_compressed_doc) 117 | 118 | fn_inputs = { 119 | 'text': real_compressed_text} 120 | fn_correct_outputs = OrderedDict([ 121 | ( 122 | 'extracted_docs', [ten_k_real_compressed_doc, excel_real_compressed_doc])]) 123 | 124 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 125 | 126 | 127 | @project_test 128 | def test_get_document_type(fn): 129 | doc = '\n' \ 130 | '10-K\n' \ 131 | '1\n' \ 132 | 'test-20171231x10k.htm\n' \ 133 | '10-K\n' \ 134 | '\n' \ 135 | '\n' \ 136 | '...' 137 | 138 | fn_inputs = { 139 | 'doc': doc} 140 | fn_correct_outputs = OrderedDict([ 141 | ( 142 | 'doc_type', '10-k')]) 143 | 144 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 145 | 146 | 147 | @project_test 148 | def test_lemmatize_words(fn): 149 | fn_inputs = { 150 | 'words': ['cow', 'running', 'jeep', 'swimmers', 'tackle', 'throw', 'driven']} 151 | fn_correct_outputs = OrderedDict([ 152 | ( 153 | 'lemmatized_words', ['cow', 'run', 'jeep', 'swimmers', 'tackle', 'throw', 'drive'])]) 154 | 155 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 156 | 157 | 158 | @project_test 159 | def test_get_bag_of_words(fn): 160 | def sort_ndarray(array): 161 | hashes = [hash(str(x)) for x in array] 162 | sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) 163 | 164 | return array[sotred_indicies] 165 | 166 | fn_inputs = { 167 | 'sentiment_words': pd.Series(['one', 'last', 'second']), 168 | 'docs': [ 169 | 'this is a document', 170 | 'this document is the second document', 171 | 'last one']} 172 | fn_correct_outputs = OrderedDict([ 173 | ( 174 | 'bag_of_words', np.array([ 175 | [0, 0, 0], 176 | [1, 0, 0], 177 | [0, 1, 1]]))]) 178 | 179 | fn_out = fn(**fn_inputs) 180 | assert_structure(fn_out, fn_correct_outputs['bag_of_words'], 'bag_of_words') 181 | assert np.array_equal(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['bag_of_words'].T)), \ 182 | 'Wrong value for bag_of_words.\n' \ 183 | 'INPUT docs:\n{}\n\n' \ 184 | 'OUTPUT bag_of_words:\n{}\n\n' \ 185 | 'A POSSIBLE CORRECT OUTPUT FOR bag_of_words:\n{}\n'\ 186 | .format(fn_inputs['docs'], fn_out, fn_correct_outputs['bag_of_words']) 187 | 188 | 189 | @project_test 190 | def test_get_jaccard_similarity(fn): 191 | fn_inputs = { 192 | 'bag_of_words_matrix': np.array([ 193 | [0, 1, 1, 0, 0, 0, 1], 194 | [0, 1, 2, 0, 1, 1, 1], 195 | [1, 0, 0, 1, 0, 0, 0]])} 196 | fn_correct_outputs = OrderedDict([ 197 | ( 198 | 'jaccard_similarities', [0.7142857142857143, 0.0])]) 199 | 200 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 201 | 202 | 203 | @project_test 204 | def test_get_tfidf(fn): 205 | def sort_ndarray(array): 206 | hashes = [hash(str(x)) for x in array] 207 | sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) 208 | 209 | return array[sotred_indicies] 210 | 211 | fn_inputs = { 212 | 'sentiment_words': pd.Series(['one', 'last', 'second']), 213 | 'docs': [ 214 | 'this is a document', 215 | 'this document is the second document', 216 | 'last one']} 217 | fn_correct_outputs = OrderedDict([ 218 | ( 219 | 'tfidf', np.array([ 220 | [0.0, 0.0, 0.0], 221 | [1.0, 0.0, 0.0], 222 | [0.0, 0.70710678, 0.70710678]]))]) 223 | 224 | fn_out = fn(**fn_inputs) 225 | assert_structure(fn_out, fn_correct_outputs['tfidf'], 'tfidf') 226 | assert np.isclose(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['tfidf'].T)).all(), \ 227 | 'Wrong value for tfidf.\n' \ 228 | 'INPUT docs:\n{}\n\n' \ 229 | 'OUTPUT tfidf:\n{}\n\n' \ 230 | 'A POSSIBLE CORRECT OUTPUT FOR tfidf:\n{}\n'\ 231 | .format(fn_inputs['docs'], fn_out, fn_correct_outputs['tfidf']) 232 | 233 | 234 | @project_test 235 | def test_get_cosine_similarity(fn): 236 | fn_inputs = { 237 | 'tfidf_matrix': np.array([ 238 | [0.0, 0.57735027, 0.57735027, 0.0, 0.0, 0.0, 0.57735027], 239 | [0.0, 0.32516555, 0.6503311, 0.0, 0.42755362, 0.42755362, 0.32516555], 240 | [0.70710678, 0.0, 0.0, 0.70710678, 0.0, 0.0, 0.0]])} 241 | fn_correct_outputs = OrderedDict([ 242 | ( 243 | 'cosine_similarities', [0.75093766927060945, 0.0])]) 244 | 245 | assert_output(fn, fn_inputs, fn_correct_outputs, check_parameter_changes=False) 246 | --------------------------------------------------------------------------------