elements
187 | We reduce this risk by only seeking takes with more than 5 (nonblank)
188 | elements, the median length of which is fewer than 30 characters
189 | """
190 | char_counts = []
191 | if html.stripped_strings:
192 | for t in html.stripped_strings:
193 | if len(t) > 0:
194 | char_counts.append(len(t))
195 | return len(char_counts) > 5 and median(char_counts) < 30
196 | else:
197 | self.log_cache.append(('ERROR',
198 | "the should_remove_table function is broken"))
199 |
200 |
201 |
202 | def is_line_break(e):
203 | """Is e likely to function as a line break when document is rendered?
204 |
205 | we are including 'HTML block-level elements' here. Note ('paragraph')
206 | and other tags may not necessarily force the appearance of a 'line break',
207 | on the page if they are enclosed inside other elements, notably a
208 | table cell
209 | """
210 |
211 |
212 | is_block_tag = e.name != None and e.name in ['p', 'div', 'br', 'hr', 'tr',
213 | 'table', 'form', 'h1', 'h2',
214 | 'h3', 'h4', 'h5', 'h6']
215 | # handle block tags inside tables: if the apparent block formatting is
216 | # enclosed in a table cell
tags, and if there are no other block
217 | # elements within the | cell (it's a singleton, then it will not
218 | # necessarily appear on a new line so we don't treat it as a line break
219 | if is_block_tag and e.parent.name == 'td':
220 | if len(e.parent.findChildren(name=e.name)) == 1:
221 | is_block_tag = False
222 | # inspect the style attribute of element e (if any) to see if it has
223 | # block style, which will appear as a line break in the document
224 | if hasattr(e, 'attrs') and 'style' in e.attrs:
225 | is_block_style = re.search('margin-(top|bottom)', e['style'])
226 | else:
227 | is_block_style = False
228 | return is_block_tag or is_block_style
229 |
230 |
--------------------------------------------------------------------------------
/src/metadata.py:
--------------------------------------------------------------------------------
1 | """
2 | secedgartext: extract text from SEC corporate filings
3 | Copyright (C) 2017 Alexander Ions
4 |
5 | You should have received a copy of the GNU General Public License
6 | along with this program. If not, see .
7 | """
8 | import json
9 | import re
10 | from bs4 import BeautifulSoup, Tag, NavigableString
11 | import time
12 | import random
13 |
14 | from .utils import logger
15 | from .utils import args, requests_get
16 | from .utils import batch_number, batch_start_time, batch_machine_id
17 | from .utils import sql_cursor, sql_connection
18 |
19 |
20 | class Metadata(object):
21 | def __init__(self, index_url=None):
22 | self.sec_cik = ''
23 | self.sec_company_name = ''
24 | self.document_type = ''
25 | self.sec_form_header = ''
26 | self.sec_period_of_report = ''
27 | self.sec_filing_date = ''
28 | self.sec_changed_date = ''
29 | self.sec_accepted_date = ''
30 | self.sec_index_url = ''
31 | self.sec_url = ''
32 | self.metadata_file_name = ''
33 | self.original_file_name = ''
34 | self.original_file_size = ''
35 | self.document_group = ''
36 | self.section_name = ''
37 | self.section_n_characters = None
38 | self.endpoints = []
39 | self.extraction_method = ''
40 | self.warnings = []
41 | self.company_description = ''
42 | self.output_file = None
43 | self.time_elapsed = None
44 | self.batch_number = batch_number
45 | self.batch_signature = args.batch_signature or ''
46 | self.batch_start_time = str(batch_start_time)
47 | self.batch_machine_id = batch_machine_id
48 | self.section_end_time = None
49 |
50 | if index_url:
51 | index_metadata = {}
52 | attempts = 0
53 | while attempts < 5:
54 | try:
55 | ri = requests_get(index_url)
56 | logger.info('Status Code: ' + str(ri.status_code))
57 | soup = BeautifulSoup(ri.text, 'html.parser')
58 | # Parse the page to find metadata
59 | form_type = soup.find('div', {'id': 'formHeader'}). \
60 | find_next('strong').string.strip()
61 | break
62 | except:
63 | attempts += 1
64 | logger.warning('No valid index page, attempt %i: %s'
65 | % (attempts, index_url))
66 | time.sleep(attempts*10 + random.randint(1,5))
67 |
68 | index_metadata['formHeader'] = form_type
69 | infoheads = soup.find_all('div', class_='infoHead')
70 | for i in infoheads:
71 | j = i.next_element
72 | while not (isinstance(j, Tag)) or not ('info') in \
73 | j.attrs['class']:
74 | j = j.next_element
75 | # remove colons, spaces, hyphens from dates/times
76 | if type(j.string) is NavigableString:
77 | index_metadata[i.string] = re.sub('[: -]', '',
78 | j.string).strip()
79 | i = soup.find('span', class_='companyName')
80 | while not (isinstance(i, NavigableString)):
81 | i = i.next_element
82 | index_metadata['companyName'] = i.strip()
83 | i = soup.find(string='CIK')
84 | while not (isinstance(i, NavigableString)) or not (re.search('\d{10}', i.string)):
85 | i = i.next_element
86 | index_metadata['CIK'] = re.search('\d{5,}', i).group()
87 |
88 | for pair in [['Period of Report', 'sec_period_of_report'],
89 | ['Filing Date', 'sec_filing_date'],
90 | ['Filing Date Changed', 'sec_changed_date'],
91 | ['Accepted', 'sec_accepted_date'],
92 | ['formHeader', 'sec_form_header'],
93 | ['companyName', 'sec_company_name'],
94 | ['CIK', 'sec_cik']]:
95 | if pair[0] in index_metadata:
96 | setattr(self, pair[1], index_metadata[pair[0]])
97 |
98 | def add_data_from_filing_text(self, text):
99 | """Scrape metadata from the filing document
100 |
101 | Find key metadata fields at the start of the filing submission,
102 | if they were not already found in the SEC index page
103 | :param text: full text of the filing
104 | """
105 | for pair in [['CONFORMED PERIOD OF REPORT:', 'sec_period_of_report'],
106 | ['FILED AS OF DATE:', 'sec_filing_date'],
107 | ['DATE AS OF CHANGE:', 'sec_changed_date'],
108 | ['', 'sec_accepted_date'],
109 | ['COMPANY CONFORMED NAME:', 'sec_company_name'],
110 | ['CENTRAL INDEX KEY::', 'sec_cik']]:
111 | srch = re.search('(?<=' + pair[0] + ').*', text)
112 | if srch and not getattr(self, pair[1]):
113 | setattr(self, pair[1], srch.group().strip())
114 |
115 | def save_to_json(self, file_path):
116 | """
117 | we effectively convert the Metadata object's data into a dict
118 | when we do json.dumps on it
119 | :param file_path:
120 | :return:
121 | """
122 | with open(file_path, 'w', encoding='utf-8') as json_output:
123 | # to write the backslashes in the JSON file legibly
124 | # (without duplicate backslashes), we have to
125 | # encode/decode using the 'unicode_escape' codec. This then
126 | # allows us to open the JSON file and click on the file link,
127 | # for immediate viewing in a browser.
128 | excerpt_as_json = json.dumps(self, default=lambda o: o.__dict__,
129 | sort_keys=False, indent=4)
130 | json_output.write(bytes(excerpt_as_json, "utf-8").
131 | decode("unicode_escape"))
132 |
133 |
134 | def save_to_db(self):
135 | """Append metadata to sqlite database
136 |
137 | """
138 |
139 | # conn = sqlite3.connect(path.join(args.storage, 'metadata.sqlite3'))
140 | # c = conn.cursor()
141 | sql_insert = """INSERT INTO metadata (
142 | batch_number,
143 | batch_signature,
144 | batch_start_time,
145 | batch_machine_id,
146 | sec_cik,
147 | company_description,
148 | sec_company_name,
149 | sec_form_header,
150 | sec_period_of_report,
151 | sec_filing_date,
152 | sec_index_url,
153 | sec_url,
154 | metadata_file_name,
155 | document_group,
156 | section_name,
157 | section_n_characters,
158 | section_end_time,
159 | extraction_method,
160 | output_file,
161 | start_line,
162 | end_line,
163 | time_elapsed) VALUES
164 | """ + "('" + "', '".join([str(self.batch_number),
165 | str(self.batch_signature),
166 | str(self.batch_start_time)[:-3], # take only 3dp microseconds
167 | self.batch_machine_id,
168 | self.sec_cik,
169 | re.sub("[\'\"]","", self.company_description).strip(),
170 | re.sub("[\'\"]","", self.sec_company_name).strip(),
171 | self.sec_form_header, self.sec_period_of_report,
172 | self.sec_filing_date,
173 | self.sec_index_url, self.sec_url,
174 | self.metadata_file_name, self.document_group,
175 | self.section_name, str(self.section_n_characters),
176 | str(self.section_end_time)[:-3],
177 | self.extraction_method,
178 | str(self.output_file),
179 | re.sub("[\'\"]","", self.endpoints[0]).strip()[0:200],
180 | re.sub("[\'\"]","", self.endpoints[1]).strip()[0:200],
181 | str(self.time_elapsed)]) + "')"
182 | sql_insert = sql_insert.replace("'None'","NULL")
183 | sql_cursor.execute(sql_insert)
184 | sql_connection.commit()
185 |
186 |
187 | def load_from_json(file_path):
188 | metadata = Metadata()
189 | with open(file_path, 'r') as json_file:
190 | try:
191 | # data = json.loads(data_file.read().replace('\\', '\\\\'), strict=False)
192 | data = json.loads(json_file.read())
193 | metadata.sec_cik = data['sec_cik']
194 | metadata.sec_company_name = data['sec_company_name']
195 | metadata.company_description = data['company_description']
196 | metadata.document_type = data['document_type']
197 | metadata.sec_form_header = data['sec_form_header']
198 | metadata.sec_period_of_report = data['sec_period_of_report']
199 | metadata.sec_filing_date = data['sec_filing_date']
200 | metadata.sec_changed_date = data['sec_changed_date']
201 | metadata.sec_accepted_date = data['sec_accepted_date']
202 | metadata.sec_accepted_date = data['sec_accepted_date']
203 | metadata.sec_url = data['sec_url']
204 | metadata.metadata_file_name = data['metadata_file_name']
205 | metadata.original_file_name = data['original_file_name']
206 | metadata.original_file_size = data['original_file_size']
207 | metadata.document_group = data['form_group']
208 | metadata.section_name = data['section_name']
209 | metadata.section_n_characters = data['section_n_characters']
210 | metadata.endpoints = data['endpoints']
211 | metadata.extraction_method = data['extraction_method']
212 | metadata.warnings = data['warnings']
213 | metadata.output_file = data['output_file']
214 | metadata.time_elapsed = data['time_elapsed']
215 | metadata.batch_number = data['batch_number']
216 | metadata.batch_signature = data['batch_signature']
217 | metadata.batch_start_time = data['batch_start_time']
218 | metadata.batch_machine_id = data['batch_machine_id']
219 | metadata.section_end_time = data['section_end_time']
220 |
221 | except:
222 | logger.info('Could not load corrupted JSON file: ' + file_path)
223 |
224 | return metadata
225 |
226 |
--------------------------------------------------------------------------------
/src/text_document.py:
--------------------------------------------------------------------------------
1 | """
2 | secedgartext: extract text from SEC corporate filings
3 | Copyright (C) 2017 Alexander Ions
4 |
5 | You should have received a copy of the GNU General Public License
6 | along with this program. If not, see .
7 | """
8 | import re
9 |
10 | from .document import Document
11 |
12 |
13 | class TextDocument(Document):
14 | def __init__(self, *args, **kwargs):
15 | super(TextDocument, self).__init__(*args, **kwargs)
16 |
17 | def search_terms_type(self):
18 | return "txt"
19 |
20 | def extract_section(self, search_pairs):
21 | """
22 |
23 | :param search_pairs:
24 | :return:
25 | """
26 | start_text = 'na'
27 | end_text = 'na'
28 | warnings = []
29 | text_extract = None
30 | for st_idx, st in enumerate(search_pairs):
31 | # ungreedy search (note '.*?' regex expression between 'start' and 'end' patterns
32 | # also using (?:abc|def) for a non-capturing group
33 | # st = super().search_terms_pattern_to_regex()
34 | # st = Reader.search_terms_pattern_to_regex(st)
35 | item_search = re.findall(st['start'] + '.*?' + st['end'],
36 | self.doc_text,
37 | re.DOTALL | re.IGNORECASE)
38 | if item_search:
39 | longest_text_length = 0
40 | for s in item_search:
41 | text_extract = s.strip()
42 | if len(s) > longest_text_length:
43 | longest_text_length = len(text_extract)
44 | # final_text_new = re.sub('^\n*', '', final_text_new)
45 | final_text_lines = text_extract.split('\n')
46 | start_text = final_text_lines[0]
47 | end_text = final_text_lines[-1]
48 | break
49 | if text_extract:
50 | # final_text = '\n'.join(final_text_lines)
51 | # text_extract = remove_table_lines(final_text)
52 | text_extract = remove_table_lines(text_extract)
53 | extraction_summary = self.extraction_method + '_document'
54 | else:
55 | warnings.append('Extraction did not work for text file')
56 | extraction_summary = self.extraction_method + '_document: failed'
57 | return text_extract, extraction_summary, start_text, end_text, warnings
58 |
59 | def remove_table_lines(input_text):
60 | """Replace lines believed to be part of numeric tables with a placeholder.
61 |
62 | :param input_text:
63 | :return:
64 | """
65 | text_lines = []
66 | table_lines = []
67 | post_table_lines = []
68 | is_in_a_table = False
69 | is_in_a_post_table = False
70 | all_lines = input_text.splitlines(True)
71 | for i, line in enumerate(all_lines, 0):
72 | if is_table_line(line):
73 | # a table line, possibly not part of an excerpt
74 | if is_in_a_post_table:
75 | # table resumes: put the inter-table lines into the table_line list
76 | table_lines = table_lines + post_table_lines
77 | post_table_lines = []
78 | is_in_a_post_table = False
79 | table_lines.append(line)
80 | is_in_a_table = True
81 | else:
82 | # not a table line
83 | if is_in_a_table:
84 | # the first post-table line
85 | is_in_a_table = False
86 | is_in_a_post_table = True
87 | post_table_lines.append(line)
88 | elif is_in_a_post_table:
89 | # 2nd and subsequent post-table lines, or final line
90 | if len(post_table_lines) >= 4:
91 | # sufficient post-table lines have accumulated now that we
92 | # revert to standard 'not a post table' mode.
93 | # We append the post-table lines to the text_lines,
94 | # and we discard the table_lines
95 | if len(table_lines) >= 3:
96 | text_lines.append(
97 | '[DATA_TABLE_REMOVED_' +
98 | str(len(table_lines)) + '_LINES]\n\n')
99 | else:
100 | # very short table, so we just leave it in
101 | # the document regardless
102 | text_lines = text_lines + table_lines
103 | text_lines = text_lines + post_table_lines
104 | table_lines = []
105 | post_table_lines = []
106 | is_in_a_post_table = False
107 | else:
108 | post_table_lines.append(line)
109 | if not (is_in_a_table) and not (is_in_a_post_table):
110 | # normal excerpt line: just append it to text_lines
111 | text_lines.append(line)
112 | # Tidy up any outstanding table_lines and post_table_lines at the end
113 | if len(table_lines) >= 3:
114 | text_lines.append(
115 | '[DATA_TABLE_REMOVED_' + str(len(table_lines)) + '_LINES]\n\n')
116 | else:
117 | text_lines = text_lines + table_lines
118 | text_lines = text_lines + post_table_lines
119 |
120 | final_text = ''.join(text_lines)
121 | return final_text
122 |
123 |
124 | def is_table_line(s):
125 | """Is text line string s likely to be part of a numeric table?
126 |
127 | gaps between table 'cells' are expected to have three or more whitespaces,
128 | and table rows are expected to have at least 3 such gaps, i.e. 4 columns
129 |
130 | :param s:
131 | :return:
132 | """
133 | s = s.replace('\t', ' ')
134 | rs = re.findall('\S\s{3,}', s) # \S = non-whitespace, \s = whitespace
135 | r = re.search('(|(-|=|_){5,})', s) # check for TABLE quasi-HTML tag,
136 | # or use of lots of punctuation marks as table gridlines
137 | # Previously also looking for ^\s{10,}[a-zA-z] "lots of spaces prior to
138 | # the first (non-numeric i.e. not just a page number marker) character".
139 | # Not using this approach because risk of confusion with centre-justified
140 | # section headings in certain text documents
141 | return len(rs) >= 2 or r != None
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | secedgartext: extract text from SEC corporate filings
3 | Copyright (C) 2017 Alexander Ions
4 |
5 | You should have received a copy of the GNU General Public License
6 | along with this program. If not, see .
7 | """
8 |
9 | import logging
10 | import os
11 | import sys
12 | import shutil
13 | import argparse
14 | import re
15 | from os import path
16 | import socket
17 | import time
18 | import datetime
19 | import json
20 | import sqlite3
21 | import multiprocessing as mp
22 | from copy import copy
23 |
24 |
25 | """Parse the command line arguments
26 | """
27 | companies_file_location = ''
28 | single_company = ''
29 | project_dir = path.dirname(path.dirname(__file__))
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument('--storage', help='Specify path to storage location')
32 | parser.add_argument('--write_sql', default=True, help='Save metadata to sqlite database? (Boolean)')
33 | parser.add_argument('--company', help='CIK code specifying company for single-company download')
34 | parser.add_argument('--companies_list', help='path of text file with all company CIK codes to download')
35 | parser.add_argument('--filings', help='comma-separated list of SEC filings of interest (10-Q,10-K...)')
36 | parser.add_argument('--documents')
37 | parser.add_argument('--start', help='document start date passed to EDGAR web interface')
38 | parser.add_argument('--end', help='document end date passed to EDGAR web interface')
39 | parser.add_argument('--report_period', help='search pattern for company report dates, e.g. 2012, 201206 etc.')
40 | parser.add_argument('--batch_signature')
41 | parser.add_argument('--start_company', help='index number of first company to download from the companies_list file')
42 | parser.add_argument('--end_company', help='index number of last company to download from the companies_list file')
43 | parser.add_argument('--traffic_limit_pause_ms', help='time to pause between download attempts, to avoid overloading EDGAR server')
44 | parser.add_argument('--multiprocessing_cores', help='number of processor cores to use')
45 | args = parser.parse_args()
46 |
47 | if args.storage:
48 | if not path.isabs(args.storage):
49 | args.storage = path.join(project_dir, args.storage)
50 | else:
51 | args.storage = path.join(project_dir, 'output_files_examples')
52 |
53 | args.write_sql = args.write_sql or True
54 | if args.company:
55 | single_company = args.company
56 | else:
57 | if args.companies_list:
58 | companies_file_location = os.path.join(project_dir, args.companies_list)
59 | else:
60 | companies_file_location = os.path.join(project_dir, 'companies_list.txt')
61 |
62 | args.filings = args.filings or \
63 | input('Enter filings search text (default: 10-K,10-Q): ') or \
64 | '10-K,10-Q'
65 | args.filings = re.split(',', args.filings) # ['10-K','10-Q']
66 |
67 | if '10-K' in args.filings:
68 | search_window_days = 365
69 | else:
70 | search_window_days = 91
71 | ccyymmdd_default_start = (datetime.datetime.now() - datetime.timedelta(days=
72 | search_window_days)).strftime('%Y%m%d')
73 | args.start = int(args.start or \
74 | input('Enter start date for filings search (default: ' +
75 | ccyymmdd_default_start + '): ') or \
76 | ccyymmdd_default_start)
77 | ccyymmdd_default_end = (datetime.datetime.strptime(str(args.start), '%Y%m%d') +
78 | datetime.timedelta(days=search_window_days)).strftime('%Y%m%d')
79 | args.end = int(args.end or \
80 | input('Enter end date for filings search (default: ' +
81 | ccyymmdd_default_end + '): ') or \
82 | ccyymmdd_default_end)
83 | if str(args.report_period).lower() == 'all':
84 | date_search_string = '.*'
85 | else:
86 | date_search_string = str(
87 | args.report_period or
88 | input('Enter filing report period ccyy, ccyymm etc. (default: all periods): ') or
89 | '.*')
90 |
91 |
92 | """Set up the metadata database
93 | """
94 | batch_start_time = datetime.datetime.utcnow()
95 | batch_machine_id = socket.gethostname()
96 |
97 | if args.write_sql:
98 | db_location = path.join(args.storage, 'metadata.sqlite3')
99 | sql_connection = sqlite3.connect(db_location)
100 | sql_cursor = sql_connection.cursor()
101 | sql_cursor.execute("""
102 | CREATE TABLE IF NOT EXISTS metadata (
103 | id integer PRIMARY KEY,
104 | batch_number integer NOT NULL,
105 | batch_signature text NOT NULL,
106 | batch_start_time datetime NOT NULL,
107 | batch_machine_id text,
108 | sec_cik text NOT NULL,
109 | company_description text,
110 | sec_company_name text,
111 | sec_form_header text,
112 | sec_period_of_report integer,
113 | sec_filing_date integer,
114 | sec_index_url text,
115 | sec_url text,
116 | metadata_file_name text,
117 | document_group text,
118 | section_name text,
119 | section_n_characters integer,
120 | section_end_time datetime,
121 | extraction_method text,
122 | output_file text,
123 | start_line text,
124 | end_line text,
125 | time_elapsed real)
126 | """)
127 | sql_connection.commit()
128 | query_result = sql_cursor.execute('SELECT max(batch_number) FROM metadata').fetchone()
129 | if query_result and query_result[0]:
130 | batch_number = query_result[0] + 1
131 | else:
132 | batch_number = 1
133 | # put a dummy line into the metadata table to 'reserve' a batch number:
134 | # prevents other processes running in parallel from taking the same batch_number
135 | sql_cursor.execute("""
136 | insert into metadata (batch_number, batch_signature,
137 | batch_start_time, sec_cik) values
138 | """ + " ('" + "', '".join([str(batch_number),
139 | str(args.batch_signature or ''),
140 | str(batch_start_time)[:-3], # take only 3dp microseconds
141 | 'dummy_cik_code']) + "')")
142 | sql_connection.commit()
143 | else:
144 | batch_number = 0
145 |
146 |
147 | """Set up numbered storage sub-directory for the current batch run
148 | """
149 | storage_toplevel_directory = os.path.join(args.storage,
150 | 'batch_' +
151 | format(batch_number, '04d'))
152 |
153 | # (re-)make the storage directory for the current batch. This will delete
154 | # any contents that might be left over from earlier runs, thus avoiding
155 | # any potential duplication/overlap/confusion
156 | if os.path.exists(storage_toplevel_directory):
157 | shutil.rmtree(storage_toplevel_directory)
158 | os.makedirs(storage_toplevel_directory)
159 |
160 |
161 |
162 |
163 | """Set up logging
164 | """
165 | # log_file_name = 'sec_extractor_{0}.log'.format(ts)
166 | log_file_name = 'secedgartext_batch_%s.log' % format(batch_number, '04d')
167 | log_path = path.join(args.storage, log_file_name)
168 |
169 | logger = logging.getLogger('text_analysis')
170 | # # set up the logger if it hasn't already been set up earlier in the execution run
171 | logger.setLevel(logging.DEBUG) # we have to initialise this top-level setting otherwise everything defaults to logging.WARN level
172 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s',
173 | '%Y%m%d %H:%M:%S')
174 |
175 | file_handler = logging.FileHandler(log_path)
176 | file_handler.setFormatter(formatter)
177 | file_handler.setLevel(logging.DEBUG)
178 | file_handler.set_name('my_file_handler')
179 | logger.addHandler(file_handler)
180 |
181 | console_handler = logging.StreamHandler()
182 | console_handler.setFormatter(formatter)
183 | console_handler.setLevel(logging.DEBUG)
184 | console_handler.set_name('my_console_handler')
185 | logger.addHandler(console_handler)
186 |
187 |
188 | ts = time.time()
189 | logger.info('=' * 65)
190 | logger.info('Analysis started at {0}'.
191 | format(datetime.datetime.fromtimestamp(ts).
192 | strftime('%Y%m%d %H:%M:%S')))
193 | logger.info('Command line:\t{0}'.format(sys.argv[0]))
194 | logger.info('Arguments:\t\t{0}'.format(' '.join(sys.argv[:])))
195 | logger.info('=' * 65)
196 |
197 | if args.write_sql:
198 | logger.info('Opened SQL connection: %s', db_location)
199 |
200 |
201 | if not args.traffic_limit_pause_ms:
202 | # default pause after HTTP request: zero milliseconds
203 | args.traffic_limit_pause_ms = 0
204 | else:
205 | args.traffic_limit_pause_ms = int(args.traffic_limit_pause_ms)
206 | logger.info('Traffic Limit Pause (ms): %s' %
207 | str(args.traffic_limit_pause_ms))
208 |
209 |
210 | if args.multiprocessing_cores:
211 | args.multiprocessing_cores = min(mp.cpu_count()-1,
212 | int(args.multiprocessing_cores))
213 | else:
214 | args.multiprocessing_cores = 0
215 |
216 |
217 | """Create search_terms_regex, which stores the patterns that we
218 | use for identifying sections in each of EDGAR documents types
219 | """
220 | with open (path.join(project_dir, 'document_group_section_search.json'), 'r') as \
221 | f:
222 | json_text = f.read()
223 | search_terms = json.loads(json_text)
224 | if not search_terms:
225 | logger.error('Search terms file is missing or corrupted: ' +
226 | f.name)
227 | search_terms_regex = copy(search_terms)
228 | for filing in search_terms:
229 | for idx, section in enumerate(search_terms[filing]):
230 | for format in ['txt','html']:
231 | for idx2, pattern in enumerate(search_terms[filing][idx][format]):
232 | for startend in ['start','end']:
233 | regex_string = search_terms[filing][idx][format] \
234 | [idx2][startend]
235 | regex_string = regex_string.replace('_','\\s{,5}')
236 | regex_string = regex_string.replace('\n', '\\n')
237 | search_terms_regex[filing][idx][format] \
238 | [idx2][startend] = regex_string
239 | """identify which 'document' types are to be downloaded. If no command line
240 | argument given, then default to all of the document types listed in the
241 | JSON file"""
242 | args.documents = args.documents or ','.join(list(search_terms.keys()))
243 | args.documents = re.split(',', args.documents) # ['10-K','10-Q']
244 |
245 |
246 | def requests_get(url, params=None):
247 | """retrieve text via url, fatal error if no internet connection available
248 | :param url: source url
249 | :return: text retriieved
250 | """
251 | import requests, random
252 | retries = 0
253 | success = False
254 | hdr = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36'}
255 | while (not success) and (retries <= 20):
256 | # wait for an increasingly long time (up to a day) in case internet
257 | # connection is broken. Gives enough time to fix connection or SEC site
258 | try:
259 | # to test the timeout functionality, try loading this page:
260 | # http://httpstat.us/200?sleep=20000 (20 seconds delay before page loads)
261 | r = requests.get(url, headers=hdr, params=params, timeout=10)
262 | success = True
263 | # facility to add a pause to respect SEC EDGAR traffic limit
264 | # https://www.sec.gov/privacy.htm#security
265 | time.sleep(args.traffic_limit_pause_ms/1000)
266 | except requests.exceptions.RequestException as e:
267 | wait = (retries ^3) * 20 + random.randint(1,5)
268 | logger.warning(e)
269 | logger.info('URL: %s' % url)
270 | logger.info(
271 | 'Waiting %s secs and re-trying...' % wait)
272 | time.sleep(wait)
273 | retries += 1
274 | if retries > 10:
275 | logger.error('Download repeatedly failed: %s',
276 | url)
277 | sys.exit('Download repeatedly failed: %s' %
278 | url)
279 | return r
280 |
281 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
|