├── ALL ├── .gitkeep ├── .~lock.1018724_Balance_sheet.csv# ├── .~lock.1018724_Cash_Flow.csv# └── .~lock.1018724_Income_Statment.csv# ├── Aggregate.py ├── Label.py ├── Sorted.xlsx ├── files └── .gitkeep ├── model ├── Kmeans.joblib ├── SVC.joblib ├── X_transform_SVC.joblib ├── X_transform_cluster.joblib ├── X_vecto_SVC.joblib └── X_vecto_cluster.joblib ├── parsed └── .gitkeep ├── readme.md ├── requirements.txt ├── scrape.py ├── scrapy.cfg └── sec ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc └── settings.cpython-37.pyc ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc └── sec.cpython-37.pyc └── sec.py /ALL/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ALL/.~lock.1018724_Balance_sheet.csv#: -------------------------------------------------------------------------------- 1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:07,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4; -------------------------------------------------------------------------------- /ALL/.~lock.1018724_Cash_Flow.csv#: -------------------------------------------------------------------------------- 1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:08,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4; -------------------------------------------------------------------------------- /ALL/.~lock.1018724_Income_Statment.csv#: -------------------------------------------------------------------------------- 1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:09,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4; -------------------------------------------------------------------------------- /Aggregate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import datetime 4 | from functools import reduce 5 | from datetime import timedelta 6 | from dateutil.relativedelta import relativedelta 7 | from datetime import date 8 | import re 9 | from difflib import SequenceMatcher 10 | 11 | 12 | # TODO come up with proper merging cel process 13 | # surpivised learning model 14 | # look for proccesing language model 15 | # TODO Try implementing processing language model to see similirities between words 16 | # so the synunoms will be merged together 17 | 18 | # TODO Delete rows with more than 10-20 Nans(replace other nans using 19 | # https://www.kaggle.com/juejuewang/handle-missing-values-in-time-series-for-beginners) 20 | # TODO rename function(from CIK to Symbol) 21 | 22 | # TODO make pypi package 23 | 24 | # company = all uniqie 25 | def companies(): 26 | files = [] 27 | for file in os.listdir("./files"): 28 | if file.endswith(".csv"): 29 | pat = os.path.join("./files", file) 30 | files.append(pat) 31 | companies = [] 32 | for i in files: 33 | company = i.split(' ')[0] 34 | company = company[8:] 35 | companies.append(company) 36 | companies = list(set(companies)) 37 | return companies 38 | 39 | # print(companies()) 40 | 41 | # read files in folder with parsed documents 42 | def get_company_parsed(company): 43 | parsed_files = [] 44 | for file in os.listdir("./parsed"): 45 | if file.endswith(".csv"): 46 | pat = os.path.join("./parsed", file) 47 | if company in pat: 48 | parsed_files.append(pat) 49 | return parsed_files 50 | 51 | # Combines the files of the same type in one single spreadsheet 52 | def join_all(company): 53 | count = 0 54 | Is_count = [] 55 | parsed_files = get_company_parsed(company) 56 | Is = [] 57 | bs = [] 58 | cs = [] 59 | for i in parsed_files: 60 | if 'Income_Statment' in i: 61 | date = i[-14:-4] 62 | df = pd.read_csv(i, index_col= 0, usecols = [0, 2] ) 63 | df.index = df.index.astype(str) 64 | df = df.rename(columns={"2": date}) 65 | df = df[~df.index.duplicated()] 66 | df.index = df.index.map(str.lower) 67 | Is.append(df) 68 | if 'Balance_Sheet' in i: 69 | date = i[-14:-4] 70 | df = pd.read_csv(i, index_col= 0, usecols = [0, 2] ) 71 | df.index = df.index.astype(str) 72 | df = df.rename(columns={"2": date}) 73 | df = df[~df.index.duplicated()] 74 | df.index = df.index.map(str.lower) 75 | bs.append(df) 76 | if 'Cash_Flow' in i: 77 | date = i[-14:-4] 78 | df = pd.read_csv(i, index_col= 0, usecols = [0, 2] ) 79 | df.index = df.index.astype(str) 80 | df = df.rename(columns={"2": date}) 81 | df = df[~df.index.duplicated()] 82 | # print(df.index) 83 | df.index = df.index.map(str.lower) 84 | cs.append(df) 85 | # try: 86 | Is_df = pd.concat(Is, axis = 1, sort = False) 87 | for i in Is_df.index: 88 | for b in Is_df.index: 89 | rat = SequenceMatcher(None, i, b).ratio() 90 | if rat > 0.7: 91 | Is_df.rename(index = {i: b}, inplace = True) 92 | Is_df = Is_df.groupby(level=0, axis = 0, sort = False).sum() 93 | 94 | for column in Is_df: 95 | columnSeriesObj = Is_df[column] 96 | # print(Is_df) 97 | check = columnSeriesObj.str.contains('us-gaap', regex=False) 98 | if check.any() == True: 99 | for val in columnSeriesObj.values: 100 | val = str(val) 101 | val2 = val.split('us-gaap')[0] 102 | Is_df.replace({val: val2}, inplace = True) 103 | Is_df.to_csv('./ALL/' + company + '_Income_Statment.csv') 104 | 105 | # except ValueError: 106 | # pass 107 | 108 | try: 109 | Bs_df = pd.concat(bs, axis = 1, sort = False) 110 | for i in Bs_df.index: 111 | for b in Bs_df.index: 112 | rat = SequenceMatcher(None, i, b).ratio() 113 | # print(rat) 114 | if rat > 0.7: 115 | Bs_df.rename(index = {i: b}, inplace = True) 116 | Bs_df = Bs_df.groupby(level=0, axis = 0, sort = False).sum() 117 | 118 | # rename us-gaap in year 2015 119 | for column in Bs_df: 120 | columnSeriesObj = Bs_df[column] 121 | check = columnSeriesObj.str.contains('us-gaap', regex=False) 122 | if check.any() == True: 123 | for val in columnSeriesObj.values: 124 | val = str(val) 125 | val2 = val.split('us-gaap')[0] 126 | Bs_df.replace({val: val2}, inplace = True) 127 | 128 | Bs_df.to_csv('./ALL/' + company + '_Balance_sheet.csv') 129 | except ValueError: 130 | pass 131 | 132 | try: 133 | Cs_df = pd.concat(cs, axis = 1, sort = False) 134 | for i in Cs_df.index: 135 | for b in Cs_df.index: 136 | rat = SequenceMatcher(None, i, b).ratio() 137 | # print(rat) 138 | if rat > 0.7: 139 | Cs_df.rename(index = {i: b}, inplace = True) 140 | Cs_df = Cs_df.groupby(level=0, axis = 0, sort = False).sum() 141 | 142 | # rename us-gaap in year 2015 143 | for column in Cs_df: 144 | columnSeriesObj = Cs_df[column] 145 | check = columnSeriesObj.str.contains('us-gaap', regex=False) 146 | if check.any() == True: 147 | for val in columnSeriesObj.values: 148 | val = str(val) 149 | val2 = val.split('us-gaap')[0] 150 | Cs_df.replace({val: val2}, inplace = True) 151 | Cs_df.to_csv('./ALL/' + company + '_Cash_Flow.csv') 152 | except ValueError: 153 | pass 154 | 155 | # use all 156 | def main(companies = companies() ): 157 | 158 | # Combine the files of the same type in one single document 159 | for i in companies: 160 | join_all(i) 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /Label.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | from sklearn.naive_bayes import MultinomialNB 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | import numpy as np 7 | from joblib import dump, load 8 | import time 9 | 10 | # Times for each model based on small sample 11 | # Kmean time 0.4447798728942871 12 | # SVC time 0.4916868209838867 13 | # Combined models time 0.5555140972137451 14 | 15 | # TODO increase the training sample for supervised model 16 | 17 | count_vect = CountVectorizer() 18 | tfidf_transformer = TfidfTransformer() 19 | 20 | # Load Classifiers 21 | 22 | # Load Kmeans 23 | Kmeans = load(r'C:\Users\Nikita\Desktop\ml_selector\model\Kmeans.joblib') 24 | X_SVC_train_counts = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_vecto_SVC.joblib') 25 | X_SVC_train_tfidf = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_transform_SVC.joblib') 26 | 27 | # Load SVC 28 | SVC = load(r'C:\Users\Nikita\Desktop\ml_selector\model\SVC.joblib') 29 | X_Kmean_train_counts = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_vecto_cluster.joblib') 30 | X_Kmean_train_tfidf = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_transform_cluster.joblib' ) 31 | 32 | # read all files in folder with raw documents 33 | def raw_files(): 34 | files = [] 35 | for file in os.listdir("./files"): 36 | if file.endswith(".csv"): 37 | pat = os.path.join("./files", file) 38 | files.append(pat) 39 | return files 40 | 41 | # Function for SVC model 42 | def SVC_predict(names, model = SVC, vectorizer = X_SVC_train_counts, transformer = X_SVC_train_tfidf): 43 | X_new_counts = vectorizer.transform(names) 44 | X_new_tfidf = transformer.transform(X_new_counts) 45 | 46 | predicted = model.predict(X_new_tfidf) 47 | probabilities = model.predict_proba(X_new_tfidf) 48 | 49 | return predicted, probabilities 50 | 51 | # Funcion for Kmean model 52 | def Kmean_predict(names, model = Kmeans, vectorizer = X_Kmean_train_counts, transformer = X_Kmean_train_tfidf): 53 | X_new_counts = vectorizer.transform(names) 54 | X_new_tfidf = transformer.transform(X_new_counts) 55 | 56 | predicted = model.predict(X_new_tfidf) 57 | 58 | return predicted 59 | 60 | # Rename files from CIKs to Tickers 61 | def rename(company): 62 | df = pd.read_excel('Sorted.xlsx', index_col = 0) 63 | 64 | df2 = df.loc[df['CIK'] == int(company)] 65 | print(df2) 66 | TICKER = str(df2['Ticker'].values[0]) 67 | print(TICKER) 68 | return TICKER 69 | 70 | # Decide on the document type(eg. Balance Sheet, Income Statment, Cash Flow) 71 | def doc_type(doc, model = 'Combined_model'): 72 | # Read document 73 | df = pd.read_csv(doc, index_col= 1) 74 | # print(df) 75 | df.index = df.index.astype(str) 76 | 77 | # Convert index to a string 78 | names = list(df.index.values) 79 | names = [i.lower() for i in names] 80 | names = [', '.join(names)] 81 | 82 | # Get results of model prediction 83 | doc_type = 'none' 84 | 85 | # Kmean Alone 86 | if model == 'Kmean': 87 | result2 = Kmean_predict(names = names) 88 | if result2[0] == 1: 89 | doc_type = 'Balance_Sheet' 90 | if result2[0] == 6: 91 | doc_type = 'Income_Statment' 92 | if result2[0] == 0: 93 | doc_type = 'Cash_Flow' 94 | 95 | # SVC Alone 96 | if model == 'SVC': 97 | result, probability = SVC_predict(names = names) 98 | if result[0] == 3 and max(probability[0]) > 0.93: 99 | doc_type = 'Balance_Sheet' 100 | if result[0] == 2 and max(probability[0]) > 0.93: 101 | doc_type = 'Income_Statment' 102 | if result[0] == 1 and max(probability[0]) > 0.93: 103 | doc_type = 'Cash_Flow' 104 | 105 | # Combined model 106 | if model == 'Combined_model': 107 | result, probability = SVC_predict(names = names) 108 | result2 = Kmean_predict(names = names) 109 | if result[0] == 3 and max(probability[0]) > 0.93 and result2[0] == 1: 110 | doc_type = 'Balance_Sheet' 111 | if result[0] == 2 and max(probability[0]) > 0.93 and result2[0] == 6: 112 | doc_type = 'Income_Statment' 113 | if result[0] == 1 and max(probability[0]) > 0.93 and result2[0] == 0: 114 | doc_type = 'Cash_Flow' 115 | 116 | # Company name(CIK) 117 | company = doc.split(' ')[0] 118 | company = company[8:] 119 | company = rename(company) 120 | # Date of publish 121 | date = doc.split(' ')[1] 122 | date = date[:-6] 123 | 124 | if doc_type is not 'none': 125 | # print(df) 126 | df.drop('Unnamed: 0', axis = 'columns', inplace = True) 127 | # print(df) 128 | save_name = './parsed/' + company + ' '+ doc_type + ' '+ date + '.csv' 129 | df.to_csv(save_name) 130 | 131 | # Labels all the documets and times the procecss 132 | def sort(): 133 | t = time.time() 134 | for i in raw_files(): 135 | doc_type(i) 136 | t1 = time.time() - t 137 | print("The labeling took: " + str(t1) + " seconds") 138 | 139 | sort() -------------------------------------------------------------------------------- /Sorted.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/Sorted.xlsx -------------------------------------------------------------------------------- /files/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/Kmeans.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/Kmeans.joblib -------------------------------------------------------------------------------- /model/SVC.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/SVC.joblib -------------------------------------------------------------------------------- /model/X_transform_SVC.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_transform_SVC.joblib -------------------------------------------------------------------------------- /model/X_transform_cluster.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_transform_cluster.joblib -------------------------------------------------------------------------------- /model/X_vecto_SVC.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_vecto_SVC.joblib -------------------------------------------------------------------------------- /model/X_vecto_cluster.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_vecto_cluster.joblib -------------------------------------------------------------------------------- /parsed/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # About this project 2 | 3 | The main idea of project is to make efficient fundamental data scraper which will provide accurately sorted financial information. 4 | 5 | At the current state the scraper is a fully functioning and written using Scrapy library. 6 | The data is scrapped only from 2011 onward. 7 | 8 | The labeling script decides on the document type and stores it in parsed folder. 9 | 10 | The Aggregation script is in very raw stage of progress and I will be working on it in the upcoming month. 11 | 12 | ## Getting Started 13 | Copy the repository and Install requirement.txt using pip 14 | 15 | To scrape data run scraper.py and pass symbols of companies you want to scrape and year 16 | ``` 17 | python scrape.py 18 | ```. 19 | 20 | The scraped files are stored in scraped folder 21 | 22 | To label all scraped files just run 23 | ``` 24 | label.py 25 | ```. 26 | 27 | ## Contribution 28 | I am actively seeking contributors to improve efficiency, structure and functionality. 29 | 30 | ## License 31 | 32 | This project is licensed under the terms of the MIT license. 33 | 34 | "# SEC-EDGAR-python-scraper" 35 | 36 | ## A note 37 | Also, I am third year finance major, and been learning programming for less than a year, therefore the code inefficiencies and the structure might look out of place as I am not familiar with many programming convensions. 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | asn1crypto=1.0.1=py37_0 5 | atomicwrites=1.3.0=py37_1 6 | attrs=19.2.0=py_0 7 | automat=0.7.0=py37_0 8 | bcrypt=3.1.7=py37he774522_0 9 | beautifulsoup4=4.8.1=pypi_0 10 | blas=1.0=mkl 11 | bs4=0.0.1=pypi_0 12 | ca-certificates=2019.8.28=0 13 | certifi=2019.9.11=py37_0 14 | cffi=1.12.3=py37h7a1dbc1_0 15 | chardet=3.0.4=pypi_0 16 | colorama=0.4.1=py37_0 17 | constantly=15.1.0=py37h28b3542_0 18 | cryptography=2.7=py37h7a1dbc1_0 19 | cssselect=1.1.0=py_0 20 | get=2019.4.13=pypi_0 21 | hyperlink=19.0.0=py_0 22 | icc_rt=2019.0.0=h0cc432a_1 23 | idna=2.8=py37_0 24 | importlib_metadata=0.23=py37_0 25 | incremental=17.5.0=py37_0 26 | intel-openmp=2019.4=245 27 | libiconv=1.15=h1df5818_7 28 | libxml2=2.9.9=h464c3ec_0 29 | libxslt=1.1.33=h579f668_0 30 | lxml=4.4.1=py37h1350720_0 31 | mkl=2019.4=245 32 | mkl-service=2.3.0=py37hb782905_0 33 | mkl_fft=1.0.14=py37h14836fe_0 34 | mkl_random=1.1.0=py37h675688f_0 35 | more-itertools=7.2.0=py37_0 36 | numpy=1.16.5=py37h19fb1c0_0 37 | numpy-base=1.16.5=py37hc3f5095_0 38 | openssl=1.1.1d=he774522_2 39 | packaging=19.2=py_0 40 | pandas=0.25.1=py37ha925a31_0 41 | parsel=1.5.2=py37_0 42 | pip=19.2.3=py37_0 43 | pluggy=0.13.0=py37_0 44 | post=2019.4.13=pypi_0 45 | public=2019.4.13=pypi_0 46 | py=1.8.0=py37_0 47 | pyasn1=0.4.7=py_0 48 | pyasn1-modules=0.2.6=py37_0 49 | pycparser=2.19=py37_0 50 | pydispatcher=2.0.5=py37_1 51 | pyhamcrest=1.9.0=py37_2 52 | pyopenssl=19.0.0=py37_0 53 | pyparsing=2.4.2=py_0 54 | pytest=5.0.1=py37_0 55 | pytest-runner=5.1=py_0 56 | python=3.7.4=h5263a28_0 57 | python-dateutil=2.8.0=py37_0 58 | pytz=2019.3=py_0 59 | pywin32=223=py37hfa6e2cd_1 60 | query-string=2019.4.13=pypi_0 61 | queuelib=1.5.0=py37_0 62 | request=2019.4.13=pypi_0 63 | requests=2.22.0=pypi_0 64 | scrapy=1.6.0=py37_0 65 | service_identity=18.1.0=py37h28b3542_0 66 | setuptools=41.4.0=py37_0 67 | six=1.12.0=py37_0 68 | soupsieve=1.9.4=pypi_0 69 | sqlite=3.30.0=he774522_0 70 | twisted=19.7.0=py37he774522_1 71 | urllib3=1.25.6=pypi_0 72 | vc=14.1=h0510ff6_4 73 | vs2015_runtime=14.16.27012=hf0eaf9b_0 74 | w3lib=1.21.0=py_0 75 | wcwidth=0.1.7=py37_0 76 | wheel=0.33.6=py37_0 77 | wincertstore=0.2=py37_0 78 | zipp=0.6.0=py_0 79 | zlib=1.2.11=h62dcd97_3 80 | zope=1.0=py37_1 81 | zope.interface=4.6.0=py37he774522_0 82 | -------------------------------------------------------------------------------- /scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | from scrapy.crawler import CrawlerProcess 5 | from scrapy.utils.project import get_project_settings 6 | import scrapy 7 | import re, requests 8 | import bs4 as bs 9 | import urllib.request 10 | from sys import argv 11 | import pandas as pd 12 | import random 13 | import os 14 | 15 | 16 | # TODO before starting extensive scraping jobs implement a solid proxy system(Late stage dev) 17 | 18 | tickers = input("Enter Tickers: ").split() 19 | year = int(input("Enter from which year to start: ")) 20 | 21 | # Function to convert Tickers to CIK numbers 22 | def getCIK(TICKERS): 23 | URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany' 24 | CIK_RE = re.compile(r'.*CIK=(\d{10}).*') 25 | cik_dict = {} 26 | for ticker in TICKERS: 27 | f = requests.get(URL.format(ticker), stream = True) 28 | results = CIK_RE.findall(f.text) 29 | if len(results): 30 | results[0] = int(re.sub('\.[0]*', '.', results[0])) 31 | cik_dict[str(ticker).upper()] = str(results[0]) 32 | return cik_dict 33 | 34 | # reads files names in files directory 35 | def raw_files(): 36 | files = [] 37 | for file in os.listdir("./files"): 38 | if file.endswith(".csv"): 39 | pat = os.path.join("./files", file) 40 | files.append(pat) 41 | return files 42 | 43 | 44 | # check if provided Tickers already have CIKs and return URL 45 | def look_up(Tickers): 46 | # read existing tickers 47 | existing_df = pd.read_excel('Sorted.xlsx', index_col = 0) 48 | ex_Tickers = existing_df['Ticker'].values.tolist() 49 | # compare existing tickers and add non existing tickers 50 | for t in Tickers: 51 | if t in ex_Tickers: 52 | print(t + ' already exists') 53 | else: 54 | print(t + " doesn't exist") 55 | links = getCIK([t]) 56 | new_part = pd.DataFrame(list(links.items()), columns = ['Ticker', 'CIK']) 57 | existing_df = existing_df.append(new_part, ignore_index = True) 58 | existing_df.to_excel('Sorted.xlsx') 59 | 60 | URLs = [] 61 | # get names of already scraped CIKs 62 | names = [i.split(' ')[0] for i in raw_files() ] 63 | names = [i.split('\\')[-1] for i in names ] 64 | # get CIKs for required tickers and convert to URL 65 | for t in Tickers: 66 | df = existing_df.loc[existing_df['Ticker'] == t] 67 | CIK = str(df['CIK'].values[0]) 68 | # Check if company data already has been scraped 69 | if CIK in names: 70 | print(CIK + 'already scraped') 71 | else: 72 | URL = 'https://www.sec.gov/Archives/edgar/data/' + CIK 73 | URLs.append(URL) 74 | return URLs 75 | 76 | 77 | # Start Spider provided the links 78 | process = CrawlerProcess(get_project_settings()) 79 | 80 | def scrape(links, year): 81 | for i in links: 82 | process.crawl('sec', start_url=links, company = i, year = year) 83 | process.start() 84 | 85 | scrape(links = look_up(Tickers = tickers ), year = year) 86 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sec.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sec 12 | -------------------------------------------------------------------------------- /sec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__init__.py -------------------------------------------------------------------------------- /sec/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /sec/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /sec/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SecItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /sec/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SecSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class SecDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /sec/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class SecPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /sec/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sec project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | # ROTATED_PROXY_ENABLED = True 13 | # PROXY_STORAGE = 'scrapy_rotated_proxy.extensions.file_storage.FileProxyStorage' 14 | 15 | # ROTATING_PROXY_LIST_PATH = './httpproxies.txt' 16 | # PROXY_MODE = 0 17 | 18 | BOT_NAME = 'sec' 19 | 20 | SPIDER_MODULES = ['sec.spiders'] 21 | NEWSPIDER_MODULE = 'sec.spiders' 22 | 23 | 24 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 25 | #USER_AGENT = 'sec (+http://www.yourdomain.com)' 26 | 27 | # Obey robots.txt rules 28 | # ROBOTSTXT_OBEY = True 29 | 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | CONCURRENT_REQUESTS = 20 32 | 33 | # Configure a delay for requests for the same website (default: 0) 34 | # DOWNLOAD_DELAY = 0.2 35 | 36 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | # COOKIES_ENABLED = False 45 | 46 | 47 | # DOWNLOADER_MIDDLEWARES = { 48 | # 'rotating_proxies.middlewares.RotatingProxyMiddleware': 610, 49 | # 'rotating_proxies.middlewares.BanDetectionMiddleware': 620, 50 | # } 51 | 52 | # DOWNLOADER_MIDDLEWARES.update({ 53 | # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, 54 | # 'scrapy_rotated_proxy.downloadmiddlewares.proxy.RotatedProxyMiddleware': 750, 55 | # }) 56 | # Disable Telnet Console (enabled by default) 57 | #TELNETCONSOLE_ENABLED = False 58 | 59 | # Override the default request headers: 60 | #DEFAULT_REQUEST_HEADERS = { 61 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 62 | # 'Accept-Language': 'en', 63 | #} 64 | 65 | # Enable or disable spider middlewares 66 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 67 | #SPIDER_MIDDLEWARES = { 68 | # 'sec.middlewares.SecSpiderMiddleware': 543, 69 | #} 70 | 71 | # Enable or disable downloader middlewares 72 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 73 | #DOWNLOADER_MIDDLEWARES = { 74 | # 'sec.middlewares.SecDownloaderMiddleware': 543, 75 | #} 76 | 77 | # Enable or disable extensions 78 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 79 | #EXTENSIONS = { 80 | # 'scrapy.extensions.telnet.TelnetConsole': None, 81 | #} 82 | 83 | # Configure item pipelines 84 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 85 | #ITEM_PIPELINES = { 86 | # 'sec.pipelines.SecPipeline': 300, 87 | #} 88 | 89 | # Enable and configure the AutoThrottle extension (disabled by default) 90 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 91 | #AUTOTHROTTLE_ENABLED = True 92 | # The initial download delay 93 | #AUTOTHROTTLE_START_DELAY = 5 94 | # The maximum download delay to be set in case of high latencies 95 | #AUTOTHROTTLE_MAX_DELAY = 60 96 | # The average number of requests Scrapy should be sending in parallel to 97 | # each remote server 98 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 99 | # Enable showing throttling stats for every response received: 100 | #AUTOTHROTTLE_DEBUG = False 101 | 102 | # Enable and configure HTTP caching (disabled by default) 103 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 104 | #HTTPCACHE_ENABLED = True 105 | #HTTPCACHE_EXPIRATION_SECS = 0 106 | #HTTPCACHE_DIR = 'httpcache' 107 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 108 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 109 | -------------------------------------------------------------------------------- /sec/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sec/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /sec/spiders/__pycache__/sec.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/spiders/__pycache__/sec.cpython-37.pyc -------------------------------------------------------------------------------- /sec/spiders/sec.py: -------------------------------------------------------------------------------- 1 | 2 | import scrapy 3 | 4 | from scrapy.utils.project import get_project_settings 5 | 6 | from scrapy.linkextractors import LinkExtractor 7 | from scrapy.spiders import CrawlSpider, Rule 8 | from scrapy.crawler import CrawlerProcess 9 | import bs4 as bs 10 | 11 | from scrapy.selector import Selector 12 | import pandas as pd 13 | import re, requests 14 | 15 | import urllib.request 16 | 17 | class MainSpider(CrawlSpider): 18 | name = 'sec' 19 | 20 | def __init__(self, year = '', company='',start_url = '' ,*args, **kwargs): 21 | super(MainSpider, self).__init__(*args, **kwargs) 22 | self.start_urls = start_url 23 | self.company = company 24 | self.year = year 25 | 26 | def start_requests(self): 27 | for url in self.start_urls: 28 | yield scrapy.Request(url,callback= self.parse_item, dont_filter=True) 29 | 30 | # pass links to main archive not Cik values 31 | def parse_item(self, response): 32 | data = response.text 33 | soup = bs.BeautifulSoup(data, features= 'lxml') 34 | links = [] 35 | dates = [] 36 | 37 | for link in soup.select('#main-content table tr td a '): 38 | i = 'https://www.sec.gov' + link.get('href') 39 | links.append(i) 40 | 41 | for date in soup.select('#main-content table tr td:nth-of-type(3)'): 42 | d = date.get_text()[0:10] 43 | if int(d[0:4]) >= self.year: 44 | dates.append(d) 45 | 46 | length = (len(dates)) 47 | dictionary = dict(zip(links[0:length], dates)) 48 | 49 | for key, value in dictionary.items(): 50 | yield scrapy.Request(key, callback = self.parse_item1, meta={'date': value, 'link' : key}) 51 | 52 | def parse_item1(self, response): 53 | date = response.meta['date'] 54 | data = response.xpath('//td//a/@href').getall() 55 | company = response.meta['link'] 56 | company = company.split('/')[-2] 57 | 58 | R = ['R' + str(i) + '.htm' for i in range(2,10) ] 59 | 60 | Reports_links = [] 61 | Rs = [] 62 | for i in R: 63 | for d in data: 64 | if i in d : 65 | link = 'https://www.sec.gov' + d 66 | Rs.append(i[0:2]) 67 | Reports_links.append(link) 68 | 69 | dictionary = dict(zip(Reports_links, Rs)) 70 | 71 | if Reports_links: 72 | for key, value in dictionary.items(): 73 | yield scrapy.Request(key, callback = self.main_parse, meta={'date': date, 'R': value, 'company' : company}) 74 | 75 | def main_parse(self, response): 76 | date = response.meta['date'] 77 | R = response.meta['R'] 78 | company = response.meta['company'] 79 | source = response.text 80 | soup = bs.BeautifulSoup(source,'lxml') 81 | # find a table 82 | table = soup.table 83 | table = soup.find('table') 84 | table_rows = table.find_all('tr') 85 | 86 | # parse for rows 87 | clear = [] 88 | for tr in table_rows: 89 | try: 90 | td = tr.find_all('td') 91 | row = [i.text for i in td] 92 | clear.append(row) 93 | except: 94 | continue 95 | # put rows in dataframe and save 96 | df = pd.DataFrame(clear) 97 | name = company + ' ' + date + R + '.csv' 98 | name = name.replace('/', '') 99 | name = './files/' + name 100 | df.to_csv(name) 101 | --------------------------------------------------------------------------------