├── ALL
    ├── .gitkeep
    ├── .~lock.1018724_Balance_sheet.csv#
    ├── .~lock.1018724_Cash_Flow.csv#
    └── .~lock.1018724_Income_Statment.csv#
├── Aggregate.py
├── Label.py
├── Sorted.xlsx
├── files
    └── .gitkeep
├── model
    ├── Kmeans.joblib
    ├── SVC.joblib
    ├── X_transform_SVC.joblib
    ├── X_transform_cluster.joblib
    ├── X_vecto_SVC.joblib
    └── X_vecto_cluster.joblib
├── parsed
    └── .gitkeep
├── readme.md
├── requirements.txt
├── scrape.py
├── scrapy.cfg
└── sec
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-37.pyc
        └── settings.cpython-37.pyc
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── __pycache__
            ├── __init__.cpython-37.pyc
            └── sec.cpython-37.pyc
        └── sec.py


/ALL/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ALL/.~lock.1018724_Balance_sheet.csv#:
--------------------------------------------------------------------------------
1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:07,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4;


--------------------------------------------------------------------------------
/ALL/.~lock.1018724_Cash_Flow.csv#:
--------------------------------------------------------------------------------
1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:08,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4;


--------------------------------------------------------------------------------
/ALL/.~lock.1018724_Income_Statment.csv#:
--------------------------------------------------------------------------------
1 | ,DESKTOP-6PS1AE2/Nikita,,29.10.2019 12:09,file:///C:/Users/Nikita/AppData/Roaming/LibreOffice/4;


--------------------------------------------------------------------------------
/Aggregate.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import os
  3 | import datetime 
  4 | from functools import reduce
  5 | from datetime import timedelta
  6 | from dateutil.relativedelta import relativedelta
  7 | from datetime import date
  8 | import re
  9 | from difflib import SequenceMatcher
 10 | 
 11 | 
 12 | # TODO come up with proper merging cel process
 13 |     # surpivised learning model
 14 |     # look for proccesing language model 
 15 | # TODO Try implementing processing language model to see similirities between words
 16 | #   so the synunoms will be merged together 
 17 | 
 18 | # TODO Delete rows with more than 10-20 Nans(replace other nans using 
 19 | #   https://www.kaggle.com/juejuewang/handle-missing-values-in-time-series-for-beginners)
 20 | # TODO rename function(from CIK to Symbol)
 21 | 
 22 | # TODO make pypi package 
 23 | 
 24 | # company = all uniqie
 25 | def companies():
 26 |     files = []
 27 |     for file in os.listdir("./files"):
 28 |         if file.endswith(".csv"):
 29 |             pat = os.path.join("./files", file)
 30 |             files.append(pat)
 31 |     companies = []
 32 |     for i in files:
 33 |         company = i.split(' ')[0]
 34 |         company = company[8:]
 35 |         companies.append(company)
 36 |     companies = list(set(companies))
 37 |     return companies
 38 | 
 39 | # print(companies())
 40 | 
 41 | # read files in folder with parsed documents
 42 | def get_company_parsed(company):
 43 |     parsed_files = []
 44 |     for file in os.listdir("./parsed"):
 45 |         if file.endswith(".csv"):
 46 |             pat = os.path.join("./parsed", file)
 47 |             if company in pat:
 48 |                 parsed_files.append(pat)
 49 |     return parsed_files
 50 | 
 51 | # Combines the files of the same type in one single spreadsheet
 52 | def join_all(company):
 53 |     count = 0
 54 |     Is_count = []
 55 |     parsed_files = get_company_parsed(company)
 56 |     Is = []
 57 |     bs = []
 58 |     cs = []
 59 |     for i in parsed_files:
 60 |         if 'Income_Statment' in i:  
 61 |             date = i[-14:-4]
 62 |             df = pd.read_csv(i, index_col= 0, usecols = [0, 2] )
 63 |             df.index = df.index.astype(str)
 64 |             df = df.rename(columns={"2": date})
 65 |             df = df[~df.index.duplicated()]
 66 |             df.index = df.index.map(str.lower)
 67 |             Is.append(df)
 68 |         if 'Balance_Sheet' in i:
 69 |             date = i[-14:-4]
 70 |             df = pd.read_csv(i, index_col= 0, usecols = [0, 2] )
 71 |             df.index = df.index.astype(str)
 72 |             df = df.rename(columns={"2": date})
 73 |             df = df[~df.index.duplicated()]
 74 |             df.index = df.index.map(str.lower)
 75 |             bs.append(df)    
 76 |         if 'Cash_Flow' in i: 
 77 |             date = i[-14:-4]
 78 |             df = pd.read_csv(i, index_col= 0, usecols = [0, 2] )
 79 |             df.index = df.index.astype(str)
 80 |             df = df.rename(columns={"2": date})
 81 |             df = df[~df.index.duplicated()]
 82 |             # print(df.index)
 83 |             df.index = df.index.map(str.lower)
 84 |             cs.append(df)
 85 |     # try:
 86 |     Is_df = pd.concat(Is, axis = 1, sort = False)
 87 |     for i in Is_df.index:
 88 |         for b in Is_df.index:
 89 |             rat = SequenceMatcher(None, i, b).ratio()
 90 |             if rat > 0.7:
 91 |                 Is_df.rename(index = {i: b}, inplace = True)
 92 |     Is_df = Is_df.groupby(level=0, axis = 0, sort = False).sum()
 93 | 
 94 |     for column in Is_df:
 95 |         columnSeriesObj = Is_df[column]
 96 |         # print(Is_df)
 97 |         check = columnSeriesObj.str.contains('us-gaap', regex=False)
 98 |         if check.any() == True:
 99 |             for val in columnSeriesObj.values:
100 |                 val = str(val)
101 |                 val2 = val.split('us-gaap')[0]
102 |                 Is_df.replace({val: val2}, inplace = True)
103 |     Is_df.to_csv('./ALL/' + company + '_Income_Statment.csv')
104 | 
105 |     # except ValueError:
106 |     #     pass
107 | 
108 |     try:
109 |         Bs_df = pd.concat(bs, axis = 1, sort = False)
110 |         for i in Bs_df.index:
111 |             for b in Bs_df.index:
112 |                 rat = SequenceMatcher(None, i, b).ratio()
113 |                 # print(rat)
114 |                 if rat > 0.7:
115 |                     Bs_df.rename(index = {i: b}, inplace = True)
116 |         Bs_df = Bs_df.groupby(level=0, axis = 0, sort = False).sum()
117 | 
118 |         # rename us-gaap in year 2015 
119 |         for column in Bs_df:
120 |             columnSeriesObj = Bs_df[column]
121 |             check = columnSeriesObj.str.contains('us-gaap', regex=False)
122 |             if check.any() == True:
123 |                 for val in columnSeriesObj.values:
124 |                     val = str(val)
125 |                     val2 = val.split('us-gaap')[0]
126 |                     Bs_df.replace({val: val2}, inplace = True)
127 |                 
128 |         Bs_df.to_csv('./ALL/' + company + '_Balance_sheet.csv')
129 |     except ValueError:
130 |         pass
131 |     
132 |     try:
133 |         Cs_df = pd.concat(cs, axis = 1, sort = False)
134 |         for i in Cs_df.index:
135 |             for b in Cs_df.index:
136 |                 rat = SequenceMatcher(None, i, b).ratio()
137 |                 # print(rat)
138 |                 if rat > 0.7:
139 |                     Cs_df.rename(index = {i: b}, inplace = True)
140 |         Cs_df = Cs_df.groupby(level=0, axis = 0, sort = False).sum()
141 | 
142 |         # rename us-gaap in year 2015 
143 |         for column in Cs_df:
144 |             columnSeriesObj = Cs_df[column]
145 |             check = columnSeriesObj.str.contains('us-gaap', regex=False)
146 |             if check.any() == True:
147 |                 for val in columnSeriesObj.values:
148 |                     val = str(val)
149 |                     val2 = val.split('us-gaap')[0]
150 |                     Cs_df.replace({val: val2}, inplace = True)
151 |         Cs_df.to_csv('./ALL/' + company + '_Cash_Flow.csv')
152 |     except ValueError:
153 |         pass
154 | 
155 | # use all
156 | def main(companies = companies() ):
157 |     
158 |     # Combine the files of the same type in one single document 
159 |     for i in companies:
160 |         join_all(i)
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/Label.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd 
  2 | import os
  3 | from sklearn.feature_extraction.text import CountVectorizer
  4 | from sklearn.naive_bayes import MultinomialNB
  5 | from sklearn.feature_extraction.text import TfidfTransformer
  6 | import numpy as np
  7 | from joblib import dump, load
  8 | import time
  9 | 
 10 | # Times for each model based on small sample 
 11 | # Kmean time 0.4447798728942871
 12 | # SVC time 0.4916868209838867
 13 | # Combined models time 0.5555140972137451
 14 | 
 15 | # TODO increase the training sample for supervised model
 16 | 
 17 | count_vect = CountVectorizer()
 18 | tfidf_transformer = TfidfTransformer()
 19 | 
 20 | # Load Classifiers
 21 | 
 22 | # Load Kmeans
 23 | Kmeans = load(r'C:\Users\Nikita\Desktop\ml_selector\model\Kmeans.joblib')
 24 | X_SVC_train_counts = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_vecto_SVC.joblib')
 25 | X_SVC_train_tfidf = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_transform_SVC.joblib')
 26 | 
 27 | # Load SVC
 28 | SVC = load(r'C:\Users\Nikita\Desktop\ml_selector\model\SVC.joblib')
 29 | X_Kmean_train_counts = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_vecto_cluster.joblib')
 30 | X_Kmean_train_tfidf = load(r'C:\Users\Nikita\Desktop\ml_selector\model\X_transform_cluster.joblib' )
 31 | 
 32 | # read all files in folder with raw documents 
 33 | def raw_files():
 34 |     files = []
 35 |     for file in os.listdir("./files"):
 36 |         if file.endswith(".csv"):
 37 |             pat = os.path.join("./files", file)
 38 |             files.append(pat)
 39 |     return files
 40 | 
 41 | # Function for SVC model
 42 | def SVC_predict(names, model = SVC, vectorizer = X_SVC_train_counts, transformer = X_SVC_train_tfidf):
 43 |     X_new_counts = vectorizer.transform(names)
 44 |     X_new_tfidf = transformer.transform(X_new_counts)
 45 |     
 46 |     predicted = model.predict(X_new_tfidf)
 47 |     probabilities = model.predict_proba(X_new_tfidf)
 48 | 
 49 |     return predicted, probabilities
 50 | 
 51 | # Funcion for Kmean model
 52 | def Kmean_predict(names, model = Kmeans, vectorizer = X_Kmean_train_counts, transformer = X_Kmean_train_tfidf):
 53 |     X_new_counts = vectorizer.transform(names)
 54 |     X_new_tfidf = transformer.transform(X_new_counts)
 55 |     
 56 |     predicted = model.predict(X_new_tfidf)
 57 | 
 58 |     return predicted
 59 | 
 60 | # Rename files from CIKs to Tickers
 61 | def rename(company):
 62 |     df = pd.read_excel('Sorted.xlsx', index_col = 0)
 63 |     
 64 |     df2 = df.loc[df['CIK'] == int(company)]
 65 |     print(df2)
 66 |     TICKER = str(df2['Ticker'].values[0])
 67 |     print(TICKER)
 68 |     return TICKER    
 69 | 
 70 | # Decide on the document type(eg. Balance Sheet, Income Statment, Cash Flow)
 71 | def doc_type(doc, model = 'Combined_model'):
 72 |     # Read document 
 73 |     df = pd.read_csv(doc, index_col= 1)
 74 |     # print(df)
 75 |     df.index = df.index.astype(str)
 76 | 
 77 |     # Convert index to a string 
 78 |     names = list(df.index.values)
 79 |     names = [i.lower() for i in names]
 80 |     names = [', '.join(names)]
 81 | 
 82 |     # Get results of model prediction
 83 |     doc_type = 'none'
 84 | 
 85 |     # Kmean Alone 
 86 |     if model == 'Kmean':
 87 |         result2 = Kmean_predict(names = names)
 88 |         if result2[0] == 1:
 89 |             doc_type = 'Balance_Sheet'
 90 |         if result2[0] == 6:
 91 |             doc_type = 'Income_Statment'
 92 |         if result2[0] == 0:
 93 |             doc_type = 'Cash_Flow'
 94 | 
 95 |     # SVC Alone
 96 |     if model == 'SVC':
 97 |         result, probability = SVC_predict(names = names)
 98 |         if result[0] == 3 and max(probability[0]) > 0.93:
 99 |             doc_type = 'Balance_Sheet'
100 |         if result[0] == 2 and max(probability[0]) > 0.93:
101 |             doc_type = 'Income_Statment'
102 |         if result[0] == 1 and max(probability[0]) > 0.93:
103 |             doc_type = 'Cash_Flow'
104 | 
105 |     # Combined model 
106 |     if model == 'Combined_model':
107 |         result, probability = SVC_predict(names = names)
108 |         result2 = Kmean_predict(names = names)
109 |         if result[0] == 3 and max(probability[0]) > 0.93 and result2[0] == 1:
110 |             doc_type = 'Balance_Sheet'
111 |         if result[0] == 2 and max(probability[0]) > 0.93 and result2[0] == 6:
112 |             doc_type = 'Income_Statment'
113 |         if result[0] == 1 and max(probability[0]) > 0.93 and result2[0] == 0:
114 |             doc_type = 'Cash_Flow'
115 | 
116 |     # Company name(CIK)
117 |     company = doc.split(' ')[0]
118 |     company = company[8:]
119 |     company = rename(company)
120 |     # Date of publish 
121 |     date = doc.split(' ')[1]
122 |     date = date[:-6]
123 | 
124 |     if doc_type is not 'none':
125 |         # print(df)
126 |         df.drop('Unnamed: 0', axis = 'columns', inplace = True)
127 |         # print(df)
128 |         save_name = './parsed/' + company + ' '+ doc_type + ' '+ date + '.csv'
129 |         df.to_csv(save_name)
130 | 
131 | # Labels all the documets and times the procecss
132 | def sort():
133 |     t = time.time()
134 |     for i in raw_files():
135 |         doc_type(i)
136 |     t1 = time.time() - t
137 |     print("The labeling took: " + str(t1)  + " seconds")
138 | 
139 | sort()


--------------------------------------------------------------------------------
/Sorted.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/Sorted.xlsx


--------------------------------------------------------------------------------
/files/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model/Kmeans.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/Kmeans.joblib


--------------------------------------------------------------------------------
/model/SVC.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/SVC.joblib


--------------------------------------------------------------------------------
/model/X_transform_SVC.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_transform_SVC.joblib


--------------------------------------------------------------------------------
/model/X_transform_cluster.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_transform_cluster.joblib


--------------------------------------------------------------------------------
/model/X_vecto_SVC.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_vecto_SVC.joblib


--------------------------------------------------------------------------------
/model/X_vecto_cluster.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/model/X_vecto_cluster.joblib


--------------------------------------------------------------------------------
/parsed/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # About this project 
 2 | 
 3 |    The main idea of project is to make efficient fundamental data scraper which will provide accurately sorted financial information.
 4 |    
 5 |    At the current state the scraper is a fully functioning and written using Scrapy library.
 6 |         The data is scrapped only from 2011 onward.
 7 | 
 8 |    The labeling script decides on the document type and stores it in parsed folder.
 9 | 
10 |    The Aggregation script is in very raw stage of progress and I will be working on it in the upcoming month.  
11 | 
12 | ## Getting Started
13 |    Copy the repository and Install requirement.txt using pip
14 |   
15 |    To scrape data run scraper.py and pass symbols of companies you want to scrape and year
16 |       ```
17 |          python scrape.py
18 |       ```.
19 |         
20 |    The scraped files are stored in scraped folder 
21 | 
22 |    To label all scraped files just run 
23 |       ```
24 |          label.py
25 |       ```.
26 | 
27 | ## Contribution
28 | I am actively seeking contributors to improve efficiency, structure and functionality.
29 | 
30 | ## License
31 | 
32 | This project is licensed under the terms of the MIT license.
33 | 
34 | "# SEC-EDGAR-python-scraper" 
35 | 
36 | ## A note 
37 | Also, I am third year finance major, and been learning programming for less than a year, therefore the code inefficiencies and the structure might look out of place as I am not familiar with many programming convensions. 
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: win-64
 4 | asn1crypto=1.0.1=py37_0
 5 | atomicwrites=1.3.0=py37_1
 6 | attrs=19.2.0=py_0
 7 | automat=0.7.0=py37_0
 8 | bcrypt=3.1.7=py37he774522_0
 9 | beautifulsoup4=4.8.1=pypi_0
10 | blas=1.0=mkl
11 | bs4=0.0.1=pypi_0
12 | ca-certificates=2019.8.28=0
13 | certifi=2019.9.11=py37_0
14 | cffi=1.12.3=py37h7a1dbc1_0
15 | chardet=3.0.4=pypi_0
16 | colorama=0.4.1=py37_0
17 | constantly=15.1.0=py37h28b3542_0
18 | cryptography=2.7=py37h7a1dbc1_0
19 | cssselect=1.1.0=py_0
20 | get=2019.4.13=pypi_0
21 | hyperlink=19.0.0=py_0
22 | icc_rt=2019.0.0=h0cc432a_1
23 | idna=2.8=py37_0
24 | importlib_metadata=0.23=py37_0
25 | incremental=17.5.0=py37_0
26 | intel-openmp=2019.4=245
27 | libiconv=1.15=h1df5818_7
28 | libxml2=2.9.9=h464c3ec_0
29 | libxslt=1.1.33=h579f668_0
30 | lxml=4.4.1=py37h1350720_0
31 | mkl=2019.4=245
32 | mkl-service=2.3.0=py37hb782905_0
33 | mkl_fft=1.0.14=py37h14836fe_0
34 | mkl_random=1.1.0=py37h675688f_0
35 | more-itertools=7.2.0=py37_0
36 | numpy=1.16.5=py37h19fb1c0_0
37 | numpy-base=1.16.5=py37hc3f5095_0
38 | openssl=1.1.1d=he774522_2
39 | packaging=19.2=py_0
40 | pandas=0.25.1=py37ha925a31_0
41 | parsel=1.5.2=py37_0
42 | pip=19.2.3=py37_0
43 | pluggy=0.13.0=py37_0
44 | post=2019.4.13=pypi_0
45 | public=2019.4.13=pypi_0
46 | py=1.8.0=py37_0
47 | pyasn1=0.4.7=py_0
48 | pyasn1-modules=0.2.6=py37_0
49 | pycparser=2.19=py37_0
50 | pydispatcher=2.0.5=py37_1
51 | pyhamcrest=1.9.0=py37_2
52 | pyopenssl=19.0.0=py37_0
53 | pyparsing=2.4.2=py_0
54 | pytest=5.0.1=py37_0
55 | pytest-runner=5.1=py_0
56 | python=3.7.4=h5263a28_0
57 | python-dateutil=2.8.0=py37_0
58 | pytz=2019.3=py_0
59 | pywin32=223=py37hfa6e2cd_1
60 | query-string=2019.4.13=pypi_0
61 | queuelib=1.5.0=py37_0
62 | request=2019.4.13=pypi_0
63 | requests=2.22.0=pypi_0
64 | scrapy=1.6.0=py37_0
65 | service_identity=18.1.0=py37h28b3542_0
66 | setuptools=41.4.0=py37_0
67 | six=1.12.0=py37_0
68 | soupsieve=1.9.4=pypi_0
69 | sqlite=3.30.0=he774522_0
70 | twisted=19.7.0=py37he774522_1
71 | urllib3=1.25.6=pypi_0
72 | vc=14.1=h0510ff6_4
73 | vs2015_runtime=14.16.27012=hf0eaf9b_0
74 | w3lib=1.21.0=py_0
75 | wcwidth=0.1.7=py37_0
76 | wheel=0.33.6=py37_0
77 | wincertstore=0.2=py37_0
78 | zipp=0.6.0=py_0
79 | zlib=1.2.11=h62dcd97_3
80 | zope=1.0=py37_1
81 | zope.interface=4.6.0=py37he774522_0
82 | 


--------------------------------------------------------------------------------
/scrape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | from scrapy.crawler import CrawlerProcess
 5 | from scrapy.utils.project import get_project_settings
 6 | import scrapy 
 7 | import re, requests
 8 | import bs4 as bs
 9 | import urllib.request
10 | from sys import argv
11 | import pandas as pd
12 | import random
13 | import os
14 | 
15 | 
16 | # TODO before starting extensive scraping jobs implement a solid proxy system(Late stage dev)
17 | 
18 | tickers = input("Enter Tickers: ").split() 
19 | year  = int(input("Enter from which year to start: "))
20 | 
21 | # Function to convert Tickers to CIK numbers
22 | def getCIK(TICKERS):
23 |     URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
24 |     CIK_RE = re.compile(r'.*CIK=(\d{10}).*')    
25 |     cik_dict = {}
26 |     for ticker in TICKERS:
27 |         f = requests.get(URL.format(ticker), stream = True)
28 |         results = CIK_RE.findall(f.text)
29 |         if len(results):
30 |             results[0] = int(re.sub('\.[0]*', '.', results[0]))
31 |             cik_dict[str(ticker).upper()] = str(results[0])
32 |     return cik_dict
33 | 
34 | # reads files names in files directory
35 | def raw_files():
36 |     files = []
37 |     for file in os.listdir("./files"):
38 |         if file.endswith(".csv"):
39 |             pat = os.path.join("./files", file)
40 |             files.append(pat)
41 |     return files
42 | 
43 | 
44 | # check if provided Tickers already have CIKs and return URL 
45 | def look_up(Tickers):
46 |     # read existing tickers
47 |     existing_df = pd.read_excel('Sorted.xlsx', index_col = 0)
48 |     ex_Tickers = existing_df['Ticker'].values.tolist()
49 |     # compare existing tickers and add non existing tickers
50 |     for t in Tickers:
51 |         if t in ex_Tickers:
52 |             print(t + ' already exists')
53 |         else:
54 |             print(t + " doesn't exist")
55 |             links = getCIK([t])
56 |             new_part = pd.DataFrame(list(links.items()), columns = ['Ticker', 'CIK'])
57 |             existing_df = existing_df.append(new_part, ignore_index = True)
58 |     existing_df.to_excel('Sorted.xlsx')
59 |     
60 |     URLs = []
61 |     # get names of already scraped CIKs
62 |     names = [i.split(' ')[0] for i in raw_files() ] 
63 |     names = [i.split('\\')[-1] for i in names ]
64 |     # get CIKs for required tickers and convert to URL
65 |     for t in Tickers:
66 |         df = existing_df.loc[existing_df['Ticker'] == t]     
67 |         CIK = str(df['CIK'].values[0])
68 |         # Check if company data already has been scraped 
69 |         if CIK in names:
70 |             print(CIK + 'already scraped')
71 |         else:
72 |             URL = 'https://www.sec.gov/Archives/edgar/data/' + CIK
73 |             URLs.append(URL)
74 |     return URLs
75 | 
76 | 
77 | # Start Spider provided the links  
78 | process = CrawlerProcess(get_project_settings())        
79 | 
80 | def scrape(links, year):
81 |     for i in links:
82 |         process.crawl('sec', start_url=links, company = i, year = year)
83 |         process.start()
84 | 
85 | scrape(links = look_up(Tickers = tickers ), year = year)
86 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sec.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sec
12 | 


--------------------------------------------------------------------------------
/sec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__init__.py


--------------------------------------------------------------------------------
/sec/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/sec/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/sec/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class SecItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/sec/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class SecSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class SecDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/sec/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class SecPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/sec/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for sec project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | # ROTATED_PROXY_ENABLED = True
 13 | # PROXY_STORAGE = 'scrapy_rotated_proxy.extensions.file_storage.FileProxyStorage'
 14 | 
 15 | # ROTATING_PROXY_LIST_PATH   = './httpproxies.txt'
 16 | # PROXY_MODE = 0
 17 | 
 18 | BOT_NAME = 'sec'
 19 | 
 20 | SPIDER_MODULES = ['sec.spiders']
 21 | NEWSPIDER_MODULE = 'sec.spiders'
 22 | 
 23 | 
 24 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 25 | #USER_AGENT = 'sec (+http://www.yourdomain.com)'
 26 | 
 27 | # Obey robots.txt rules
 28 | # ROBOTSTXT_OBEY = True
 29 | 
 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 31 | CONCURRENT_REQUESTS = 20
 32 | 
 33 | # Configure a delay for requests for the same website (default: 0)
 34 | # DOWNLOAD_DELAY = 0.2
 35 | 
 36 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | # COOKIES_ENABLED = False
 45 | 
 46 | 
 47 | # DOWNLOADER_MIDDLEWARES = {
 48 | #     'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
 49 | #     'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
 50 | # }
 51 | 
 52 | # DOWNLOADER_MIDDLEWARES.update({
 53 | #     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
 54 | #     'scrapy_rotated_proxy.downloadmiddlewares.proxy.RotatedProxyMiddleware': 750,
 55 | # })
 56 | # Disable Telnet Console (enabled by default)
 57 | #TELNETCONSOLE_ENABLED = False
 58 | 
 59 | # Override the default request headers:
 60 | #DEFAULT_REQUEST_HEADERS = {
 61 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 62 | #   'Accept-Language': 'en',
 63 | #}
 64 | 
 65 | # Enable or disable spider middlewares
 66 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 67 | #SPIDER_MIDDLEWARES = {
 68 | #    'sec.middlewares.SecSpiderMiddleware': 543,
 69 | #}
 70 | 
 71 | # Enable or disable downloader middlewares
 72 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 73 | #DOWNLOADER_MIDDLEWARES = {
 74 | #    'sec.middlewares.SecDownloaderMiddleware': 543,
 75 | #}
 76 | 
 77 | # Enable or disable extensions
 78 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 79 | #EXTENSIONS = {
 80 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 81 | #}
 82 | 
 83 | # Configure item pipelines
 84 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 85 | #ITEM_PIPELINES = {
 86 | #    'sec.pipelines.SecPipeline': 300,
 87 | #}
 88 | 
 89 | # Enable and configure the AutoThrottle extension (disabled by default)
 90 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 91 | #AUTOTHROTTLE_ENABLED = True
 92 | # The initial download delay
 93 | #AUTOTHROTTLE_START_DELAY = 5
 94 | # The maximum download delay to be set in case of high latencies
 95 | #AUTOTHROTTLE_MAX_DELAY = 60
 96 | # The average number of requests Scrapy should be sending in parallel to
 97 | # each remote server
 98 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 99 | # Enable showing throttling stats for every response received:
100 | #AUTOTHROTTLE_DEBUG = False
101 | 
102 | # Enable and configure HTTP caching (disabled by default)
103 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
104 | #HTTPCACHE_ENABLED = True
105 | #HTTPCACHE_EXPIRATION_SECS = 0
106 | #HTTPCACHE_DIR = 'httpcache'
107 | #HTTPCACHE_IGNORE_HTTP_CODES = []
108 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
109 | 


--------------------------------------------------------------------------------
/sec/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sec/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/sec/spiders/__pycache__/sec.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galibin24/SEC-EDGAR-python-scraper/a1147d475c4685b3112ca68dee37eb9b2b1e2722/sec/spiders/__pycache__/sec.cpython-37.pyc


--------------------------------------------------------------------------------
/sec/spiders/sec.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import scrapy
  3 | 
  4 | from scrapy.utils.project import get_project_settings
  5 | 
  6 | from scrapy.linkextractors import LinkExtractor
  7 | from scrapy.spiders import CrawlSpider, Rule
  8 | from scrapy.crawler import CrawlerProcess
  9 | import bs4 as bs
 10 | 
 11 | from scrapy.selector import Selector
 12 | import pandas as pd 
 13 | import re, requests
 14 | 
 15 | import urllib.request
 16 | 
 17 | class MainSpider(CrawlSpider):
 18 |     name = 'sec'
 19 | 
 20 |     def __init__(self, year = '', company='',start_url = '' ,*args, **kwargs): 
 21 |         super(MainSpider, self).__init__(*args, **kwargs) 
 22 |         self.start_urls = start_url
 23 |         self.company = company
 24 |         self.year = year
 25 |         
 26 |     def start_requests(self):
 27 |         for url in self.start_urls:
 28 |             yield scrapy.Request(url,callback= self.parse_item, dont_filter=True)
 29 | 
 30 |     # pass links to main archive not Cik values
 31 |     def parse_item(self, response):
 32 |         data = response.text
 33 |         soup = bs.BeautifulSoup(data, features= 'lxml')
 34 |         links = []
 35 |         dates = []
 36 | 
 37 |         for link in soup.select('#main-content table tr td a '):
 38 |             i = 'https://www.sec.gov' + link.get('href')
 39 |             links.append(i)
 40 | 
 41 |         for date in soup.select('#main-content table tr td:nth-of-type(3)'):
 42 |             d = date.get_text()[0:10]
 43 |             if int(d[0:4]) >= self.year:
 44 |                 dates.append(d)
 45 |             
 46 |         length = (len(dates))
 47 |         dictionary = dict(zip(links[0:length], dates))
 48 |          
 49 |         for key, value in dictionary.items():
 50 |             yield scrapy.Request(key, callback = self.parse_item1, meta={'date': value, 'link' : key})
 51 | 
 52 |     def parse_item1(self, response):
 53 |         date = response.meta['date']
 54 |         data = response.xpath('//td//a/@href').getall() 
 55 |         company = response.meta['link']
 56 |         company = company.split('/')[-2]
 57 | 
 58 |         R = ['R' + str(i) + '.htm' for i in range(2,10) ]
 59 | 
 60 |         Reports_links = []
 61 |         Rs = []
 62 |         for i in R:
 63 |             for d in data:
 64 |                 if i in d :      
 65 |                     link = 'https://www.sec.gov' + d
 66 |                     Rs.append(i[0:2])
 67 |                     Reports_links.append(link)
 68 | 
 69 |         dictionary = dict(zip(Reports_links, Rs))
 70 | 
 71 |         if Reports_links:
 72 |             for key, value in dictionary.items():
 73 |                 yield scrapy.Request(key, callback = self.main_parse, meta={'date': date, 'R': value, 'company' : company})
 74 | 
 75 |     def main_parse(self, response):
 76 |         date = response.meta['date']
 77 |         R = response.meta['R']
 78 |         company = response.meta['company']
 79 |         source = response.text  
 80 |         soup = bs.BeautifulSoup(source,'lxml')
 81 |         # find a table 
 82 |         table = soup.table
 83 |         table = soup.find('table')
 84 |         table_rows = table.find_all('tr')
 85 | 
 86 |         # parse for rows 
 87 |         clear  = []
 88 |         for tr in table_rows:
 89 |             try:
 90 |                 td = tr.find_all('td')
 91 |                 row = [i.text for i in td]
 92 |                 clear.append(row)
 93 |             except:
 94 |                 continue
 95 |         # put rows in dataframe and save
 96 |         df = pd.DataFrame(clear)
 97 |         name = company + ' ' + date + R + '.csv'
 98 |         name = name.replace('/', '')
 99 |         name = './files/' + name 
100 |         df.to_csv(name)
101 | 


--------------------------------------------------------------------------------