├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── bin
    └── pystock-crawler
├── pystock_crawler
    ├── __init__.py
    ├── exporters.py
    ├── items.py
    ├── loaders.py
    ├── settings.py
    ├── spiders
    │   ├── __init__.py
    │   ├── edgar.py
    │   ├── nasdaq.py
    │   └── yahoo.py
    ├── tests
    │   ├── __init__.py
    │   ├── base.py
    │   ├── test_cmdline.py
    │   ├── test_loaders.py
    │   ├── test_spiders_edgar.py
    │   ├── test_spiders_nasdaq.py
    │   ├── test_spiders_yahoo.py
    │   └── test_utils.py
    ├── throttle.py
    └── utils.py
├── pytest.ini
├── requirements-test.txt
├── requirements.txt
├── scrapy.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.csv
 2 | *.log
 3 | *.pyc
 4 | .coverage
 5 | .scrapy/
 6 | .~*
 7 | build/
 8 | dist/
 9 | pystock_crawler.egg-info/
10 | pystock_crawler/tests/sample_data/
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 | branches:
 5 |   only:
 6 |     - master
 7 | install:
 8 |   - pip install -r requirements.txt
 9 |   - pip install -r requirements-test.txt
10 | script:
11 |   - py.test
12 | after_success:
13 |   - pip install python-coveralls
14 |   - coveralls
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Chang-Hung Liang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst LICENSE requirements.txt


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | pystock-crawler
  2 | ===============
  3 | 
  4 | .. image:: https://badge.fury.io/py/pystock-crawler.png
  5 |     :target: http://badge.fury.io/py/pystock-crawler
  6 | 
  7 | .. image:: https://travis-ci.org/eliangcs/pystock-crawler.png?branch=master
  8 |     :target: https://travis-ci.org/eliangcs/pystock-crawler
  9 | 
 10 | .. image:: https://coveralls.io/repos/eliangcs/pystock-crawler/badge.png?branch=master
 11 |     :target: https://coveralls.io/r/eliangcs/pystock-crawler
 12 | 
 13 | ``pystock-crawler`` is a utility for crawling historical data of US stocks,
 14 | including:
 15 | 
 16 | * Ticker symbols listed in NYSE, NASDAQ or AMEX from `NASDAQ.com`_
 17 | * Daily prices from `Yahoo Finance`_
 18 | * Fundamentals from 10-Q and 10-K filings (XBRL) on `SEC EDGAR`_
 19 | 
 20 | 
 21 | Example Output
 22 | --------------
 23 | 
 24 | NYSE ticker symbols::
 25 | 
 26 |     DDD   3D Systems Corporation
 27 |     MMM   3M Company
 28 |     WBAI  500.com Limited
 29 |     ...
 30 | 
 31 | Apple's daily prices::
 32 | 
 33 |     symbol,date,open,high,low,close,volume,adj_close
 34 |     AAPL,2014-04-28,572.80,595.75,572.55,594.09,23890900,594.09
 35 |     AAPL,2014-04-25,564.53,571.99,563.96,571.94,13922800,571.94
 36 |     AAPL,2014-04-24,568.21,570.00,560.73,567.77,27092600,567.77
 37 |     ...
 38 | 
 39 | Google's fundamentals::
 40 | 
 41 |     symbol,end_date,amend,period_focus,fiscal_year,doc_type,revenues,op_income,net_income,eps_basic,eps_diluted,dividend,assets,cur_assets,cur_liab,cash,equity,cash_flow_op,cash_flow_inv,cash_flow_fin
 42 |     GOOG,2009-06-30,False,Q2,2009,10-Q,5522897000.0,1873894000.0,1484545000.0,4.7,4.66,0.0,35158760000.0,23834853000.0,2000962000.0,11911351000.0,31594856000.0,3858684000.0,-635974000.0,46354000.0
 43 |     GOOG,2009-09-30,False,Q3,2009,10-Q,5944851000.0,2073718000.0,1638975000.0,5.18,5.13,0.0,37702845000.0,26353544000.0,2321774000.0,12087115000.0,33721753000.0,6584667000.0,-3245963000.0,74851000.0
 44 |     GOOG,2009-12-31,False,FY,2009,10-K,23650563000.0,8312186000.0,6520448000.0,20.62,20.41,0.0,40496778000.0,29166958000.0,2747467000.0,10197588000.0,36004224000.0,9316198000.0,-8019205000.0,233412000.0
 45 |     ...
 46 | 
 47 | 
 48 | Installation
 49 | ------------
 50 | 
 51 | Prerequisites:
 52 | 
 53 | * Python 2.7
 54 | 
 55 | ``pystock-crawler`` is based on Scrapy_, so you will also need to install
 56 | prerequisites such as lxml_ and libffi_ for Scrapy and its dependencies. On
 57 | Ubuntu, for example, you can install them like this::
 58 | 
 59 |     sudo apt-get update
 60 |     sudo apt-get install -y gcc python-dev libffi-dev libssl-dev libxml2-dev libxslt1-dev build-essential
 61 | 
 62 | See `Scrapy's installation guide`_ for more details.
 63 | 
 64 | After installing prerequisites, you can then install ``pystock-crawler`` with
 65 | ``pip``::
 66 | 
 67 |     (sudo) pip install pystock-crawler
 68 | 
 69 | 
 70 | Quickstart
 71 | ----------
 72 | 
 73 | **Example 1.** Fetch Google's and Yahoo's daily prices ordered by date::
 74 | 
 75 |     pystock-crawler prices GOOG,YHOO -o out.csv --sort
 76 | 
 77 | **Example 2.** Fetch daily prices of all companies listed in
 78 | ``./symbols.txt``::
 79 | 
 80 |     pystock-crawler prices ./symbols.txt -o out.csv
 81 | 
 82 | **Example 3.** Fetch Facebook's fundamentals during 2013::
 83 | 
 84 |     pystock-crawler reports FB -o out.csv -s 20130101 -e 20131231
 85 | 
 86 | **Example 4.** Fetch fundamentals of all companies in ``./nyse.txt`` and direct
 87 | the log to ``./crawling.log``::
 88 | 
 89 |     pystock-crawler reports ./nyse.txt -o out.csv -l ./crawling.log
 90 | 
 91 | **Example 5.** Fetch all ticker symbols in NYSE, NASDAQ and AMEX::
 92 | 
 93 |     pystock-crawler symbols NYSE,NASDAQ,AMEX -o out.txt
 94 | 
 95 | 
 96 | Usage
 97 | -----
 98 | 
 99 | Type ``pystock-crawler -h`` to see command help::
100 | 
101 |     Usage:
102 |       pystock-crawler symbols <exchanges> (-o OUTPUT) [-l LOGFILE] [-w WORKING_DIR]
103 |                                           [--sort]
104 |       pystock-crawler prices <symbols> (-o OUTPUT) [-s YYYYMMDD] [-e YYYYMMDD]
105 |                                        [-l LOGFILE] [-w WORKING_DIR] [--sort]
106 |       pystock-crawler reports <symbols> (-o OUTPUT) [-s YYYYMMDD] [-e YYYYMMDD]
107 |                                         [-l LOGFILE] [-w WORKING_DIR]
108 |                                         [-b BATCH_SIZE] [--sort]
109 |       pystock-crawler (-h | --help)
110 |       pystock-crawler (-v | --version)
111 | 
112 |     Options:
113 |       -h --help       Show this screen
114 |       -o OUTPUT       Output file
115 |       -s YYYYMMDD     Start date [default: ]
116 |       -e YYYYMMDD     End date [default: ]
117 |       -l LOGFILE      Log output [default: ]
118 |       -w WORKING_DIR  Working directory [default: .]
119 |       -b BATCH_SIZE   Batch size [default: 500]
120 |       --sort          Sort the result
121 | 
122 | There are three commands available:
123 | 
124 | * ``pystock-crawler symbols`` grabs ticker symbol lists
125 | * ``pystock-crawler prices`` grabs daily prices
126 | * ``pystock-crawler reports`` grabs fundamentals
127 | 
128 | ``<exchanges>`` is a comma-separated string that specifies the stock exchanges
129 | you want to include. Current, NYSE, NASDAQ and AMEX are supported.
130 | 
131 | The output file of ``pystock-crawler symbols`` can be used for ``<symbols>``
132 | argument in ``pystock-crawler prices`` and ``pystock-crawler reports``
133 | commands.
134 | 
135 | ``<symbols>`` can be an inline string separated with commas or a text file
136 | that lists symbols line by line. For example, the inline string can be
137 | something like ``AAPL,GOOG,FB``. And the text file may look like this::
138 | 
139 |     # This line is comment
140 |     AAPL    Put anything you want here
141 |     GOOG    Since the text here is ignored
142 |     FB
143 | 
144 | Use ``-o`` to specify the output file. For ``pystock-crawler symbols``
145 | command, the output format is a simple text file. For
146 | ``pystock-crawler prices`` and ``pystock-crawler reports`` the output format
147 | is CSV.
148 | 
149 | ``-l`` is where the crawling logs go to. If not specified, the logs go to
150 | stdout.
151 | 
152 | By default, the crawler uses the current directory as the working directory.
153 | If you don't want to use the current directoy, you can specify it with ``-w``
154 | option. The crawler keeps HTTP cache in a directory named ``.scrapy`` under
155 | the working directory. The cache can save your time by avoid downloading the
156 | same web pages. However, the cache can be quite huge. If you don't need it,
157 | just delete the ``.scrapy`` directory after you've done crawling.
158 | 
159 | ``-b`` option is only available to ``pystock-crawler reports`` command. It
160 | allows you to split a large symbol list into smaller batches. This is actually
161 | a workaround for an unresolved bug (#2). Normally you don't have to specify
162 | this option. Default value (500) works just fine.
163 | 
164 | The rows in the output file are in an arbitrary order by default. Use
165 | ``--sort`` option to sort them by symbols and dates. But if you have a large
166 | output file, don't use --sort because it will be slow and eat a lot of memory.
167 | 
168 | 
169 | Developer Guide
170 | ---------------
171 | 
172 | Installing Dependencies
173 | ~~~~~~~~~~~~~~~~~~~~~~~
174 | ::
175 | 
176 |     pip install -r requirements.txt
177 | 
178 | 
179 | Running Test
180 | ~~~~~~~~~~~~
181 | 
182 | Install test requirements::
183 | 
184 |     pip install -r requirements-test.txt
185 | 
186 | Then run the test::
187 | 
188 |     py.test
189 | 
190 | This will download the test data (a lot of XML/XBRL files) from from
191 | `SEC EDGAR`_ on the fly, so it will take some time and disk space. The test
192 | data is saved to ``pystock_crawler/tests/sample_data`` directory. It can be
193 | reused on the next time you run the test. If you don't need them, just delete
194 | the ``sample_data`` directory.
195 | 
196 | 
197 | .. _libffi: https://sourceware.org/libffi/
198 | .. _lxml: http://lxml.de/
199 | .. _NASDAQ.com: http://www.nasdaq.com/
200 | .. _Scrapy: http://scrapy.org/
201 | .. _Scrapy's installation guide: http://doc.scrapy.org/en/latest/intro/install.html
202 | .. _SEC EDGAR: http://www.sec.gov/edgar/searchedgar/companysearch.html
203 | .. _virtualenv: http://www.virtualenv.org/
204 | .. _virtualenvwrapper: http://virtualenvwrapper.readthedocs.org/
205 | .. _Yahoo Finance: http://finance.yahoo.com/
206 | 


--------------------------------------------------------------------------------
/bin/pystock-crawler:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Usage:
  4 |   pystock-crawler symbols <exchanges> (-o OUTPUT) [-l LOGFILE] [-w WORKING_DIR]
  5 |                                       [--sort]
  6 |   pystock-crawler prices <symbols> (-o OUTPUT) [-s YYYYMMDD] [-e YYYYMMDD]
  7 |                                    [-l LOGFILE] [-w WORKING_DIR] [--sort]
  8 |   pystock-crawler reports <symbols> (-o OUTPUT) [-s YYYYMMDD] [-e YYYYMMDD]
  9 |                                     [-l LOGFILE] [-w WORKING_DIR]
 10 |                                     [-b BATCH_SIZE] [--sort]
 11 |   pystock-crawler (-h | --help)
 12 |   pystock-crawler (-v | --version)
 13 | 
 14 | Options:
 15 |   -h --help       Show this screen
 16 |   -o OUTPUT       Output file
 17 |   -s YYYYMMDD     Start date [default: ]
 18 |   -e YYYYMMDD     End date [default: ]
 19 |   -l LOGFILE      Log output [default: ]
 20 |   -w WORKING_DIR  Working directory [default: .]
 21 |   -b BATCH_SIZE   Batch size [default: 500]
 22 |   --sort          Sort the result
 23 | 
 24 | '''
 25 | import codecs
 26 | import math
 27 | import os
 28 | import sys
 29 | import uuid
 30 | 
 31 | from contextlib import contextmanager
 32 | from docopt import docopt
 33 | from scrapy import log
 34 | 
 35 | try:
 36 |     import pystock_crawler
 37 | except ImportError:
 38 |     # For development environment
 39 |     sys.path.append(os.getcwd())
 40 |     import pystock_crawler
 41 | 
 42 | 
 43 | def random_string(length=5):
 44 |     return uuid.uuid4().get_hex()[0:5]
 45 | 
 46 | 
 47 | @contextmanager
 48 | def tmp_scrapy_cfg():
 49 |     content = '''# pystock_crawler scrapy.cfg
 50 | [settings]
 51 | default = pystock_crawler.settings
 52 | 
 53 | [deploy]
 54 | #url = http://localhost:6800/
 55 | project = pystock_crawler
 56 | '''
 57 |     filename = os.path.abspath('./scrapy.cfg')
 58 |     filename_bak = os.path.abspath('./scrapy-%s.cfg' % random_string())
 59 |     if os.path.exists(filename):
 60 |         log.msg(u'Renaming %s -> %s' % (filename, filename_bak))
 61 |         os.rename(filename, filename_bak)
 62 |     assert not os.path.exists(filename)
 63 |     log.msg(u'Creating temporary config: %s' % filename)
 64 |     with open(filename, 'w') as f:
 65 |         f.write(content)
 66 | 
 67 |     yield
 68 | 
 69 |     if os.path.exists(filename):
 70 |         log.msg(u'Deleting %s' % filename)
 71 |         os.remove(filename)
 72 |     if os.path.exists(filename_bak):
 73 |         log.msg(u'Renaming %s -> %s' % (filename_bak, filename))
 74 |         os.rename(filename_bak, filename)
 75 | 
 76 | 
 77 | def run_scrapy_command(cmd):
 78 |     log.msg('Command: %s' % cmd)
 79 |     with tmp_scrapy_cfg():
 80 |         os.system(cmd)
 81 | 
 82 | 
 83 | def count_symbols(symbols):
 84 |     if os.path.exists(symbols):
 85 |         # If `symbols` is a file
 86 |         with open(symbols) as f:
 87 |             count = 0
 88 |             for line in f:
 89 |                 line = line.rstrip()
 90 |                 if line and not line.startswith('#'):
 91 |                     count += 1
 92 |         return count
 93 | 
 94 |     # If `symbols` is a comma-separated string
 95 |     return len(symbols.split(','))
 96 | 
 97 | 
 98 | def merge_files(target, sources, ignore_header=False):
 99 |     log.msg(u'Merging files to %s' % target)
100 |     with codecs.open(target, 'w', 'utf-8') as out:
101 |         for i, source in enumerate(sources):
102 |             with codecs.open(source, 'r', 'utf-8') as f:
103 |                 if ignore_header and i > 0:
104 |                     try:
105 |                         f.next()  # Ignore CSV header
106 |                     except StopIteration:
107 |                         break  # Empty file
108 |                 out.write(f.read())
109 | 
110 |     # Delete source files
111 |     for filename in sources:
112 |         log.msg(u'Deleting %s' % filename)
113 |         os.remove(filename)
114 | 
115 | 
116 | def crawl_symbols(exchanges, output, log_file):
117 |     command = 'scrapy crawl nasdaq -a exchanges="%s" -t symbollist' % exchanges
118 | 
119 |     if output:
120 |         command += ' -o "%s"' % output
121 |     if log_file:
122 |         command += ' -s LOG_FILE="%s"' % log_file
123 | 
124 |     run_scrapy_command(command)
125 | 
126 | 
127 | def crawl(spider, symbols, start_date, end_date, output, log_file, batch_size):
128 |     command = 'scrapy crawl %s -a symbols="%s" -t csv' % (spider, symbols)
129 | 
130 |     if start_date:
131 |         command += ' -a startdate=%s' % start_date
132 |     if end_date:
133 |         command += ' -a enddate=%s' % end_date
134 |     if log_file:
135 |         command += ' -s LOG_FILE="%s"' % log_file
136 | 
137 |     if spider == 'edgar':
138 |         # When crawling edgar filings, run the scrapy command batch by batch to
139 |         # work around issue #2
140 |         num_symbols = count_symbols(symbols)
141 |         num_batches = int(math.ceil(num_symbols / float(batch_size)))
142 | 
143 |         # Store sub-files so we can merge them later
144 |         output_files = []
145 | 
146 |         for i in xrange(num_batches):
147 |             start = i * batch_size
148 |             batch_cmd = command + ' -a limit=%d,%d' % (start, batch_size)
149 |             if output:
150 |                 filename = '%s.%d' % (output, i + 1)
151 |                 batch_cmd += ' -o "%s"' % filename
152 |                 output_files.append(filename)
153 | 
154 |             run_scrapy_command(batch_cmd)
155 | 
156 |         merge_files(output, output_files, ignore_header=True)
157 |     else:
158 |         if output:
159 |             command += ' -o "%s"' % output
160 |         run_scrapy_command(command)
161 | 
162 | 
163 | def sort_symbols(filename):
164 |     log.msg(u'Sorting: %s' % filename)
165 | 
166 |     with codecs.open(filename, 'r', 'utf-8') as f:
167 |         lines = [line for line in f]
168 | 
169 |     lines = sorted(lines)
170 | 
171 |     with codecs.open(filename, 'w', 'utf-8') as f:
172 |         f.writelines(lines)
173 | 
174 |     log.msg(u'Sorted: %s' % filename)
175 | 
176 | 
177 | def sort_csv(filename):
178 |     log.msg(u'Sorting: %s' % filename)
179 | 
180 |     with codecs.open(filename, 'r', 'utf-8') as f:
181 |         try:
182 |             headers = f.next()
183 |         except StopIteration:
184 |             log.msg(u'No need to sort empty file: %s' % filename)
185 |             return
186 |         lines = [line for line in f]
187 | 
188 |     def line_cmp(line1, line2):
189 |         a = line1.split(',')
190 |         b = line2.split(',')
191 |         length = min(len(a), len(b))
192 |         i = 0
193 |         while 1:
194 |             result = cmp(a[i], b[i])
195 |             if result or i >= length:
196 |                 return result
197 |             i += 1
198 | 
199 |     lines = sorted(lines, cmp=line_cmp)
200 | 
201 |     with codecs.open(filename, 'w', 'utf-8') as f:
202 |         f.write(headers)
203 |         f.writelines(lines)
204 | 
205 |     log.msg(u'Sorted: %s' % filename)
206 | 
207 | 
208 | def print_version():
209 |     print 'pystock-crawler %s' % pystock_crawler.__version__
210 | 
211 | 
212 | def main():
213 |     args = docopt(__doc__)
214 | 
215 |     symbols = args.get('<symbols>')
216 |     start_date = args.get('-s')
217 |     end_date = args.get('-e')
218 |     output = args.get('-o')
219 |     log_file = args.get('-l')
220 |     batch_size = args.get('-b')
221 |     sorting = args.get('--sort')
222 |     working_dir = args.get('-w')
223 | 
224 |     if args['prices']:
225 |         spider = 'yahoo'
226 |     elif args['reports']:
227 |         spider = 'edgar'
228 |     else:
229 |         spider = None
230 | 
231 |     if symbols and os.path.exists(symbols):
232 |         symbols = os.path.abspath(symbols)
233 |     if output:
234 |         output = os.path.abspath(output)
235 |     if log_file:
236 |         log_file = os.path.abspath(log_file)
237 | 
238 |     try:
239 |         batch_size = int(batch_size)
240 |         if batch_size <= 0:
241 |             raise ValueError
242 |     except ValueError:
243 |         raise ValueError("BATCH_SIZE must be a positive integer, input is '%s'" % batch_size)
244 | 
245 |     try:
246 |         os.chdir(working_dir)
247 |     except OSError as err:
248 |         sys.stderr.write('%s\n' % err)
249 |         return
250 | 
251 |     if spider:
252 |         log.start(logfile=log_file)
253 |         crawl(spider, symbols, start_date, end_date, output, log_file, batch_size)
254 |         if sorting and output:
255 |             sort_csv(output)
256 |     elif args['symbols']:
257 |         log.start(logfile=log_file)
258 |         exchanges = args.get('<exchanges>')
259 |         crawl_symbols(exchanges, output, log_file)
260 |         if sorting and output:
261 |             sort_symbols(output)
262 |     elif args['-v'] or args['--version']:
263 |         print_version()
264 | 
265 | 
266 | if __name__ == '__main__':
267 |     main()
268 | 


--------------------------------------------------------------------------------
/pystock_crawler/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.8.2'
2 | 


--------------------------------------------------------------------------------
/pystock_crawler/exporters.py:
--------------------------------------------------------------------------------
 1 | from scrapy.conf import settings
 2 | from scrapy.contrib.exporter import BaseItemExporter, CsvItemExporter
 3 | 
 4 | 
 5 | class CsvItemExporter2(CsvItemExporter):
 6 |     '''
 7 |     The standard CsvItemExporter class does not pass the kwargs through to the
 8 |     CSV writer, resulting in EXPORT_FIELDS and EXPORT_ENCODING being ignored
 9 |     (EXPORT_EMPTY is not used by CSV).
10 | 
11 |     http://stackoverflow.com/questions/6943778/python-scrapy-how-to-get-csvitemexporter-to-write-columns-in-a-specific-order
12 | 
13 |     '''
14 |     def __init__(self, *args, **kwargs):
15 |         kwargs['fields_to_export'] = settings.getlist('EXPORT_FIELDS') or None
16 |         kwargs['encoding'] = settings.get('EXPORT_ENCODING', 'utf-8')
17 | 
18 |         super(CsvItemExporter2, self).__init__(*args, **kwargs)
19 | 
20 |     def _write_headers_and_set_fields_to_export(self, item):
21 |         # HACK: Override this private method to filter fields that are in
22 |         # fields_to_export but not in item
23 |         if self.include_headers_line:
24 |             item_fields = item.fields.keys()
25 |             if self.fields_to_export:
26 |                 self.fields_to_export = filter(lambda a: a in item_fields, self.fields_to_export)
27 |             else:
28 |                 self.fields_to_export = item_fields
29 |             self.csv_writer.writerow(self.fields_to_export)
30 | 
31 | 
32 | class SymbolListExporter(BaseItemExporter):
33 | 
34 |     def __init__(self, file, **kwargs):
35 |         self._configure(kwargs, dont_fail=True)
36 |         self.file = file
37 | 
38 |     def export_item(self, item):
39 |         self.file.write('%s\t%s\n' % (item['symbol'], item['name']))
40 | 


--------------------------------------------------------------------------------
/pystock_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | 
 9 | class ReportItem(Item):
10 |     # Trading symbol
11 |     symbol = Field()
12 | 
13 |     # If this doc is an amendment to previously filed doc
14 |     amend = Field()
15 | 
16 |     # Quarterly (10-Q) or annual (10-K) report
17 |     doc_type = Field()
18 | 
19 |     # Q1, Q2, Q3, or FY for annual report
20 |     period_focus = Field()
21 | 
22 |     fiscal_year = Field()
23 |     end_date = Field()
24 | 
25 |     revenues = Field()
26 |     op_income = Field()
27 |     net_income = Field()
28 | 
29 |     eps_basic = Field()
30 |     eps_diluted = Field()
31 | 
32 |     dividend = Field()
33 | 
34 |     # Balance sheet stuffs
35 |     assets = Field()
36 |     cur_assets = Field()
37 |     cur_liab = Field()
38 |     equity = Field()
39 |     cash = Field()
40 | 
41 |     # Cash flow from operating, investing, and financing
42 |     cash_flow_op = Field()
43 |     cash_flow_inv = Field()
44 |     cash_flow_fin = Field()
45 | 
46 | 
47 | class PriceItem(Item):
48 |     # Trading symbol
49 |     symbol = Field()
50 | 
51 |     # YYYY-MM-DD
52 |     date = Field()
53 | 
54 |     open = Field()
55 |     close = Field()
56 |     high = Field()
57 |     low = Field()
58 |     adj_close = Field()
59 |     volume = Field()
60 | 
61 | 
62 | class SymbolItem(Item):
63 |     symbol = Field()
64 |     name = Field()
65 | 


--------------------------------------------------------------------------------
/pystock_crawler/loaders.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from datetime import datetime, timedelta
  4 | from scrapy import log
  5 | from scrapy.contrib.loader import ItemLoader
  6 | from scrapy.contrib.loader.processor import Compose, MapCompose, TakeFirst
  7 | from scrapy.utils.misc import arg_to_iter
  8 | from scrapy.utils.python import flatten
  9 | 
 10 | from pystock_crawler.items import ReportItem
 11 | 
 12 | 
 13 | DATE_FORMAT = '%Y-%m-%d'
 14 | 
 15 | MAX_PER_SHARE_VALUE = 1000.0
 16 | 
 17 | # If number of characters of response body exceeds this value,
 18 | # remove some useless text defined by RE_XML_GARBAGE to reduce memory usage
 19 | THRESHOLD_TO_CLEAN = 20000000
 20 | 
 21 | # Used to get rid of "<tag>LONG STRING...</tag>"
 22 | RE_XML_GARBAGE = re.compile(r'>([^<]{100,})<')
 23 | 
 24 | 
 25 | class IntermediateValue(object):
 26 |     '''
 27 |     Intermediate data that serves as output of input processors, i.e., input
 28 |     of output processors. "Intermediate" is shorten as "imd" in later naming.
 29 | 
 30 |     '''
 31 |     def __init__(self, local_name, value, text, context, node=None, start_date=None,
 32 |                  end_date=None, instant=None):
 33 |         self.local_name = local_name
 34 |         self.value = value
 35 |         self.text = text
 36 |         self.context = context
 37 |         self.node = node
 38 |         self.start_date = start_date
 39 |         self.end_date = end_date
 40 |         self.instant = instant
 41 | 
 42 |     def __cmp__(self, other):
 43 |         if self.value < other.value:
 44 |             return -1
 45 |         elif self.value > other.value:
 46 |             return 1
 47 |         return 0
 48 | 
 49 |     def __repr__(self):
 50 |         context_id = None
 51 |         if self.context:
 52 |             context_id = self.context.xpath('@id')[0].extract()
 53 |         return '(%s, %s, %s)' % (self.local_name, self.value, context_id)
 54 | 
 55 |     def is_member(self):
 56 |         return is_member(self.context)
 57 | 
 58 | 
 59 | class ExtractText(object):
 60 | 
 61 |     def __call__(self, value):
 62 |         if hasattr(value, 'select'):
 63 |             try:
 64 |                 return value.xpath('./text()')[0].extract()
 65 |             except IndexError:
 66 |                 return ''
 67 |         return unicode(value)
 68 | 
 69 | 
 70 | class MatchEndDate(object):
 71 | 
 72 |     def __init__(self, data_type=str, ignore_date_range=False):
 73 |         self.data_type = data_type
 74 |         self.ignore_date_range = ignore_date_range
 75 | 
 76 |     def __call__(self, value, loader_context):
 77 |         if not hasattr(value, 'select'):
 78 |             return IntermediateValue('', 0.0, '0', None)
 79 | 
 80 |         doc_end_date_str = loader_context['end_date']
 81 |         doc_type = loader_context['doc_type']
 82 |         selector = loader_context['selector']
 83 | 
 84 |         context_id = value.xpath('@contextRef')[0].extract()
 85 |         try:
 86 |             context = selector.xpath('//*[@id="%s"]' % context_id)[0]
 87 |         except IndexError:
 88 |             try:
 89 |                 url = loader_context['response'].url
 90 |             except KeyError:
 91 |                 url = None
 92 |             log.msg(u'Cannot find context: %s in %s' % (context_id, url), log.WARNING)
 93 |             return None
 94 | 
 95 |         date = instant = start_date = end_date = None
 96 |         try:
 97 |             instant = context.xpath('.//*[local-name()="instant"]/text()')[0].extract().strip()
 98 |         except (IndexError, ValueError):
 99 |             try:
100 |                 end_date_str = context.xpath('.//*[local-name()="endDate"]/text()')[0].extract().strip()
101 |                 end_date = datetime.strptime(end_date_str, DATE_FORMAT)
102 | 
103 |                 start_date_str = context.xpath('.//*[local-name()="startDate"]/text()')[0].extract().strip()
104 |                 start_date = datetime.strptime(start_date_str, DATE_FORMAT)
105 | 
106 |                 if self.ignore_date_range or date_range_matches_doc_type(doc_type, start_date, end_date):
107 |                     date = end_date
108 |             except (IndexError, ValueError):
109 |                 pass
110 |         else:
111 |             try:
112 |                 instant = datetime.strptime(instant, DATE_FORMAT)
113 |             except ValueError:
114 |                 pass
115 |             else:
116 |                 date = instant
117 | 
118 |         if date:
119 |             doc_end_date = datetime.strptime(doc_end_date_str, DATE_FORMAT)
120 |             delta_days = (doc_end_date - date).days
121 |             if abs(delta_days) < 30:
122 |                 try:
123 |                     text = value.xpath('./text()')[0].extract()
124 |                     val = self.data_type(text)
125 |                 except (IndexError, ValueError):
126 |                     pass
127 |                 else:
128 |                     local_name = value.xpath('local-name()')[0].extract()
129 |                     return IntermediateValue(
130 |                         local_name, val, text, context, value,
131 |                         start_date=start_date, end_date=end_date, instant=instant)
132 | 
133 |         return None
134 | 
135 | 
136 | class ImdSumMembersOr(object):
137 | 
138 |     def __init__(self, second_func=None):
139 |         self.second_func = second_func
140 | 
141 |     def __call__(self, imd_values):
142 |         members = []
143 |         non_members = []
144 |         for imd_value in imd_values:
145 |             if imd_value.is_member():
146 |                 members.append(imd_value)
147 |             else:
148 |                 non_members.append(imd_value)
149 | 
150 |         if members and len(members) == len(imd_values):
151 |             return imd_sum(members)
152 | 
153 |         if imd_values:
154 |             return self.second_func(non_members)
155 |         return None
156 | 
157 | 
158 | def date_range_matches_doc_type(doc_type, start_date, end_date):
159 |     delta_days = (end_date - start_date).days
160 |     return ((doc_type == '10-Q' and delta_days < 120 and delta_days > 60) or
161 |             (doc_type == '10-K' and delta_days < 380 and delta_days > 350))
162 | 
163 | 
164 | def get_amend(values):
165 |     if values:
166 |         return values[0]
167 |     return False
168 | 
169 | 
170 | def get_symbol(values):
171 |     if values:
172 |         symbols = map(lambda s: s.strip(), values[0].split(','))
173 |         return '/'.join(symbols)
174 |     return False
175 | 
176 | 
177 | def imd_max(imd_values):
178 |     if imd_values:
179 |         imd_value = max(imd_values)
180 |         return imd_value.value
181 |     return None
182 | 
183 | 
184 | def imd_min(imd_values):
185 |     if imd_values:
186 |         imd_value = min(imd_values)
187 |         return imd_value.value
188 |     return None
189 | 
190 | 
191 | def imd_sum(imd_values):
192 |     return sum([v.value for v in imd_values])
193 | 
194 | 
195 | def imd_get_revenues(imd_values):
196 |     interest_elems = filter(lambda v: 'interest' in v.local_name.lower(), imd_values)
197 |     if len(interest_elems) == len(imd_values):
198 |         # HACK: An exceptional case for BBT
199 |         # Revenues = InterestIncome + NoninterestIncome
200 |         return imd_sum(imd_values)
201 | 
202 |     return imd_max(imd_values)
203 | 
204 | 
205 | def imd_get_net_income(imd_values):
206 |     return imd_min(imd_values)
207 | 
208 | 
209 | def imd_get_op_income(imd_values):
210 |     imd_values = filter(lambda v: memberness(v.context) < 2, imd_values)
211 |     return imd_min(imd_values)
212 | 
213 | 
214 | def imd_get_cash_flow(imd_values, loader_context):
215 |     if len(imd_values) == 1:
216 |         return imd_values[0].value
217 | 
218 |     doc_type = loader_context['doc_type']
219 | 
220 |     within_date_range = []
221 |     for imd_value in imd_values:
222 |         if imd_value.start_date and imd_value.end_date:
223 |             if date_range_matches_doc_type(doc_type, imd_value.start_date, imd_value.end_date):
224 |                 within_date_range.append(imd_value)
225 | 
226 |     if within_date_range:
227 |         return imd_max(within_date_range)
228 | 
229 |     return imd_max(imd_values)
230 | 
231 | 
232 | def imd_get_per_share_value(imd_values):
233 |     if not imd_values:
234 |         return None
235 | 
236 |     v = imd_values[0]
237 |     value = v.value
238 |     if abs(value) > MAX_PER_SHARE_VALUE:
239 |         try:
240 |             decimals = int(v.node.xpath('@decimals')[0].extract())
241 |         except (AttributeError, IndexError, ValueError):
242 |             return None
243 |         else:
244 |             # HACK: some of LTD's reports have unreasonablely large per share value, such as
245 |             # 320000 EPS (and it should be 0.32), so use decimals attribute to scale it down,
246 |             # note that this is NOT a correct way to interpret decimals attribute
247 |             value *= pow(10, decimals - 2)
248 |     return value if abs(value) <= MAX_PER_SHARE_VALUE else None
249 | 
250 | 
251 | def imd_get_equity(imd_values):
252 |     if not imd_values:
253 |         return None
254 | 
255 |     values = filter(lambda v: v.local_name == 'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest', imd_values)
256 |     if values:
257 |         return values[0].value
258 | 
259 |     values = filter(lambda v: v.local_name == 'StockholdersEquity', imd_values)
260 |     if values:
261 |         return values[0].value
262 | 
263 |     return imd_values[0].value
264 | 
265 | 
266 | def imd_filter_member(imd_values):
267 |     if imd_values:
268 |         with_memberness = [(v, memberness(v.context)) for v in imd_values]
269 |         with_memberness = sorted(with_memberness, cmp=lambda a, b: a[1] - b[1])
270 | 
271 |         m0 = with_memberness[0][1]
272 |         non_members = []
273 | 
274 |         for v in with_memberness:
275 |             if v[1] == m0:
276 |                 non_members.append(v[0])
277 | 
278 |         return non_members
279 | 
280 |     return imd_values
281 | 
282 | 
283 | def imd_mult(imd_values):
284 |     for v in imd_values:
285 |         try:
286 |             node_id = v.node.xpath('@id')[0].extract().lower()
287 |         except (AttributeError, IndexError):
288 |             pass
289 |         else:
290 |             # HACK: some of LUV's reports have unreasonablely small numbers such as
291 |             # 4136 in revenues which should be 4136 millions, this hack uses id attribute
292 |             # to determine if it should be scaled up
293 |             if 'inmillions' in node_id and abs(v.value) < 100000.0:
294 |                 v.value *= 1000000.0
295 |             elif 'inthousands' in node_id and abs(v.value) < 100000000.0:
296 |                 v.value *= 1000.0
297 |     return imd_values
298 | 
299 | 
300 | def memberness(context):
301 |     '''The likelihood that the context is a "member".'''
302 |     if context:
303 |         texts = context.xpath('.//*[local-name()="explicitMember"]/text()').extract()
304 |         text = str(texts).lower()
305 | 
306 |         if len(texts) > 1:
307 |             return 2
308 |         elif 'country' in text:
309 |             return 2
310 |         elif 'member' not in text:
311 |             return 0
312 |         elif 'successor' in text:
313 |             # 'SuccessorMember' is a rare case that shouldn't be treated as member
314 |             return 1
315 |         elif 'parent' in text:
316 |             return 2
317 |     return 3
318 | 
319 | 
320 | def is_member(context):
321 |     if context:
322 |         texts = context.xpath('.//*[local-name()="explicitMember"]/text()').extract()
323 |         text = str(texts).lower()
324 | 
325 |         # 'SuccessorMember' is a rare case that shouldn't be treated as member
326 |         if 'member' not in text or 'successor' in text or 'parent' in text:
327 |             return False
328 |     return True
329 | 
330 | 
331 | def str_to_bool(value):
332 |     if hasattr(value, 'lower'):
333 |         value = value.lower()
334 |         return bool(value) and value != 'false' and value != '0'
335 |     return bool(value)
336 | 
337 | 
338 | def find_namespace(xxs, name):
339 |     name_re = name.replace('-', '\-')
340 |     if not name_re.startswith('xmlns'):
341 |         name_re = 'xmlns:' + name_re
342 |     return xxs.re('%s=\"([^\"]+)\"' % name_re)[0]
343 | 
344 | 
345 | def register_namespace(xxs, name):
346 |     ns = find_namespace(xxs, name)
347 |     xxs.register_namespace(name, ns)
348 | 
349 | 
350 | def register_namespaces(xxs):
351 |     names = ('xmlns', 'xbrli', 'dei', 'us-gaap')
352 |     for name in names:
353 |         try:
354 |             register_namespace(xxs, name)
355 |         except IndexError:
356 |             pass
357 | 
358 | 
359 | class XmlXPathItemLoader(ItemLoader):
360 | 
361 |     def __init__(self, *args, **kwargs):
362 |         super(XmlXPathItemLoader, self).__init__(*args, **kwargs)
363 |         register_namespaces(self.selector)
364 | 
365 |     def add_xpath(self, field_name, xpath, *processors, **kw):
366 |         values = self._get_values(xpath, **kw)
367 |         self.add_value(field_name, values, *processors, **kw)
368 |         return len(self._values[field_name])
369 | 
370 |     def add_xpaths(self, name, paths):
371 |         for path in paths:
372 |             match_count = self.add_xpath(name, path)
373 |             if match_count > 0:
374 |                 return match_count
375 | 
376 |         return 0
377 | 
378 |     def _get_values(self, xpaths, **kw):
379 |         xpaths = arg_to_iter(xpaths)
380 |         return flatten([self.selector.xpath(xpath) for xpath in xpaths])
381 | 
382 | 
383 | class ReportItemLoader(XmlXPathItemLoader):
384 | 
385 |     default_item_class = ReportItem
386 |     default_output_processor = TakeFirst()
387 | 
388 |     symbol_in = MapCompose(ExtractText(), unicode.upper)
389 |     symbol_out = Compose(get_symbol)
390 | 
391 |     amend_in = MapCompose(ExtractText(), str_to_bool)
392 |     amend_out = Compose(get_amend)
393 | 
394 |     period_focus_in = MapCompose(ExtractText(), unicode.upper)
395 |     period_focus_out = TakeFirst()
396 | 
397 |     revenues_in = MapCompose(MatchEndDate(float))
398 |     revenues_out = Compose(imd_filter_member, imd_mult, ImdSumMembersOr(imd_get_revenues))
399 | 
400 |     net_income_in = MapCompose(MatchEndDate(float))
401 |     net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)
402 | 
403 |     op_income_in = MapCompose(MatchEndDate(float))
404 |     op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)
405 | 
406 |     eps_basic_in = MapCompose(MatchEndDate(float))
407 |     eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
408 | 
409 |     eps_diluted_in = MapCompose(MatchEndDate(float))
410 |     eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None)
411 | 
412 |     dividend_in = MapCompose(MatchEndDate(float))
413 |     dividend_out = Compose(imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)
414 | 
415 |     assets_in = MapCompose(MatchEndDate(float))
416 |     assets_out = Compose(imd_filter_member, imd_mult, imd_max)
417 | 
418 |     cur_assets_in = MapCompose(MatchEndDate(float))
419 |     cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)
420 | 
421 |     cur_liab_in = MapCompose(MatchEndDate(float))
422 |     cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)
423 | 
424 |     equity_in = MapCompose(MatchEndDate(float))
425 |     equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)
426 | 
427 |     cash_in = MapCompose(MatchEndDate(float))
428 |     cash_out = Compose(imd_filter_member, imd_mult, imd_max)
429 | 
430 |     cash_flow_op_in = MapCompose(MatchEndDate(float, True))
431 |     cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
432 | 
433 |     cash_flow_inv_in = MapCompose(MatchEndDate(float, True))
434 |     cash_flow_inv_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
435 | 
436 |     cash_flow_fin_in = MapCompose(MatchEndDate(float, True))
437 |     cash_flow_fin_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)
438 | 
439 |     def __init__(self, *args, **kwargs):
440 |         response = kwargs.get('response')
441 |         if len(response.body) > THRESHOLD_TO_CLEAN:
442 |             # Remove some useless text to reduce memory usage
443 |             body, __ = RE_XML_GARBAGE.subn(lambda m: '><', response.body)
444 |             response = response.replace(body=body)
445 |             kwargs['response'] = response
446 | 
447 |         super(ReportItemLoader, self).__init__(*args, **kwargs)
448 | 
449 |         symbol = self._get_symbol()
450 |         end_date = self._get_doc_end_date()
451 |         fiscal_year = self._get_doc_fiscal_year()
452 |         doc_type = self._get_doc_type()
453 | 
454 |         # ignore document that is not 10-Q or 10-K
455 |         if not (doc_type and doc_type.split('/')[0] in ('10-Q', '10-K')):
456 |             return
457 | 
458 |         # some documents set their amendment flag in DocumentType, e.g., '10-Q/A',
459 |         # instead of setting it in AmendmentFlag
460 |         amend = None
461 |         if doc_type.endswith('/A'):
462 |             amend = True
463 |             doc_type = doc_type[0:-2]
464 | 
465 |         self.context.update({
466 |             'end_date': end_date,
467 |             'doc_type': doc_type
468 |         })
469 | 
470 |         self.add_xpath('symbol', '//dei:TradingSymbol')
471 |         self.add_value('symbol', symbol)
472 | 
473 |         if amend:
474 |             self.add_value('amend', True)
475 |         else:
476 |             self.add_xpath('amend', '//dei:AmendmentFlag')
477 | 
478 |         if doc_type == '10-K':
479 |             period_focus = 'FY'
480 |         else:
481 |             period_focus = self._get_period_focus(end_date)
482 | 
483 |         if not fiscal_year and period_focus:
484 |             fiscal_year = self._guess_fiscal_year(end_date, period_focus)
485 | 
486 |         self.add_value('period_focus', period_focus)
487 |         self.add_value('fiscal_year', fiscal_year)
488 |         self.add_value('end_date', end_date)
489 |         self.add_value('doc_type', doc_type)
490 | 
491 |         self.add_xpaths('revenues', [
492 |             '//us-gaap:SalesRevenueNet',
493 |             '//us-gaap:Revenues',
494 |             '//us-gaap:SalesRevenueGoodsNet',
495 |             '//us-gaap:SalesRevenueServicesNet',
496 |             '//us-gaap:RealEstateRevenueNet',
497 |             '//*[local-name()="NetRevenuesIncludingNetInterestIncome"]',
498 |             '//*[contains(local-name(), "TotalRevenues") and contains(local-name(), "After")]',
499 |             '//*[contains(local-name(), "TotalRevenues")]',
500 |             '//*[local-name()="InterestAndDividendIncomeOperating" or local-name()="NoninterestIncome"]',
501 |             '//*[contains(local-name(), "Revenue")]'
502 |         ])
503 |         self.add_xpath('revenues', '//us-gaap:FinancialServicesRevenue')
504 | 
505 |         self.add_xpaths('net_income', [
506 |             '//*[contains(local-name(), "NetLossIncome") and contains(local-name(), "Corporation")]',
507 |             '//*[local-name()="NetIncomeLossAvailableToCommonStockholdersBasic" or local-name()="NetIncomeLoss"]',
508 |             '//us-gaap:ProfitLoss',
509 |             '//us-gaap:IncomeLossFromContinuingOperations',
510 |             '//*[contains(local-name(), "IncomeLossFromContinuingOperations") and not(contains(local-name(), "Per"))]',
511 |             '//*[contains(local-name(), "NetIncomeLoss")]',
512 |             '//*[starts-with(local-name(), "NetIncomeAttributableTo")]'
513 |         ])
514 | 
515 |         self.add_xpaths('op_income', [
516 |             '//us-gaap:OperatingIncomeLoss'
517 |         ])
518 | 
519 |         self.add_xpaths('eps_basic', [
520 |             '//us-gaap:EarningsPerShareBasic',
521 |             '//us-gaap:IncomeLossFromContinuingOperationsPerBasicShare',
522 |             '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare',
523 |             '//*[contains(local-name(), "NetIncomeLoss") and contains(local-name(), "Per") and contains(local-name(), "Common")]',
524 |             '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Basic")]',
525 |             '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]',
526 |             '//*[contains(local-name(), "NetLossPerShare")]',
527 |             '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Basic")]',
528 |             '//*[local-name()="BasicEarningsAttributableToStockholdersPerCommonShare"]',
529 |             '//*[local-name()="Earningspersharebasicanddiluted"]',
530 |             '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]',
531 |             '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]',
532 |             '//us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic',
533 |             '//*[local-name()="NetIncomeLossEPS"]',
534 |             '//*[local-name()="NetLoss"]'
535 |         ])
536 | 
537 |         self.add_xpaths('eps_diluted', [
538 |             '//us-gaap:EarningsPerShareDiluted',
539 |             '//us-gaap:IncomeLossFromContinuingOperationsPerDilutedShare',
540 |             '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare',
541 |             '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]',
542 |             '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]',
543 |             '//*[contains(local-name(), "NetLossPerShare")]',
544 |             '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]',
545 |             '//*[local-name()="DilutedEarningsAttributableToStockholdersPerCommonShare"]',
546 |             '//us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted',
547 |             '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]',
548 |             '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]',
549 |             '//us-gaap:EarningsPerShareBasic',
550 |             '//*[local-name()="NetIncomeLossEPS"]',
551 |             '//*[local-name()="NetLoss"]'
552 |         ])
553 | 
554 |         self.add_xpaths('dividend', [
555 |             '//us-gaap:CommonStockDividendsPerShareDeclared',
556 |             '//us-gaap:CommonStockDividendsPerShareCashPaid'
557 |         ])
558 | 
559 |         # if dividend isn't found in doc, assume it's 0
560 |         self.add_value('dividend', 0.0)
561 | 
562 |         self.add_xpaths('assets', [
563 |             '//us-gaap:Assets',
564 |             '//us-gaap:AssetsNet',
565 |             '//us-gaap:LiabilitiesAndStockholdersEquity'
566 |         ])
567 | 
568 |         self.add_xpaths('cur_assets', [
569 |             '//us-gaap:AssetsCurrent'
570 |         ])
571 | 
572 |         self.add_xpaths('cur_liab', [
573 |             '//us-gaap:LiabilitiesCurrent'
574 |         ])
575 | 
576 |         self.add_xpaths('equity', [
577 |             '//*[local-name()="StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest" or local-name()="StockholdersEquity"]',
578 |             '//*[local-name()="TotalCommonShareholdersEquity"]',
579 |             '//*[local-name()="CommonShareholdersEquity"]',
580 |             '//*[local-name()="CommonStockEquity"]',
581 |             '//*[local-name()="TotalEquity"]',
582 |             '//us-gaap:RetainedEarningsAccumulatedDeficit',
583 |             '//*[contains(local-name(), "MembersEquityIncludingPortionAttributableToNoncontrollingInterest")]',
584 |             '//us-gaap:CapitalizationLongtermDebtAndEquity',
585 |             '//*[local-name()="TotalCapitalization"]'
586 |         ])
587 | 
588 |         self.add_xpaths('cash', [
589 |             '//us-gaap:CashCashEquivalentsAndFederalFundsSold',
590 |             '//us-gaap:CashAndDueFromBanks',
591 |             '//us-gaap:CashAndCashEquivalentsAtCarryingValue',
592 |             '//us-gaap:Cash',
593 |             '//*[local-name()="CashAndCashEquivalents"]',
594 |             '//*[contains(local-name(), "CarryingValueOfCashAndCashEquivalents")]',
595 |             '//*[contains(local-name(), "CashCashEquivalents")]',
596 |             '//*[contains(local-name(), "CashAndCashEquivalents")]'
597 |         ])
598 | 
599 |         self.add_xpaths('cash_flow_op', [
600 |             '//us-gaap:NetCashProvidedByUsedInOperatingActivities',
601 |             '//us-gaap:NetCashProvidedByUsedInOperatingActivitiesContinuingOperations'
602 |         ])
603 | 
604 |         self.add_xpaths('cash_flow_inv', [
605 |             '//us-gaap:NetCashProvidedByUsedInInvestingActivities',
606 |             '//us-gaap:NetCashProvidedByUsedInInvestingActivitiesContinuingOperations'
607 |         ])
608 | 
609 |         self.add_xpaths('cash_flow_fin', [
610 |             '//us-gaap:NetCashProvidedByUsedInFinancingActivities',
611 |             '//us-gaap:NetCashProvidedByUsedInFinancingActivitiesContinuingOperations'
612 |         ])
613 | 
614 |     def _get_symbol(self):
615 |         try:
616 |             filename = self.context['response'].url.split('/')[-1]
617 |             return filename.split('-')[0].upper()
618 |         except IndexError:
619 |             return None
620 | 
621 |     def _get_doc_fiscal_year(self):
622 |         try:
623 |             fiscal_year = self.selector.xpath('//dei:DocumentFiscalYearFocus/text()')[0].extract()
624 |             return int(fiscal_year)
625 |         except (IndexError, ValueError):
626 |             return None
627 | 
628 |     def _guess_fiscal_year(self, end_date, period_focus):
629 |         # Guess fiscal_year based on document end_date and period_focus
630 |         date = datetime.strptime(end_date, DATE_FORMAT)
631 |         month_ranges = {
632 |             'Q1': (2, 3, 4),
633 |             'Q2': (5, 6, 7),
634 |             'Q3': (8, 9, 10),
635 |             'FY': (11, 12, 1)
636 |         }
637 |         month_range = month_ranges.get(period_focus)
638 | 
639 |         # Case 1: release Q1 around March, Q2 around June, ...
640 |         # This is what most companies do
641 |         if date.month in month_range:
642 |             if period_focus == 'FY' and date.month == 1:
643 |                 return date.year - 1
644 |             return date.year
645 | 
646 |         # How many days left before 10-K's release?
647 |         days_left_table = {
648 |             'Q1': 270,
649 |             'Q2': 180,
650 |             'Q3': 90,
651 |             'FY': 0
652 |         }
653 |         days_left = days_left_table.get(period_focus)
654 | 
655 |         # Other cases, assume end_date.year of its FY report equals to
656 |         # its fiscal_year
657 |         if days_left is not None:
658 |             fy_date = date + timedelta(days=days_left)
659 |             return fy_date.year
660 | 
661 |         return None
662 | 
663 |     def _get_doc_end_date(self):
664 |         # the document end date could come from URL or document content
665 |         # we need to guess which one is correct
666 |         url_date_str = self.context['response'].url.split('-')[-1].split('.')[0]
667 |         url_date = datetime.strptime(url_date_str, '%Y%m%d')
668 |         url_date_str = url_date.strftime(DATE_FORMAT)
669 | 
670 |         try:
671 |             doc_date_str = self.selector.xpath('//dei:DocumentPeriodEndDate/text()')[0].extract()
672 |             doc_date = datetime.strptime(doc_date_str, DATE_FORMAT)
673 |         except (IndexError, ValueError):
674 |             return url_date.strftime(DATE_FORMAT)
675 | 
676 |         context_date_strs = set(self.selector.xpath('//*[local-name()="context"]//*[local-name()="endDate"]/text()').extract())
677 | 
678 |         date = url_date
679 |         if doc_date_str in context_date_strs:
680 |             date = doc_date
681 | 
682 |         return date.strftime(DATE_FORMAT)
683 | 
684 |     def _get_doc_type(self):
685 |         try:
686 |             return self.selector.xpath('//dei:DocumentType/text()')[0].extract().upper()
687 |         except (IndexError, ValueError):
688 |             return None
689 | 
690 |     def _get_period_focus(self, doc_end_date):
691 |         try:
692 |             return self.selector.xpath('//dei:DocumentFiscalPeriodFocus/text()')[0].extract().strip().upper()
693 |         except IndexError:
694 |             pass
695 | 
696 |         try:
697 |             doc_yr = doc_end_date.split('-')[0]
698 |             yr_end_date = self.selector.xpath('//dei:CurrentFiscalYearEndDate/text()')[0].extract()
699 |             yr_end_date = yr_end_date.replace('--', doc_yr + '-')
700 |         except IndexError:
701 |             return None
702 | 
703 |         doc_end_date = datetime.strptime(doc_end_date, '%Y-%m-%d')
704 |         yr_end_date = datetime.strptime(yr_end_date, '%Y-%m-%d')
705 |         delta_days = (yr_end_date - doc_end_date).days
706 | 
707 |         if delta_days > -45 and delta_days < 45:
708 |             return 'FY'
709 |         elif (delta_days <= -45 and delta_days > -135) or delta_days > 225:
710 |             return 'Q1'
711 |         elif (delta_days <= -135 and delta_days > -225) or (delta_days > 135 and delta_days <= 225):
712 |             return 'Q2'
713 |         elif delta_days <= -225 or (delta_days > 45 and delta_days <= 135):
714 |             return 'Q3'
715 | 
716 |         return 'FY'
717 | 


--------------------------------------------------------------------------------
/pystock_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for pystock-crawler project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'pystock-crawler'
10 | 
11 | EXPORT_FIELDS = (
12 |     # Price columns
13 |     'symbol', 'date', 'open', 'high', 'low', 'close', 'volume', 'adj_close',
14 | 
15 |     # Report columns
16 |     'end_date', 'amend', 'period_focus', 'fiscal_year', 'doc_type', 'revenues', 'op_income', 'net_income',
17 |     'eps_basic', 'eps_diluted', 'dividend', 'assets', 'cur_assets', 'cur_liab', 'cash', 'equity',
18 |     'cash_flow_op', 'cash_flow_inv', 'cash_flow_fin',
19 | )
20 | 
21 | FEED_EXPORTERS = {
22 |     'csv': 'pystock_crawler.exporters.CsvItemExporter2',
23 |     'symbollist': 'pystock_crawler.exporters.SymbolListExporter'
24 | }
25 | 
26 | HTTPCACHE_ENABLED = True
27 | 
28 | HTTPCACHE_POLICY = 'scrapy.contrib.httpcache.RFC2616Policy'
29 | 
30 | HTTPCACHE_STORAGE = 'scrapy.contrib.httpcache.LeveldbCacheStorage'
31 | 
32 | LOG_LEVEL = 'INFO'
33 | 
34 | NEWSPIDER_MODULE = 'pystock_crawler.spiders'
35 | 
36 | SPIDER_MODULES = ['pystock_crawler.spiders']
37 | 
38 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
39 | #USER_AGENT = 'pystock-crawler (+http://www.yourdomain.com)'
40 | 
41 | CONCURRENT_REQUESTS_PER_DOMAIN = 8
42 | 
43 | COOKIES_ENABLED = False
44 | 
45 | #AUTOTHROTTLE_ENABLED = True
46 | 
47 | RETRY_TIMES = 4
48 | 
49 | EXTENSIONS = {
50 |     'scrapy.contrib.throttle.AutoThrottle': None,
51 |     'pystock_crawler.throttle.PassiveThrottle': 0
52 | }
53 | 
54 | PASSIVETHROTTLE_ENABLED = True
55 | #PASSIVETHROTTLE_DEBUG = True
56 | 
57 | DEPTH_STATS_VERBOSE = True
58 | 


--------------------------------------------------------------------------------
/pystock_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/pystock_crawler/spiders/edgar.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy.contrib.spiders import CrawlSpider, Rule
 5 | 
 6 | from pystock_crawler import utils
 7 | from pystock_crawler.loaders import ReportItemLoader
 8 | 
 9 | 
10 | class URLGenerator(object):
11 | 
12 |     def __init__(self, symbols, start_date='', end_date='', start=0, count=None):
13 |         end = start + count if count is not None else None
14 |         self.symbols = symbols[start:end]
15 |         self.start_date = start_date
16 |         self.end_date = end_date
17 | 
18 |     def __iter__(self):
19 |         url = 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-&dateb=%s&datea=%s&owner=exclude&count=300'
20 |         for symbol in self.symbols:
21 |             yield (url % (symbol, self.end_date, self.start_date))
22 | 
23 | 
24 | class EdgarSpider(CrawlSpider):
25 | 
26 |     name = 'edgar'
27 |     allowed_domains = ['sec.gov']
28 | 
29 |     rules = (
30 |         Rule(SgmlLinkExtractor(allow=('/Archives/edgar/data/[^\"]+\-index\.htm',))),
31 |         Rule(SgmlLinkExtractor(allow=('/Archives/edgar/data/[^\"]+/[A-Za-z]+\-\d{8}\.xml',)), callback='parse_10qk'),
32 |     )
33 | 
34 |     def __init__(self, **kwargs):
35 |         super(EdgarSpider, self).__init__(**kwargs)
36 | 
37 |         symbols_arg = kwargs.get('symbols')
38 |         start_date = kwargs.get('startdate', '')
39 |         end_date = kwargs.get('enddate', '')
40 |         limit_arg = kwargs.get('limit', '')
41 | 
42 |         utils.check_date_arg(start_date, 'startdate')
43 |         utils.check_date_arg(end_date, 'enddate')
44 |         start, count = utils.parse_limit_arg(limit_arg)
45 | 
46 |         if symbols_arg:
47 |             if os.path.exists(symbols_arg):
48 |                 # get symbols from a text file
49 |                 symbols = utils.load_symbols(symbols_arg)
50 |             else:
51 |                 # inline symbols in command
52 |                 symbols = symbols_arg.split(',')
53 |             self.start_urls = URLGenerator(symbols, start_date, end_date, start, count)
54 |         else:
55 |             self.start_urls = []
56 | 
57 |     def parse_10qk(self, response):
58 |         '''Parse 10-Q or 10-K XML report.'''
59 |         loader = ReportItemLoader(response=response)
60 |         item = loader.load_item()
61 | 
62 |         if 'doc_type' in item:
63 |             doc_type = item['doc_type']
64 |             if doc_type in ('10-Q', '10-K'):
65 |                 return item
66 | 
67 |         return None
68 | 


--------------------------------------------------------------------------------
/pystock_crawler/spiders/nasdaq.py:
--------------------------------------------------------------------------------
 1 | import cStringIO
 2 | import re
 3 | 
 4 | from scrapy.spider import Spider
 5 | 
 6 | from pystock_crawler.items import SymbolItem
 7 | 
 8 | 
 9 | RE_SYMBOL = re.compile(r'^[A-Z]+$')
10 | 
11 | 
12 | def generate_urls(exchanges):
13 |     for exchange in exchanges:
14 |         yield 'http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=%s&render=download' % exchange
15 | 
16 | 
17 | class NasdaqSpider(Spider):
18 | 
19 |     name = 'nasdaq'
20 |     allowed_domains = ['www.nasdaq.com']
21 | 
22 |     def __init__(self, **kwargs):
23 |         super(NasdaqSpider, self).__init__(**kwargs)
24 | 
25 |         exchanges = kwargs.get('exchanges', '').split(',')
26 |         self.start_urls = generate_urls(exchanges)
27 | 
28 |     def parse(self, response):
29 |         try:
30 |             file_like = cStringIO.StringIO(response.body)
31 | 
32 |             # Ignore first row
33 |             file_like.next()
34 | 
35 |             for line in file_like:
36 |                 tokens = line.split(',')
37 |                 symbol = tokens[0].strip('"')
38 |                 if RE_SYMBOL.match(symbol):
39 |                     name = tokens[1].strip('"')
40 |                     yield SymbolItem(symbol=symbol, name=name)
41 |         finally:
42 |             file_like.close()
43 | 


--------------------------------------------------------------------------------
/pystock_crawler/spiders/yahoo.py:
--------------------------------------------------------------------------------
 1 | import cStringIO
 2 | import os
 3 | import re
 4 | 
 5 | from datetime import datetime
 6 | from scrapy.spider import Spider
 7 | 
 8 | from pystock_crawler import utils
 9 | from pystock_crawler.items import PriceItem
10 | 
11 | 
12 | def parse_date(date_str):
13 |     if date_str:
14 |         date = datetime.strptime(date_str, '%Y%m%d')
15 |         return date.year, date.month - 1, date.day
16 |     return '', '', ''
17 | 
18 | 
19 | def make_url(symbol, start_date=None, end_date=None):
20 |     url = ('http://ichart.finance.yahoo.com/table.csv?'
21 |            's=%(symbol)s&d=%(end_month)s&e=%(end_day)s&f=%(end_year)s&g=d&'
22 |            'a=%(start_month)s&b=%(start_day)s&c=%(start_year)s&ignore=.csv')
23 | 
24 |     start_date = parse_date(start_date)
25 |     end_date = parse_date(end_date)
26 | 
27 |     return url % {
28 |         'symbol': symbol,
29 |         'start_year': start_date[0],
30 |         'start_month': start_date[1],
31 |         'start_day': start_date[2],
32 |         'end_year': end_date[0],
33 |         'end_month': end_date[1],
34 |         'end_day': end_date[2]
35 |     }
36 | 
37 | 
38 | def generate_urls(symbols, start_date=None, end_date=None):
39 |     for symbol in symbols:
40 |         yield make_url(symbol, start_date, end_date)
41 | 
42 | 
43 | class YahooSpider(Spider):
44 | 
45 |     name = 'yahoo'
46 |     allowed_domains = ['finance.yahoo.com']
47 | 
48 |     def __init__(self, **kwargs):
49 |         super(YahooSpider, self).__init__(**kwargs)
50 | 
51 |         symbols_arg = kwargs.get('symbols')
52 |         start_date = kwargs.get('startdate', '')
53 |         end_date = kwargs.get('enddate', '')
54 | 
55 |         utils.check_date_arg(start_date, 'startdate')
56 |         utils.check_date_arg(end_date, 'enddate')
57 | 
58 |         if symbols_arg:
59 |             if os.path.exists(symbols_arg):
60 |                 # get symbols from a text file
61 |                 symbols = utils.load_symbols(symbols_arg)
62 |             else:
63 |                 # inline symbols in command
64 |                 symbols = symbols_arg.split(',')
65 |             self.start_urls = generate_urls(symbols, start_date, end_date)
66 |         else:
67 |             self.start_urls = []
68 | 
69 |     def parse(self, response):
70 |         symbol = self._get_symbol_from_url(response.url)
71 |         try:
72 |             file_like = cStringIO.StringIO(response.body)
73 |             rows = utils.parse_csv(file_like)
74 |             for row in rows:
75 |                 item = PriceItem(symbol=symbol)
76 |                 for k, v in row.iteritems():
77 |                     item[k.replace(' ', '_').lower()] = v
78 |                 yield item
79 |         finally:
80 |             file_like.close()
81 | 
82 |     def _get_symbol_from_url(self, url):
83 |         match = re.search(r'[\?&]s=([^&]*)', url)
84 |         if match:
85 |             return match.group(1)
86 |         return ''
87 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eliangcs/pystock-crawler/8b803c8944f36af46daf04c6767a74132e37a101/pystock_crawler/tests/__init__.py


--------------------------------------------------------------------------------
/pystock_crawler/tests/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | 
 5 | # Stores temporary test data
 6 | SAMPLE_DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data')
 7 | 
 8 | 
 9 | class TestCaseBase(unittest.TestCase):
10 |     '''
11 |     Provides utility functions for test cases.
12 | 
13 |     '''
14 |     def assert_none_or_almost_equal(self, value, expected_value):
15 |         if expected_value is None:
16 |             self.assertIsNone(value)
17 |         else:
18 |             self.assertAlmostEqual(value, expected_value)
19 | 
20 |     def assert_item(self, item, expected):
21 |         self.assertEqual(item.get('symbol'), expected.get('symbol'))
22 |         self.assertEqual(item.get('name'), expected.get('name'))
23 |         self.assertEqual(item.get('amend'), expected.get('amend'))
24 |         self.assertEqual(item.get('doc_type'), expected.get('doc_type'))
25 |         self.assertEqual(item.get('period_focus'), expected.get('period_focus'))
26 |         self.assertEqual(item.get('fiscal_year'), expected.get('fiscal_year'))
27 |         self.assertEqual(item.get('end_date'), expected.get('end_date'))
28 |         self.assert_none_or_almost_equal(item.get('revenues'), expected.get('revenues'))
29 |         self.assert_none_or_almost_equal(item.get('net_income'), expected.get('net_income'))
30 |         self.assert_none_or_almost_equal(item.get('eps_basic'), expected.get('eps_basic'))
31 |         self.assert_none_or_almost_equal(item.get('eps_diluted'), expected.get('eps_diluted'))
32 |         self.assertAlmostEqual(item.get('dividend'), expected.get('dividend'))
33 |         self.assert_none_or_almost_equal(item.get('assets'), expected.get('assets'))
34 |         self.assert_none_or_almost_equal(item.get('equity'), expected.get('equity'))
35 |         self.assert_none_or_almost_equal(item.get('cash'), expected.get('cash'))
36 |         self.assert_none_or_almost_equal(item.get('op_income'), expected.get('op_income'))
37 |         self.assert_none_or_almost_equal(item.get('cur_assets'), expected.get('cur_assets'))
38 |         self.assert_none_or_almost_equal(item.get('cur_liab'), expected.get('cur_liab'))
39 |         self.assert_none_or_almost_equal(item.get('cash_flow_op'), expected.get('cash_flow_op'))
40 |         self.assert_none_or_almost_equal(item.get('cash_flow_inv'), expected.get('cash_flow_inv'))
41 |         self.assert_none_or_almost_equal(item.get('cash_flow_fin'), expected.get('cash_flow_fin'))
42 | 
43 | 
44 | def _create_sample_data_dir():
45 |     if not os.path.exists(SAMPLE_DATA_DIR):
46 |         try:
47 |             os.makedirs(SAMPLE_DATA_DIR)
48 |         except OSError:
49 |             pass
50 | 
51 |     assert os.path.exists(SAMPLE_DATA_DIR)
52 | 
53 | _create_sample_data_dir()
54 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/test_cmdline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | 
  5 | import pystock_crawler
  6 | 
  7 | from envoy import run
  8 | 
  9 | 
 10 | TEST_DIR = './test_data'
 11 | 
 12 | 
 13 | # Scrapy runs on another process where working directory may be different with
 14 | # the process running the test. So we have to explicitly set PYTHONPATH to
 15 | # the absolute path of the current working directory for Scrapy process to be
 16 | # able to locate pystock_crawler module.
 17 | os.environ['PYTHONPATH'] = os.getcwd()
 18 | 
 19 | 
 20 | class PrintTest(unittest.TestCase):
 21 | 
 22 |     def test_no_args(self):
 23 |         r = run('./bin/pystock-crawler')
 24 |         self.assertIn('Usage:', r.std_err)
 25 | 
 26 |     def test_print_help(self):
 27 |         r = run('./bin/pystock-crawler -h')
 28 |         self.assertIn('Usage:', r.std_out)
 29 | 
 30 |         r2 = run('./bin/pystock-crawler --help')
 31 |         self.assertEqual(r.std_out, r2.std_out)
 32 | 
 33 |     def test_print_version(self):
 34 |         r = run('./bin/pystock-crawler -v')
 35 |         self.assertEqual(r.std_out, 'pystock-crawler %s\n' % pystock_crawler.__version__)
 36 | 
 37 |         r2 = run('./bin/pystock-crawler --version')
 38 |         self.assertEqual(r.std_out, r2.std_out)
 39 | 
 40 | 
 41 | class CrawlTest(unittest.TestCase):
 42 |     '''Base class for crawl test cases.'''
 43 |     def setUp(self):
 44 |         if os.path.isdir(TEST_DIR):
 45 |             shutil.rmtree(TEST_DIR)
 46 |         os.mkdir(TEST_DIR)
 47 | 
 48 |         self.args = {
 49 |             'output': os.path.join(TEST_DIR, '%s.out' % self.filename),
 50 |             'log_file': os.path.join(TEST_DIR, '%s.log' % self.filename),
 51 |             'working_dir': TEST_DIR
 52 |         }
 53 | 
 54 |     def tearDown(self):
 55 |         shutil.rmtree(TEST_DIR)
 56 | 
 57 |     def assert_cache(self):
 58 |         # Check if cache is there
 59 |         cache_dir = os.path.join(TEST_DIR, '.scrapy', 'httpcache', '%s.leveldb' % self.spider)
 60 |         self.assertTrue(os.path.isdir(cache_dir))
 61 | 
 62 |     def assert_log(self):
 63 |         # Check if log file is there
 64 |         log_path = self.args['log_file']
 65 |         self.assertTrue(os.path.isfile(log_path))
 66 | 
 67 |     def get_output_content(self):
 68 |         output_path = self.args['output']
 69 |         self.assertTrue(os.path.isfile(output_path))
 70 | 
 71 |         with open(output_path) as f:
 72 |             content = f.read()
 73 |         return content
 74 | 
 75 | 
 76 | class CrawlSymbolsTest(CrawlTest):
 77 | 
 78 |     filename = 'symbols'
 79 |     spider = 'nasdaq'
 80 | 
 81 |     def assert_nyse_output(self):
 82 |         # Check if some common NYSE symbols are in output
 83 |         content = self.get_output_content()
 84 |         self.assertIn('JPM', content)
 85 |         self.assertIn('KO', content)
 86 |         self.assertIn('WMT', content)
 87 | 
 88 |         # NASDAQ symbols shouldn't be
 89 |         self.assertNotIn('AAPL', content)
 90 |         self.assertNotIn('GOOG', content)
 91 |         self.assertNotIn('YHOO', content)
 92 | 
 93 |     def assert_nyse_and_nasdaq_output(self):
 94 |         # Check if some common NYSE symbols are in output
 95 |         content = self.get_output_content()
 96 |         self.assertIn('JPM', content)
 97 |         self.assertIn('KO', content)
 98 |         self.assertIn('WMT', content)
 99 | 
100 |         # Check if some common NASDAQ symbols are in output
101 |         self.assertIn('AAPL', content)
102 |         self.assertIn('GOOG', content)
103 |         self.assertIn('YHOO', content)
104 | 
105 |     def test_crawl_nyse(self):
106 |         r = run('./bin/pystock-crawler symbols NYSE -o %(output)s -l %(log_file)s -w %(working_dir)s' % self.args)
107 |         self.assertEqual(r.status_code, 0)
108 |         self.assert_nyse_output()
109 |         self.assert_log()
110 |         self.assert_cache()
111 | 
112 |     def test_crawl_nyse_and_nasdaq(self):
113 |         r = run('./bin/pystock-crawler symbols NYSE,NASDAQ -o %(output)s -l %(log_file)s -w %(working_dir)s --sort' % self.args)
114 |         self.assertEqual(r.status_code, 0)
115 |         self.assert_nyse_and_nasdaq_output()
116 |         self.assert_log()
117 |         self.assert_cache()
118 | 
119 | 
120 | class CrawlPricesTest(CrawlTest):
121 | 
122 |     filename = 'prices'
123 |     spider = 'yahoo'
124 | 
125 |     def test_crawl_inline_symbols(self):
126 |         r = run('./bin/pystock-crawler prices GOOG,IBM -o %(output)s -l %(log_file)s -w %(working_dir)s' % self.args)
127 |         self.assertEqual(r.status_code, 0)
128 | 
129 |         content = self.get_output_content()
130 |         self.assertIn('GOOG', content)
131 |         self.assertIn('IBM', content)
132 |         self.assert_log()
133 |         self.assert_cache()
134 | 
135 |     def test_crawl_symbol_file(self):
136 |         # Create a sample symbol file
137 |         symbol_file = os.path.join(TEST_DIR, 'symbols.txt')
138 |         with open(symbol_file, 'w') as f:
139 |             f.write('WMT\nJPM')
140 |         self.args['symbol_file'] = symbol_file
141 | 
142 |         r = run('./bin/pystock-crawler prices %(symbol_file)s -o %(output)s -l %(log_file)s -w %(working_dir)s --sort' % self.args)
143 |         self.assertEqual(r.status_code, 0)
144 | 
145 |         content = self.get_output_content()
146 |         self.assertIn('WMT', content)
147 |         self.assertIn('JPM', content)
148 |         self.assert_log()
149 |         self.assert_cache()
150 | 
151 | 
152 | class CrawlReportsTest(CrawlTest):
153 | 
154 |     filename = 'reports'
155 |     spider = 'edgar'
156 | 
157 |     def test_crawl_inline_symbols(self):
158 |         r = run('./bin/pystock-crawler reports KO,MCD -o %(output)s -l %(log_file)s -w %(working_dir)s '
159 |                 '-s 20130401 -e 20130531' % self.args)
160 |         self.assertEqual(r.status_code, 0)
161 | 
162 |         content = self.get_output_content()
163 |         self.assertIn('KO', content)
164 |         self.assertIn('MCD', content)
165 |         self.assert_log()
166 |         self.assert_cache()
167 | 
168 |     def test_crawl_symbol_file(self):
169 |         # Create a sample symbol file
170 |         symbol_file = os.path.join(TEST_DIR, 'symbols.txt')
171 |         with open(symbol_file, 'w') as f:
172 |             f.write('KO\nMCD')
173 |         self.args['symbol_file'] = symbol_file
174 | 
175 |         r = run('./bin/pystock-crawler reports %(symbol_file)s -o %(output)s -l %(log_file)s -w %(working_dir)s '
176 |                 '-s 20130401 -e 20130531 --sort' % self.args)
177 |         self.assertEqual(r.status_code, 0)
178 | 
179 |         content = self.get_output_content()
180 |         self.assertIn('KO', content)
181 |         self.assertIn('MCD', content)
182 |         self.assert_log()
183 |         self.assert_cache()
184 | 
185 |         # Check CSV header
186 |         expected_header = [
187 |             'symbol', 'end_date', 'amend', 'period_focus', 'fiscal_year', 'doc_type',
188 |             'revenues', 'op_income', 'net_income', 'eps_basic', 'eps_diluted', 'dividend',
189 |             'assets', 'cur_assets', 'cur_liab', 'cash', 'equity', 'cash_flow_op',
190 |             'cash_flow_inv', 'cash_flow_fin'
191 |         ]
192 |         head_line = content.split('\n')[0].rstrip()
193 |         self.assertEqual(head_line.split(','), expected_header)
194 | 
195 |     def test_merge_empty_results(self):
196 |         # Ridiculous date range (1800/1/1) -> empty result
197 |         r = run('./bin/pystock-crawler reports KO,MCD -o %(output)s -l %(log_file)s -w %(working_dir)s '
198 |                 '-s 18000101 -e 18000101 -b 1' % self.args)
199 |         self.assertEqual(r.status_code, 0)
200 | 
201 |         content = self.get_output_content()
202 |         self.assertFalse(content)
203 | 
204 |         # Make sure subfiles are deleted
205 |         filename = self.args['output']
206 |         self.assertFalse(os.path.exists(os.path.join('%s.1' % filename)))
207 |         self.assertFalse(os.path.exists(os.path.join('%s.2' % filename)))
208 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/test_spiders_edgar.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | from scrapy.http import HtmlResponse, XmlResponse
  5 | 
  6 | from pystock_crawler.spiders.edgar import EdgarSpider, URLGenerator
  7 | from pystock_crawler.tests.base import TestCaseBase
  8 | 
  9 | 
 10 | def make_url(symbol, start_date='', end_date=''):
 11 |     '''A URL that lists all 10-Q and 10-K filings of a company.'''
 12 |     return 'http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=10-&dateb=%s&datea=%s&owner=exclude&count=300' \
 13 |            % (symbol, end_date, start_date)
 14 | 
 15 | 
 16 | def make_link_html(href, text=u'Link'):
 17 |     return u'<a href="%s">%s</a>' % (href, text)
 18 | 
 19 | 
 20 | class URLGeneratorTest(TestCaseBase):
 21 | 
 22 |     def test_no_dates(self):
 23 |         urls = URLGenerator(('FB', 'GOOG'))
 24 |         self.assertEqual(list(urls), [
 25 |             make_url('FB'), make_url('GOOG')
 26 |         ])
 27 | 
 28 |     def test_with_start_date(self):
 29 |         urls = URLGenerator(('AAPL', 'AMZN', 'GLD'), start_date='20120215')
 30 |         self.assertEqual(list(urls), [
 31 |             make_url('AAPL', start_date='20120215'),
 32 |             make_url('AMZN', start_date='20120215'),
 33 |             make_url('GLD', start_date='20120215')
 34 |         ])
 35 | 
 36 |     def test_with_end_date(self):
 37 |         urls = URLGenerator(('TSLA', 'USO', 'MMM'), end_date='20110530')
 38 |         self.assertEqual(list(urls), [
 39 |             make_url('TSLA', end_date='20110530'),
 40 |             make_url('USO', end_date='20110530'),
 41 |             make_url('MMM', end_date='20110530')
 42 |         ])
 43 | 
 44 |     def test_with_start_and_end_dates(self):
 45 |         urls = URLGenerator(('DDD', 'AXP', 'KO'), start_date='20111230', end_date='20121230')
 46 |         self.assertEqual(list(urls), [
 47 |             make_url('DDD', '20111230', '20121230'),
 48 |             make_url('AXP', '20111230', '20121230'),
 49 |             make_url('KO', '20111230', '20121230')
 50 |         ])
 51 | 
 52 | 
 53 | class EdgarSpiderTest(TestCaseBase):
 54 | 
 55 |     def test_empty_creation(self):
 56 |         spider = EdgarSpider()
 57 |         self.assertEqual(spider.start_urls, [])
 58 | 
 59 |     def test_symbol_file(self):
 60 |         # create a mock file of a list of symbols
 61 |         f = tempfile.NamedTemporaryFile('w', delete=False)
 62 |         f.write('# Comment\nGOOG\nADBE\nLNKD\n#comment\nJPM\n')
 63 |         f.close()
 64 | 
 65 |         spider = EdgarSpider(symbols=f.name)
 66 |         urls = list(spider.start_urls)
 67 | 
 68 |         self.assertEqual(urls, [
 69 |             make_url('GOOG'), make_url('ADBE'),
 70 |             make_url('LNKD'), make_url('JPM')
 71 |         ])
 72 | 
 73 |         os.remove(f.name)
 74 | 
 75 |     def test_invalid_dates(self):
 76 |         with self.assertRaises(ValueError):
 77 |             EdgarSpider(startdate='12345678')
 78 | 
 79 |         with self.assertRaises(ValueError):
 80 |             EdgarSpider(enddate='12345678')
 81 | 
 82 |     def test_symbol_file_and_dates(self):
 83 |         # create a mock file of a list of symbols
 84 |         f = tempfile.NamedTemporaryFile('w', delete=False)
 85 |         f.write('# Comment\nT\nCBS\nWMT\n')
 86 |         f.close()
 87 | 
 88 |         spider = EdgarSpider(symbols=f.name, startdate='20110101', enddate='20130630')
 89 |         urls = list(spider.start_urls)
 90 | 
 91 |         self.assertEqual(urls, [
 92 |             make_url('T', '20110101', '20130630'),
 93 |             make_url('CBS', '20110101', '20130630'),
 94 |             make_url('WMT', '20110101', '20130630')
 95 |         ])
 96 | 
 97 |         os.remove(f.name)
 98 | 
 99 |     def test_parse_company_filing_page(self):
100 |         '''
101 |         Parse the page that lists all filings of a company.
102 | 
103 |         Example:
104 |         http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001288776&type=10-&dateb=&owner=exclude&count=40
105 | 
106 |         '''
107 |         spider = EdgarSpider()
108 |         spider._follow_links = True  # HACK
109 | 
110 |         body = '''
111 |             <html><body>
112 |             <a href="http://example.com/">Useless Link</a>
113 |             <a href="/Archives/edgar/data/abc-index.htm">Link</a>
114 |             <a href="/Archives/edgar/data/123-index.htm">Link</a>
115 |             <a href="/Archives/edgar/data/123.htm">Useless Link</a>
116 |             <a href="/Archives/edgar/data/123/abc-index.htm">Link</a>
117 |             <a href="/Archives/edgar/data/123/456/abc123-index.htm">Link</a>
118 |             <a href="/Archives/edgar/123/abc-index.htm">Uselss Link</a>
119 |             <a href="/Archives/edgar/data/123/456/789/HELLO-index.htm">Link</a>
120 |             <a href="/Archives/hello-index.html">Useless Link</a>
121 |             </body></html>
122 |         '''
123 | 
124 |         response = HtmlResponse('http://sec.gov/mock', body=body)
125 |         requests = spider.parse(response)
126 |         urls = [r.url for r in requests]
127 | 
128 |         self.assertEqual(urls, [
129 |             'http://sec.gov/Archives/edgar/data/abc-index.htm',
130 |             'http://sec.gov/Archives/edgar/data/123-index.htm',
131 |             'http://sec.gov/Archives/edgar/data/123/abc-index.htm',
132 |             'http://sec.gov/Archives/edgar/data/123/456/abc123-index.htm',
133 |             'http://sec.gov/Archives/edgar/data/123/456/789/HELLO-index.htm'
134 |         ])
135 | 
136 |     def test_parse_quarter_or_annual_page(self):
137 |         '''
138 |         Parse the page that lists filings of a quater or a year of a company.
139 | 
140 |         Example:
141 |         http://www.sec.gov/Archives/edgar/data/1288776/000128877613000055/0001288776-13-000055-index.htm
142 | 
143 |         '''
144 |         spider = EdgarSpider()
145 |         spider._follow_links = True  # HACK
146 | 
147 |         body = '''
148 |             <html><body>
149 |             <a href="http://example.com">Useless Link</a>
150 |             <a href="/Archives/edgar/data/123/abc-20130630.xml">Link</a>
151 |             <a href="/Archives/edgar/123/456/abc123-20130630.xml">Useless Link</a>
152 |             <a href="/Archives/edgar/data/456/789/hello-20130630.xml">Link</a>
153 |             <a href="/Archives/edgar/123/456/hello-20130630.xml">Useless Link</a>
154 |             <a href="/Archives/data/123/456/hello-20130630.xml">Useless Link</a>
155 |             <a href="/Archives/edgar/data/123/456/hello-201306300.xml">Useless Link</a>
156 |             <a href="/Archives/edgar/data/123/456/xyz-20130630.html">Link</a>
157 |             </body></html>
158 |         '''
159 | 
160 |         response = HtmlResponse('http://sec.gov/mock', body=body)
161 |         requests = spider.parse(response)
162 |         urls = [r.url for r in requests]
163 | 
164 |         self.assertEqual(urls, [
165 |             'http://sec.gov/Archives/edgar/data/123/abc-20130630.xml',
166 |             'http://sec.gov/Archives/edgar/data/456/789/hello-20130630.xml'
167 |         ])
168 | 
169 |     def test_parse_xml_report(self):
170 |         '''Parse XML 10-Q or 10-K report.'''
171 |         spider = EdgarSpider()
172 |         spider._follow_links = True  # HACK
173 | 
174 |         body = '''
175 |             <?xml version="1.0">
176 |             <xbrl xmlns="http://www.xbrl.org/2003/instance"
177 |                   xmlns:xbrli="http://www.xbrl.org/2003/instance"
178 |                   xmlns:dei="http://xbrl.sec.gov/dei/2011-01-31"
179 |                   xmlns:us-gaap="http://fasb.org/us-gaap/2011-01-31">
180 | 
181 |               <context id="c1">
182 |                 <startDate>2013-03-31</startDate>
183 |                 <endDate>2013-06-28</endDate>
184 |               </context>
185 | 
186 |               <dei:AmendmentFlag contextRef="c1">false</dei:AmendmentFlag>
187 |               <dei:DocumentType contextRef="c1">10-Q</dei:DocumentType>
188 |               <dei:DocumentFiscalPeriodFocus contextRef="c1">Q2</dei:DocumentFiscalPeriodFocus>
189 |               <dei:DocumentPeriodEndDate contextRef="c1">2013-06-28</dei:DocumentPeriodEndDate>
190 |               <dei:DocumentFiscalYearFocus>2013</dei>
191 | 
192 |               <us-gaap:Revenues contextRef="c1">100</us-gaap:Revenues>
193 |               <us-gaap:NetIncomeLoss contextRef="c1">200</us-gaap:NetIncomeLoss>
194 |               <us-gaap:EarningsPerShareBasic contextRef="c1">0.2</us-gaap:EarningsPerShareBasic>
195 |               <us-gaap:EarningsPerShareDiluted contextRef="c1">0.19</us-gaap:EarningsPerShareDiluted>
196 |               <us-gaap:CommonStockDividendsPerShareDeclared contextRef="c1">0.07</us-gaap:CommonStockDividendsPerShareDeclared>
197 | 
198 |               <us-gaap:Assets contextRef="c1">1600</us-gaap:Assets>
199 |               <us-gaap:StockholdersEquity contextRef="c1">300</us-gaap:StockholdersEquity>
200 |               <us-gaap:CashAndCashEquivalentsAtCarryingValue contextRef="c1">150</us-gaap:CashAndCashEquivalentsAtCarryingValue>
201 |             </xbrl>
202 |         '''
203 | 
204 |         response = XmlResponse('http://sec.gov/Archives/edgar/data/123/abc-20130720.xml', body=body)
205 |         item = spider.parse_10qk(response)
206 | 
207 |         self.assert_item(item, {
208 |             'symbol': 'ABC',
209 |             'amend': False,
210 |             'doc_type': '10-Q',
211 |             'period_focus': 'Q2',
212 |             'fiscal_year': 2013,
213 |             'end_date': '2013-06-28',
214 |             'revenues': 100.0,
215 |             'net_income': 200.0,
216 |             'eps_basic': 0.2,
217 |             'eps_diluted': 0.19,
218 |             'dividend': 0.07,
219 |             'assets': 1600.0,
220 |             'equity': 300.0,
221 |             'cash': 150.0
222 |         })
223 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/test_spiders_nasdaq.py:
--------------------------------------------------------------------------------
 1 | from scrapy.http import TextResponse
 2 | 
 3 | from pystock_crawler.spiders.nasdaq import NasdaqSpider
 4 | from pystock_crawler.tests.base import TestCaseBase
 5 | 
 6 | 
 7 | class NasdaqSpiderTest(TestCaseBase):
 8 | 
 9 |     def test_parse(self):
10 |         spider = NasdaqSpider()
11 | 
12 |         body = ('"Symbol","Name","Doesnt Matter",\n'
13 |                 '"DDD","3D Systems Corporation","50.5",\n'
14 |                 '"VNO","Vornado Realty Trust","103.5",\n'
15 |                 '"VNO^G","Vornado Realty Trust","25.21",\n'
16 |                 '"WBS","Webster Financial Corporation","29.71",\n'
17 |                 '"WBS/WS","Webster Financial Corporation","13.07",\n'
18 |                 '"AAA-A","Some Fake Company","1234.0",')
19 |         response = TextResponse('http://www.nasdaq.com/dummy_url', body=body)
20 |         items = list(spider.parse(response))
21 | 
22 |         self.assertEqual(len(items), 3)
23 |         self.assert_item(items[0], {
24 |             'symbol': 'DDD',
25 |             'name': '3D Systems Corporation'
26 |         })
27 |         self.assert_item(items[1], {
28 |             'symbol': 'VNO',
29 |             'name': 'Vornado Realty Trust'
30 |         })
31 |         self.assert_item(items[2], {
32 |             'symbol': 'WBS',
33 |             'name': 'Webster Financial Corporation'
34 |         })
35 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/test_spiders_yahoo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | from scrapy.http import TextResponse
  5 | 
  6 | from pystock_crawler.spiders.yahoo import make_url, YahooSpider
  7 | from pystock_crawler.tests.base import TestCaseBase
  8 | 
  9 | 
 10 | class MakeURLTest(TestCaseBase):
 11 | 
 12 |     def test_no_dates(self):
 13 |         self.assertEqual(make_url('YHOO'), (
 14 |             'http://ichart.finance.yahoo.com/table.csv?'
 15 |             's=YHOO&d=&e=&f=&g=d&a=&b=&c=&ignore=.csv'
 16 |         ))
 17 | 
 18 |     def test_only_start_date(self):
 19 |         self.assertEqual(make_url('GOOG', start_date='20131122'), (
 20 |             'http://ichart.finance.yahoo.com/table.csv?'
 21 |             's=GOOG&d=&e=&f=&g=d&a=10&b=22&c=2013&ignore=.csv'
 22 |         ))
 23 | 
 24 |     def test_only_end_date(self):
 25 |         self.assertEqual(make_url('AAPL', end_date='20131122'), (
 26 |             'http://ichart.finance.yahoo.com/table.csv?'
 27 |             's=AAPL&d=10&e=22&f=2013&g=d&a=&b=&c=&ignore=.csv'
 28 |         ))
 29 | 
 30 |     def test_start_and_end_dates(self):
 31 |         self.assertEqual(make_url('TSLA', start_date='20120305', end_date='20131122'), (
 32 |             'http://ichart.finance.yahoo.com/table.csv?'
 33 |             's=TSLA&d=10&e=22&f=2013&g=d&a=2&b=5&c=2012&ignore=.csv'
 34 |         ))
 35 | 
 36 | 
 37 | class YahooSpiderTest(TestCaseBase):
 38 | 
 39 |     def test_empty_creation(self):
 40 |         spider = YahooSpider()
 41 |         self.assertEqual(list(spider.start_urls), [])
 42 | 
 43 |     def test_inline_symbols(self):
 44 |         spider = YahooSpider(symbols='C')
 45 |         self.assertEqual(list(spider.start_urls), [make_url('C')])
 46 | 
 47 |         spider = YahooSpider(symbols='KO,DIS,ATVI')
 48 |         self.assertEqual(list(spider.start_urls), [
 49 |             make_url(symbol) for symbol in ('KO', 'DIS', 'ATVI')
 50 |         ])
 51 | 
 52 |     def test_symbol_file(self):
 53 |         try:
 54 |             # Create a mock file of a list of symbols
 55 |             with tempfile.NamedTemporaryFile('w', delete=False) as f:
 56 |                 f.write('# Comment\nGOOG\tGoogle Inc.\nAAPL\nFB  Facebook.com\n#comment\nAMZN\n')
 57 | 
 58 |             spider = YahooSpider(symbols=f.name)
 59 |             self.assertEqual(list(spider.start_urls), [
 60 |                 make_url(symbol) for symbol in ('GOOG', 'AAPL', 'FB', 'AMZN')
 61 |             ])
 62 |         finally:
 63 |             os.remove(f.name)
 64 | 
 65 |     def test_illegal_dates(self):
 66 |         with self.assertRaises(ValueError):
 67 |             YahooSpider(startdate='12345678')
 68 | 
 69 |         with self.assertRaises(ValueError):
 70 |             YahooSpider(enddate='12345678')
 71 | 
 72 |     def test_parse(self):
 73 |         spider = YahooSpider()
 74 | 
 75 |         body = ('Date,Open,High,Low,Close,Volume,Adj Close\n'
 76 |                 '2013-11-22,121.58,122.75,117.93,121.38,11096700,121.38\n'
 77 |                 '2013-09-06,168.57,169.70,165.15,166.97,8619700,166.97\n'
 78 |                 '2013-06-26,103.80,105.87,102.66,105.72,6602600,105.72\n')
 79 |         response = TextResponse(make_url('YHOO'), body=body)
 80 |         items = list(spider.parse(response))
 81 | 
 82 |         self.assertEqual(len(items), 3)
 83 |         self.assert_item(items[0], {
 84 |             'symbol': 'YHOO',
 85 |             'date': '2013-11-22',
 86 |             'open': 121.58,
 87 |             'high': 122.75,
 88 |             'low': 117.93,
 89 |             'close': 121.38,
 90 |             'volume': 11096700,
 91 |             'adj_close': 121.38
 92 |         })
 93 |         self.assert_item(items[1], {
 94 |             'symbol': 'YHOO',
 95 |             'date': '2013-09-06',
 96 |             'open': 168.57,
 97 |             'high': 169.70,
 98 |             'low': 165.15,
 99 |             'close': 166.97,
100 |             'volume': 8619700,
101 |             'adj_close': 166.97
102 |         })
103 |         self.assert_item(items[2], {
104 |             'symbol': 'YHOO',
105 |             'date': '2013-06-26',
106 |             'open': 103.80,
107 |             'high': 105.87,
108 |             'low': 102.66,
109 |             'close': 105.72,
110 |             'volume': 6602600,
111 |             'adj_close': 105.72
112 |         })
113 | 


--------------------------------------------------------------------------------
/pystock_crawler/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import cStringIO
 2 | import os
 3 | 
 4 | from pystock_crawler import utils
 5 | from pystock_crawler.tests.base import SAMPLE_DATA_DIR, TestCaseBase
 6 | 
 7 | 
 8 | class UtilsTest(TestCaseBase):
 9 | 
10 |     def test_check_date_arg(self):
11 |         utils.check_date_arg('19830305')
12 |         utils.check_date_arg('19851122')
13 |         utils.check_date_arg('19980720')
14 |         utils.check_date_arg('20140212')
15 | 
16 |         # OK to pass an empty argument
17 |         utils.check_date_arg('')
18 | 
19 |         with self.assertRaises(ValueError):
20 |             utils.check_date_arg('1234')
21 | 
22 |         with self.assertRaises(ValueError):
23 |             utils.check_date_arg('2014111')
24 | 
25 |         with self.assertRaises(ValueError):
26 |             utils.check_date_arg('20141301')
27 | 
28 |         with self.assertRaises(ValueError):
29 |             utils.check_date_arg('20140132')
30 | 
31 |     def test_parse_limit_arg(self):
32 |         self.assertEqual(utils.parse_limit_arg(''), (0, None))
33 |         self.assertEqual(utils.parse_limit_arg('11,22'), (11, 22))
34 | 
35 |         with self.assertRaises(ValueError):
36 |             utils.parse_limit_arg('11,22,33')
37 | 
38 |         with self.assertRaises(ValueError):
39 |             utils.parse_limit_arg('abc')
40 | 
41 |     def test_load_symbols(self):
42 |         try:
43 |             filename = os.path.join(SAMPLE_DATA_DIR, 'test_symbols.txt')
44 |             with open(filename, 'w') as f:
45 |                 f.write('AAPL Apple Inc.\nGOOG\tGoogle Inc.\n# Comment\nFB\nTWTR\nAMZN\nSPY\n\nYHOO\n# The end\n')
46 | 
47 |             symbols = list(utils.load_symbols(filename))
48 |             self.assertEqual(symbols, ['AAPL', 'GOOG', 'FB', 'TWTR', 'AMZN', 'SPY', 'YHOO'])
49 |         finally:
50 |             os.remove(filename)
51 | 
52 |     def test_parse_csv(self):
53 |         f = cStringIO.StringIO('name,age\nAvon,30\nOmar,29\nJoe,45\n')
54 |         items = list(utils.parse_csv(f))
55 |         self.assertEqual(items, [
56 |             { 'name': 'Avon', 'age': '30' },
57 |             { 'name': 'Omar', 'age': '29' },
58 |             { 'name': 'Joe', 'age': '45' }
59 |         ])
60 | 


--------------------------------------------------------------------------------
/pystock_crawler/throttle.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from scrapy.exceptions import NotConfigured
 4 | from scrapy import signals
 5 | 
 6 | 
 7 | class PassiveThrottle(object):
 8 |     '''
 9 |     Scrapy's AutoThrottle adds too much download delay on edgar spider, making
10 |     it too slow.
11 | 
12 |     PassiveThrottle takes a more "passive" approach. It adds download delay
13 |     only if there is an error response.
14 | 
15 |     '''
16 |     def __init__(self, crawler):
17 |         self.crawler = crawler
18 |         if not crawler.settings.getbool('PASSIVETHROTTLE_ENABLED'):
19 |             raise NotConfigured
20 | 
21 |         self.debug = crawler.settings.getbool("PASSIVETHROTTLE_DEBUG")
22 |         self.stats = crawler.stats
23 |         crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
24 |         crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
25 | 
26 |     @classmethod
27 |     def from_crawler(cls, crawler):
28 |         return cls(crawler)
29 | 
30 |     def _spider_opened(self, spider):
31 |         self.mindelay = self._min_delay(spider)
32 |         self.maxdelay = self._max_delay(spider)
33 |         self.retry_http_codes = self._retry_http_codes()
34 | 
35 |         self.stats.set_value('delay_count', 0)
36 | 
37 |     def _min_delay(self, spider):
38 |         s = self.crawler.settings
39 |         return getattr(spider, 'download_delay', 0.0) or \
40 |             s.getfloat('DOWNLOAD_DELAY')
41 | 
42 |     def _max_delay(self, spider):
43 |         return self.crawler.settings.getfloat('PASSIVETHROTTLE_MAX_DELAY', 60.0)
44 | 
45 |     def _retry_http_codes(self):
46 |         return self.crawler.settings.getlist('RETRY_HTTP_CODES', [])
47 | 
48 |     def _response_downloaded(self, response, request, spider):
49 |         key, slot = self._get_slot(request, spider)
50 |         if slot is None:
51 |             return
52 | 
53 |         olddelay = slot.delay
54 |         self._adjust_delay(slot, response)
55 |         if self.debug:
56 |             diff = slot.delay - olddelay
57 |             conc = len(slot.transferring)
58 |             msg = "slot: %s | conc:%2d | delay:%5d ms (%+d)" % \
59 |                   (key, conc, slot.delay * 1000, diff * 1000)
60 |             spider.log(msg, level=logging.INFO)
61 | 
62 |     def _get_slot(self, request, spider):
63 |         key = request.meta.get('download_slot')
64 |         return key, self.crawler.engine.downloader.slots.get(key)
65 | 
66 |     def _adjust_delay(self, slot, response):
67 |         """Define delay adjustment policy"""
68 |         if response.status in self.retry_http_codes:
69 |             new_delay = max(slot.delay, 1) * 4
70 |             new_delay = max(new_delay, self.mindelay)
71 |             new_delay = min(new_delay, self.maxdelay)
72 |             slot.delay = new_delay
73 |             self.stats.inc_value('delay_count')
74 |         elif response.status == 200:
75 |             new_delay = max(slot.delay / 2, self.mindelay)
76 |             if new_delay < 0.01:
77 |                 new_delay = 0
78 |             slot.delay = new_delay
79 | 


--------------------------------------------------------------------------------
/pystock_crawler/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | def check_date_arg(value, arg_name=None):
 7 |     if value:
 8 |         try:
 9 |             if len(value) != 8:
10 |                 raise ValueError
11 |             datetime.strptime(value, '%Y%m%d')
12 |         except ValueError:
13 |             raise ValueError("Option '%s' must be in YYYYMMDD format, input is '%s'" % (arg_name, value))
14 | 
15 | 
16 | def parse_limit_arg(value):
17 |     if value:
18 |         tokens = value.split(',')
19 |         try:
20 |             if len(tokens) != 2:
21 |                 raise ValueError
22 |             return int(tokens[0]), int(tokens[1])
23 |         except ValueError:
24 |             raise ValueError("Option 'limit' must be in START,COUNT format, input is '%s'" % value)
25 |     return 0, None
26 | 
27 | 
28 | def load_symbols(file_path):
29 |     symbols = []
30 |     with open(file_path) as f:
31 |         for line in f:
32 |             line = line.strip()
33 |             if line and not line.startswith('#'):
34 |                 symbol = line.split()[0]
35 |                 symbols.append(symbol)
36 |     return symbols
37 | 
38 | 
39 | def parse_csv(file_like):
40 |     reader = csv.reader(file_like)
41 |     headers = reader.next()
42 |     for row in reader:
43 |         item = {}
44 |         for i, value in enumerate(row):
45 |             header = headers[i]
46 |             item[header] = value
47 |         yield item
48 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --cov-report term-missing --cov pystock_crawler --cov bin pystock_crawler/tests/
3 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | envoy
2 | pytest
3 | pytest-cov
4 | requests
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docopt==0.6.2
2 | leveldb==0.193
3 | Scrapy==0.24.4
4 | service-identity==1.0.0
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = pystock_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pystock_crawler
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | import codecs
 7 | import os
 8 | import re
 9 | 
10 | 
11 | here = os.path.abspath(os.path.dirname(__file__))
12 | 
13 | 
14 | # Read the version number from a source file.
15 | # Why read it, and not import?
16 | # see https://groups.google.com/d/topic/pypa-dev/0PkjVpcxTzQ/discussion
17 | def find_version(*file_paths):
18 |     # Open in Latin-1 so that we avoid encoding errors.
19 |     # Use codecs.open for Python 2 compatibility
20 |     with codecs.open(os.path.join(here, *file_paths), 'r', 'latin1') as f:
21 |         version_file = f.read()
22 | 
23 |     # The version line must have the form
24 |     # __version__ = 'ver'
25 |     version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
26 |     if version_match:
27 |         return version_match.group(1)
28 |     raise RuntimeError('Unable to find version string')
29 | 
30 | 
31 | def read_description(filename):
32 |     with codecs.open(filename, encoding='utf-8') as f:
33 |         return f.read()
34 | 
35 | 
36 | def parse_requirements(filename):
37 |     with open(filename) as f:
38 |         content = f.read()
39 |     return filter(lambda x: x and not x.startswith('#'), content.splitlines())
40 | 
41 | 
42 | setup(
43 |     name='pystock-crawler',
44 |     version=find_version('pystock_crawler', '__init__.py'),
45 |     url='https://github.com/eliangcs/pystock-crawler',
46 |     description='Crawl and parse stock historical data',
47 |     long_description=read_description('README.rst'),
48 |     author='Chang-Hung Liang',
49 |     author_email='eliang.cs@gmail.com',
50 |     license='MIT',
51 |     packages=['pystock_crawler', 'pystock_crawler.spiders'],
52 |     scripts=['bin/pystock-crawler'],
53 |     install_requires=parse_requirements('requirements.txt'),
54 |     classifiers=[
55 |         'Development Status :: 3 - Alpha',
56 |         'Environment :: Console',
57 |         'Intended Audience :: Developers',
58 |         'Intended Audience :: Financial and Insurance Industry',
59 |         'License :: OSI Approved :: MIT License',
60 |         'Operating System :: OS Independent',
61 |         'Programming Language :: Python',
62 |         'Programming Language :: Python :: 2.7',
63 |         'Topic :: Internet :: WWW/HTTP',
64 |         'Topic :: Office/Business :: Financial :: Investment',
65 |         'Topic :: Software Development :: Libraries :: Python Modules'
66 |     ]
67 | )
68 | 


--------------------------------------------------------------------------------