├── .gitignore
├── README.md
├── options_scraper
    ├── __init__.py
    ├── cli.py
    ├── scraper.py
    ├── serializer.py
    └── utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .idea
3 | *.pyc
4 | options_scraper.egg-info
5 | dist
6 | build


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NASDAQ Options chain Scraper
  2 | 
  3 | Python Options Chain scraper for the old NASDAQ website : https://old.nasdaq.com
  4 | 
  5 | ## Install 
  6 | 
  7 | ```bash
  8 | pip install options-scraper
  9 | ```
 10 | 
 11 | ## API
 12 | 
 13 | Use the API if you want to access the scraped data records ( as python objects ) directly.
 14 | 
 15 | ### Usage
 16 | 
 17 | ```python
 18 | 
 19 | from options_scraper.scraper import NASDAQOptionsScraper
 20 | from options_scraper.utils import batched
 21 | 
 22 | scraper = NASDAQOptionsScraper()
 23 | ticker_symbol = 'XOM'
 24 | kwargs = { "money": 'all',
 25 |            "expir": 'week',
 26 |            "excode": None,
 27 |            "callput": None
 28 |          }
 29 | 
 30 | records_generator = scraper(ticker_symbol, **kwargs)
 31 | 
 32 | # Either access each record individually as shown below
 33 | for item in records_generator:
 34 |     print(item)
 35 | 
 36 | # Or use the batched util to get a list of items
 37 | for items in batched(records_generator, batch_size=100):
 38 |     print(items)
 39 | 
 40 | ```
 41 | 
 42 | ### Output
 43 | 
 44 | Each scraped record will have the following structure
 45 | 
 46 | 
 47 | ```python
 48 | 
 49 | {'Ask': '23.20',
 50 |  'Bid': '18.50',
 51 |  'Calls': 'Apr 24, 2020',
 52 |  'Chg': '',
 53 |  'Last': '19.40',
 54 |  'Open Int': '15',
 55 |  'Puts': 'Apr 24, 2020',
 56 |  'Root': 'XOM',
 57 |  'Strike': '60',
 58 |  'Vol': '0'}
 59 | 
 60 | {'Ask': '28.20',
 61 |  'Bid': '23.50',
 62 |  'Calls': 'Apr 24, 2020',
 63 |  'Chg': '',
 64 |  'Last': '29.67',
 65 |  'Open Int': '3',
 66 |  'Puts': 'Apr 24, 2020',
 67 |  'Root': 'XOM',
 68 |  'Strike': '65',
 69 |  'Vol': '0'}
 70 | 
 71 | ```
 72 | 
 73 | ## Console Script
 74 | 
 75 | Use this script to scrape records and save them either in CSV or JSON format.
 76 | 
 77 | ```bash
 78 | options-scraper --help
 79 | ```
 80 | 
 81 | ```text
 82 | usage: options-scraper [-h]
 83 |                        [-l {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}]
 84 |                        [-t TICKER] 
 85 |                        [-o ODIR] 
 86 |                        [-b BATCH_SIZE] 
 87 |                        [-c {call,put}]
 88 |                        [-m {all,in,out,near}] 
 89 |                        [-e EXCODE]
 90 |                        [-x {week,stan,quart,cebo}] 
 91 |                        [-s {json,csv}]
 92 | 
 93 | optional arguments:
 94 |   -h, --help            show this help message and exit
 95 |   -l {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}
 96 |   -t TICKER, --ticker TICKER Ticker Symbol
 97 |   -o ODIR, --odir ODIR  Output directory
 98 |   -b BATCH_SIZE, --batch_size BATCH_SIZE Batch Size
 99 |   -c {call,put}, --callput {call,put}
100 |   -m {all,in,out,near}, --money {all,in,out,near}
101 |   -e EXCODE, --excode EXCODE excode
102 |   -x {week,stan,quart,cebo}, --expir {week,stan,quart,cebo}
103 |   -s {json,csv}, --serialize {json,csv} Serialization format
104 | ```
105 | 
106 | 
107 | #### Serialization format (-s)
108 | You have an option to output the data either in a CSV file or a JSON file.
109 | Default format is CSV.
110 | 
111 | #### Batch Size (-b)
112 | Define how many records each csv or json file should have.
113 | 
114 | 
115 | ### Examples
116 | 1. To get all the option chain for XOM in a batch_size of 1000 and `csv` file format.
117 | This will make sure that each CSV file has 1000 records in it.
118 | The last file will have the remaining records
119 | 
120 | ```bash
121 | options-scraper -t XOM -o /Users/abhishek/options_data -b 1000 -s csv
122 | ```
123 | 
124 | 
125 | 2. To get all option chain data for MSFT in a batch_size of 10 and `json` file format.
126 | ```bash
127 | options-scraper -t MSFT -o /Users/abhishek/options_data -b 10 -s json
128 | ```
129 | 
130 | 3. To get all `put` options with weekly expiry.
131 | ```bash
132 | options-scraper -t XOM -e cbo -c put -x week -o /Users/abhishek/options_data
133 | ```
134 | 
135 | 4. To get all `call` options with `cebo` expiry.
136 | ```bash
137 | options-scraper -t XOM -c call -x cebo -o /Users/abhishek/options_data
138 | ```
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/options_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.91"
2 | 


--------------------------------------------------------------------------------
/options_scraper/cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | from pprint import pformat
 5 | 
 6 | from options_scraper.serializer import NASDAQOptionsSerializer
 7 | 
 8 | LOG = logging.getLogger(__name__)
 9 | 
10 | 
11 | def main():
12 |     """
13 |     Description:
14 |         Entry point to the options scraper
15 | 
16 |     """
17 | 
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-l",
20 |                         "--log-level",
21 |                         default="INFO",
22 |                         choices=list(logging._nameToLevel.keys()))
23 |     parser.add_argument("-t", "--ticker", help="Ticker Symbol")
24 |     parser.add_argument("-o", "--odir", help="Output directory")
25 |     parser.add_argument("-b",
26 |                         "--batch_size",
27 |                         help="Batch Size",
28 |                         default=100,
29 |                         type=int)
30 |     parser.add_argument("-c", "--callput", choices=["call", "put"])
31 |     parser.add_argument("-m",
32 |                         "--money",
33 |                         default="all",
34 |                         choices=["all", "in", "out", "near"])
35 |     parser.add_argument("-e", "--excode", help="excode")
36 |     parser.add_argument("-x",
37 |                         "--expir",
38 |                         choices=["week", "stan", "quart", "cebo"])
39 |     parser.add_argument(
40 |                         "-s",
41 |                         "--serialize",
42 |                         help="Serialization format",
43 |                         default="csv",
44 |                         choices=["json", "csv"])
45 |     args = parser.parse_args()
46 | 
47 |     logging.basicConfig(
48 |         level=logging._nameToLevel[args.log_level],
49 |         format="%(asctime)s :: [%(levelname)s] :: [%(name)s] :: %(message)s",
50 |     )
51 | 
52 |     if args.ticker is None:
53 |         raise ValueError("Ticker symbol not passed")
54 | 
55 |     if args.odir is None:
56 |         raise ValueError("Output Directory not passed. Provide the complete path where you want to save the files")
57 | 
58 |     if not os.path.exists(args.odir):
59 |         raise IOError("Path {0} does not exists".format(args.odir))
60 | 
61 |     kwargs = {
62 |         "money": args.money.lower(),
63 |         "expir": args.expir.lower() if args.expir else None,
64 |         "excode": args.excode.lower() if args.excode else None,
65 |         "callput": args.callput.lower() if args.callput else None,
66 |     }
67 | 
68 |     LOG.info("VERIFY: arguments passed %s", pformat(kwargs))
69 |     LOG.info("Serialization format is %s", args.serialize.upper())
70 |     LOG.info("Batch Size is %s", args.batch_size)
71 | 
72 |     serializer = NASDAQOptionsSerializer(
73 |         ticker=args.ticker,
74 |         root_dir=args.odir,
75 |         serialization_format=args.serialize.lower(),
76 |     )
77 |     serializer.serialize(**kwargs)
78 |     LOG.info("Finished Scraping")


--------------------------------------------------------------------------------
/options_scraper/scraper.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import urllib.parse
  4 | 
  5 | import requests
  6 | from lxml import etree
  7 | 
  8 | from options_scraper.utils import batched, get_text
  9 | 
 10 | LOG = logging.getLogger(__name__)
 11 | 
 12 | __all__ = ['NASDAQOptionsScraper']
 13 | 
 14 | last_number_pattern = re.compile(r"(?<=&page=)\d+")
 15 | nasdaq_base_url = "https://old.nasdaq.com"
 16 | 
 17 | 
 18 | class NASDAQOptionsScraper:
 19 | 
 20 |     @staticmethod
 21 |     def gen_pages(url):
 22 |         """
 23 |         Description:
 24 |             If for a given query the results are paginated then
 25 |             we should traverse the pages too. This function exactly does that.
 26 | 
 27 |         Args:
 28 |             URL - The main URL
 29 | 
 30 |         Returns:
 31 |             Generator - All the other pages in the search results if present.
 32 | 
 33 |         """
 34 |         response = requests.get(url)
 35 |         tree = etree.HTML(response.content)
 36 |         for element in tree.xpath("//*[@id='quotes_content_left_lb_LastPage']"):
 37 |             if element is not None:
 38 |                 last_url = element.attrib["href"]
 39 |                 page_numbers = re.findall(last_number_pattern, last_url)
 40 |                 if page_numbers:
 41 |                     last_page = int(page_numbers[0])
 42 |                     for i in range(2, last_page + 1):
 43 |                         url_to_scrap = "{0}&page={1}".format(url, i)
 44 |                         yield url_to_scrap
 45 | 
 46 |     @staticmethod
 47 |     def gen_page_records(url):
 48 |         """
 49 |         Description:
 50 |             Scrape Options data from the given URL.
 51 |             This is a 2 step process.
 52 |                 1. First, extract the headers
 53 |                 2. Then, the data rows.
 54 | 
 55 |         Args:
 56 |             url: NASDAQ URL to scrape
 57 | 
 58 |         Returns:
 59 |             Generator: Data records each as a dictionary
 60 | 
 61 |         """
 62 |         response = requests.get(url)
 63 |         tree = etree.HTML(response.content)
 64 |         headers = []
 65 |         # First, we will extract the table headers.
 66 |         for element in tree.xpath(
 67 |                 "//div[@class='OptionsChain-chart borderAll thin']"):
 68 |             for thead_element in element.xpath("table/thead/tr/th"):
 69 |                 a_element = thead_element.find("a")
 70 |                 if a_element is not None:
 71 |                     headers.append(a_element.text.strip())
 72 |                 else:
 73 |                     headers.append(thead_element.text.strip())
 74 | 
 75 |         # Then, the data rows.
 76 |         for element in tree.xpath(
 77 |                 "//div[@class='OptionsChain-chart borderAll thin']"):
 78 |             for trow_elem in element.xpath("//tr"):
 79 |                 data_row = [get_text(x) for x in trow_elem.findall("td")]
 80 |                 if len(headers) == len(data_row):
 81 |                     yield dict(zip(headers, data_row))
 82 | 
 83 |     def __call__(self, ticker, **kwargs):
 84 |         """
 85 |         Description:
 86 |             Constructs a NASDAQ specific URL for the given Ticker Symbol and options.
 87 |             Then traverses the option data found at the URL. If there are more pages,
 88 |             the data records on the pages are scraped too.
 89 | 
 90 |         Args:
 91 |             ticker: A valid Ticker Symbol
 92 |             **kwargs: Mapping of query parameters that should be passed to the NASDAQ URL
 93 | 
 94 |         Returns:
 95 |             Generator: Each options data record as a python dictionary till
 96 |             the last page is reached.
 97 |         """
 98 |         params = urllib.parse.urlencode(
 99 |             dict((k, v) for k, v in kwargs.items() if v is not None))
100 |         url = f"{nasdaq_base_url}/symbol/{ticker.lower()}/option-chain?{params}"
101 | 
102 |         LOG.info("Scraping data from URL %s", url)
103 |         for rec in self.gen_page_records(url):
104 |             yield rec
105 | 
106 |         for url in self.gen_pages(url):
107 |             LOG.info("Scraping data from URL %s", url)
108 |             for rec in self.gen_page_records(url):
109 |                 yield rec
110 | 
111 | 


--------------------------------------------------------------------------------
/options_scraper/serializer.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import datetime
 3 | import json
 4 | import logging
 5 | import os
 6 | 
 7 | from typing import List, Mapping
 8 | 
 9 | from options_scraper.scraper import NASDAQOptionsScraper
10 | from options_scraper.utils import batched
11 | 
12 | 
13 | __all__ = ['NASDAQOptionsSerializer']
14 | 
15 | LOG = logging.getLogger(__name__)
16 | 
17 | 
18 | class NASDAQOptionsSerializer:
19 |     def __init__(self,
20 |                  ticker: str,
21 |                  root_dir: str,
22 |                  serialization_format: str = "csv",
23 |                  batch_size: int = 100):
24 | 
25 |         self.ticker = ticker
26 |         self.serialization_format = serialization_format
27 |         self.serializer = (self._to_json
28 |                            if serialization_format == "json" else self._to_csv)
29 |         self.output_file_date_fmt = "%Y-%m-%dT%H-%M-%S-%f"
30 | 
31 |         output_path = os.path.join(root_dir, ticker)
32 |         if not os.path.exists(output_path):
33 |             os.mkdir(output_path)
34 |         self.output_path = output_path
35 | 
36 |         self.batch_size = batch_size
37 |         self._scraped_records = 0
38 |         self._scraper = NASDAQOptionsScraper()
39 | 
40 |     def serialize(self, **kwargs):
41 |         records_generator = self._scraper(self.ticker, **kwargs)
42 |         for items in batched(records_generator, batch_size=self.batch_size):
43 | 
44 |             if items:
45 |                 timestamp = datetime.datetime.utcnow().strftime(
46 |                     self.output_file_date_fmt)
47 |                 file_name = f"{self.ticker}_{timestamp}.{self.serialization_format}"
48 |                 self.serializer(items, os.path.join(self.output_path,
49 |                                                     file_name))
50 |                 LOG.info("Scraped batch %s records", len(items))
51 | 
52 |                 self._scraped_records += len(items)
53 | 
54 |         LOG.info("Scraped a total of %s records for %s", self._scraped_records, self.ticker)
55 | 
56 |     @staticmethod
57 |     def _to_json(items: List[Mapping], file_path: str):
58 |         items_to_serialize = {"items": items}
59 |         with open(file_path, "w") as output_file:
60 |             json.dump(items_to_serialize, output_file, indent=4)
61 | 
62 |     @staticmethod
63 |     def _to_csv(items: List[Mapping], file_path: str):
64 |         with open(file_path, "a") as csv_file:
65 |             headers = list(items[0])
66 |             writer = csv.DictWriter(csv_file,
67 |                                     delimiter=",",
68 |                                     lineterminator="\n",
69 |                                     fieldnames=headers)
70 |             writer.writeheader()  # file doesn't exist yet, write a header
71 |             for item in items:
72 |                 writer.writerow(item)


--------------------------------------------------------------------------------
/options_scraper/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import islice
 2 | from typing import Generator
 3 | 
 4 | from lxml import etree
 5 | 
 6 | 
 7 | def get_text(elt):
 8 |     """
 9 |     Description:
10 |         Returns the text from tags.
11 | 
12 |     Args:
13 |         elt: An lxml etree element
14 | 
15 |     Returns:
16 |         Text within the element.
17 |     """
18 |     return etree.tostring(elt, method="text", encoding="unicode").strip()
19 | 
20 | 
21 | def batched(gen: Generator,
22 |             batch_size: int):
23 |     """
24 |     Description:
25 |         A util to slice a generator in a batch_size.
26 |         The consumer can consume the generator in batches of given batch_size
27 | 
28 |     Args:
29 |         gen: The generator to be consumed.
30 |         batch_size: Consume batches of what size ?
31 | 
32 |     """
33 |     while True:
34 |         batch = list(islice(gen, 0, batch_size))
35 |         if len(batch) == 0:
36 |             return
37 |         yield batch
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2018.11.29
2 | chardet==3.0.4
3 | idna==2.8
4 | lxml==4.5.0
5 | requests==2.23.0
6 | urllib3==1.25.8
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from setuptools import find_packages, setup
 4 | from options_scraper import __version__
 5 | 
 6 | 
 7 | this_directory = path.abspath(path.dirname(__file__))
 8 | 
 9 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
10 |     long_description = f.read()
11 | 
12 | VERSION = __version__
13 | DISTNAME = 'options_scraper'
14 | LICENSE = 'GNU GPLv3'
15 | AUTHOR = 'Abhishek Singh'
16 | MAINTAINER = 'Abhishek Singh'
17 | MAINTAINER_EMAIL = 'aosingh@asu.edu'
18 | DESCRIPTION = 'NASDAQ Options chain scraper for https://old.nasdaq.com'
19 | URL = 'https://github.com/aosingh/options_scraper'
20 | 
21 | PACKAGES = ['options_scraper']
22 | 
23 | DEPENDENCIES = ['lxml', 'requests', 'urllib3']
24 | 
25 | classifiers = [
26 |     'Development Status :: 4 - Beta',
27 |     'Intended Audience :: Education',
28 |     'Intended Audience :: Developers',
29 |     'Intended Audience :: Financial and Insurance Industry',
30 |     'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
31 |     'Programming Language :: Python :: 3.7',
32 |     'Programming Language :: Python :: 3.8',
33 |     'Topic :: Office/Business :: Financial',
34 |     'Topic :: Text Processing :: Markup :: HTML',
35 |     'Operating System :: POSIX :: Linux',
36 |     'Operating System :: Unix',
37 |     'Operating System :: Microsoft :: Windows',
38 |     'Operating System :: MacOS'
39 | ]
40 | keywords = 'nasdaq options chain scraper'
41 | 
42 | 
43 | setup(
44 |     name=DISTNAME,
45 |     long_description=long_description,
46 |     long_description_content_type='text/markdown',
47 |     author=AUTHOR,
48 |     author_email=MAINTAINER_EMAIL,
49 |     maintainer=MAINTAINER,
50 |     maintainer_email=MAINTAINER_EMAIL,
51 |     description=DESCRIPTION,
52 |     license=LICENSE,
53 |     url=URL,
54 |     version=VERSION,
55 |     entry_points={
56 |         'console_scripts': [
57 |             'options-scraper=options_scraper.cli:main'
58 |         ]
59 |     },
60 |     packages=find_packages(exclude=("tests",)),
61 |     package_dir={'options_scraper': 'options_scraper'},
62 |     install_requires=DEPENDENCIES,
63 |     include_package_data=True,
64 |     classifiers=classifiers,
65 |     keywords=keywords,
66 | )


--------------------------------------------------------------------------------