├── .gitignore ├── README.md ├── options_scraper ├── __init__.py ├── cli.py ├── scraper.py ├── serializer.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .idea 3 | *.pyc 4 | options_scraper.egg-info 5 | dist 6 | build -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NASDAQ Options chain Scraper 2 | 3 | Python Options Chain scraper for the old NASDAQ website : https://old.nasdaq.com 4 | 5 | ## Install 6 | 7 | ```bash 8 | pip install options-scraper 9 | ``` 10 | 11 | ## API 12 | 13 | Use the API if you want to access the scraped data records ( as python objects ) directly. 14 | 15 | ### Usage 16 | 17 | ```python 18 | 19 | from options_scraper.scraper import NASDAQOptionsScraper 20 | from options_scraper.utils import batched 21 | 22 | scraper = NASDAQOptionsScraper() 23 | ticker_symbol = 'XOM' 24 | kwargs = { "money": 'all', 25 | "expir": 'week', 26 | "excode": None, 27 | "callput": None 28 | } 29 | 30 | records_generator = scraper(ticker_symbol, **kwargs) 31 | 32 | # Either access each record individually as shown below 33 | for item in records_generator: 34 | print(item) 35 | 36 | # Or use the batched util to get a list of items 37 | for items in batched(records_generator, batch_size=100): 38 | print(items) 39 | 40 | ``` 41 | 42 | ### Output 43 | 44 | Each scraped record will have the following structure 45 | 46 | 47 | ```python 48 | 49 | {'Ask': '23.20', 50 | 'Bid': '18.50', 51 | 'Calls': 'Apr 24, 2020', 52 | 'Chg': '', 53 | 'Last': '19.40', 54 | 'Open Int': '15', 55 | 'Puts': 'Apr 24, 2020', 56 | 'Root': 'XOM', 57 | 'Strike': '60', 58 | 'Vol': '0'} 59 | 60 | {'Ask': '28.20', 61 | 'Bid': '23.50', 62 | 'Calls': 'Apr 24, 2020', 63 | 'Chg': '', 64 | 'Last': '29.67', 65 | 'Open Int': '3', 66 | 'Puts': 'Apr 24, 2020', 67 | 'Root': 'XOM', 68 | 'Strike': '65', 69 | 'Vol': '0'} 70 | 71 | ``` 72 | 73 | ## Console Script 74 | 75 | Use this script to scrape records and save them either in CSV or JSON format. 76 | 77 | ```bash 78 | options-scraper --help 79 | ``` 80 | 81 | ```text 82 | usage: options-scraper [-h] 83 | [-l {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}] 84 | [-t TICKER] 85 | [-o ODIR] 86 | [-b BATCH_SIZE] 87 | [-c {call,put}] 88 | [-m {all,in,out,near}] 89 | [-e EXCODE] 90 | [-x {week,stan,quart,cebo}] 91 | [-s {json,csv}] 92 | 93 | optional arguments: 94 | -h, --help show this help message and exit 95 | -l {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,FATAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET} 96 | -t TICKER, --ticker TICKER Ticker Symbol 97 | -o ODIR, --odir ODIR Output directory 98 | -b BATCH_SIZE, --batch_size BATCH_SIZE Batch Size 99 | -c {call,put}, --callput {call,put} 100 | -m {all,in,out,near}, --money {all,in,out,near} 101 | -e EXCODE, --excode EXCODE excode 102 | -x {week,stan,quart,cebo}, --expir {week,stan,quart,cebo} 103 | -s {json,csv}, --serialize {json,csv} Serialization format 104 | ``` 105 | 106 | 107 | #### Serialization format (-s) 108 | You have an option to output the data either in a CSV file or a JSON file. 109 | Default format is CSV. 110 | 111 | #### Batch Size (-b) 112 | Define how many records each csv or json file should have. 113 | 114 | 115 | ### Examples 116 | 1. To get all the option chain for XOM in a batch_size of 1000 and `csv` file format. 117 | This will make sure that each CSV file has 1000 records in it. 118 | The last file will have the remaining records 119 | 120 | ```bash 121 | options-scraper -t XOM -o /Users/abhishek/options_data -b 1000 -s csv 122 | ``` 123 | 124 | 125 | 2. To get all option chain data for MSFT in a batch_size of 10 and `json` file format. 126 | ```bash 127 | options-scraper -t MSFT -o /Users/abhishek/options_data -b 10 -s json 128 | ``` 129 | 130 | 3. To get all `put` options with weekly expiry. 131 | ```bash 132 | options-scraper -t XOM -e cbo -c put -x week -o /Users/abhishek/options_data 133 | ``` 134 | 135 | 4. To get all `call` options with `cebo` expiry. 136 | ```bash 137 | options-scraper -t XOM -c call -x cebo -o /Users/abhishek/options_data 138 | ``` 139 | 140 | 141 | -------------------------------------------------------------------------------- /options_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9.91" 2 | -------------------------------------------------------------------------------- /options_scraper/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | from pprint import pformat 5 | 6 | from options_scraper.serializer import NASDAQOptionsSerializer 7 | 8 | LOG = logging.getLogger(__name__) 9 | 10 | 11 | def main(): 12 | """ 13 | Description: 14 | Entry point to the options scraper 15 | 16 | """ 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-l", 20 | "--log-level", 21 | default="INFO", 22 | choices=list(logging._nameToLevel.keys())) 23 | parser.add_argument("-t", "--ticker", help="Ticker Symbol") 24 | parser.add_argument("-o", "--odir", help="Output directory") 25 | parser.add_argument("-b", 26 | "--batch_size", 27 | help="Batch Size", 28 | default=100, 29 | type=int) 30 | parser.add_argument("-c", "--callput", choices=["call", "put"]) 31 | parser.add_argument("-m", 32 | "--money", 33 | default="all", 34 | choices=["all", "in", "out", "near"]) 35 | parser.add_argument("-e", "--excode", help="excode") 36 | parser.add_argument("-x", 37 | "--expir", 38 | choices=["week", "stan", "quart", "cebo"]) 39 | parser.add_argument( 40 | "-s", 41 | "--serialize", 42 | help="Serialization format", 43 | default="csv", 44 | choices=["json", "csv"]) 45 | args = parser.parse_args() 46 | 47 | logging.basicConfig( 48 | level=logging._nameToLevel[args.log_level], 49 | format="%(asctime)s :: [%(levelname)s] :: [%(name)s] :: %(message)s", 50 | ) 51 | 52 | if args.ticker is None: 53 | raise ValueError("Ticker symbol not passed") 54 | 55 | if args.odir is None: 56 | raise ValueError("Output Directory not passed. Provide the complete path where you want to save the files") 57 | 58 | if not os.path.exists(args.odir): 59 | raise IOError("Path {0} does not exists".format(args.odir)) 60 | 61 | kwargs = { 62 | "money": args.money.lower(), 63 | "expir": args.expir.lower() if args.expir else None, 64 | "excode": args.excode.lower() if args.excode else None, 65 | "callput": args.callput.lower() if args.callput else None, 66 | } 67 | 68 | LOG.info("VERIFY: arguments passed %s", pformat(kwargs)) 69 | LOG.info("Serialization format is %s", args.serialize.upper()) 70 | LOG.info("Batch Size is %s", args.batch_size) 71 | 72 | serializer = NASDAQOptionsSerializer( 73 | ticker=args.ticker, 74 | root_dir=args.odir, 75 | serialization_format=args.serialize.lower(), 76 | ) 77 | serializer.serialize(**kwargs) 78 | LOG.info("Finished Scraping") -------------------------------------------------------------------------------- /options_scraper/scraper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import urllib.parse 4 | 5 | import requests 6 | from lxml import etree 7 | 8 | from options_scraper.utils import batched, get_text 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | __all__ = ['NASDAQOptionsScraper'] 13 | 14 | last_number_pattern = re.compile(r"(?<=&page=)\d+") 15 | nasdaq_base_url = "https://old.nasdaq.com" 16 | 17 | 18 | class NASDAQOptionsScraper: 19 | 20 | @staticmethod 21 | def gen_pages(url): 22 | """ 23 | Description: 24 | If for a given query the results are paginated then 25 | we should traverse the pages too. This function exactly does that. 26 | 27 | Args: 28 | URL - The main URL 29 | 30 | Returns: 31 | Generator - All the other pages in the search results if present. 32 | 33 | """ 34 | response = requests.get(url) 35 | tree = etree.HTML(response.content) 36 | for element in tree.xpath("//*[@id='quotes_content_left_lb_LastPage']"): 37 | if element is not None: 38 | last_url = element.attrib["href"] 39 | page_numbers = re.findall(last_number_pattern, last_url) 40 | if page_numbers: 41 | last_page = int(page_numbers[0]) 42 | for i in range(2, last_page + 1): 43 | url_to_scrap = "{0}&page={1}".format(url, i) 44 | yield url_to_scrap 45 | 46 | @staticmethod 47 | def gen_page_records(url): 48 | """ 49 | Description: 50 | Scrape Options data from the given URL. 51 | This is a 2 step process. 52 | 1. First, extract the headers 53 | 2. Then, the data rows. 54 | 55 | Args: 56 | url: NASDAQ URL to scrape 57 | 58 | Returns: 59 | Generator: Data records each as a dictionary 60 | 61 | """ 62 | response = requests.get(url) 63 | tree = etree.HTML(response.content) 64 | headers = [] 65 | # First, we will extract the table headers. 66 | for element in tree.xpath( 67 | "//div[@class='OptionsChain-chart borderAll thin']"): 68 | for thead_element in element.xpath("table/thead/tr/th"): 69 | a_element = thead_element.find("a") 70 | if a_element is not None: 71 | headers.append(a_element.text.strip()) 72 | else: 73 | headers.append(thead_element.text.strip()) 74 | 75 | # Then, the data rows. 76 | for element in tree.xpath( 77 | "//div[@class='OptionsChain-chart borderAll thin']"): 78 | for trow_elem in element.xpath("//tr"): 79 | data_row = [get_text(x) for x in trow_elem.findall("td")] 80 | if len(headers) == len(data_row): 81 | yield dict(zip(headers, data_row)) 82 | 83 | def __call__(self, ticker, **kwargs): 84 | """ 85 | Description: 86 | Constructs a NASDAQ specific URL for the given Ticker Symbol and options. 87 | Then traverses the option data found at the URL. If there are more pages, 88 | the data records on the pages are scraped too. 89 | 90 | Args: 91 | ticker: A valid Ticker Symbol 92 | **kwargs: Mapping of query parameters that should be passed to the NASDAQ URL 93 | 94 | Returns: 95 | Generator: Each options data record as a python dictionary till 96 | the last page is reached. 97 | """ 98 | params = urllib.parse.urlencode( 99 | dict((k, v) for k, v in kwargs.items() if v is not None)) 100 | url = f"{nasdaq_base_url}/symbol/{ticker.lower()}/option-chain?{params}" 101 | 102 | LOG.info("Scraping data from URL %s", url) 103 | for rec in self.gen_page_records(url): 104 | yield rec 105 | 106 | for url in self.gen_pages(url): 107 | LOG.info("Scraping data from URL %s", url) 108 | for rec in self.gen_page_records(url): 109 | yield rec 110 | 111 | -------------------------------------------------------------------------------- /options_scraper/serializer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import json 4 | import logging 5 | import os 6 | 7 | from typing import List, Mapping 8 | 9 | from options_scraper.scraper import NASDAQOptionsScraper 10 | from options_scraper.utils import batched 11 | 12 | 13 | __all__ = ['NASDAQOptionsSerializer'] 14 | 15 | LOG = logging.getLogger(__name__) 16 | 17 | 18 | class NASDAQOptionsSerializer: 19 | def __init__(self, 20 | ticker: str, 21 | root_dir: str, 22 | serialization_format: str = "csv", 23 | batch_size: int = 100): 24 | 25 | self.ticker = ticker 26 | self.serialization_format = serialization_format 27 | self.serializer = (self._to_json 28 | if serialization_format == "json" else self._to_csv) 29 | self.output_file_date_fmt = "%Y-%m-%dT%H-%M-%S-%f" 30 | 31 | output_path = os.path.join(root_dir, ticker) 32 | if not os.path.exists(output_path): 33 | os.mkdir(output_path) 34 | self.output_path = output_path 35 | 36 | self.batch_size = batch_size 37 | self._scraped_records = 0 38 | self._scraper = NASDAQOptionsScraper() 39 | 40 | def serialize(self, **kwargs): 41 | records_generator = self._scraper(self.ticker, **kwargs) 42 | for items in batched(records_generator, batch_size=self.batch_size): 43 | 44 | if items: 45 | timestamp = datetime.datetime.utcnow().strftime( 46 | self.output_file_date_fmt) 47 | file_name = f"{self.ticker}_{timestamp}.{self.serialization_format}" 48 | self.serializer(items, os.path.join(self.output_path, 49 | file_name)) 50 | LOG.info("Scraped batch %s records", len(items)) 51 | 52 | self._scraped_records += len(items) 53 | 54 | LOG.info("Scraped a total of %s records for %s", self._scraped_records, self.ticker) 55 | 56 | @staticmethod 57 | def _to_json(items: List[Mapping], file_path: str): 58 | items_to_serialize = {"items": items} 59 | with open(file_path, "w") as output_file: 60 | json.dump(items_to_serialize, output_file, indent=4) 61 | 62 | @staticmethod 63 | def _to_csv(items: List[Mapping], file_path: str): 64 | with open(file_path, "a") as csv_file: 65 | headers = list(items[0]) 66 | writer = csv.DictWriter(csv_file, 67 | delimiter=",", 68 | lineterminator="\n", 69 | fieldnames=headers) 70 | writer.writeheader() # file doesn't exist yet, write a header 71 | for item in items: 72 | writer.writerow(item) -------------------------------------------------------------------------------- /options_scraper/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | from typing import Generator 3 | 4 | from lxml import etree 5 | 6 | 7 | def get_text(elt): 8 | """ 9 | Description: 10 | Returns the text from tags. 11 | 12 | Args: 13 | elt: An lxml etree element 14 | 15 | Returns: 16 | Text within the element. 17 | """ 18 | return etree.tostring(elt, method="text", encoding="unicode").strip() 19 | 20 | 21 | def batched(gen: Generator, 22 | batch_size: int): 23 | """ 24 | Description: 25 | A util to slice a generator in a batch_size. 26 | The consumer can consume the generator in batches of given batch_size 27 | 28 | Args: 29 | gen: The generator to be consumed. 30 | batch_size: Consume batches of what size ? 31 | 32 | """ 33 | while True: 34 | batch = list(islice(gen, 0, batch_size)) 35 | if len(batch) == 0: 36 | return 37 | yield batch 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2018.11.29 2 | chardet==3.0.4 3 | idna==2.8 4 | lxml==4.5.0 5 | requests==2.23.0 6 | urllib3==1.25.8 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from setuptools import find_packages, setup 4 | from options_scraper import __version__ 5 | 6 | 7 | this_directory = path.abspath(path.dirname(__file__)) 8 | 9 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 10 | long_description = f.read() 11 | 12 | VERSION = __version__ 13 | DISTNAME = 'options_scraper' 14 | LICENSE = 'GNU GPLv3' 15 | AUTHOR = 'Abhishek Singh' 16 | MAINTAINER = 'Abhishek Singh' 17 | MAINTAINER_EMAIL = 'aosingh@asu.edu' 18 | DESCRIPTION = 'NASDAQ Options chain scraper for https://old.nasdaq.com' 19 | URL = 'https://github.com/aosingh/options_scraper' 20 | 21 | PACKAGES = ['options_scraper'] 22 | 23 | DEPENDENCIES = ['lxml', 'requests', 'urllib3'] 24 | 25 | classifiers = [ 26 | 'Development Status :: 4 - Beta', 27 | 'Intended Audience :: Education', 28 | 'Intended Audience :: Developers', 29 | 'Intended Audience :: Financial and Insurance Industry', 30 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 31 | 'Programming Language :: Python :: 3.7', 32 | 'Programming Language :: Python :: 3.8', 33 | 'Topic :: Office/Business :: Financial', 34 | 'Topic :: Text Processing :: Markup :: HTML', 35 | 'Operating System :: POSIX :: Linux', 36 | 'Operating System :: Unix', 37 | 'Operating System :: Microsoft :: Windows', 38 | 'Operating System :: MacOS' 39 | ] 40 | keywords = 'nasdaq options chain scraper' 41 | 42 | 43 | setup( 44 | name=DISTNAME, 45 | long_description=long_description, 46 | long_description_content_type='text/markdown', 47 | author=AUTHOR, 48 | author_email=MAINTAINER_EMAIL, 49 | maintainer=MAINTAINER, 50 | maintainer_email=MAINTAINER_EMAIL, 51 | description=DESCRIPTION, 52 | license=LICENSE, 53 | url=URL, 54 | version=VERSION, 55 | entry_points={ 56 | 'console_scripts': [ 57 | 'options-scraper=options_scraper.cli:main' 58 | ] 59 | }, 60 | packages=find_packages(exclude=("tests",)), 61 | package_dir={'options_scraper': 'options_scraper'}, 62 | install_requires=DEPENDENCIES, 63 | include_package_data=True, 64 | classifiers=classifiers, 65 | keywords=keywords, 66 | ) --------------------------------------------------------------------------------