├── .gitignore ├── README.md ├── list.py ├── load.py └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | config.py 3 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stock Data Pipeline with Python and Google Cloud 2 | 3 | >

7 | > This project is now archived. The visualization still works but has stopped being updated as of March 30th, 2022. Archival was set due to no longer wanting to pay for API usage. 8 | 9 | 10 | >

14 | > Any data in this project or on my website is for informational purposes only and should not be taken as invesment advice. 15 | 16 |
17 | 18 |

19 |

20 |

21 | 22 | ## Overview 23 | * Extracts and transforms S&P 500 stock data with Python from a financial API. 24 | * Data is loaded into Cloud Storage then transferred to BigQuery and rendered on my webpage. 25 | * Python code runs on a scheduled cron job through a virtual machine with GCP Compute Engine. 26 | 27 | ### Important Links 28 | * [Visualization](https://www.digitalghost.dev/stock-data-pipeline) 29 | * [Documentation](https://github.com/digitalghost-dev/stock-data-pipeline/wiki/Stock-Data-Pipeline-Documentation) 30 | 31 | ## How the Pipeline Works 32 | 33 | ### Data Pipeline 34 | 1. A cron job triggers `main.py` to run. 35 | 2. `main.py` calls the IEX Cloud API. 36 | 3. The data is processed and cleaned by removing commas, hyphens, and/or other extra characters from the **company name** column. 37 | 4. `main.py` creates a `csv` file with the prepared data. 38 | 5. `load.py` copies the `csv` file to a Cloud Storage bucket. 39 | 6. The `csv` file is loaded to BigQuery. 40 | 7. Using the [BigQuery API](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-client-libraries) and when the [webpage](https://www.digitalghost.dev/projects/stock-data-pipeline) is loaded, the data is queried and then displayed. 41 | 42 | ### CI/CD 43 | * None 44 | 45 | ### Notes: 46 | * The file that connects to BigQuery to pull the data when the page loads is located in my [wesbite repository](https://github.com/digitalghost-dev/website/) since that renders the frontend. 47 | * The pipeline does not account for holidays. 48 | 49 | ### Pipeline Flowchart 50 | ![stock-data-flowchart](https://storage.googleapis.com/pipeline-flowcharts/stock-data-pipeline-flowchart.png) 51 | 52 | ## Services Used 53 | * **APIs:** [IEX Cloud](https://www.iexcloud.io) 54 | * **Google Cloud Services:** 55 | * **Virtual Machine:** [Compute Engine ](https://cloud.google.com/compute) 56 | * **Object Storage:** [Cloud Storage](https://cloud.google.com/storage) 57 | * **Data Warehouse:** [BigQuery](https://cloud.google.com/bigquery/) 58 | * **Scheduler:** [cron](https://en.wikipedia.org/wiki/Cron) 59 | * **Visualization:** [Flask](https://flask.palletsprojects.com/en/2.2.x/) and HTML 60 | -------------------------------------------------------------------------------- /list.py: -------------------------------------------------------------------------------- 1 | ticker_list = ['mmm', 'aos', 'abt', 'abbv', 'abmd', 'acn', 'atvi', 'adm', 'adbe', 'adp', 'aap', 'aes', 'afl', 'a', 'apd', 'akam', 'alk', 'alb', 'are', 'algn', 'alle', 'lnt', 'all', 'goog', 'mo', 'amzn', 'amcr', 'amd', 'aee', 'aal', 'aep', 'axp', 'aig', 'amt', 'awk', 'amp', 'abc', 'ame', 'amgn', 'aph', 'adi', 'anss', 'aon', 'apa', 'aapl', 'amat', 'aptv', 'anet', 'ajg', 'aiz', 't', 'ato', 'adsk', 'azo', 'avb', 'avy', 'bkr', 'ball', 'bac', 'bbwi', 'bax', 'bdx', 'wrb', 'brk.b', 'bby', 'bio', 'tech', 'biib', 'blk', 'bk', 'ba', 'bkng', 'bwa', 'bxp', 'bsx', 'bmy', 'avgo', 'br', 'bro', 'bf.b', 'chrw', 'cdns', 'czr', 'cpt', 'cpb', 'cof', 'cah', 'kmx', 'ccl', 'carr', 'ctlt', 'cat', 'cboe', 'cbre', 'cdw', 'ce', 'cnc', 'cnp', 'cday', 'cf', 'crl', 'schw', 'chtr', 'cvx', 'cmg', 'cb', 'chd', 'ci', 'cinf', 'ctas', 'csco', 'c', 'cfg', 'ctxs', 'clx', 'cme', 'cms', 'ko', 'ctsh', 'cl', 'cmcsa', 'cma', 'cag', 'cop', 'ed', 'stz', 'ceg', 'coo', 'cprt', 'glw', 'ctva', 'cost', 'ctra', 'cci', 'csx', 'cmi', 'cvs', 'dhi', 'dhr', 'dri', 'dva', 'de', 'dal', 'xray', 'dvn', 'dxcm', 'fang', 'dlr', 'dfs', 'dish', 'dis', 'dg', 'dltr', 'd', 'dpz', 'dov', 'dow', 'dte', 'duk', 'dre', 'dd', 'dxc', 'emn', 'etn', 'ebay', 'ecl', 'eix', 'ew', 'ea', 'elv', 'lly', 'emr', 'enph', 'etr', 'eog', 'epam', 'efx', 'eqix', 'eqr', 'ess', 'el', 'etsy', 're', 'evrg', 'es', 'exc', 'expe', 'expd', 'exr', 'xom', 'ffiv', 'fds', 'fast', 'frt', 'fdx', 'fitb', 'frc', 'fe', 'fis', 'fisv', 'flt', 'fmc', 'f', 'ftnt', 'ftv', 'fbhs', 'foxa', 'fox', 'ben', 'fcx', 'grmn', 'it', 'gnrc', 'gd', 'ge', 'gis', 'gm', 'gpc', 'gild', 'gl', 'gpn', 'gs', 'hal', 'hig', 'has', 'hca', 'peak', 'hsic', 'hsy', 'hes', 'hpe', 'hlt', 'holx', 'hd', 'hon', 'hrl', 'hst', 'hwm', 'hpq', 'hum', 'hban', 'hii', 'ibm', 'iex', 'idxx', 'itw', 'ilmn', 'incy', 'ir', 'intc', 'ice', 'ip', 'ipg', 'iff', 'intu', 'isrg', 'ivz', 'iqv', 'irm', 'jbht', 'jkhy', 'j', 'jnj', 'jci', 'jpm', 'jnpr', 'k', 'kdp', 'key', 'keys', 'kmb', 'kim', 'kmi', 'klac', 'khc', 'kr', 'lhx', 'lh', 'lrcx', 'lw', 'lvs', 'ldos', 'len', 'lnc', 'lin', 'lyv', 'lkq', 'lmt', 'l', 'low', 'lumn', 'lyb', 'mtb', 'mro', 'mpc', 'mktx', 'mar', 'mmc', 'mlm', 'mas', 'ma', 'mtch', 'mkc', 'mcd', 'mck', 'mdt', 'mrk', 'meta', 'met', 'mtd', 'mgm', 'mchp', 'mu', 'msft', 'maa', 'mrna', 'mhk', 'moh', 'tap', 'mdlz', 'mpwr', 'mnst', 'mco', 'ms', 'mos', 'msi', 'msci', 'ndaq', 'ntap', 'nflx', 'nwl', 'nem', 'nwsa', 'nws', 'nee', 'nlsn', 'nke', 'ni', 'ndsn', 'nsc', 'ntrs', 'noc', 'nlok', 'nclh', 'nrg', 'nue', 'nvda', 'nvr', 'nxpi', 'orly', 'oxy', 'odfl', 'omc', 'on', 'oke', 'orcl', 'ogn', 'otis', 'pcar', 'pkg', 'para', 'ph', 'payx', 'payc', 'pypl', 'penn', 'pnr', 'pep', 'pki', 'pfe', 'pm', 'psx', 'pnw', 'pxd', 'pnc', 'pool', 'ppg', 'ppl', 'pfg', 'pg', 'pgr', 'pld', 'pru', 'peg', 'ptc', 'psa', 'phm', 'pvh', 'qrvo', 'pwr', 'qcom', 'dgx', 'rl', 'rjf', 'rtx', 'o', 'reg', 'regn', 'rf', 'rsg', 'rmd', 'rhi', 'rok', 'rol', 'rop', 'rost', 'rcl', 'spgi', 'crm', 'sbac', 'slb', 'stx', 'see', 'sre', 'now', 'shw', 'sbny', 'spg', 'swks', 'sjm', 'sna', 'sedg', 'so', 'luv', 'swk', 'sbux', 'stt', 'ste', 'syk', 'sivb', 'syf', 'snps', 'syy', 'tmus', 'trow', 'ttwo', 'tpr', 'tgt', 'tel', 'tdy', 'tfx', 'ter', 'tsla', 'txn', 'txt', 'tmo', 'tjx', 'tsco', 'tt', 'tdg', 'trv', 'trmb', 'tfc', 'twtr', 'tyl', 'tsn', 'usb', 'udr', 'ulta', 'unp', 'ual', 'ups', 'uri', 'unh', 'uhs', 'vlo', 'vtr', 'vrsn', 'vrsk', 'vz', 'vrtx', 'vfc', 'vtrs', 'vici', 'v', 'vno', 'vmc', 'wab', 'wba', 'wmt', 'wbd', 'wm', 'wat', 'wec', 'wfc', 'well', 'wst', 'wdc', 'wrk', 'wy', 'whr', 'wmb', 'wtw', 'gww', 'wynn', 'xel', 'xyl', 'yum', 'zbra', 'zbh', 'zion', 'zts'] -------------------------------------------------------------------------------- /load.py: -------------------------------------------------------------------------------- 1 | from config import table_name, cloud_storage_bucket 2 | from google.cloud import bigquery 3 | import subprocess 4 | 5 | def csv_file(): 6 | 7 | # Running a shell command to upload to Cloud Storage. 8 | subprocess.run(["gsutil cp *.csv " + cloud_storage_bucket], shell=True) 9 | 10 | def bigquery_upload(): 11 | 12 | # Construct a BigQuery client object. 13 | client = bigquery.Client() 14 | table_id = table_name 15 | 16 | # Creating table schema 17 | job_config = bigquery.LoadJobConfig( 18 | write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, 19 | source_format=bigquery.SourceFormat.CSV, 20 | skip_leading_rows=1, 21 | schema=[ 22 | bigquery.SchemaField("Ticker", "STRING"), 23 | bigquery.SchemaField("Company", "STRING"), 24 | bigquery.SchemaField("Price", "FLOAT"), 25 | bigquery.SchemaField("Change", "FLOAT"), 26 | bigquery.SchemaField("PE_Ratio", "FLOAT") 27 | ], 28 | ) 29 | 30 | # Make an API request. 31 | load_job = client.load_table_from_uri( 32 | cloud_storage_bucket, table_id, job_config=job_config 33 | ) 34 | 35 | # Waits for the job to complete. 36 | load_job.result() 37 | 38 | destination_table = client.get_table(table_id) 39 | print(f"Loaded {destination_table.num_rows} rows.") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | 3 | # Importing needed modules. 4 | from load import csv_file, bigquery_upload 5 | from urllib.request import urlopen 6 | from list import ticker_list 7 | from config import api_key 8 | import pandas as pd 9 | import json 10 | 11 | # URLs needed to build the API endpoint connection. 12 | base_url = 'https://cloud.iexapis.com/stable/stock/' 13 | quote_url = '/quote/?token=' 14 | company_url = '/company?token=' 15 | 16 | def company_info(): 17 | header_list = ['ticker', 'company', 'price', 'change', 'peRatio'] 18 | sep = " -" 19 | sep2 = " (" 20 | 21 | # Iterating through all of the tickers to get the data. 22 | master_list = [] 23 | count = 0 24 | while count < len(ticker_list): 25 | ticker = ticker_list[count] 26 | 27 | # Opening and reading the API endpoints. 28 | with urlopen(f"{base_url}{ticker}{quote_url}{api_key}") as response: 29 | source = response.read() 30 | data = json.loads(source) 31 | 32 | # API endpoints used. 33 | symbol = data["symbol"] 34 | company = data["companyName"] 35 | company_stripped = company.split(sep, 1)[0] 36 | company_stripped = company_stripped.split(sep2, 1)[0] 37 | price = data["latestPrice"] 38 | change = data["change"] 39 | peRatio = data["peRatio"] 40 | 41 | # Appending the end point data to the master list. 42 | info = [symbol, company_stripped, price, change, peRatio] 43 | master_list.append(info) 44 | count += 1 45 | 46 | # Removing commas in company names, outputting a CSV file. 47 | dataframe = pd.DataFrame(master_list) 48 | dataframe = dataframe.replace(",", "", regex=True) 49 | dataframe.to_csv('output.csv', header=header_list, index=False) 50 | 51 | if __name__ == "__main__": 52 | company_info() 53 | # Running the functions from load.py file. 54 | csv_file() 55 | bigquery_upload() --------------------------------------------------------------------------------