├── .editorconfig ├── .gitignore ├── .travis.yml ├── 01-basics.ipynb ├── 01-presentation-example ├── 01_simple_open.py ├── 02_open_file.py ├── 03_data_manipulation.py ├── 04_perform_calculation.py ├── 04_perform_calculation_no_comments.py ├── 05_storing_data.py ├── 06_pandas.py └── 06_pandas_no_comments.py ├── 02-selenium-examples └── download_calendar.py ├── 02-selenium-safari ├── create_names.py ├── gather_links_for_processing.py ├── html_to_pdf.py ├── main.py ├── merge_pdf_files.py ├── process_html_remove_junk.py ├── requirements.txt └── zip_folder.py ├── 02-webscrape-celery ├── __init__.py ├── basic_consumer.py ├── basic_producer.py ├── consumer.py ├── data.html ├── producer.py └── urls.txt ├── 02-webscraping.ipynb ├── 03-tidy-data.ipynb ├── 04-other-analysis ├── Autoregression_retail_sales.ipynb ├── Dynamic Linear Regression Models in Python.ipynb ├── example_pandas.py └── read_sec.py ├── 04-pandas-other └── pandas-selecting-rows.ipynb ├── 04-pandas.ipynb ├── 05-data-analysis.ipynb ├── 05-other-visualizations ├── Visualization.ipynb └── visualize-football-stadiums.ipynb ├── 06-data-visualizations.ipynb ├── 06-flask └── flask-rss │ ├── README.md │ ├── main.py │ ├── static │ ├── css │ │ ├── bootstrap-theme.css │ │ ├── bootstrap-theme.css.map │ │ ├── bootstrap-theme.min.css │ │ ├── bootstrap-theme.min.css.map │ │ ├── bootstrap.css │ │ ├── bootstrap.css.map │ │ ├── bootstrap.min.css │ │ ├── bootstrap.min.css.map │ │ ├── reader.css │ │ └── style.css │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ └── js │ │ ├── bootstrap.js │ │ ├── bootstrap.min.js │ │ ├── jquery-2.2.0.js │ │ └── npm.js │ └── templates │ ├── index.html │ ├── layout.html │ ├── notfound.html │ ├── reader.html │ └── table.html ├── 07-airflow ├── README.md ├── dags │ ├── example_postgres.py │ ├── sql │ │ ├── stock_insert.sql │ │ └── stock_schema.sql │ ├── stock_analysis_dag.py │ └── stocks.py └── docker-compose.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── data ├── 20180806_ALL_EQUITY_meetup.csv ├── WA_Fn-UseC_-HR-Employee-Attrition.xlsx ├── WMT_US.csv ├── WMT_US_pandas.csv ├── WMT_US_updated.csv ├── billboard.csv ├── country_timeseries.csv ├── fortune_1000.csv ├── gapminder.tsv ├── global_equity_historic_sales_1999_2018_usd_all_meetup.csv ├── linkedin_industries.html ├── msft_stock_key_data.csv ├── pew.csv ├── portfolio.csv ├── pycon_sponsor_levels.csv ├── pycon_sponsors.csv ├── retail_sales.csv ├── sponsors_vlookup.csv ├── stl.csv ├── stlcom_larget_employers.xlsx ├── stlregionalchamber_largest_employers_.xlsx ├── stock_data_simple.csv ├── stock_data_simple.xlsx ├── stock_description.csv ├── table1.csv ├── table2.csv ├── table3.csv ├── table4a.csv ├── table4b.csv └── weather.csv ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── readme.rst └── usage.rst ├── img ├── basics │ ├── basic_python_style.png │ ├── built-in_data_structures.png │ ├── built-in_functions.png │ ├── built-in_len.png │ ├── calculations.png │ ├── cell.png │ ├── cell_ex.png │ ├── cell_types.png │ ├── cells.png │ ├── comments.png │ ├── data-types.png │ ├── data_collections.png │ ├── excel-built-in-string.png │ ├── excel-built-in.png │ ├── excel-pre-installed-add-ins.png │ ├── jupyter-method.png │ ├── pycharm-function-pop.png │ ├── pycharm-function-popup.png │ ├── pycharm-methods.png │ ├── pycon-files.png │ ├── pycon_sponsor_levels.png │ ├── pycon_sponsors.png │ ├── python-pre-installed-add-ins.png │ ├── reserved_words.png │ ├── standard-library-import.png │ ├── standard-library.png │ └── vscode-method.png ├── dataframe.png ├── dataframe_components.png ├── excel_table.png ├── pandas_dataframe.png └── split_apply_combine.png ├── requirements_dev.txt ├── section1-01-basics_but_important_stuff.ipynb ├── section1-02-files_lists_dictionaries.ipynb ├── section1_challenge_1.py ├── section1_challenge_1_answer.py ├── section1_challenge_2.py ├── section1_challenge_2_answer.py ├── section1_challenge_3.py ├── section1_challenge_3_answer.py ├── section2-01-real-world-example.py ├── section2-02-real-world-example-refactored.py ├── section2_challenge.rst ├── setup.cfg ├── setup.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | venv/ 104 | stock_algo/ 105 | sec.gov.zip 106 | .ipynb_checkpoints/ 107 | .idea/ 108 | zip-data/ 109 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Config file for automatic testing at travis-ci.org 2 | 3 | language: python 4 | python: 5 | - 3.7 6 | - 3.6 7 | - 3.5 8 | - 2.7 9 | 10 | # Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors 11 | install: pip install -U tox-travis 12 | 13 | # Command to run tests, e.g. python setup.py test 14 | script: tox 15 | 16 | 17 | -------------------------------------------------------------------------------- /01-basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intro to Python\n", 8 | "\n", 9 | "## string" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "pycharm": { 17 | "is_executing": false, 18 | "name": "#%%\n" 19 | }, 20 | "scrolled": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "tickers = \"GOOG MSFT IBM TSLA\"" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Save String to File" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "pycharm": { 39 | "name": "#%%\n" 40 | } 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# write to file\n", 45 | "f = open('tickers.txt', 'wt')\n", 46 | "f.write(tickers)\n", 47 | "f.close()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Tuple" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": { 61 | "pycharm": { 62 | "name": "#%%\n" 63 | } 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "# tuple\n", 68 | "tickers = (\"GOOG\",\n", 69 | " \"MSFT\",\n", 70 | " \"IBM\",\n", 71 | " \"TSLA\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### For-Loop over tuple and sum values" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "pycharm": { 86 | "name": "#%%\n" 87 | } 88 | }, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "(1, 1)\n", 95 | "The sum of the tuple:\t2\n", 96 | "(2, 2)\n", 97 | "The sum of the tuple:\t4\n", 98 | "(3, 3)\n", 99 | "The sum of the tuple:\t6\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "list_of_tuples = [(1,1),\n", 105 | " (2,2),\n", 106 | " (3,3)]\n", 107 | "\n", 108 | "for values in list_of_tuples:\n", 109 | " print(f\"{values}\")\n", 110 | " total = sum(values)\n", 111 | " print(f\"The sum of the tuple:\\t{total}\")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## For-Loop Over tickers" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": { 125 | "pycharm": { 126 | "name": "#%%\n" 127 | } 128 | }, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "GOOG\n", 135 | "MSFT\n", 136 | "IBM\n", 137 | "TSLA\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "# loolist\n", 143 | "tickers = [\"GOOG\",\n", 144 | " \"MSFT\",\n", 145 | " \"IBM\",\n", 146 | " \"TSLA\"]\n", 147 | "\n", 148 | "for ticker in tickers:\n", 149 | " print(ticker)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## For Loop - String Formatting" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 6, 162 | "metadata": { 163 | "pycharm": { 164 | "name": "#%%\n" 165 | } 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "Ticker: GOOG\n", 173 | "Ticker: MSFT\n", 174 | "Ticker: IBM\n", 175 | "Ticker: TSLA\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# loop through list\n", 181 | "tickers = [\"GOOG\",\"MSFT\",\"IBM\",\"TSLA\"]\n", 182 | "\n", 183 | "for ticker in tickers:\n", 184 | " print(f\"Ticker: {ticker}\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 7, 195 | "metadata": { 196 | "pycharm": { 197 | "name": "#%%\n" 198 | } 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "tickers = set([\"GOOG\",\n", 203 | " \"MSFT\",\n", 204 | " \"IBM\",\n", 205 | " \"TSLA\"])\n", 206 | "\n", 207 | "tickers = (\"GOOG\",\n", 208 | " \"MSFT\",\n", 209 | " \"IBM\",\n", 210 | " \"TSLA\")" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "# Tuple" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 9, 223 | "metadata": { 224 | "pycharm": { 225 | "name": "#%%\n" 226 | } 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "tickers = tuple([\"GOOG\",\n", 231 | " \"MSFT\",\n", 232 | " \"IBM\",\n", 233 | " \"TSLA\"])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "# Dictionary" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "metadata": { 247 | "pycharm": { 248 | "name": "#%%\n" 249 | } 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "tickers = {1: \"GOOG\",\n", 254 | " 2: \"MSFT\",\n", 255 | " 3: \"IBM\",\n", 256 | " 4: \"TSLA\"}" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "# Opening Files" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 12, 269 | "metadata": { 270 | "pycharm": { 271 | "name": "#%%\n" 272 | } 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "Ticker,Date,Shares,Price\n", 280 | "\n", 281 | "GOOG,2019-10-01,100,1\n", 282 | "\n", 283 | "MSFT,2019-10-01,200,1\n", 284 | "\n", 285 | "IBM,2019-10-01,500,1\n", 286 | "\n", 287 | "TSLA,2019-10-01,300,1\n", 288 | "\n", 289 | "\n", 290 | "\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "import os\n", 296 | "# Contents of portfolio.csv:\n", 297 | "\"\"\"\n", 298 | "Ticker,Date,Shares,Price\n", 299 | "GOOG,2019-10-01,100,1\n", 300 | "MSFT,2019-10-01,200,1\n", 301 | "IBM,2019-10-01,500,1\n", 302 | "TSLA,2019-10-01,300,1\n", 303 | "\"\"\"\n", 304 | "\n", 305 | "# basic - open a file\n", 306 | "file = open('data/portfolio.csv', 'r')\n", 307 | "\n", 308 | "# print each line\n", 309 | "for line in file:\n", 310 | " print(line)\n", 311 | "\n", 312 | "# don't forget to close the file\n", 313 | "file.close()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "# Opening Files - Preferred Way" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 13, 326 | "metadata": { 327 | "pycharm": { 328 | "is_executing": false, 329 | "name": "#%%\n" 330 | } 331 | }, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "['GOOG', '2019-10-01', '100', '1']\n", 338 | "['MSFT', '2019-10-01', '200', '1']\n", 339 | "['IBM', '2019-10-01', '500', '1']\n", 340 | "['TSLA', '2019-10-01', '300', '1']\n", 341 | "['']\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "### Better way to a file\n", 347 | "### with automatically closes the file for you\n", 348 | "\n", 349 | "with open('data/portfolio.csv', 'r') as f:\n", 350 | " headers = next(f) # skip a single of input\n", 351 | " for line in f:\n", 352 | " line = line.strip() #strip the whitespace\n", 353 | " parts = line.split(\",\")\n", 354 | " print(parts)\n" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "# Example of indexing into lists and if statement" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 14, 367 | "metadata": { 368 | "pycharm": { 369 | "is_executing": false, 370 | "name": "#%%\n" 371 | } 372 | }, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "Ticker:GOOG\tDate: 2019-10-01\tShares: 100\tPrice: 1\n", 379 | "\n", 380 | "Ticker:MSFT\tDate: 2019-10-01\tShares: 200\tPrice: 1\n", 381 | "\n", 382 | "Ticker:IBM\tDate: 2019-10-01\tShares: 500\tPrice: 1\n", 383 | "\n", 384 | "Ticker:TSLA\tDate: 2019-10-01\tShares: 300\tPrice: 1\n", 385 | "\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "\n", 391 | "with open(r'data/portfolio.csv', 'r') as f:\n", 392 | " headers = next(f) # skip a single line of input, or skip header\n", 393 | " for line in f:\n", 394 | " parts = line.split(\",\")\n", 395 | " # check if number of items in list greater than 1\n", 396 | " # this will skip lines with only 1 element\n", 397 | " if len(parts) > 1:\n", 398 | " ticker = parts[0] # take the first item in the list\n", 399 | " date = parts[1] # take the second item\n", 400 | " shares = parts[2]\n", 401 | " price = parts[3]\n", 402 | " # f-strings formatting\n", 403 | " print(f\"Ticker:{ticker}\\tDate: {date}\\tShares: {shares}\\tPrice: {price}\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "# \"Batteries included\" with csv module" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 15, 416 | "metadata": { 417 | "pycharm": { 418 | "is_executing": false, 419 | "name": "#%%\n" 420 | } 421 | }, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "[{'date': '2019-10-01', 'price': '1', 'shares': '100', 'ticker': 'GOOG'},\n", 428 | " {'date': '2019-10-01', 'price': '1', 'shares': '200', 'ticker': 'MSFT'},\n", 429 | " {'date': '2019-10-01', 'price': '1', 'shares': '500', 'ticker': 'IBM'},\n", 430 | " {'date': '2019-10-01', 'price': '1', 'shares': '300', 'ticker': 'TSLA'}]\n" 431 | ] 432 | } 433 | ], 434 | "source": [ 435 | "import csv\n", 436 | "from pprint import pprint\n", 437 | "\n", 438 | "portfolio = list() # need to create a list before you try using it\n", 439 | "# or, more common way to create list\n", 440 | "portfolio = [] # create a list to store tickers\n", 441 | "\n", 442 | "with open(r'data/portfolio.csv', 'r') as f:\n", 443 | " rows = csv.reader(f)\n", 444 | " headers = next(f) # skip a single of input\n", 445 | " for row in rows:\n", 446 | " if len(row) > 1:\n", 447 | " record = {\n", 448 | " 'ticker' : row[0],\n", 449 | " 'date' : row[1],\n", 450 | " 'shares' : row[2],\n", 451 | " 'price': row[3]\n", 452 | " }\n", 453 | " portfolio.append(record)\n", 454 | "\n", 455 | "pprint(portfolio)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "# Create a function that takes a filename and returns the contents" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 16, 468 | "metadata": { 469 | "pycharm": { 470 | "is_executing": false, 471 | "name": "#%%\n" 472 | } 473 | }, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "[{'date': '2019-10-01', 'price': '1', 'shares': '100', 'ticker': 'GOOG'},\n", 480 | " {'date': '2019-10-01', 'price': '1', 'shares': '200', 'ticker': 'MSFT'},\n", 481 | " {'date': '2019-10-01', 'price': '1', 'shares': '500', 'ticker': 'IBM'},\n", 482 | " {'date': '2019-10-01', 'price': '1', 'shares': '300', 'ticker': 'TSLA'}]\n" 483 | ] 484 | } 485 | ], 486 | "source": [ 487 | "import csv\n", 488 | "from pprint import pprint\n", 489 | "\n", 490 | "def read_portfolio(filename):\n", 491 | "\n", 492 | " portfolio = list() # create a list to store tickers\n", 493 | "\n", 494 | " with open(filename, 'r') as f:\n", 495 | " rows = csv.reader(f)\n", 496 | " headers = next(f) # skip a single of input\n", 497 | " for row in rows:\n", 498 | " if len(row) > 1:\n", 499 | " record = {\n", 500 | " 'ticker' : row[0],\n", 501 | " 'date' : row[1],\n", 502 | " 'shares' : row[2],\n", 503 | " 'price': row[3]\n", 504 | " }\n", 505 | " portfolio.append(record)\n", 506 | " return portfolio\n", 507 | "\n", 508 | "portfolio = read_portfolio(r'data/portfolio.csv')\n", 509 | "\n", 510 | "pprint(portfolio)\n" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": { 516 | "pycharm": { 517 | "name": "#%% md\n" 518 | } 519 | }, 520 | "source": [ 521 | "# SQL Connectivity" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 17, 527 | "metadata": { 528 | "pycharm": { 529 | "name": "#%%\n" 530 | } 531 | }, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "2020-05-14 14:47:03.454172\n", 538 | "2019-11-01 00:00:00\n", 539 | "2020-05-14 14:47:03.455172\n" 540 | ] 541 | } 542 | ], 543 | "source": [ 544 | "# $ pip install sqlalchemy\n", 545 | "# sqlalchemy_uri = \"dialect+driver://user:password@host:port/dbname\"\n", 546 | "import sqlalchemy as sa\n", 547 | "\n", 548 | "conn = sa.create_engine('sqlite://')\n", 549 | "\n", 550 | "conn.execute('''CREATE TABLE zoo\n", 551 | " (critter VARCHAR(20) PRIMARY KEY,\n", 552 | " count INT,\n", 553 | " damages FLOAT)''')\n", 554 | "\n", 555 | "import datetime\n", 556 | "\n", 557 | "today = datetime.datetime.today()\n", 558 | "print(today)\n", 559 | "\n", 560 | "today = datetime.datetime.strptime(\"11/01/2019\",\"%m/%d/%Y\" )\n", 561 | "print(today)\n", 562 | "\n", 563 | "today = datetime.datetime.now()\n", 564 | "print(today)" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "outputs": [], 571 | "source": [], 572 | "metadata": { 573 | "collapsed": false, 574 | "pycharm": { 575 | "name": "#%%\n" 576 | } 577 | } 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [] 585 | } 586 | ], 587 | "metadata": { 588 | "kernelspec": { 589 | "display_name": "Python 3", 590 | "language": "python", 591 | "name": "python3" 592 | }, 593 | "language_info": { 594 | "codemirror_mode": { 595 | "name": "ipython", 596 | "version": 3 597 | }, 598 | "file_extension": ".py", 599 | "mimetype": "text/x-python", 600 | "name": "python", 601 | "nbconvert_exporter": "python", 602 | "pygments_lexer": "ipython3", 603 | "version": "3.7.7" 604 | } 605 | }, 606 | "nbformat": 4, 607 | "nbformat_minor": 1 608 | } -------------------------------------------------------------------------------- /01-presentation-example/01_simple_open.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | filename = r'data\WMT_US.csv' 4 | 5 | f = open(filename, 'r') 6 | 7 | print(f) 8 | 9 | data = f.read() 10 | 11 | print(data) 12 | 13 | f.close() 14 | 15 | f = open(filename, 'r') # open file 16 | 17 | for line in f: 18 | print(line) 19 | 20 | f.close() # close file 21 | -------------------------------------------------------------------------------- /01-presentation-example/02_open_file.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | 4 | filename = r'data\WMT_US.csv' 5 | 6 | total = 0.0 7 | 8 | with open(filename, 'r') as f: 9 | rows = csv.reader(f) 10 | 11 | # save header row 12 | header = next(f) 13 | # and skip to next row 14 | 15 | for row in rows: 16 | print(row) 17 | 18 | -------------------------------------------------------------------------------- /01-presentation-example/03_data_manipulation.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | from datetime import datetime 4 | 5 | filename = r'data\WMT_US.csv' 6 | 7 | with open(filename, 'r') as f: 8 | rows = csv.reader(f) 9 | 10 | # skip header row 11 | header = next(f) 12 | 13 | for row in rows: 14 | row[2] = datetime.strptime(row[2], "%m/%d/%Y") 15 | 16 | # convert string to integer 17 | row[3] = int(row[3]) 18 | row[4] = int(row[4]) 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /01-presentation-example/04_perform_calculation.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | from datetime import datetime 4 | 5 | filename = r'data\WMT_US.csv' 6 | 7 | records = [] 8 | 9 | with open(filename, 'r') as f: 10 | rows = csv.reader(f) 11 | 12 | # skip header row 13 | header = next(f) 14 | 15 | for row in rows: 16 | # print(row) 17 | # ['WMT US', 'WAL-MART STORES INC', '12/31/2014', '476293988352', '460271988736'] 18 | 19 | # convert string to date object 20 | row_date = datetime.strptime(row[2], "%m/%d/%Y") 21 | # print(row_date) 22 | # 2003-12-31 00:00:00 23 | # gives us ability to ask for year 24 | row_date_year = row_date.year 25 | 26 | # need to convert sales and expenses values from string to integer 27 | # so can perform mathmatical operations 28 | row_sales = int(row[3]) 29 | row_expenses = int(row[4]) 30 | 31 | # perform profit calculation 32 | profit = row_sales - row_expenses 33 | 34 | print(f"{row_date_year} Profit = {profit:,}") 35 | 36 | """ 37 | Output: 38 | 39 | 2014 Profit = 16,021,999,616 40 | 2013 Profit = 16,999,000,064 41 | 2012 Profit = 15,699,000,320 42 | 2011 Profit = 16,389,000,192 43 | 2010 Profit = 14,334,999,552 44 | 2009 Profit = 13,400,000,512 45 | 2008 Profit = 12,730,999,808 46 | 2007 Profit = 11,283,999,744 47 | 2006 Profit = 11,230,999,552 48 | 2005 Profit = 10,266,999,808 49 | 2004 Profit = 9,054,000,128 50 | 2003 Profit = 7,954,999,808 51 | """ 52 | 53 | -------------------------------------------------------------------------------- /01-presentation-example/04_perform_calculation_no_comments.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | from datetime import datetime 4 | 5 | filename = r'data\WMT_US.csv' 6 | 7 | with open(filename, 'r') as f: 8 | rows = csv.reader(f) 9 | header = next(f) 10 | 11 | for row in rows: 12 | row_date_year = datetime.strptime(row[2], "%m/%d/%Y").year 13 | 14 | row_sales = int(row[3]) 15 | row_expenses = int(row[4]) 16 | 17 | profit = row_sales - row_expenses 18 | 19 | print(f"{row_date_year} Profit = {profit:,}") 20 | 21 | """ 22 | Output: 23 | 24 | 2014 Profit = 16,021,999,616 25 | 2013 Profit = 16,999,000,064 26 | 2012 Profit = 15,699,000,320 27 | 2011 Profit = 16,389,000,192 28 | 2010 Profit = 14,334,999,552 29 | 2009 Profit = 13,400,000,512 30 | 2008 Profit = 12,730,999,808 31 | 2007 Profit = 11,283,999,744 32 | 2006 Profit = 11,230,999,552 33 | 2005 Profit = 10,266,999,808 34 | 2004 Profit = 9,054,000,128 35 | 2003 Profit = 7,954,999,808 36 | """ 37 | 38 | -------------------------------------------------------------------------------- /01-presentation-example/05_storing_data.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | from datetime import datetime 4 | from pprint import pprint 5 | 6 | filename = r'data\WMT_US.csv' 7 | 8 | records = [] 9 | 10 | with open(filename, 'r') as f: 11 | rows = csv.reader(f) 12 | 13 | # skip header row 14 | header = next(f) 15 | 16 | for row in rows: 17 | row[2] = datetime.strptime(row[2], "%m/%d/%Y") 18 | row[3] = int(row[3]) 19 | row[4] = int(row[4]) 20 | # perform calculation 21 | profit = row[3] - row[4] 22 | 23 | record = { 24 | "ticker": row[0], 25 | "name": row[1], 26 | "date": row[2], 27 | "sales": row[3], 28 | "expenses": row[4], 29 | "profit": profit 30 | } 31 | 32 | records.append(record) 33 | 34 | pprint(records) 35 | -------------------------------------------------------------------------------- /01-presentation-example/06_pandas.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | 4 | pd.set_option('display.float_format', lambda x: f'{x:.5f}') 5 | pd.set_option('display.max_columns', 100) 6 | pd.set_option('display.max_rows', 100) 7 | pd.set_option('display.width', 600) 8 | 9 | filename = r'data\WMT_US.csv' 10 | 11 | df = pd.read_csv(filename) 12 | 13 | # check the data types of each columns 14 | print(df.dtypes) 15 | 16 | """ 17 | Ticker object 18 | Company Name object 19 | Year End object 20 | Total Sales int64 21 | Total Expenses int64 22 | dtype: object 23 | """ 24 | 25 | # convert the date column to python date object 26 | # which makes it easier to work with 27 | df['Year End'] = pd.to_datetime(df['Year End']) 28 | 29 | # print(df.dtypes) 30 | """ 31 | Ticker object 32 | Company Name object 33 | Year End datetime64[ns] 34 | Total Sales int64 35 | Total Expenses int64 36 | dtype: object 37 | 38 | """ 39 | 40 | # calculate total profit 41 | df['Total Profit'] = df['Total Sales'] - df['Total Expenses'] 42 | 43 | # print(df) 44 | """ 45 | Ticker Company Name Year End Total Sales Total Expenses Total Profit 46 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616 47 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064 48 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320 49 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192 50 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552 51 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512 52 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808 53 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744 54 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552 55 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808 56 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128 57 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808 58 | """ 59 | 60 | df['Profit Margin'] = (df['Total Profit'] / df['Total Sales']) * 100 61 | 62 | # print(df) 63 | """ 64 | Ticker Company Name Year End Total Sales Total Expenses Total Profit Profit Margin 65 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616 3.36389 66 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064 3.62327 67 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320 3.51247 68 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192 3.88504 69 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552 3.51164 70 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512 3.30369 71 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808 3.36089 72 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744 3.23648 73 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552 3.59476 74 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808 3.56507 75 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128 3.53218 76 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808 3.46448 77 | """ 78 | 79 | # percent change needs to be ascending dates 80 | df.sort_values("Year End", inplace=True) 81 | df['Sales Growth YoY %'] = df['Total Sales'].pct_change() * 100 82 | 83 | # print(df) 84 | """ 85 | Ticker Company Name Year End Total Sales Total Expenses Total Profit Profit Margin Sales Growth YoY % 86 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808 3.46448 nan 87 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128 3.53218 11.63378 88 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808 3.56507 12.35131 89 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552 3.59476 8.48573 90 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744 3.23648 11.59408 91 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808 3.36089 8.64735 92 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512 3.30369 7.07710 93 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552 3.51164 0.64275 94 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192 3.88504 3.34016 95 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320 3.51247 5.95024 96 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064 3.62327 4.96968 97 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616 3.36389 1.52015 98 | """ 99 | 100 | new_filename = filename.replace(".csv", "_pandas.csv") 101 | 102 | df.to_csv(new_filename) 103 | -------------------------------------------------------------------------------- /01-presentation-example/06_pandas_no_comments.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | 4 | input_filename = r'data\WMT_US.csv' 5 | output_filename = r'data\WMT_US_output.csv' 6 | 7 | df = pd.read_csv(input_filename) 8 | 9 | df['Total Profit'] = df['Total Sales'] - df['Total Expenses'] 10 | 11 | df.to_csv(output_filename) 12 | 13 | print(df) 14 | """ 15 | Ticker Company Name Year End Total Sales Total Expenses Total Profit 16 | 0 WMT US WAL-MART STORES INC 12/31/2014 476293988352 460271988736 16021999616 17 | 1 WMT US WAL-MART STORES INC 12/31/2013 469162000384 452163000320 16999000064 18 | 2 WMT US WAL-MART STORES INC 12/31/2012 446950014976 431251014656 15699000320 19 | 3 WMT US WAL-MART STORES INC 12/31/2011 421849006080 405460005888 16389000192 20 | 4 WMT US WAL-MART STORES INC 12/31/2010 408214011904 393879012352 14334999552 21 | 5 WMT US WAL-MART STORES INC 12/31/2009 405606989824 392206989312 13400000512 22 | 6 WMT US WAL-MART STORES INC 12/31/2008 378798997504 366067997696 12730999808 23 | 7 WMT US WAL-MART STORES INC 12/31/2007 348650012672 337366012928 11283999744 24 | 8 WMT US WAL-MART STORES INC 12/31/2006 312426987520 301195987968 11230999552 25 | 9 WMT US WAL-MART STORES INC 12/31/2005 287989006336 277722006528 10266999808 26 | 10 WMT US WAL-MART STORES INC 12/31/2004 256329007104 247275006976 9054000128 27 | 11 WMT US WAL-MART STORES INC 12/31/2003 229615992832 221660993024 7954999808 28 | """ 29 | -------------------------------------------------------------------------------- /02-selenium-examples/download_calendar.py: -------------------------------------------------------------------------------- 1 | #! py27w 2 | import os, time 3 | from datetime import datetime 4 | from datetime import date 5 | from datetime import timedelta 6 | from selenium import webdriver 7 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 8 | from selenium.common.exceptions import NoSuchElementException 9 | from selenium.webdriver.common.by import By 10 | from selenium.webdriver.support.ui import WebDriverWait 11 | from selenium.webdriver.support import expected_conditions as EC 12 | 13 | fp = webdriver.FirefoxProfile() 14 | fp.set_preference('browser.download.folderList', 2) 15 | fp.set_preference("browser.download.manager.showWhenStarting", False) 16 | fp.set_preference('browser.download.dir', os.getcwd()) 17 | fp.set_preference("browser.helperApps.neverAsk.saveToDisk", 'application/vnd.ms-excel') 18 | fp.set_preference("browser.download.dir", "c:\\tmp"); 19 | driver = webdriver.Firefox(firefox_profile=fp) 20 | driver.get('https://www.zacks.com/earnings/earnings-reports') 21 | 22 | 23 | def click_calendar(): 24 | try: 25 | element_xpath = '//*[@id="earnings_release"]/div[1]/p/a' 26 | element = WebDriverWait(driver, 10).until( 27 | lambda driver: driver.find_element_by_xpath(element_xpath).click() 28 | ) 29 | finally: 30 | print("clicked calendar") 31 | 32 | 33 | def click_prev_day(x): 34 | s = 'datespan_%d' % (x) 35 | try: 36 | WebDriverWait(driver, 10).until( 37 | lambda driver: driver.find_element_by_id(s).click() 38 | ) 39 | except: 40 | result = False 41 | else: 42 | result = True 43 | return result 44 | 45 | 46 | def click_export(): 47 | try: 48 | element = WebDriverWait(driver, 10).until( 49 | lambda driver: driver.find_element_by_id('export_excel').click() 50 | ) 51 | except: 52 | result = False 53 | else: 54 | result = True 55 | return result 56 | 57 | 58 | def click_prev_month(): 59 | try: 60 | driver.find_element_by_id('prevCal').click() 61 | except: 62 | result = False 63 | else: 64 | result = True 65 | i = 31 66 | while i > 27: 67 | try: 68 | click_prev_day(i) 69 | return False 70 | except: 71 | print('could not find %s in prev month' % (i)) 72 | i -= 1 73 | 74 | 75 | def subtract_day(n): 76 | y = n - 1 77 | return y 78 | 79 | 80 | def start_date(): 81 | return datetime(2016, 2, 29) 82 | 83 | 84 | def click_to_start_date(): 85 | start_date = datetime(2016, 2, 28) 86 | a = date.today() 87 | b = start_date 88 | c = a.month - b.month 89 | if c > 0: 90 | click_calendar() 91 | while c > 0: 92 | click_prev_month() 93 | c -= 1 94 | try: 95 | click_prev_day(31) 96 | except: 97 | click_prev_day(30) 98 | 99 | 100 | def main(): 101 | # click_to_start_date() 102 | # sdate = start_date() 103 | m = 12 104 | while m > 0: 105 | m -= 1 106 | for x in range(31, 0, -1): 107 | click_calendar() 108 | click_prev_day(x) 109 | click_export() 110 | 111 | click_calendar() 112 | click_prev_month() 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /02-selenium-safari/create_names.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import urllib.parse 5 | from pyquery import PyQuery as pq 6 | from bs4 import BeautifulSoup 7 | import configparser 8 | 9 | 10 | def create_filenames_for_conversion(filepath, filename, file_extention): 11 | print(filepath, filename) 12 | timestr = time.strftime("%Y%m%d-%H%M%S",time.localtime(os.path.getmtime(os.path.join(filepath, filename)))) 13 | filename = filename.replace(file_extention,"") 14 | filename = filename.translate(dict((ord(char), None) for char in '\/*?:"<>|,.')) 15 | filename_html, filename_pdf = os.path.join(timestr + '_' + filename + '(clean)' + file_extention), os.path.join(timestr+'_'+filename + '(clean).pdf') 16 | print('starting creation of: ' + filename_html) 17 | return filename_html, filename_pdf 18 | 19 | 20 | def create_filename_from_url(url): 21 | url, fragment = urllib.parse.urldefrag(url) 22 | parsed = urllib.parse.urlsplit(url) 23 | stripped = parsed.path.replace(URL_REPLACE, '') 24 | filename = stripped.translate(dict((ord(char), None) for char in '\/*?:"<>|')) 25 | print(filename) 26 | return filename 27 | 28 | def create_folder_path_from_url(base_dir, url): 29 | path = os.path.join(base_dir, str(url.split("/")[5]+"_"+url.split("/")[6]).translate(dict((ord(char), None) for char in '\/*?:"<>|'))) 30 | if os.path.exists(path) != True: 31 | os.makedirs(path) 32 | print(path) 33 | return path 34 | 35 | 36 | def create_file(filename, w_page_source, URL_WEBSITE): 37 | d = pq(w_page_source, parser='html') 38 | ab = d.make_links_absolute(URL_WEBSITE) 39 | soup = BeautifulSoup(ab.html(), "html.parser") 40 | try: 41 | with open(filename, "w", encoding='utf-8') as f: 42 | f.write(str(soup.decode_contents)) 43 | except: 44 | print('something broke: ', filename) 45 | return filename 46 | 47 | 48 | -------------------------------------------------------------------------------- /02-selenium-safari/gather_links_for_processing.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import configparser 3 | import time 4 | from create_names import create_file 5 | #sys.stdout = codecs.getwriter('utf8')(sys.stdout) 6 | import urllib.parse 7 | import lxml 8 | import lxml.html 9 | from lxml.html import parse, tostring, open_in_browser, fromstring 10 | 11 | 12 | def get_toc_links(filename, w_page_source, URL_WEBSITE,toc_xpath=None): 13 | create_file(filename, w_page_source, URL_WEBSITE) 14 | html = lxml.html.fromstring(w_page_source) 15 | html.make_links_absolute(URL_WEBSITE) 16 | ab = lxml.html.tostring(html,pretty_print=True, method="html") 17 | soup = BeautifulSoup(ab, 'lxml') 18 | links = [] 19 | for link in soup.find_all('a'): 20 | if 'href' in link.attrs: 21 | links.append(str(link.attrs['href'])) 22 | urls = [] 23 | for i in links: 24 | url, fragment = urllib.parse.urldefrag(i) 25 | urls.append(url) 26 | urls = f7(urls) 27 | newurls = [] 28 | for i in urls: 29 | if 'htm' in i: 30 | newurls.append(i) 31 | return(newurls) 32 | 33 | def f7(seq): 34 | seen = set() 35 | seen_add = seen.add 36 | return [x for x in seq if not (x in seen or seen_add(x))] 37 | -------------------------------------------------------------------------------- /02-selenium-safari/html_to_pdf.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # try importing scandir and if found, use it as it's a few magnitudes of an order faster than stock os.walk 4 | 5 | import sys 6 | import os 7 | # Generate type library so that we can access constants 8 | 9 | import process_html_remove_junk 10 | def convertHTML2PDF(htmlPath, pdfPath): 11 | import win32com.client.makepy 12 | import win32com.client 13 | from win32com.client import Dispatch 14 | from win32com.client.dynamic import ERRORS_BAD_CONTEXT 15 | import winerror 16 | win32com.client.makepy.GenerateFromTypeLibSpec('Acrobat') 17 | # Use Unicode characters instead of their ascii psuedo-replacements 18 | UNICODE_SNOB = 0 19 | 'Convert an HTML document to PDF format' 20 | # Connect to Adobe Acrobat 21 | import win32com.client 22 | avDoc = win32com.client.DispatchEx('AcroExch.AVDoc') 23 | avDoc.Open(os.path.abspath(htmlPath), 'html2pdf') 24 | # Save in PDF format 25 | pdDoc = avDoc.GetPDDoc() 26 | pdDoc.Save(win32com.client.constants.PDSaveFull, os.path.abspath(pdfPath)) 27 | pdDoc.Close() 28 | # Close HTML document without prompting to save 29 | avDoc.Close(True) 30 | 31 | def file_conversion(folder): 32 | nfolder = os.path.join(folder,'clean') 33 | #folder = os.path.normpath(sys.argv[1]) 34 | if nfolder is None: 35 | directory = 'C:\\HTML' 36 | files = process_html_remove_junk.walk_dir_fullfilename(directory) 37 | else: 38 | files=[] 39 | files = [os.path.join(nfolder, x) for x in os.listdir(nfolder)] 40 | for filename in files: 41 | basename = os.path.basename(filename) 42 | extname = os.path.splitext(basename) 43 | dirname = os.path.dirname(filename) 44 | pdf = os.path.join(folder,'pdf', extname[0]+'.pdf') 45 | try: 46 | print(pdf) 47 | convertHTML2PDF(filename, pdf) 48 | except: 49 | print('problem with: ' + filename) 50 | 51 | 52 | -------------------------------------------------------------------------------- /02-selenium-safari/main.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import selenium 5 | from selenium import webdriver 6 | import create_names 7 | from create_names import create_folder_path_from_url, create_filename_from_url, create_file 8 | from zip_folder import zip_directory 9 | from merge_pdf_files import pyMerger 10 | import time 11 | import gather_links_for_processing 12 | from gather_links_for_processing import get_toc_links 13 | import random 14 | import process_html_remove_junk 15 | import html_to_pdf 16 | 17 | 18 | def file_merge(directory): 19 | for path, dirnames, files in os.walk(directory): 20 | pyMerger(path) 21 | print(path) 22 | 23 | def post_process(): 24 | file_merge(BASE_DIR) 25 | zip_directory(BASE_DIR) 26 | 27 | def pause_for_random_time(): 28 | time.sleep(random.randint(3,7)) 29 | 30 | def process_html_files(directory=None): 31 | files_processed = process_html_remove_junk.process_html_files_removing_junk(directory) 32 | return files_processed 33 | 34 | def process_cleaned_files_into_pdf(directory): 35 | files_processed_pdf = html_to_pdf.file_conversion(directory) 36 | return files_processed_pdf 37 | 38 | def grab_urls_from_file(INPUT_FILE): 39 | file = INPUT_FILE 40 | urls = [] 41 | list_of_list_of_filenames=[] 42 | with open(file, 'r') as f: 43 | urls = f.read().splitlines() 44 | if len(urls) < 1: 45 | urls = [sys.argv[1]] 46 | print(urls) 47 | return urls 48 | 49 | def main(): 50 | ''' 51 | function which calls file with urls to process 52 | ''' 53 | if len(sys.argv) < 2: 54 | sys.exit(0) 55 | 56 | w = webdriver.Chrome() 57 | 58 | domain_url, base_login = URL_WEBSITE, URL_LOGIN 59 | w.get(domain_url + base_login) 60 | loginElem = w.find_element_by_name('email') 61 | loginElem.send_keys(USERNAME) 62 | loginPass = w.find_element_by_name('password1') 63 | loginPass.send_keys(PASSWORD) 64 | time.sleep(3) 65 | loginPass.submit() 66 | time.sleep(3) 67 | 68 | urls = grab_urls_from_file(INPUT_FILE) 69 | 70 | for url in urls: 71 | w.get(url) 72 | base_dir = os.path.abspath(os.sep) 73 | path = create_names.create_folder_path_from_url(BASE_DIR, url) 74 | filename = os.path.join(path,create_names.create_filename_from_url(url) + '(t).html') 75 | page_source = w.page_source 76 | toc_table_only = page_source 77 | toc = gather_links_for_processing.get_toc_links(filename, w.page_source, URL_WEBSITE) 78 | for webpage_url in toc: 79 | try: 80 | w.get(webpage_url) 81 | filename = create_names.create_filename_from_url(w.current_url) 82 | fout = create_names.create_file(os.path.join(path,filename + '.html'), w.page_source, URL_WEBSITE) 83 | except: 84 | print('something broke: ', filename) 85 | pause_for_random_time() 86 | list_of_list_of_filenames = process_html_files(path) 87 | process_cleaned_files_into_pdf(path) 88 | #pyMerger(directory) 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /02-selenium-safari/merge_pdf_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | def pyMerger(directory): 3 | pdfFiles = [f for f in os.listdir(directory) if f.lower().endswith("pdf")] 4 | merger = PdfFileMerger() 5 | 6 | if pdfFiles != []: # check if directory has pdf files in it 7 | for filename in pdfFiles: 8 | if filename != "_mergedFull.pdf": # check if merged file already exists and skip it 9 | merger.append(PdfFileReader(os.path.join(directory, filename), "rb")) 10 | 11 | outputFile = os.path.join(r'c:\pdf', directory+"_mergedFull.pdf") 12 | merger.write(outputFile) # it will overwrite if final file existed 13 | else: 14 | print(directory + " has no pdf files in it.") 15 | 16 | -------------------------------------------------------------------------------- /02-selenium-safari/process_html_remove_junk.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from bs4 import BeautifulSoup 4 | import create_names 5 | from create_names import create_filenames_for_conversion 6 | 7 | 8 | 9 | def grab_junk_tag(file): 10 | with open(file, 'r', encoding='utf-8') as f: 11 | data = f.read() 12 | bsObj = BeautifulSoup(data, "html.parser") 13 | head_elements_blacklist = ['topbar t-topbar'] 14 | body_elements_blacklist = ['expanded', 'annotator-modal-wrapper annotator-editor-modal annotator-editor annotator-hide', 'annotator-modal-wrapper annotator-delete-confirm-modal', 'annotator-adder','sbo-reading-menu sbo-menu-top', 'interface-controls interface-controls-top', 'sample-message', 'font-flyout','t-sbo-next sbo-next sbo-nav-bottom', 't-sbo-next sbo-next sbo-nav-top', 't-sbo-prev sbo-prev sbo-nav-bottom', 't-sbo-prev sbo-prev sbo-nav-top', 'reading-controls-bottom'] 15 | footer_elements_blacklist = ['pagefoot t-pagefoot'] 16 | html_elements_blacklists = [{'header': head_elements_blacklist}, {'div': body_elements_blacklist}, {'footer': footer_elements_blacklist}] 17 | 18 | for elements in html_elements_blacklists: 19 | for element, tags in elements.items(): 20 | for tag in tags: 21 | try: 22 | temp = bsObj.find(element, {'class': tag}) 23 | temp.decompose() 24 | #print('processed: ' + element + ' ' + tag) 25 | except: 26 | print('error: ' + tag) 27 | continue 28 | return(bsObj) 29 | 30 | def check_for_folder_and_create(destfolder,additional=None): 31 | if additional != None: 32 | new_folders = [] 33 | for folder in additional: 34 | newfolder = os.path.join(destfolder,folder) 35 | if not os.path.isdir(newfolder): 36 | os.makedirs(newfolder) 37 | new_folders.append(newfolder) 38 | return new_folders 39 | if not os.path.isdir(destfolder): 40 | os.makedirs(destfolder) 41 | return destfolder 42 | 43 | 44 | def get_fullfilepaths_files_in_folder(folder_to_process, extfilter=None): 45 | files_in_folder = [os.path.join(folder_to_process, x) for x in os.listdir(folder_to_process) if extfilter in x] 46 | return files_in_folder 47 | 48 | def walk_dir_fullfilename(directory, extfilter=None): 49 | all_files = [] 50 | for path, dirnames, files in os.walk(directory): 51 | for file in files: 52 | filepath, filename = path, file 53 | fullfilepath = os.path.join(path, file) 54 | if extfilter != None: 55 | if extfilter in fullfilepath and '(clean)' not in fullfilepath: 56 | all_files.append(fullfilepath) 57 | else: 58 | pass 59 | else: 60 | all_files.append(fullfilepath) 61 | return all_files 62 | 63 | #walk_test=walk_dir_fullfilename(directory, extfilter='htm') 64 | 65 | def process_html_files_removing_junk(directory): 66 | #folder = os.path.normpath(sys.argv[1]) 67 | if directory is None: 68 | directory = 'C:\\HTML' 69 | files = walk_dir_fullfilename(directory) 70 | else: 71 | files = get_fullfilepaths_files_in_folder(directory, extfilter='htm') 72 | for filename in files: 73 | try: 74 | list_of_list_of_filenames = [] 75 | basename = os.path.basename(filename) 76 | extname = os.path.splitext(basename) 77 | dirname = os.path.dirname(filename) 78 | destfolder = directory 79 | filename_html, filename_pdf = create_names.create_filenames_for_conversion(destfolder, filename, extname[1]) 80 | filepath = check_for_folder_and_create(destfolder,additional=["clean","pdf","html"]) 81 | pdf = os.path.join(filepath[1],filename_pdf) 82 | html_clean = os.path.join(filepath[0], filename_html) 83 | try: 84 | bsObj = grab_junk_tag(filename) 85 | except: 86 | print('error' + html) 87 | try: 88 | with open(html_clean, "w", encoding='utf-8') as file: 89 | file.write(str(bsObj.decode_contents)) 90 | except: 91 | with open(html_clean, "w", encoding='utf-8') as file: 92 | file.write(str(bsObj.decode_contents)) 93 | list_of_list_of_filenames.append([filename, html_clean, pdf]) 94 | except: 95 | print('problem with:' + filename) 96 | return list_of_list_of_filenames -------------------------------------------------------------------------------- /02-selenium-safari/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-selenium-safari/requirements.txt -------------------------------------------------------------------------------- /02-selenium-safari/zip_folder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import shutil 4 | 5 | def zip_folder(folder_path, output_path): 6 | parent_folder = os.path.dirname(folder_path) 7 | # Retrieve the paths of the folder contents. 8 | contents = os.walk(folder_path) 9 | try: 10 | zip_file = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) 11 | for root, folders, files in contents: 12 | # Include all subfolders, including empty ones. 13 | for folder_name in folders: 14 | absolute_path = os.path.join(root, folder_name) 15 | relative_path = absolute_path.replace(parent_folder + '\\','') 16 | print("Adding {:d} to archive.".format(absolute_path)) 17 | zip_file.write(absolute_path, relative_path) 18 | for file_name in files: 19 | absolute_path = os.path.join(root, file_name) 20 | relative_path = absolute_path.replace(parent_folder + '\\','') 21 | print ("Adding '{:s}' to archive.".format(absolute_path)) 22 | zip_file.write(absolute_path, relative_path) 23 | print("'{:s}' created successfully.".format(output_path)) 24 | except IOError: 25 | print (message) 26 | sys.exit(1) 27 | except OSError: 28 | print (message) 29 | sys.exit(1) 30 | except zipfile.BadZipfile: 31 | print (message) 32 | sys.exit(1) 33 | finally: 34 | zip_file.close() 35 | shutil.rmtree(folder_path) 36 | 37 | def zip_directory(directory): 38 | folders = [ name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name)) ] 39 | for i in folders: 40 | zip_folder(os.path.join(directory, i), os.path.join(directory, i + ".zip")) 41 | 42 | -------------------------------------------------------------------------------- /02-webscrape-celery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-webscrape-celery/__init__.py -------------------------------------------------------------------------------- /02-webscrape-celery/basic_consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pika 3 | import requests 4 | from bs4 import BeautifulSoup 5 | # python3 -m venv venv 6 | 7 | # activates the virtualenv 8 | # source venv/bin/activate 9 | # pip3 install bs4 requests celery pika 10 | # python basic_consumer.py 11 | 12 | def on_message(channel, method_frame, header_frame, body): 13 | print(f"-> Starting: [{body}]") 14 | r = requests.get(body) 15 | soup = BeautifulSoup(r.text) 16 | print(f"-> Extracted: {soup.html.head.title}") 17 | print(f"-> Done: [{body}]") 18 | channel.basic_ack(delivery_tag=method_frame.delivery_tag) 19 | 20 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost')) 21 | channel = connection.channel() 22 | print('* Handling messages.') 23 | 24 | channel.basic_consume('pages', on_message) 25 | 26 | try: 27 | channel.start_consuming() 28 | except KeyboardInterrupt: 29 | channel.stop_consuming() 30 | 31 | connection.close() 32 | -------------------------------------------------------------------------------- /02-webscrape-celery/basic_producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import pika 4 | # activate spreadsheets_to_dataframes 5 | # python basic_producer.py 6 | 7 | print("* Connecting to RabbitMQ broker") 8 | 9 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost')) 10 | channel = connection.channel() 11 | channel.queue_declare(queue='pages') 12 | 13 | def produce(): 14 | with open('webscraping_celery/urls.txt', 'r') as f: 15 | urls = f.read().splitlines() 16 | 17 | for url in urls: 18 | print(f"* Pushed: [{url}]") 19 | channel.basic_publish(exchange='', routing_key='pages', body=url) 20 | 21 | counter = 0 22 | 23 | produce() 24 | 25 | connection.close() 26 | -------------------------------------------------------------------------------- /02-webscrape-celery/consumer.py: -------------------------------------------------------------------------------- 1 | 2 | ##################### 3 | # using docker for both broker and backend 4 | # $ docker run -d -p 5672:5672 -p 15672:15672 --name url-rabbit rabbitmq:management 5 | # $ celery -A consumer worker --loglevel=info 6 | 7 | 8 | import requests 9 | from celery import Celery 10 | # pip install celery==3.1.21 11 | # ^ windows 12 | 13 | app = Celery('tasks', broker='amqp://localhost/') 14 | 15 | @app.task 16 | def download_url(url): 17 | print(f"-> Starting: [{url}]") 18 | try: 19 | req = requests.get(url) 20 | if req.status_code == 200: 21 | 22 | print(f"-> Success Download: [{url}]") 23 | except: 24 | print(f'error: {url}') 25 | 26 | 27 | # celery -A consumer worker --loglevel=info 28 | # ^ run above celery command in terminal while situated in same folder as current file 29 | 30 | # from celery.task.control import discard_all 31 | # discard_all() 32 | # ^ use above to clear celery queue 33 | 34 | 35 | -------------------------------------------------------------------------------- /02-webscrape-celery/data.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-webscrape-celery/data.html -------------------------------------------------------------------------------- /02-webscrape-celery/producer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from celery import Celery 3 | import consumer 4 | 5 | app = Celery('tasks', broker='amqp://localhost//') 6 | 7 | def produce(): 8 | with open(f'urls.txt', 'r') as f: 9 | urls = f.read().splitlines() 10 | 11 | for url in urls: 12 | consumer.download_url.delay(url) 13 | print(f"* Submitted: [{url}]") 14 | 15 | produce() 16 | 17 | ##################### 18 | 19 | # urls.txt # example 20 | 21 | """ 22 | http://www.apple.com 23 | http://www.amazon.com 24 | http://www.abc.xyz 25 | http://www.microsoft.com 26 | http://www.facebook.com 27 | http://www.alibabagroup.com 28 | http://www.tencent.com 29 | http://www.berkshirehathaway.com 30 | http://www.jpmorganchase.com 31 | http://www.exxonmobil.com 32 | http://www.jnj.com 33 | http://usa.visa.com 34 | http://www.shell.com 35 | http://www.samsung.com 36 | http://www.bankofamerica.com 37 | http://www.icbc.com.cn 38 | http://www.wellsfargo.com 39 | http://corporate.walmart.com 40 | http://www.nestle.com 41 | http://www.unitedhealthgroup.com 42 | http://www.intel.com 43 | http://www.att.com 44 | http://www.chevron.com 45 | http://www.ccb.com 46 | http://www.homedepot.com 47 | http://www.pfizer.com 48 | http://www.verizon.com 49 | http://www.toyota.co.jp 50 | http://www.ab-inbev.com 51 | http://www.mastercard.com 52 | """ 53 | -------------------------------------------------------------------------------- /02-webscrape-celery/urls.txt: -------------------------------------------------------------------------------- 1 | http://www.apple.com 2 | http://www.amazon.com 3 | http://www.abc.xyz 4 | http://www.microsoft.com 5 | http://www.facebook.com 6 | http://www.alibabagroup.com 7 | http://www.tencent.com 8 | http://www.berkshirehathaway.com 9 | http://www.jpmorganchase.com 10 | http://www.exxonmobil.com 11 | http://www.jnj.com 12 | http://usa.visa.com 13 | http://www.shell.com 14 | http://www.samsung.com 15 | http://www.bankofamerica.com 16 | http://www.icbc.com.cn 17 | http://www.wellsfargo.com 18 | http://corporate.walmart.com 19 | http://www.nestle.com 20 | http://www.unitedhealthgroup.com 21 | http://www.intel.com 22 | http://www.att.com 23 | http://www.chevron.com 24 | http://www.ccb.com 25 | http://www.homedepot.com 26 | http://www.pfizer.com 27 | http://www.verizon.com 28 | http://www.toyota.co.jp 29 | http://www.ab-inbev.com 30 | http://www.mastercard.com 31 | http://www.cisco.com 32 | http://www.pg.com 33 | http://www.novartis.com 34 | http://www.petrochina.com.cn 35 | http://www.roche.com 36 | http://www.boeing.com 37 | http://www.coca-colacompany.com 38 | http://www.hsbc.com 39 | http://www.tsmc.com 40 | http://www.chinamobileltd.com 41 | http://www.oracle.com 42 | http://www.abchina.com 43 | http://www.netflix.com 44 | http://www.citigroup.com 45 | http://www.lvmh.com 46 | http://www.merck.com 47 | http://www.total.com 48 | http://www.pingan.com 49 | http://www.thewaltdisneycompany.com 50 | http://www.pepsico.com 51 | -------------------------------------------------------------------------------- /04-other-analysis/example_pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | filepath = r'2019-11-04T00-02-57_stlregionalchamber_largest_employers.csv' 4 | 5 | df = pd.read_csv(filepath) 6 | df.columns = df.columns.str.strip().to_list() 7 | df.columns = df.columns.str.replace(" ","_") 8 | df.columns = df.columns.str.replace("(","_").str.replace(")","_") 9 | df.columns = df.columns.str.lower() 10 | 11 | df_nan = df[df.website.isna()] 12 | df = df.dropna(subset=['website']) 13 | 14 | df['website'] = df.website.str.replace('www.',"") 15 | df['website'] = df['website'].apply(lambda x : "https://" + x) 16 | 17 | import tldextract 18 | 19 | folderpath = r'D:\PROJECTS\presentations\stl_data' 20 | 21 | import glob 22 | 23 | files = glob.glob(folderpath + "\\*.csv") 24 | 25 | import os 26 | 27 | df_list = [] 28 | for file in files: 29 | df = pd.read_csv(file) 30 | df['year'] = os.path.basename(file).split("_")[0] 31 | df_list.append(df) 32 | 33 | df = pd.concat(df_list) 34 | 35 | df.columns = df.columns.str.strip().to_list() 36 | df.columns = df.columns.str.replace(" ","_") 37 | df.columns = df.columns.str.replace("(","_").str.replace(")","_") 38 | df.columns = df.columns.str.lower() 39 | df.columns = df.columns.str.replace(".","") 40 | 41 | df = df.dropna(subset=['website']) 42 | df['website'] = df.website.str.replace('www.',"") 43 | df['website'] = df['website'].apply(lambda x : "https://" + x) 44 | df.sort_values('st_louis_employees', ascending=False) 45 | df['st_louis_employees'] = df.st_louis_employees.astype(int) 46 | df['website_domain'] = df['website'].apply(lambda x: tldextract.extract(x).domain) 47 | 48 | df_groups = [] 49 | 50 | for i, df_group in df.groupby('website_domain'): 51 | if df_group.index.size > 1: 52 | df_group = df_group.sort_values('year') 53 | df_group['pct_chg'] = df_group['st_louis_employees'].pct_change() 54 | else: 55 | df_group['pct_chg'] = None 56 | 57 | df_groups.append(df_group) 58 | 59 | df_all = pd.concat(df_groups) 60 | df_all.sort_values('st_louis_employees', ascending=False) 61 | -------------------------------------------------------------------------------- /04-other-analysis/read_sec.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | 4 | df = pd.DataFrame() 5 | 6 | df.from_csv(files) 7 | 8 | 9 | subs = [] 10 | 11 | for filename in files: 12 | print(f"Companies in {filename}") 13 | zip_filepath = os.path.join(folder_path, filename) 14 | data_file = zipfile.ZipFile(zip_filepath) 15 | 16 | df_sub = pd.read_csv(data_file.open('sub.txt'), sep='\t', error_bad_lines=False) 17 | 18 | subs.append(df_sub) 19 | 20 | revenues = [] 21 | for filename in files: 22 | print(f"Companies in {filename}") 23 | zip_filepath = os.path.join(folder_path, filename) 24 | data_file = zipfile.ZipFile(zip_filepath) 25 | 26 | df_num = pd.read_csv(data_file.open('num.txt'), encoding="latin1", sep='\t', error_bad_lines=False) 27 | df_revenues = df_num[df_num['tag'].str.contains('Revenues', regex=True)] 28 | revenues.append(df_revenues) 29 | 30 | df_revs = pd.concat(revenues) 31 | 32 | df_revs.sort_values('ddate', inplace=True) 33 | df_revs['cik'] = df_revs['adsh'].apply(lambda x: x.split("-")[0]) 34 | 35 | for i, df_group in df_revs.groupby('cik'): 36 | print(df_group) 37 | 38 | # df_sub_nodupes = df_sub.drop_duplicates(subset='name') 39 | # df_sub_nodupes.head(100) 40 | 41 | # df['stprinc'].drop_duplicates() 42 | 43 | df_missouri = df_sub[(df_sub['stprinc'].isin(['MO']) | df_sub['stprma'].isin(['MO'])) ] 44 | print(df_missouri[df_missouri['form'].isin(['10-K'])].sort_values('name')) 45 | 46 | df_sec_symbols = pd.read_json(r'D:\PROJECTS\presentations\meetup-2019-spreadsheets-to-dataframes\company_tickers.json').T 47 | 48 | """ 49 | curl "https://api-global.morningstar.com/sal-service/v1/stock/newfinancials/0P0000014I/incomeStatement/detail?dataType=A^&reportType=A^&locale=en^&^&operation=export" -H "Sec-Fetch-Mode: cors" -H "Origin: https://www.morningstar.com" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" -H "Accept: application/json, text/plain, */*" -H "Referer: https://www.morningstar.com/stocks/xnys/cat/financials" -H "X-API-RequestId: 52a823bc-0d1f-6c2a-51a9-fb553553a192" -H "ApiKey: lstzFDEOhfFNMLikKa0am9mgEKLBl49T" -H "X-API-REALTIME-E: eyJlbmMiOiJBMTI4R0NNIiwiYWxnIjoiUlNBLU9BRVAifQ.XmuAS3x5r-0MJuwLDdD4jNC6zjsY7HAFNo2VdvGg6jGcj4hZ4NaJgH20ez313H8An9UJrsUj8ERH0R8UyjQu2UGMUnJ5B1ooXFPla0LQEbN_Em3-IG84YPFcWVmEgcs1Fl2jjlKHVqZp04D21UvtgQ4xyPwQ-QDdTxHqyvSCpcE.ACRnQsNuTh1K_C9R.xpLNZ8Cc9faKoOYhss1CD0A4hG4m0M7-LZQ0fISw7NUHwzQs2AEo9ZXfwOvAj1fCbcE96mbKQo8gr7Oq1a2-piYXM1X5yNMcCxEaYyGinpnf6PGqbdr6zbYZdqyJk0KrxWVhKSQchLJaLGJOts4GlpqujSqJObJQcWWbkJQYKG9K7oKsdtMAKsHIVo5-0BCUbjKVnHJNsYwTsI7xn2Om8zGm4A.nBOuiEDssVFHC_N68tDjVA" -H "X-SAL-ContentType: e7FDDltrTy+tA2HnLovvGL0LFMwT+KkEptGju5wXVTU=" -H "DNT: 1" --compressed 50 | 51 | """ 52 | 53 | df_missouri_qtr_ann = df_missouri[df_missouri['form'].isin(['10-Q', '10-K'])] 54 | 55 | df_missouri['instance'].apply(lambda x: x.split("-")) 56 | 57 | df_microsoft = df_sub[df_sub['name'].str.contains('COCA COLA CO', regex=True)] 58 | 59 | microsoft_adsh = df_microsoft.adsh.to_list()[0] 60 | 61 | df_pre = pd.read_csv(data_file.open('pre.txt'), sep='\t', error_bad_lines=False) 62 | df_num = pd.read_csv(data_file.open('num.txt'), sep='\t', error_bad_lines=False) 63 | df_pre.head(100) 64 | 65 | df_ko_num = df_num[df_num['adsh'].isin(['0000021344-19-000034'])] 66 | df_revenues = df_num[df_num['tag'].str.contains('Revenues', regex=True)] 67 | df_revenues.sort_values('adsh').head(100) 68 | 69 | df_microsoft_num.sort_values("tag").drop_duplicates(subset=['tag']) 70 | df_microsoft_num = df_microsoft_num[df_microsoft_num['form'].isin(['10-Q', '10-K'])] 71 | 72 | 73 | df_ = df_.iloc[:, 0:len(df_.columns.tolist()[0:len(df_head.columns.tolist())])] 74 | df_.columns = df_head.columns.to_list() 75 | 76 | -------------------------------------------------------------------------------- /06-flask/flask-rss/README.md: -------------------------------------------------------------------------------- 1 | # Flask-RSS 2 | $ python main.py 3 | -------------------------------------------------------------------------------- /06-flask/flask-rss/main.py: -------------------------------------------------------------------------------- 1 | 2 | import feedparser 3 | import pandas as pd 4 | from flask import Flask, render_template 5 | 6 | app = Flask(__name__) 7 | 8 | @app.route("/") 9 | def index(): 10 | 11 | feed = feedparser.parse(r'http://www.prweb.com/rss2/daily.xml') 12 | 13 | df = pd.json_normalize(feed.entries, sep='_') 14 | 15 | df['source'] = "prweb" 16 | 17 | df = df.sort_values('published', ascending=False) 18 | 19 | df = df[['published', 'link', 'title','source']] 20 | 21 | return render_template("reader.html", df=df.itertuples(), columns_to_display=['published', 'Source', 'Headline']) 22 | 23 | if __name__ == "__main__": 24 | app.run(debug=True) 25 | -------------------------------------------------------------------------------- /06-flask/flask-rss/static/css/bootstrap-theme.min.css.map: -------------------------------------------------------------------------------- 1 | {"version":3,"sources":["less/theme.less","less/mixins/vendor-prefixes.less","less/mixins/gradients.less","less/mixins/reset-filter.less"],"names":[],"mappings":";;;;AAmBA,YAAA,aAAA,UAAA,aAAA,aAAA,aAME,YAAA,EAAA,KAAA,EAAA,eC2CA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBDvCR,mBAAA,mBAAA,oBAAA,oBAAA,iBAAA,iBAAA,oBAAA,oBAAA,oBAAA,oBAAA,oBAAA,oBCsCA,mBAAA,MAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,iBDlCR,qBAAA,sBAAA,sBAAA,uBAAA,mBAAA,oBAAA,sBAAA,uBAAA,sBAAA,uBAAA,sBAAA,uBAAA,+BAAA,gCAAA,6BAAA,gCAAA,gCAAA,gCCiCA,mBAAA,KACQ,WAAA,KDlDV,mBAAA,oBAAA,iBAAA,oBAAA,oBAAA,oBAuBI,YAAA,KAyCF,YAAA,YAEE,iBAAA,KAKJ,aErEI,YAAA,EAAA,IAAA,EAAA,KACA,iBAAA,iDACA,iBAAA,4CAAA,iBAAA,qEAEA,iBAAA,+CCnBF,OAAA,+GH4CA,OAAA,0DACA,kBAAA,SAuC2C,aAAA,QAA2B,aAAA,KArCtE,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAgBN,aEtEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAiBN,aEvEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAkBN,UExEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,gBAAA,gBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,iBAAA,iBAEE,iBAAA,QACA,aAAA,QAMA,mBAAA,0BAAA,yBAAA,0BAAA,yBAAA,yBAAA,oBAAA,2BAAA,0BAAA,2BAAA,0BAAA,0BAAA,6BAAA,oCAAA,mCAAA,oCAAA,mCAAA,mCAME,iBAAA,QACA,iBAAA,KAmBN,aEzEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAoBN,YE1EI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,kBAAA,kBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,mBAAA,mBAEE,iBAAA,QACA,aAAA,QAMA,qBAAA,4BAAA,2BAAA,4BAAA,2BAAA,2BAAA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,+BAAA,sCAAA,qCAAA,sCAAA,qCAAA,qCAME,iBAAA,QACA,iBAAA,KA2BN,eAAA,WClCE,mBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,EAAA,IAAA,IAAA,iBD2CV,0BAAA,0BE3FI,iBAAA,QACA,iBAAA,oDACA,iBAAA,+CAAA,iBAAA,wEACA,iBAAA,kDACA,OAAA,+GF0FF,kBAAA,SAEF,yBAAA,+BAAA,+BEhGI,iBAAA,QACA,iBAAA,oDACA,iBAAA,+CAAA,iBAAA,wEACA,iBAAA,kDACA,OAAA,+GFgGF,kBAAA,SASF,gBE7GI,iBAAA,iDACA,iBAAA,4CACA,iBAAA,qEAAA,iBAAA,+CACA,OAAA,+GACA,OAAA,0DCnBF,kBAAA,SH+HA,cAAA,ICjEA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBD6DV,sCAAA,oCE7GI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SD2CF,mBAAA,MAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,iBD0EV,cAAA,iBAEE,YAAA,EAAA,IAAA,EAAA,sBAIF,gBEhII,iBAAA,iDACA,iBAAA,4CACA,iBAAA,qEAAA,iBAAA,+CACA,OAAA,+GACA,OAAA,0DCnBF,kBAAA,SHkJA,cAAA,IAHF,sCAAA,oCEhII,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SD2CF,mBAAA,MAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,gBDgFV,8BAAA,iCAYI,YAAA,EAAA,KAAA,EAAA,gBAKJ,qBAAA,kBAAA,mBAGE,cAAA,EAqBF,yBAfI,mDAAA,yDAAA,yDAGE,MAAA,KE7JF,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,UFqKJ,OACE,YAAA,EAAA,IAAA,EAAA,qBC3HA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,gBDsIV,eEtLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAKF,YEvLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAMF,eExLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAOF,cEzLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAeF,UEjMI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFuMJ,cE3MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFwMJ,sBE5MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFyMJ,mBE7MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF0MJ,sBE9MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF2MJ,qBE/MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF+MJ,sBElLI,iBAAA,yKACA,iBAAA,oKACA,iBAAA,iKFyLJ,YACE,cAAA,IC9KA,mBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,EAAA,IAAA,IAAA,iBDgLV,wBAAA,8BAAA,8BAGE,YAAA,EAAA,KAAA,EAAA,QEnOE,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFiOF,aAAA,QALF,+BAAA,qCAAA,qCAQI,YAAA,KAUJ,OCnME,mBAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,EAAA,IAAA,IAAA,gBD4MV,8BE5PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFyPJ,8BE7PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF0PJ,8BE9PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF2PJ,2BE/PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF4PJ,8BEhQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF6PJ,6BEjQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFoQJ,MExQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFsQF,aAAA,QC3NA,mBAAA,MAAA,EAAA,IAAA,IAAA,gBAAA,EAAA,IAAA,EAAA,qBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,gBAAA,EAAA,IAAA,EAAA"} -------------------------------------------------------------------------------- /06-flask/flask-rss/static/css/reader.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/css/reader.css -------------------------------------------------------------------------------- /06-flask/flask-rss/static/css/style.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/css/style.css -------------------------------------------------------------------------------- /06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /06-flask/flask-rss/static/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('../../js/transition.js') 3 | require('../../js/alert.js') 4 | require('../../js/button.js') 5 | require('../../js/carousel.js') 6 | require('../../js/collapse.js') 7 | require('../../js/dropdown.js') 8 | require('../../js/modal.js') 9 | require('../../js/tooltip.js') 10 | require('../../js/popover.js') 11 | require('../../js/scrollspy.js') 12 | require('../../js/tab.js') 13 | require('../../js/affix.js') -------------------------------------------------------------------------------- /06-flask/flask-rss/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 4 | 5 | {% endblock %} 6 | 7 | {% block content %} 8 | 9 |

Flask-RSS

10 |

RSS Reader

11 |


12 |

Feeds

13 | 18 |
19 | 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /06-flask/flask-rss/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% block title %}{% endblock %} ~ RssReader 5 | 6 | 7 | 8 | 9 | 10 | {% block header %}{% endblock %} 11 | 12 | 13 | 14 | 15 |
16 | 17 | {% block content %}{% endblock %} 18 |
19 | 20 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /06-flask/flask-rss/templates/notfound.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 4 | 404 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

404 - Seite nicht gefunden

9 |

10 |

Diese Seite gibt es nicht. Kann es vielleicht sein, dass du dich vertippt hast?

11 |

Oder hat dir irgendjemand einen falschen Link gegeben?! :o Wenn ja, darfst du ihn gerne eine verpassen!

12 |


13 |

Featured Websites

14 | 21 | 22 | {% endblock %} -------------------------------------------------------------------------------- /06-flask/flask-rss/templates/reader.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 4 | {% if name %} 5 | {{ name }} 6 | {% else %} 7 | Reader 8 | {% endif %} 9 | {% endblock %} 10 | 11 | {% block content %} 12 | 13 | 14 | {% for column in columns_to_display %} 15 | 16 | {%- endfor -%} 17 | 18 | 19 | 20 | {% for row in df %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | {%- endfor -%} 28 | 29 |
{{ column }}
{{ row.published }}{{ row.source }}{{ row.title }}
30 | 31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /06-flask/flask-rss/templates/table.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block title %} 4 | {% if name %} 5 | {{ name }} 6 | {% else %} 7 | Reader 8 | {% endif %} 9 | {% endblock %} 10 | 11 | {% block content %} 12 | 13 | 14 | {% for column in columns_to_display %} 15 | 16 | {%- endfor -%} 17 | 18 | 19 | 20 | {% for row in df %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | {%- endfor -%} 28 | 29 |
{{ column }}
{{ row.published }}{{ row.source }}{{ row.title }}
30 | 31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /07-airflow/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Deployment 3 | 4 | https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html 5 | -------------------------------------------------------------------------------- /07-airflow/dags/example_postgres.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | from airflow.providers.postgres.operators.postgres import PostgresOperator 5 | 6 | default_args = {"owner": "airflow"} 7 | 8 | # create_stock_table, populate_stock_table, get_all_stocks are examples of tasks created by 9 | # instantiating the Postgres Operator 10 | 11 | with DAG( 12 | dag_id="postgres_operator_dag", 13 | start_date=datetime.datetime(2020, 2, 2), 14 | schedule_interval="@once", 15 | default_args=default_args, 16 | catchup=False, 17 | ) as dag: 18 | 19 | create_stock_table = PostgresOperator( 20 | task_id="create_stock_table", 21 | postgres_conn_id="postgres_default", 22 | sql="sql/stock_schema.sql" 23 | ) 24 | 25 | populate_stock_table = PostgresOperator( 26 | task_id="populate_stock_table", 27 | postgres_conn_id="postgres_default", 28 | sql="sql/stock_insert.sql" 29 | ) 30 | 31 | get_all_stocks = PostgresOperator( 32 | task_id="get_all_stocks", postgres_conn_id="postgres_default", sql="SELECT * FROM stocks;" 33 | ) 34 | 35 | create_stock_table >> populate_stock_table >> get_all_stocks 36 | -------------------------------------------------------------------------------- /07-airflow/dags/sql/stock_insert.sql: -------------------------------------------------------------------------------- 1 | insert into stocks values (1, 'MSFT', 'Microsoft', '2018-07-05', 124.35); 2 | insert into stocks values (2, 'GOOG', 'Google', '2019-05-01', 234.42); 3 | insert into stocks values (3, 'TSLA', 'Tesla', '2020-06-23', 2434.22); 4 | insert into stocks values (4, 'AMZN', 'Amazon', '2013-08-11', 2344.34); 5 | -------------------------------------------------------------------------------- /07-airflow/dags/sql/stock_schema.sql: -------------------------------------------------------------------------------- 1 | -- create pet table 2 | CREATE TABLE IF NOT EXISTS stocks ( 3 | id SERIAL PRIMARY KEY, 4 | symbol VARCHAR NOT NULL, 5 | name VARCHAR NOT NULL, 6 | date DATE NOT NULL, 7 | price numeric NOT NULL); 8 | -------------------------------------------------------------------------------- /07-airflow/dags/stock_analysis_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.python_operator import PythonOperator 3 | 4 | from datetime import datetime, timedelta 5 | import datetime as dt 6 | import pandas as pd 7 | import yfinance as yf 8 | import requests 9 | 10 | from functools import reduce 11 | 12 | 13 | ############################################ 14 | # DEFINE AIRFLOW DAG (SETTINGS + SCHEDULE) 15 | ############################################ 16 | default_args = { 17 | 'owner': 'airflow', 18 | 'depends_on_past': False, 19 | 'email': ['user@gmail.com'], 20 | 'email_on_failure': False, 21 | 'email_on_retry': False, 22 | 'retries': 1 23 | } 24 | 25 | dag = DAG( 'stocks_analysis_ETL_7AM', 26 | default_args=default_args, 27 | description='Collect Stock Prices For Analysis', 28 | catchup=False, 29 | start_date= datetime(2020, 6, 23), 30 | schedule_interval= '* 7 * * *' 31 | ) 32 | 33 | tickers = ['AAPL', 'AMZN', 'BLK', 'T', 'TSLA'] # <-- Initial Tickers List. It will be available globally for all functions. 34 | 35 | #################################################### 36 | # DEFINE PYTHON FUNCTIONS 37 | #################################################### 38 | 39 | def fetch_prices_function(**kwargs): # <-- Remember to include "**kwargs" in all the defined functions 40 | print('1 Fetching stock prices and remove duplicates...') 41 | stocks_prices = [] 42 | for i in range(0, len(tickers)): 43 | prices = yf.download(tickers[i], period = 'max').iloc[: , :5].dropna(axis=0, how='any') 44 | prices = prices.loc[~prices.index.duplicated(keep='last')] 45 | prices = prices.reset_index() 46 | prices.insert(loc = 1, column = 'Stock', value = tickers[i]) 47 | stocks_prices.append(prices) 48 | return stocks_prices # <-- This list is the output of the fetch_prices_function and the input for the functions below 49 | 50 | 51 | def stocks_plot_function(**kwargs): 52 | print('2 Pulling stocks_prices to concatenate sub-lists to create a combined dataset + write to CSV file...') 53 | ti = kwargs['ti'] 54 | stocks_prices = ti.xcom_pull(task_ids='fetch_prices_task') # <-- xcom_pull is used to pull the stocks_prices list generated above 55 | stock_plots_data = pd.concat(stocks_prices, ignore_index=True) 56 | stock_plots_data.to_csv('/Users/anbento/Documents/Data_Sets/Medium/stocks_plots_data.csv', index=False) 57 | 58 | print('DF Shape: ', stock_plots_data.shape) 59 | print(stock_plots_data.head(5)) 60 | print('Completed \n\n') 61 | 62 | def stocks_table_function(**kwargs): 63 | print('3 Creating aggregated dataframe with stock stats for last available date + write to CSV file...') 64 | ti = kwargs['ti'] 65 | stocks_prices = ti.xcom_pull(task_ids='fetch_prices_task') # <-- xcom_pull is used to pull the stocks_prices list generated above 66 | stocks_adj_close = [] 67 | for i in range(0, len(stocks_prices)): 68 | adj_price= stocks_prices[i][['Date','Adj Close']] 69 | adj_price.set_index('Date', inplace = True) 70 | adj_price.columns = [tickers[i]] 71 | stocks_adj_close.append(adj_price) 72 | 73 | stocks_adj_close = reduce(lambda left,right: pd.merge(left, right, left_index = True, right_index = True ,how='outer'), stocks_adj_close) 74 | stocks_adj_close.sort_index(ascending = False, inplace = True) 75 | stocks_adj_close.index = pd.to_datetime(stocks_adj_close.index).date 76 | 77 | ########################################## 78 | # DEFINE AIRFLOW OPERATORS 79 | ########################################## 80 | 81 | fetch_prices_task = PythonOperator(task_id = 'fetch_prices_task', 82 | python_callable = fetch_prices_function, 83 | provide_context = True, 84 | dag= dag ) 85 | 86 | stocks_plot_task= PythonOperator(task_id = 'stocks_plot_task', 87 | python_callable = stocks_plot_function, 88 | provide_context = True, 89 | dag= dag) 90 | 91 | stocks_table_task = PythonOperator(task_id = 'stocks_table_task', 92 | python_callable = stocks_table_function, 93 | provide_context = True, 94 | dag= dag) 95 | 96 | ########################################## 97 | # DEFINE TASKS HIERARCHY 98 | ########################################## 99 | 100 | fetch_prices_task >> stocks_plot_task >> stocks_table_task 101 | -------------------------------------------------------------------------------- /07-airflow/dags/stocks.py: -------------------------------------------------------------------------------- 1 | # import json 2 | # from datetime import datetime, timedelta 3 | # 4 | # import redis 5 | # from airflow.models import DAG 6 | # from airflow.operators import PythonOperator 7 | # 8 | # stocks = ('AAPL', 'AMZN', 'GOOGL', 'MSFT', 9 | # 'FB', 'BABA', 'BRK.B', 'JPM', 10 | # 'XOM', 'JNJ', 'V', 'BAC', 'WFC', 11 | # 'WMT', 'UNH', 'INTC', 'T', 'CVX', 12 | # 'HD', 'PFE', 'VZ', 'MA', 'CSCO', 'PG', 13 | # 'BA', 'KO', 'ORCL', 'NFLX', 'C', 'MRK', 14 | # 'DIS') 15 | # 16 | # 17 | # def get_stocks(ds, **context): 18 | # symbol = context['params']['symbol'] 19 | # 20 | # pg_hook = postgres_hook(postgres_conn_id='stocks') 21 | # api_hook = http_hook(http_conn_id='alphavantage', method='GET') 22 | # 23 | # # If either of these raises an exception then we'll be notified via 24 | # # Airflow 25 | # resp = api_hook.run(f'query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=537201H9R203WT4C&datatype=csv') 26 | # resp = json.loads(resp.content) 27 | # 28 | # # These are the only valid stocks the DB supports at the moment. Anything 29 | # # else that turns up will be ignored. 30 | # 31 | # stocks_insert = f"""INSERT INTO stocks (symbol, valid_until, price) 32 | # VALUES ({symbol}, {valid_until}, {price});""" 33 | # 34 | # # If this raises an exception then we'll be notified via Airflow 35 | # valid_until = datetime.fromtimestamp(resp['timestamp']) 36 | # 37 | # for iso2, price in resp['stocks'].items(): 38 | # # If converting the price to a float fails for whatever reason then 39 | # # just move on. 40 | # try: 41 | # price = float(price) 42 | # except: 43 | # continue 44 | # 45 | # iso2 = iso2.upper().strip() 46 | # 47 | # if iso2 not in stocks or price < 0: 48 | # continue 49 | # 50 | # pg_hook.run(stocks_insert, parameters=(iso2, 51 | # valid_until, 52 | # price)) 53 | # 54 | # 55 | # def cache_latest_stocks(ds, **kwargs): 56 | # redis_conn = redis.StrictRedis(host='redis') 57 | # pg_hook = postgres_hook(postgres_conn_id='stocks') 58 | # latest_stocks = """SELECT DISTINCT ON (symbol) 59 | # symbol, price 60 | # FROM stocks 61 | # ORDER BY symbol, valid_until DESC;""" 62 | # 63 | # for iso2, stock in pg_hook.get_records(latest_stocks): 64 | # redis_conn.set(iso2, stock) 65 | # 66 | # 67 | # args = { 68 | # 'owner': 'ryan', 69 | # 'depends_on_past': False, 70 | # 'start_date': datetime.utcnow(), 71 | # 'retries': 1, 72 | # 'retry_delay': timedelta(minutes=5), 73 | # } 74 | # 75 | # # Run at the top of the hour Monday to Friday. 76 | # # Note: This doesn't line up with the market hours of 77 | # # 10PM Sunday till 10PM Friday GMT. 78 | # dag = DAG(dag_id='stocks', 79 | # default_args=args, 80 | # schedule_interval='0 * * * 1,2,3,4,5', 81 | # dagrun_timeout=timedelta(seconds=30)) 82 | # 83 | # # loop through the lob's we want to use to build up our dag 84 | # for stock in stocks: 85 | # get_stocks_task = \ 86 | # PythonOperator(task_id='get_stocks', 87 | # provide_context=True, 88 | # op_kwargs={"stock": stock}, 89 | # python_callable=get_stocks, 90 | # dag=dag) 91 | # 92 | # cache_latest_stocks_task = \ 93 | # PythonOperator(task_id='cache_latest_stocks', 94 | # provide_context=True, 95 | # python_callable=cache_latest_stocks, 96 | # dag=dag) 97 | # 98 | # get_stocks_task.set_downstream(cache_latest_stocks_task) 99 | -------------------------------------------------------------------------------- /07-airflow/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:master-python3.8 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_GID - Group ID in Airflow containers 31 | # Default: 50000 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. 35 | # Default: airflow 36 | # 37 | # Feel free to modify this file to suit your needs. 38 | --- 39 | version: '3' 40 | x-airflow-common: 41 | &airflow-common 42 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.0.2} 43 | environment: 44 | &airflow-common-env 45 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 46 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 47 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 48 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 49 | AIRFLOW__CORE__FERNET_KEY: '' 50 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 51 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true' 52 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 53 | volumes: 54 | - ./dags:/opt/airflow/dags 55 | - ./logs:/opt/airflow/logs 56 | - ./plugins:/opt/airflow/plugins 57 | user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" 58 | depends_on: 59 | redis: 60 | condition: service_healthy 61 | postgres: 62 | condition: service_healthy 63 | before_script: 64 | - pip3 install yfinance pandas 65 | 66 | services: 67 | postgres: 68 | image: postgres:13 69 | environment: 70 | POSTGRES_USER: airflow 71 | POSTGRES_PASSWORD: airflow 72 | POSTGRES_DB: airflow 73 | volumes: 74 | - postgres-db-volume:/var/lib/postgresql/data 75 | healthcheck: 76 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 77 | interval: 5s 78 | retries: 5 79 | restart: always 80 | 81 | redis: 82 | image: redis:latest 83 | ports: 84 | - 6379:6379 85 | healthcheck: 86 | test: [ "CMD", "redis-cli", "ping" ] 87 | interval: 5s 88 | timeout: 30s 89 | retries: 50 90 | restart: always 91 | 92 | airflow-webserver: 93 | <<: *airflow-common 94 | command: webserver 95 | ports: 96 | - 8080:8080 97 | healthcheck: 98 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ] 99 | interval: 10s 100 | timeout: 10s 101 | retries: 5 102 | restart: always 103 | 104 | airflow-scheduler: 105 | <<: *airflow-common 106 | command: scheduler 107 | restart: always 108 | 109 | airflow-worker: 110 | <<: *airflow-common 111 | command: celery worker 112 | restart: always 113 | 114 | airflow-init: 115 | <<: *airflow-common 116 | command: version 117 | environment: 118 | <<: *airflow-common-env 119 | _AIRFLOW_DB_UPGRADE: 'true' 120 | _AIRFLOW_WWW_USER_CREATE: 'true' 121 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 122 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 123 | 124 | flower: 125 | <<: *airflow-common 126 | command: celery flower 127 | ports: 128 | - 5555:5555 129 | healthcheck: 130 | test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ] 131 | interval: 10s 132 | timeout: 10s 133 | retries: 5 134 | restart: always 135 | 136 | volumes: 137 | postgres-db-volume: 138 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Ryan S. McCoy 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/ryansmccoy/spreadsheets_to_dataframes/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | Spreadsheets to DataFrames could always use more documentation, whether as part of the 42 | official Spreadsheets to DataFrames docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/ryansmccoy/spreadsheets_to_dataframes/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `spreadsheets_to_dataframes` for local development. 61 | 62 | 1. Fork the `spreadsheets_to_dataframes` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/spreadsheets_to_dataframes.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv spreadsheets_to_dataframes 70 | $ cd spreadsheets_to_dataframes/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the 80 | tests, including testing other Python versions with tox:: 81 | 82 | $ flake8 spreadsheets_to_dataframes tests 83 | $ python setup.py test or pytest 84 | $ tox 85 | 86 | To get flake8 and tox, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 7. Submit a pull request through the GitHub website. 95 | 96 | Pull Request Guidelines 97 | ----------------------- 98 | 99 | Before you submit a pull request, check that it meets these guidelines: 100 | 101 | 1. The pull request should include tests. 102 | 2. If the pull request adds functionality, the docs should be updated. Put 103 | your new functionality into a function with a docstring, and add the 104 | feature to the list in README.rst. 105 | 3. The pull request should work for Python 2.7, 3.5, 3.6 and 3.7, and for PyPy. Check 106 | https://travis-ci.org/ryansmccoy/spreadsheets_to_dataframes/pull_requests 107 | and make sure that the tests pass for all supported Python versions. 108 | 109 | Tips 110 | ---- 111 | 112 | To run a subset of tests:: 113 | 114 | $ pytest tests.test_spreadsheets_to_dataframes 115 | 116 | 117 | Deploying 118 | --------- 119 | 120 | A reminder for the maintainers on how to deploy. 121 | Make sure all your changes are committed (including an entry in HISTORY.rst). 122 | Then run:: 123 | 124 | $ bump2version patch # possible: major / minor / patch 125 | $ git push 126 | $ git push --tags 127 | 128 | Travis will then deploy to PyPI if tests pass. 129 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2019-10-09) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019, Ryan S. McCoy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | flake8 spreadsheets_to_dataframes tests 55 | 56 | test: ## run tests quickly with the default Python 57 | pytest 58 | 59 | test-all: ## run tests on every Python version with tox 60 | tox 61 | 62 | coverage: ## check code coverage quickly with the default Python 63 | coverage run --source spreadsheets_to_dataframes -m pytest 64 | coverage report -m 65 | coverage html 66 | $(BROWSER) htmlcov/index.html 67 | 68 | docs: ## generate Sphinx HTML documentation, including API docs 69 | rm -f docs/spreadsheets_to_dataframes.rst 70 | rm -f docs/modules.rst 71 | sphinx-apidoc -o docs/ spreadsheets_to_dataframes 72 | $(MAKE) -C docs clean 73 | $(MAKE) -C docs html 74 | $(BROWSER) docs/_build/html/index.html 75 | 76 | servedocs: docs ## compile the docs watching for changes 77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 78 | 79 | release: dist ## package and upload a release 80 | twine upload dist/* 81 | 82 | dist: clean ## builds source and wheel package 83 | python setup.py sdist 84 | python setup.py bdist_wheel 85 | ls -l dist 86 | 87 | install: clean ## install the package to the active Python's site-packages 88 | python setup.py install 89 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ======================================================================================= 2 | From Spreadsheets to DataFrames: Escaping Excel Hell with Python 3 | ======================================================================================= 4 | 5 | ============================================================================================================================================================================== 6 | 7 | `Pycon 2021 Tutorial Video [YouTube] - May 12, 2021 `_ 8 | 9 | 10 | Other Presentations: 11 | 12 | `STL Python Presentation [YouTube] `_ 13 | 14 | `Chicago Python Users Group [YouTube] `_ 15 | 16 | Details 17 | 18 | A spreadsheet is a wonderful invention and an excellent tool for certain jobs. All too often, however, spreadsheets are called upon to perform tasks that are beyond their capabilities. It’s like the old saying, 'If the only tool you have is a hammer, every problem looks like a nail.' However, some problems are better addressed with a screwdriver, with glue, or with a Swiss Army Knife. 19 | 20 | Python is described by some in the programming world as the Swiss Army Knife of programming languages because of its unrivaled versatility and flexibility in use. This allows its users to solve complex problems relatively easily compared with other programming languages and is one of the reasons why Python has become increasingly popular over time. 21 | 22 | In this tutorial, we’ll briefly discuss spreadsheets, signs that you might be living in “Excel Hell”, and then we’ll spend the rest of the time learning how to escape it using Python. 23 | 24 | In the first section, we’ll extend on what spreadsheet users already know about cells, rows, columns, and formulas, and map them to their Python equivalent, such as variables, lists, dictionaries, and functions. At the end of this section, we’ll do an interactive exercise and learn how we can perform a simple calculation, similar to one you might do in Excel, but instead using Python. 25 | 26 | In the second section, we’ll discuss (and attempt) how we can perform more complex tasks including web scraping, data processing, analysis, and visualization, by utilizing a few popular 3rd party libraries used including Requests, Pandas, Flask, Matplotlib, and others. 27 | 28 | In the last section, we’ll round out our discussion with a few important concepts in data management, including concept of tidy data, building a data pipeline, and a few strategies (and packages) to use when approaching various data problems, including demo using Apache Airflow. 29 | 30 | Slides 31 | ====================== 32 | 33 | `Intro [Slides] `_ 34 | 35 | `Excel to Python [Slides] `_ 36 | 37 | `Python Libraries & Resources [Slides] `_ 38 | 39 | `Data Management [Slides] `_ 40 | 41 | Tutorial Code 42 | ====================== 43 | 44 | Section 1 - Python Fundamentals for an Excel User 45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | 47 | `01 basics_but_important_stuff.ipynb `_ 48 | 49 | `02 files_lists_dictionaries.ipynb `_ 50 | 51 | Section 1 - Challenges 52 | ~~~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | `challenge_1.py `_ 55 | 56 | `challenge_1_answer.py `_ 57 | 58 | `challenge_2.py `_ 59 | 60 | `challenge_2_answer.py `_ 61 | 62 | `challenge_3.py `_ 63 | 64 | `challenge_3_answer.py `_ 65 | 66 | Section 2 - Real-World Python Example for an Excel User 67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 68 | 69 | `01-real-world-example.py `_ 70 | 71 | `02-real-world-example-refactored.py `_ 72 | 73 | Section 2 - Challenge 74 | ~~~~~~~~~~~~~~~~~~~~~~~ 75 | 76 | `section2_challenge.rst `_ 77 | 78 | 79 | Section 3 - Best Practices in Python & Data for an Excel User 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 81 | 82 | `Data Management [Slides] `_ 83 | 84 | `07-airflow `_ 85 | 86 | STL Python - Talk Code 87 | ====================== 88 | 89 | `01-basics.ipynb `_ 90 | 91 | `02-webscraping.ipynb `_ 92 | 93 | `03-tidy-data.ipynb `_ 94 | 95 | `04-pandas.ipynb `_ 96 | 97 | `05-data-analysis.ipynb `_ 98 | 99 | `06-data-visualizations.ipynb `_ 100 | 101 | STL Python - Folders 102 | =================================================== 103 | 104 | * 01-basics - examples used in presentation 105 | * 02-webscraping - program that clicks through a calendar (written in javascript) and exports csv files 106 | * 02-selenium-safari - program that logins to website, scrapes html from javascript generated page, cleans html, and exports to pdf files 107 | * 02-webscrape-celery - example of that uses a message queue, and celery to download list of urls 108 | * 04-other-analysis - examples of different quantitative notebooks 109 | * 05-other-visualizations - examples of different data visualization tools 110 | * 06-flask - different flask examples 111 | * 07-airflow - example that uses airflow to download and store stock prices 112 | 113 | Quick Start Guides 114 | ====================== 115 | 116 | 117 | `Install Anaconda & Pycharm `_ 118 | 119 | * Anaconda = manages your Python environments 120 | 121 | * Pycharm = code editor 122 | 123 | `Install Git `_ - Allows you to git clone/download Github Projects' 124 | 125 | Setup Environment & Run Example (Windows): 126 | ================================================== 127 | 128 | .. code-block:: bash 129 | 130 | $ git clone https://github.com/ryansmccoy/spreadsheets-to-dataframes.git 131 | $ cd spreadsheets-to-dataframes 132 | $ conda create -n spreadsheets-to-dataframes python=3.8 pandas scipy numpy lxml jupyter matplotlib -y 133 | $ activate spreadsheets-to-dataframes 134 | $ pip install -r requirements_dev.txt 135 | 136 | Setup Environment & Run Example (Linux): 137 | ================================================== 138 | 139 | .. code-block:: bash 140 | 141 | $ git clone https://github.com/ryansmccoy/spreadsheets-to-dataframes.git 142 | $ cd spreadsheets-to-dataframes 143 | $ conda create -n spreadsheets-to-dataframes python=3.8 pandas scipy numpy lxml jupyter matplotlib -y 144 | $ source activate spreadsheets-to-dataframes 145 | $ pip install -r requirements_dev.txt 146 | 147 | Running Jupyter Notebooks: 148 | ================================================== 149 | 150 | Navigate to spreadsheet-to-dataframe directory/folder: 151 | 152 | .. code-block:: bash 153 | 154 | $ activate spreadsheets-to-dataframes 155 | $ jupyter notebook 156 | 157 | (Optional) Install Docker to Run Airflow Example 158 | =================================================== 159 | 160 | https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html 161 | 162 | Python Books & Videos: 163 | =================================================== 164 | 165 | `(Book) Python Crash Course, 2nd Edition `_ 166 | 167 | `(Book) Introducing Python: Modern Computing in Simple Packages `_ 168 | 169 | `(Book) Learning Python, 5th Edition `_ 170 | 171 | `(Book) Automate the Boring Stuff with Python, 2nd Edition: Practical Programming for Total Beginners `_ 172 | 173 | `(Book) Think Python: How to Think Like a Computer Scientist `_ 174 | 175 | `(Book) The Quick Python Book (Book) `_ 176 | 177 | `(Book) Serious Python: Black-Belt Advice on Deployment, Scalability, Testing, and More `_ 178 | 179 | `(Github) A Whirlwind Tour of Python `_ 180 | 181 | `(Github) Python Data Science Handbook `_ 182 | 183 | `(Github) Introduction to Python `_ 184 | 185 | Cookiecutter: 186 | =================================================== 187 | 188 | $ pip install cookiecutter 189 | 190 | Resources: 191 | 192 | https://github.com/cookiecutter/cookiecutter 193 | 194 | https://github.com/audreyfeldroy/cookiecutter-pypackage 195 | 196 | https://towardsdatascience.com/cookiecutter-creating-custom-reusable-project-templates-fc85c8627b07 197 | 198 | Requests 199 | =================================================== 200 | 201 | $ pip install requests 202 | 203 | Resources: 204 | 205 | https://python.readthedocs.io/en/stable/library/stdtypes.html 206 | 207 | https://realpython.com/python-requests/ 208 | 209 | Have you mastered Requests? Then you should check out multithreading, concurrency, asyncio, message queues, parallelism. 210 | 211 | https://yasoob.me/2019/05/29/speedingw-up-python-code-using-multithreading/ 212 | 213 | https://www.toptal.com/python/beginners-guide-to-concurrency-and-parallelism-in-python 214 | 215 | https://creativedata.stream/multi-threading-api-requests-in-python/ 216 | 217 | https://levelup.gitconnected.com/asynchronous-tasks-in-python-with-celery-rabbitmq-redis-480f6e506d76 218 | 219 | https://tests4geeks.com/blog/python-celery-rabbitmq-tutorial/ 220 | 221 | https://codeburst.io/automated-web-scraping-with-python-and-celery-ac02a4a9ce51 222 | 223 | https://github.com/ryansmccoy/zmq-high-speed-subs 224 | 225 | 226 | Pandas 227 | =================================================== 228 | 229 | $ pip install pandas 230 | 231 | Resources: 232 | 233 | `Dealing With Data `_ 234 | 235 | `Pandas Cookbook `_ 236 | 237 | `brandon-rhodes\pycon-pandas-tutorial `_ 238 | 239 | `Python pandas Q&A video series `_ 240 | 241 | `Master Data Analysis with Python `_ 242 | 243 | Have you mastered Pandas? Then you check out Dask and Spark. 244 | 245 | https://dask.org/ 246 | 247 | https://spark.apache.org/docs/latest/api/python/ 248 | 249 | Visualization: 250 | =================================================== 251 | 252 | $ pip install matplotlib 253 | 254 | Resources: 255 | 256 | https://github.com/fasouto/awesome-dataviz 257 | 258 | https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html 259 | 260 | https://www.toptal.com/designers/data-visualization/data-visualization-tools 261 | 262 | https://realpython.com/pandas-plot-python/ 263 | 264 | Have you mastered Matplotlilb? Then you should checkout Javascript, D3, React, Tableau 265 | 266 | Flask: 267 | =================================================== 268 | 269 | $ pip install flask 270 | 271 | Resources: 272 | 273 | https://www.fullstackpython.com/flask.html 274 | 275 | https://blog.miguelgrinberg.com/ 276 | 277 | Have you mastered Flask? Then you should checkout FastAPI, Javascript, Node, React 278 | -------------------------------------------------------------------------------- /data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx -------------------------------------------------------------------------------- /data/WMT_US.csv: -------------------------------------------------------------------------------- 1 | Ticker,Company Name,Year End,Total Sales,Total Expenses 2 | WMT US,WAL-MART STORES INC,12/31/2014,476293988352,460271988736 3 | WMT US,WAL-MART STORES INC,12/31/2013,469162000384,452163000320 4 | WMT US,WAL-MART STORES INC,12/31/2012,446950014976,431251014656 5 | WMT US,WAL-MART STORES INC,12/31/2011,421849006080,405460005888 6 | WMT US,WAL-MART STORES INC,12/31/2010,408214011904,393879012352 7 | WMT US,WAL-MART STORES INC,12/31/2009,405606989824,392206989312 8 | WMT US,WAL-MART STORES INC,12/31/2008,378798997504,366067997696 9 | WMT US,WAL-MART STORES INC,12/31/2007,348650012672,337366012928 10 | WMT US,WAL-MART STORES INC,12/31/2006,312426987520,301195987968 11 | WMT US,WAL-MART STORES INC,12/31/2005,287989006336,277722006528 12 | WMT US,WAL-MART STORES INC,12/31/2004,256329007104,247275006976 13 | WMT US,WAL-MART STORES INC,12/31/2003,229615992832,221660993024 14 | -------------------------------------------------------------------------------- /data/WMT_US_pandas.csv: -------------------------------------------------------------------------------- 1 | ,Ticker,Company Name,Year End,Total Sales,Total Expenses,Total Profit 2 | 0,WMT US,WAL-MART STORES INC,12/31/2014,476293988352,460271988736,16021999616 3 | 1,WMT US,WAL-MART STORES INC,12/31/2013,469162000384,452163000320,16999000064 4 | 2,WMT US,WAL-MART STORES INC,12/31/2012,446950014976,431251014656,15699000320 5 | 3,WMT US,WAL-MART STORES INC,12/31/2011,421849006080,405460005888,16389000192 6 | 4,WMT US,WAL-MART STORES INC,12/31/2010,408214011904,393879012352,14334999552 7 | 5,WMT US,WAL-MART STORES INC,12/31/2009,405606989824,392206989312,13400000512 8 | 6,WMT US,WAL-MART STORES INC,12/31/2008,378798997504,366067997696,12730999808 9 | 7,WMT US,WAL-MART STORES INC,12/31/2007,348650012672,337366012928,11283999744 10 | 8,WMT US,WAL-MART STORES INC,12/31/2006,312426987520,301195987968,11230999552 11 | 9,WMT US,WAL-MART STORES INC,12/31/2005,287989006336,277722006528,10266999808 12 | 10,WMT US,WAL-MART STORES INC,12/31/2004,256329007104,247275006976,9054000128 13 | 11,WMT US,WAL-MART STORES INC,12/31/2003,229615992832,221660993024,7954999808 14 | -------------------------------------------------------------------------------- /data/WMT_US_updated.csv: -------------------------------------------------------------------------------- 1 | ,ticker,name,date,sales,expenses,profit 2 | 0,WMT US,WAL-MART STORES INC,2014-12-31,476293988352,460271988736,16021999616 3 | 1,WMT US,WAL-MART STORES INC,2013-12-31,469162000384,452163000320,16999000064 4 | 2,WMT US,WAL-MART STORES INC,2012-12-31,446950014976,431251014656,15699000320 5 | 3,WMT US,WAL-MART STORES INC,2011-12-31,421849006080,405460005888,16389000192 6 | 4,WMT US,WAL-MART STORES INC,2010-12-31,408214011904,393879012352,14334999552 7 | 5,WMT US,WAL-MART STORES INC,2009-12-31,405606989824,392206989312,13400000512 8 | 6,WMT US,WAL-MART STORES INC,2008-12-31,378798997504,366067997696,12730999808 9 | 7,WMT US,WAL-MART STORES INC,2007-12-31,348650012672,337366012928,11283999744 10 | 8,WMT US,WAL-MART STORES INC,2006-12-31,312426987520,301195987968,11230999552 11 | 9,WMT US,WAL-MART STORES INC,2005-12-31,287989006336,277722006528,10266999808 12 | 10,WMT US,WAL-MART STORES INC,2004-12-31,256329007104,247275006976,9054000128 13 | 11,WMT US,WAL-MART STORES INC,2003-12-31,229615992832,221660993024,7954999808 14 | -------------------------------------------------------------------------------- /data/country_timeseries.csv: -------------------------------------------------------------------------------- 1 | Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali 2 | 1/5/2015,289,2776,,10030,,,,,,1786,,2977,,,,, 3 | 1/4/2015,288,2775,,9780,,,,,,1781,,2943,,,,, 4 | 1/3/2015,287,2769,8166,9722,,,,,,1767,3496,2915,,,,, 5 | 1/2/2015,286,,8157,,,,,,,,3496,,,,,, 6 | 12/31/2014,284,2730,8115,9633,,,,,,1739,3471,2827,,,,, 7 | 12/28/2014,281,2706,8018,9446,,,,,,1708,3423,2758,,,,, 8 | 12/27/2014,280,2695,,9409,,,,,,1697,,2732,,,,, 9 | 12/24/2014,277,2630,7977,9203,,,,,,,3413,2655,,,,, 10 | 12/21/2014,273,2597,,9004,,,,,,1607,,2582,,,,, 11 | 12/20/2014,272,2571,7862,8939,,,,,,1586,3384,2556,,,,, 12 | 12/18/2014,271,,7830,,,,,,,,3376,,,,,, 13 | 12/14/2014,267,2416,,8356,,,,,,1525,,2085,,,,, 14 | 12/9/2014,262,,7797,,,,,,,,3290,,,,,, 15 | 12/7/2014,260,2292,,7897,20,1,4,1,7,1428,,1768,8,0,1,0,6 16 | 12/3/2014,256,,7719,,,,,,,,3177,,,,,, 17 | 11/30/2014,253,2164,,7312,20,1,4,1,7,1327,,1583,8,0,1,0,6 18 | 11/28/2014,251,,7635,,,,,,,,3145,,,,,, 19 | 11/23/2014,246,2134,,6599,20,1,4,1,7,1260,,1398,8,0,1,0,6 20 | 11/22/2014,245,,7168,,,,,,,,3016,,,,,, 21 | 11/18/2014,241,2047,7082,6190,20,1,4,1,6,1214,2963,1267,8,0,1,0,6 22 | 11/16/2014,239,1971,,6073,20,1,4,1,5,1192,,1250,8,0,1,0,5 23 | 11/15/2014,238,,7069,,,,,,,,2964,,,,,, 24 | 11/11/2014,234,1919,,5586,20,1,4,1,4,1166,,1187,8,0,1,0,3 25 | 11/10/2014,233,,6878,,,,,,,,2812,,,,,, 26 | 11/9/2014,232,1878,,5368,20,1,4,1,1,1142,,1169,8,0,1,0,1 27 | 11/8/2014,231,,6822,,,,,,,,2836,,,,,, 28 | 11/4/2014,227,,6619,4862,20,1,4,1,1,,2766,1130,8,0,1,0,1 29 | 11/3/2014,226,1760,,,,,,,,1054,,,,,,, 30 | 11/2/2014,225,1731,,4759,20,1,4,1,1,1041,,1070,8,0,1,0,1 31 | 10/31/2014,222,,6525,,,,,,,,2697,,,,,, 32 | 10/29/2014,220,1667,,5338,20,1,4,1,1,1018,,1510,8,0,1,0,1 33 | 10/27/2014,218,1906,,5235,20,1,4,1,1,997,,1500,8,0,1,0,1 34 | 10/25/2014,216,,6535,,,,,,,,2413,,,,,, 35 | 10/22/2014,214,,,3896,,,4,1,1,,,1281,,,1,0,1 36 | 10/21/2014,213,1553,,,,,,,,926,,,,,,, 37 | 10/19/2014,211,1540,,3706,20,1,3,1,,904,,1259,8,0,1,0, 38 | 10/18/2014,210,,4665,,,,,,,,2705,,,,,, 39 | 10/14/2014,206,1519,,3410,20,1,3,1,,862,,1200,8,0,0,1, 40 | 10/13/2014,205,,4262,,,,,,,,2484,,,,,, 41 | 10/12/2014,204,1472,,3252,20,1,2,1,,843,,1183,8,0,1,1, 42 | 10/11/2014,203,,4249,,,,,,,,2458,,,,,, 43 | 10/8/2014,200,,,2950,20,1,1,1,,,,930,8,0,1,1, 44 | 10/7/2014,199,1350,4076,,,,,,,778,2316,,,,,, 45 | 10/5/2014,197,1298,,2789,20,1,1,,,768,,879,8,0,0,, 46 | 10/4/2014,196,,3924,,,,,,,,2210,,,,,, 47 | 10/1/2014,193,1199,3834,2437,20,1,1,,,739,2069,623,8,0,0,, 48 | 9/28/2014,190,1157,3696,2304,20,1,,,,710,1998,622,8,0,,, 49 | 9/23/2014,185,1074,3458,2021,20,1,,,,648,1830,605,8,0,,, 50 | 9/21/2014,183,1022,3280,1940,20,1,,,,635,1677,597,8,0,,, 51 | 9/20/2014,182,,,1813,,,,,,,,593,,,,, 52 | 9/19/2014,181,1008,,,,,,,,632,,,,,,, 53 | 9/17/2014,179,,3022,,,,,,,,1578,,,,,, 54 | 9/14/2014,176,942,2710,1673,,,,,,601,1459,562,,,,, 55 | 9/13/2014,175,936,,1620,21,1,,,,595,1296,562,8,0,,, 56 | 9/10/2014,172,899,,1478,21,1,,,,568,,536,8,,,, 57 | 9/9/2014,171,,2407,,,,,,,,,,,,,, 58 | 9/7/2014,169,861,2081,1424,21,3,,,,557,1137,524,8,0,,, 59 | 9/5/2014,167,812,1871,1261,22,1,,,,517,1089,491,8,,,, 60 | 8/31/2014,162,771,1698,1216,21,1,,,,494,871,476,7,,,, 61 | 8/26/2014,157,648,1378,1026,17,,,,,430,694,422,6,,,, 62 | 8/20/2014,151,607,1082,910,16,,,,,406,624,392,5,,,, 63 | 8/18/2014,149,579,972,907,15,,,,,396,576,374,4,,,, 64 | 8/16/2014,147,543,834,848,15,,,,,394,466,365,4,,,, 65 | 8/13/2014,144,519,786,810,12,,,,,380,413,348,4,,,, 66 | 8/11/2014,142,510,670,783,12,,,,,377,355,334,3,,,, 67 | 8/9/2014,140,506,599,730,13,,,,,373,323,315,2,,,, 68 | 8/6/2014,137,495,554,717,13,,,,,367,294,298,2,,,, 69 | 8/4/2014,135,495,516,691,9,,,,,363,282,286,1,,,, 70 | 8/1/2014,132,485,468,646,4,,,,,358,255,273,1,,,, 71 | 7/30/2014,129,472,391,574,3,,,,,346,227,252,1,,,, 72 | 7/27/2014,126,460,329,533,1,,,,,339,156,233,1,,,, 73 | 7/23/2014,123,427,249,525,0,,,,,319,129,224,0,,,, 74 | 7/20/2014,120,415,224,454,,,,,,314,127,219,,,,, 75 | 7/17/2014,117,410,196,442,,,,,,310,116,206,,,,, 76 | 7/14/2014,114,411,174,397,,,,,,310,106,197,,,,, 77 | 7/12/2014,112,406,172,386,,,,,,304,105,194,,,,, 78 | 7/8/2014,108,409,142,337,,,,,,309,88,142,,,,, 79 | 7/6/2014,106,408,131,305,,,,,,307,84,127,,,,, 80 | 7/2/2014,102,412,115,252,,,,,,305,75,101,,,,, 81 | 6/30/2014,100,413,107,239,,,,,,303,65,99,,,,, 82 | 6/22/2014,92,,51,,,,,,,,34,,,,,, 83 | 6/20/2014,90,390,,158,,,,,,270,,34,,,,, 84 | 6/19/2014,89,,41,,,,,,,,25,,,,,, 85 | 6/18/2014,88,390,,136,,,,,,267,,28,,,,, 86 | 6/17/2014,87,,,97,,,,,,,,49,,,,, 87 | 6/16/2014,86,398,33,,,,,,,264,24,,,,,, 88 | 6/10/2014,80,351,13,89,,,,,,226,24,7,,,,, 89 | 6/5/2014,75,,13,81,,,,,,,,6,,,,, 90 | 6/3/2014,73,344,13,,,,,,,215,12,6,,,,, 91 | 6/1/2014,71,328,13,79,,,,,,208,12,6,,,,, 92 | 5/28/2014,67,291,13,50,,,,,,193,12,6,,,,, 93 | 5/27/2014,66,281,12,16,,,,,,186,11,5,,,,, 94 | 5/23/2014,62,258,12,0,,,,,,174,11,0,,,,, 95 | 5/12/2014,51,248,12,0,,,,,,171,11,0,,,,, 96 | 5/10/2014,49,233,12,0,,,,,,157,11,0,,,,, 97 | 5/7/2014,46,236,13,0,,,,,,158,11,0,,,,, 98 | 5/5/2014,44,235,13,0,,,,,,157,11,0,,,,, 99 | 5/3/2014,42,231,13,0,,,,,,155,11,0,,,,, 100 | 5/1/2014,40,226,13,0,,,,,,149,11,0,,,,, 101 | 4/26/2014,35,224,,0,,,,,,143,,0,,,,, 102 | 4/24/2014,33,,35,0,,,,,,,,0,,,,, 103 | 4/23/2014,32,218,,0,,,,,,141,,0,,,,, 104 | 4/22/2014,31,,,0,,,,,,,,0,,,,, 105 | 4/21/2014,30,,34,,,,,,,,11,,,,,, 106 | 4/20/2014,29,208,,,,,,,,136,6,,,,,, 107 | 4/17/2014,26,203,27,,,,,,,129,,,,,,, 108 | 4/16/2014,25,197,27,,,,,,,122,13,,,,,, 109 | 4/15/2014,24,,,12,,,,,,,,,,,,, 110 | 4/14/2014,23,168,,,,,,,,108,,,,,,, 111 | 4/11/2014,20,159,26,2,,,,,,106,13,2,,,,, 112 | 4/9/2014,18,158,25,2,,,,,,101,12,2,,,,, 113 | 4/7/2014,16,151,21,2,,,,,,95,10,2,,,,, 114 | 4/4/2014,13,143,18,2,,,,,,86,7,2,,,,, 115 | 4/1/2014,10,127,8,2,,,,,,83,5,2,,,,, 116 | 3/31/2014,9,122,8,2,,,,,,80,4,2,,,,, 117 | 3/29/2014,7,112,7,,,,,,,70,2,,,,,, 118 | 3/28/2014,6,112,3,2,,,,,,70,3,2,,,,, 119 | 3/27/2014,5,103,8,6,,,,,,66,6,5,,,,, 120 | 3/26/2014,4,86,,,,,,,,62,,,,,,, 121 | 3/25/2014,3,86,,,,,,,,60,,,,,,, 122 | 3/24/2014,2,86,,,,,,,,59,,,,,,, 123 | 3/22/2014,0,49,,,,,,,,29,,,,,,, -------------------------------------------------------------------------------- /data/fortune_1000.csv: -------------------------------------------------------------------------------- 1 | rank,name,industry,location,employees,revenues_millions 2 | 1,Walmart,General Merchandisers,"Bentonville, AR","2,200,000","$523,964 " 3 | 2,Amazon,Internet Services and Retailing,"Seattle, WA","798,000","$280,522 " 4 | 3,Exxon Mobil,Petroleum Refining,"Irving, TX","74,900","$264,938 " 5 | 4,Apple,"Computers, Office Equipment","Cupertino, CA","137,000","$260,174 " 6 | 5,CVS Health,Food and Drug Stores,"Woonsocket, RI","290,000","$256,776 " 7 | 6,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE","391,500","$254,616 " 8 | 7,UnitedHealth Group,Health Care: Insurance and Managed Care,"Minnetonka, MN","325,000","$242,155 " 9 | 8,McKesson,Wholesalers: Health Care,"San Francisco, CA","70,000","$214,319 " 10 | 9,AT&T,Telecommunications,"Dallas, TX","247,800","$181,193 " 11 | 10,AmerisourceBergen,Wholesalers: Health Care,"Chesterbrook, PA","21,500","$179,589 " 12 | 12,Ford Motor,Motor Vehicles and Parts,"Dearborn, MI","190,000","$155,900 " 13 | 13,Cigna,Health Care: Insurance and Managed Care,"Bloomfield, CT","73,700","$153,566 " 14 | 14,Costco Wholesale,General Merchandisers,"Issaquah, WA","201,500","$152,703 " 15 | 15,Chevron,Petroleum Refining,"San Ramon, CA","48,200","$146,516 " 16 | 16,Cardinal Health,Wholesalers: Health Care,"Dublin, OH","49,500","$145,534 " 17 | 17,JPMorgan Chase,Commercial Banks,"New York, NY","256,981","$142,422 " 18 | 18,General Motors,Motor Vehicles and Parts,"Detroit, MI","164,000","$137,237 " 19 | 19,Walgreens Boots Alliance,Food and Drug Stores,"Deerfield, IL","287,000","$136,866 " 20 | 20,Verizon Communications,Telecommunications,"New York, NY","135,000","$131,868 " 21 | 21,Microsoft,Computer Software,"Redmond, WA","144,000","$125,843 " 22 | 22,Marathon Petroleum,Petroleum Refining,"Findlay, OH","60,910","$124,813 " 23 | 23,Kroger,Food and Drug Stores,"Cincinnati, OH","435,000","$122,286 " 24 | 24,Fannie Mae,Diversified Financials,"Washington, DC","7,500","$120,304 " 25 | 25,Bank of America,Commercial Banks,"Charlotte, NC","208,131","$113,589 " 26 | 26,Home Depot,Specialty Retailers: Other,"Atlanta, GA","415,700","$110,225 " 27 | 27,Phillips 66,Petroleum Refining,"Houston, TX","14,500","$109,559 " 28 | 28,Comcast NBCUniversal,Telecommunications,"Philadelphia, PA","190,000","$108,942 " 29 | 29,Anthem,Health Care: Insurance and Managed Care,"Indianapolis, IN","70,600","$104,213 " 30 | 30,Wells Fargo,Commercial Banks,"San Francisco, CA","259,800","$103,915 " 31 | -------------------------------------------------------------------------------- /data/linkedin_industries.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 |
CodeGroupsDescription
47corp, finAccounting
94man, tech, tranAirlines/Aviation
120leg, orgAlternative Dispute Resolution
125hlthAlternative Medicine
127art, medAnimation
19goodApparel & Fashion
50consArchitecture & Planning
111art, med, recArts and Crafts
53manAutomotive
52gov, manAviation & Aerospace
41finBanking
12gov, hlth, techBiotechnology
36med, recBroadcast Media
49consBuilding Materials
138corp, manBusiness Supplies and Equipment
129finCapital Markets
54manChemicals
90org, servCivic & Social Organization
51cons, govCivil Engineering
128cons, corp, finCommercial Real Estate
118techComputer & Network Security
109med, recComputer Games
3techComputer Hardware
5techComputer Networking
4techComputer Software
48consConstruction
24good, manConsumer Electronics
25good, manConsumer Goods
91org, servConsumer Services
18goodCosmetics
65agrDairy
1gov, techDefense & Space
99art, medDesign
69eduEducation Management
132edu, orgE-Learning
112good, manElectrical/Electronic Manufacturing
28med, recEntertainment
86org, servEnvironmental Services
110corp, rec, servEvents Services
76govExecutive Office
122corp, servFacilities Services
63agrFarming
43finFinancial Services
38art, med, recFine Art
66agrFishery
34rec, servFood & Beverages
23good, man, servFood Production
101orgFund-Raising
26good, manFurniture
29recGambling & Casinos
145cons, manGlass, Ceramics & Concrete
75govGovernment Administration
148govGovernment Relations
140art, medGraphic Design
124hlth, recHealth, Wellness and Fitness
68eduHigher Education
14hlthHospital & Health Care
31rec, serv, tranHospitality
137corpHuman Resources
134corp, good, tranImport and Export
88org, servIndividual & Family Services
147cons, manIndustrial Automation
84med, servInformation Services
96techInformation Technology and Services
42finInsurance
74govInternational Affairs
141gov, org, tranInternational Trade and Development
6techInternet
45finInvestment Banking
46finInvestment Management
73gov, legJudiciary
77gov, legLaw Enforcement
9legLaw Practice
10legLegal Services
72gov, legLegislative Office
30rec, serv, tranLeisure, Travel & Tourism
85med, rec, servLibraries
116corp, tranLogistics and Supply Chain
143goodLuxury Goods & Jewelry
55manMachinery
11corpManagement Consulting
95tranMaritime
97corpMarket Research
80corp, medMarketing and Advertising
135cons, gov, manMechanical or Industrial Engineering
126med, recMedia Production
17hlthMedical Devices
13hlthMedical Practice
139hlthMental Health Care
71govMilitary
56manMining & Metals
35art, med, recMotion Pictures and Film
37art, med, recMuseums and Institutions
115art, recMusic
114gov, man, techNanotechnology
81med, recNewspapers
100orgNon-Profit Organization Management
57manOil & Energy
113medOnline Media
123corpOutsourcing/Offshoring
87serv, tranPackage/Freight Delivery
146good, manPackaging and Containers
61manPaper & Forest Products
39art, med, recPerforming Arts
15hlth, techPharmaceuticals
131orgPhilanthropy
136art, med, recPhotography
117manPlastics
107gov, orgPolitical Organization
67eduPrimary/Secondary Education
83med, recPrinting
105corpProfessional Training & Coaching
102corp, orgProgram Development
79govPublic Policy
98corpPublic Relations and Communications
78govPublic Safety
82med, recPublishing
62manRailroad Manufacture
64agrRanching
44cons, fin, goodReal Estate
40rec, servRecreational Facilities and Services
89org, servReligious Institutions
144gov, man, orgRenewables & Environment
70edu, govResearch
32rec, servRestaurants
27good, manRetail
121corp, org, servSecurity and Investigations
7techSemiconductors
58manShipbuilding
20good, recSporting Goods
33recSports
104corpStaffing and Recruiting
22goodSupermarkets
8gov, techTelecommunications
60manTextiles
130gov, orgThink Tanks
21goodTobacco
108corp, gov, servTranslation and Localization
92tranTransportation/Trucking/Railroad
59manUtilities
106fin, techVenture Capital & Private Equity
16hlthVeterinary
93tranWarehousing
133goodWholesale
142good, man, recWine and Spirits
119techWireless
103art, med, recWriting and Editing
752 | 753 | 754 | -------------------------------------------------------------------------------- /data/msft_stock_key_data.csv: -------------------------------------------------------------------------------- 1 | Symbol,MSFT 2 | Name,Microsoft Corporation Common Stock 3 | Exchange,NASDAQ-GS 4 | Sector,Technology 5 | Industry,Computer Software: Prepackaged Software 6 | 1 Year Target,$277.50 7 | Today's High/Low,$261.00/$257.60 8 | Share Volume,"24,878,582" 9 | Average Volume,"28,320,974" 10 | Previous Close,$259.50 11 | 52 Week High/Low,$259.93/$166.11 12 | Market Cap,"1,966,557,339,088" 13 | P/E Ratio,38.86 14 | Forward P/E 1 Yr.,35.21 15 | Earnings Per Share(EPS),$6.71 16 | Annualized Dividend,$2.24 17 | Ex Dividend Date,19-May-21 18 | Dividend Pay Date,10-Jun-21 19 | Current Yield,0.88% 20 | Beta,0.8 21 | -------------------------------------------------------------------------------- /data/pew.csv: -------------------------------------------------------------------------------- 1 | "religion","<$10k","$10-20k","$20-30k","$30-40k","$40-50k","$50-75k","$75-100k","$100-150k",">150k","Don't know/refused" 2 | "Agnostic",27,34,60,81,76,137,122,109,84,96 3 | "Atheist",12,27,37,52,35,70,73,59,74,76 4 | "Buddhist",27,21,30,34,33,58,62,39,53,54 5 | "Catholic",418,617,732,670,638,1116,949,792,633,1489 6 | "Don’t know/refused",15,14,15,11,10,35,21,17,18,116 7 | "Evangelical Prot",575,869,1064,982,881,1486,949,723,414,1529 8 | "Hindu",1,9,7,9,11,34,47,48,54,37 9 | "Historically Black Prot",228,244,236,238,197,223,131,81,78,339 10 | "Jehovah's Witness",20,27,24,24,21,30,15,11,6,37 11 | "Jewish",19,19,25,25,30,95,69,87,151,162 12 | "Mainline Prot",289,495,619,655,651,1107,939,753,634,1328 13 | "Mormon",29,40,48,51,56,112,85,49,42,69 14 | "Muslim",6,7,9,10,9,23,16,8,6,22 15 | "Orthodox",13,17,23,32,32,47,38,42,46,73 16 | "Other Christian",9,7,11,13,13,14,18,14,12,18 17 | "Other Faiths",20,33,40,46,49,63,46,40,41,71 18 | "Other World Religions",5,2,3,4,2,7,3,4,4,8 19 | "Unaffiliated",217,299,374,365,341,528,407,321,258,597 20 | -------------------------------------------------------------------------------- /data/portfolio.csv: -------------------------------------------------------------------------------- 1 | Ticker,Date,Shares,Price 2 | GOOG,2019-10-01,100,1 3 | MSFT,2019-10-01,200,1 4 | IBM,2019-10-01,500,1 5 | TSLA,2019-10-01,300,1 6 | 7 | -------------------------------------------------------------------------------- /data/pycon_sponsor_levels.csv: -------------------------------------------------------------------------------- 1 | sponsor_level,amount 2 | VISIONARY,150000 3 | SUSTAINABILITY,90000 4 | MAINTAINING,60000 5 | CONTRIBUTING,30000 6 | SUPPORTING,15000 7 | PARTNER,7500 8 | PARTICIPATING,3750 9 | ASSOCIATE,1500 10 | -------------------------------------------------------------------------------- /data/pycon_sponsors.csv: -------------------------------------------------------------------------------- 1 | symbol,name,sponsor_level 2 | GOOG,ALPHABET INC.,VISIONARY 3 | AMZN,AMAZON COM INC,SUSTAINABILITY 4 | #N/A,BLOOMBERG,VISIONARY 5 | COF,CAPITAL ONE FINANCIAL CORP,MAINTAINING 6 | GLW,CORNING INC,MAINTAINING 7 | ESTC,ELASTIC N.V.,PARTNER 8 | FB,FACEBOOK INC,SUSTAINABILITY 9 | #N/A,HUAWEI TECHNOLOGIES,SUSTAINABILITY 10 | IBM,INTERNATIONAL BUSINESS MACHINES CORP,CONTRIBUTING 11 | JPM,JPMORGAN CHASE & CO,SUPPORTING 12 | MSFT,MICROSOFT CORP,VISIONARY 13 | NFLX,NETFLIX INC,PARTNER 14 | CRM,SALESFORCE.COM INC.,SUSTAINABILITY 15 | WORK,SLACK TECHNOLOGIES INC.,MAINTAINING 16 | -------------------------------------------------------------------------------- /data/retail_sales.csv: -------------------------------------------------------------------------------- 1 | date,sales 2 | 2009-10-01,338630 3 | 2009-11-01,339386 4 | 2009-12-01,400264 5 | 2010-01-01,314640 6 | 2010-02-01,311022 7 | 2010-03-01,360819 8 | 2010-04-01,356460 9 | 2010-05-01,365713 10 | 2010-06-01,358675 11 | 2010-07-01,362027 12 | 2010-08-01,362682 13 | 2010-09-01,346069 14 | 2010-10-01,355212 15 | 2010-11-01,365809 16 | 2010-12-01,426654 17 | 2011-01-01,335608 18 | 2011-02-01,337352 19 | 2011-03-01,387092 20 | 2011-04-01,380754 21 | 2011-05-01,391970 22 | 2011-06-01,388636 23 | 2011-07-01,384600 24 | 2011-08-01,394548 25 | 2011-09-01,374895 26 | 2011-10-01,379364 27 | 2011-11-01,391081 28 | 2011-12-01,451669 29 | 2012-01-01,355058 30 | 2012-02-01,372523 31 | 2012-03-01,414275 32 | 2012-04-01,393035 33 | 2012-05-01,418648 34 | 2012-06-01,400996 35 | 2012-07-01,396020 36 | 2012-08-01,417911 37 | 2012-09-01,385597 38 | 2012-10-01,399341 39 | 2012-11-01,410992 40 | 2012-12-01,461994 41 | 2013-01-01,375537 42 | 2013-02-01,373938 43 | 2013-03-01,421638 44 | 2013-04-01,408381 45 | 2013-05-01,436985 46 | 2013-06-01,414701 47 | 2013-07-01,422357 48 | 2013-08-01,434950 49 | 2013-09-01,396199 50 | 2013-10-01,415740 51 | 2013-11-01,423611 52 | 2013-12-01,477205 53 | 2014-01-01,383399 54 | 2014-02-01,380315 55 | 2014-03-01,432806 56 | 2014-04-01,431415 57 | 2014-05-01,458822 58 | 2014-06-01,433152 59 | 2014-07-01,443005 60 | 2014-08-01,450913 61 | 2014-09-01,420871 62 | 2014-10-01,437702 63 | 2014-11-01,437910 64 | 2014-12-01,501232 65 | 2015-01-01,397252 66 | 2015-02-01,386935 67 | 2015-03-01,444110 68 | 2015-04-01,438217 69 | 2015-05-01,462615 70 | 2015-06-01,448229 71 | 2015-07-01,457710 72 | 2015-08-01,456340 73 | 2015-09-01,430917 74 | -------------------------------------------------------------------------------- /data/sponsors_vlookup.csv: -------------------------------------------------------------------------------- 1 | symbol,name,sponsor_level,amount 2 | GOOG,ALPHABET INC.,VISIONARY,150000 3 | AMZN,AMAZON COM INC,SUSTAINABILITY,90000 4 | #N/A,BLOOMBERG,VISIONARY,150000 5 | COF,CAPITAL ONE FINANCIAL CORP,MAINTAINING,60000 6 | GLW,CORNING INC,MAINTAINING,60000 7 | ESTC,ELASTIC N.V.,PARTNER,7500 8 | FB,FACEBOOK INC,SUSTAINABILITY,90000 9 | #N/A,HUAWEI TECHNOLOGIES,SUSTAINABILITY,90000 10 | IBM,INTERNATIONAL BUSINESS MACHINES CORP,CONTRIBUTING,30000 11 | JPM,JPMORGAN CHASE & CO,SUPPORTING,15000 12 | MSFT,MICROSOFT CORP,VISIONARY,150000 13 | NFLX,NETFLIX INC,PARTNER,7500 14 | CRM,SALESFORCE.COM INC.,SUSTAINABILITY,90000 15 | WORK,SLACK TECHNOLOGIES INC.,MAINTAINING,60000 16 | -------------------------------------------------------------------------------- /data/stlcom_larget_employers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stlcom_larget_employers.xlsx -------------------------------------------------------------------------------- /data/stlregionalchamber_largest_employers_.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stlregionalchamber_largest_employers_.xlsx -------------------------------------------------------------------------------- /data/stock_data_simple.csv: -------------------------------------------------------------------------------- 1 | ticker,company_name,sector,trade_date,price,price_change_percent,market_capitalization,annual_sales,shares_outstanding 2 | WMT,Wal-Mart Stores,Retail,1/16/2014,76.76,-1.20%,"248,377","55,688",3235772 3 | AAPL,Apple Inc,Technology,1/16/2014,554.25,-0.60%,"494,697","37,472",892553 4 | IBM,Intl Business Machines,Technology,1/16/2014,188.76,0.50%,"204,965","23,720",1085854 5 | BAC,Bank Of America Corp,Financial,1/16/2014,17.08,-0.40%,"182,177","23,553",10666133 6 | SGL.KR,Samsung Electronics,Technology,1/16/2014,"1,301,000.00",0.20%,"180,329","23,444",147299 7 | NESN.CH,Nestle 'R',Consumer Staple,1/16/2014,67.45,1.20%,"239,974","22,584",3224798 8 | MSFT,Microsoft Corp,Technology,1/16/2014,36.89,0.40%,"307,956","18,529",8347968 9 | AMZN,Amazon.Com Inc,Retail,1/16/2014,395.8,0.00%,"181,170","17,092",457733 10 | GOOG,Google Inc,Technology,1/16/2014,"1,156.22",0.70%,"386,278","14,893",334087 11 | PFE,Pfizer Inc,Health Care,1/16/2014,31.17,0.00%,"202,014","12,643",6481070 12 | -------------------------------------------------------------------------------- /data/stock_data_simple.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stock_data_simple.xlsx -------------------------------------------------------------------------------- /data/table1.csv: -------------------------------------------------------------------------------- 1 | "country","year","cases","population" 2 | "Afghanistan",1999,745,19987071 3 | "Afghanistan",2000,2666,20595360 4 | "Brazil",1999,37737,172006362 5 | "Brazil",2000,80488,174504898 6 | "China",1999,212258,1272915272 7 | "China",2000,213766,1280428583 8 | -------------------------------------------------------------------------------- /data/table2.csv: -------------------------------------------------------------------------------- 1 | "country","year","type","count" 2 | "Afghanistan",1999,"cases",745 3 | "Afghanistan",1999,"population",19987071 4 | "Afghanistan",2000,"cases",2666 5 | "Afghanistan",2000,"population",20595360 6 | "Brazil",1999,"cases",37737 7 | "Brazil",1999,"population",172006362 8 | "Brazil",2000,"cases",80488 9 | "Brazil",2000,"population",174504898 10 | "China",1999,"cases",212258 11 | "China",1999,"population",1272915272 12 | "China",2000,"cases",213766 13 | "China",2000,"population",1280428583 14 | -------------------------------------------------------------------------------- /data/table3.csv: -------------------------------------------------------------------------------- 1 | "country","year","rate" 2 | "Afghanistan",1999,"745/19987071" 3 | "Afghanistan",2000,"2666/20595360" 4 | "Brazil",1999,"37737/172006362" 5 | "Brazil",2000,"80488/174504898" 6 | "China",1999,"212258/1272915272" 7 | "China",2000,"213766/1280428583" 8 | -------------------------------------------------------------------------------- /data/table4a.csv: -------------------------------------------------------------------------------- 1 | "country","1999","2000" 2 | "Afghanistan",745,2666 3 | "Brazil",37737,80488 4 | "China",212258,213766 5 | -------------------------------------------------------------------------------- /data/table4b.csv: -------------------------------------------------------------------------------- 1 | "country","1999","2000" 2 | "Afghanistan",19987071,20595360 3 | "Brazil",172006362,174504898 4 | "China",1272915272,1280428583 5 | -------------------------------------------------------------------------------- /data/weather.csv: -------------------------------------------------------------------------------- 1 | "id","year","month","element","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" 2 | "MX17004",2010,1,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,27.8,NA 3 | "MX17004",2010,1,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,14.5,NA 4 | "MX17004",2010,2,"tmax",NA,27.3,24.1,NA,NA,NA,NA,NA,NA,NA,29.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA 5 | "MX17004",2010,2,"tmin",NA,14.4,14.4,NA,NA,NA,NA,NA,NA,NA,13.4,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,10.7,NA,NA,NA,NA,NA,NA,NA,NA 6 | "MX17004",2010,3,"tmax",NA,NA,NA,NA,32.1,NA,NA,NA,NA,34.5,NA,NA,NA,NA,NA,31.1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 7 | "MX17004",2010,3,"tmin",NA,NA,NA,NA,14.2,NA,NA,NA,NA,16.8,NA,NA,NA,NA,NA,17.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 8 | "MX17004",2010,4,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,36.3,NA,NA,NA,NA 9 | "MX17004",2010,4,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.7,NA,NA,NA,NA 10 | "MX17004",2010,5,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,33.2,NA,NA,NA,NA 11 | "MX17004",2010,5,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18.2,NA,NA,NA,NA 12 | "MX17004",2010,6,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,30.1,NA,NA 13 | "MX17004",2010,6,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18,NA,NA 14 | "MX17004",2010,7,"tmax",NA,NA,28.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 15 | "MX17004",2010,7,"tmin",NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 16 | "MX17004",2010,8,"tmax",NA,NA,NA,NA,29.6,NA,NA,29,NA,NA,NA,NA,29.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,26.4,NA,29.7,NA,NA,NA,28,NA,25.4 17 | "MX17004",2010,8,"tmin",NA,NA,NA,NA,15.8,NA,NA,17.3,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,15.6,NA,NA,NA,15.3,NA,15.4 18 | "MX17004",2010,10,"tmax",NA,NA,NA,NA,27,NA,28.1,NA,NA,NA,NA,NA,NA,29.5,28.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,31.2,NA,NA,NA 19 | "MX17004",2010,10,"tmin",NA,NA,NA,NA,14,NA,12.9,NA,NA,NA,NA,NA,NA,13,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,NA,NA 20 | "MX17004",2010,11,"tmax",NA,31.3,NA,27.2,26.3,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28.1,27.7,NA,NA,NA,NA 21 | "MX17004",2010,11,"tmin",NA,16.3,NA,12,7.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,12.1,14.2,NA,NA,NA,NA 22 | "MX17004",2010,12,"tmax",29.9,NA,NA,NA,NA,27.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 23 | "MX17004",2010,12,"tmin",13.8,NA,NA,NA,NA,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = spreadsheets_to_dataframes 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # spreadsheets_to_dataframes documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | import spreadsheets_to_dataframes 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'Spreadsheets to DataFrames' 51 | copyright = u"2019, Ryan S. McCoy" 52 | author = u"Ryan S. McCoy" 53 | 54 | # The version info for the project you're documenting, acts as replacement 55 | # for |version| and |release|, also used in various other places throughout 56 | # the built documents. 57 | # 58 | # The short X.Y version. 59 | version = spreadsheets_to_dataframes.__version__ 60 | # The full version, including alpha/beta/rc tags. 61 | release = spreadsheets_to_dataframes.__version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'alabaster' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a 90 | # theme further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'spreadsheets_to_dataframesdoc' 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, author, documentclass 129 | # [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'spreadsheets_to_dataframes.tex', 132 | u'Spreadsheets to DataFrames Documentation', 133 | u'Ryan S. McCoy', 'manual'), 134 | ] 135 | 136 | 137 | # -- Options for manual page output ------------------------------------ 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'spreadsheets_to_dataframes', 143 | u'Spreadsheets to DataFrames Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'spreadsheets_to_dataframes', 155 | u'Spreadsheets to DataFrames Documentation', 156 | author, 157 | 'spreadsheets_to_dataframes', 158 | 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Spreadsheets to DataFrames's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | modules 12 | contributing 13 | authors 14 | history 15 | 16 | Indices and tables 17 | ================== 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install Spreadsheets to DataFrames, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install spreadsheets_to_dataframes 16 | 17 | This is the preferred method to install Spreadsheets to DataFrames, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for Spreadsheets to DataFrames can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/ryansmccoy/spreadsheets_to_dataframes 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OJL https://github.com/ryansmccoy/spreadsheets_to_dataframes/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/ryansmccoy/spreadsheets_to_dataframes 51 | .. _tarball: https://github.com/ryansmccoy/spreadsheets_to_dataframes/tarball/master 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=spreadsheets_to_dataframes 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use Spreadsheets to DataFrames in a project:: 6 | 7 | import spreadsheets_to_dataframes 8 | -------------------------------------------------------------------------------- /img/basics/basic_python_style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/basic_python_style.png -------------------------------------------------------------------------------- /img/basics/built-in_data_structures.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_data_structures.png -------------------------------------------------------------------------------- /img/basics/built-in_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_functions.png -------------------------------------------------------------------------------- /img/basics/built-in_len.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_len.png -------------------------------------------------------------------------------- /img/basics/calculations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/calculations.png -------------------------------------------------------------------------------- /img/basics/cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell.png -------------------------------------------------------------------------------- /img/basics/cell_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell_ex.png -------------------------------------------------------------------------------- /img/basics/cell_types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell_types.png -------------------------------------------------------------------------------- /img/basics/cells.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cells.png -------------------------------------------------------------------------------- /img/basics/comments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/comments.png -------------------------------------------------------------------------------- /img/basics/data-types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/data-types.png -------------------------------------------------------------------------------- /img/basics/data_collections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/data_collections.png -------------------------------------------------------------------------------- /img/basics/excel-built-in-string.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-built-in-string.png -------------------------------------------------------------------------------- /img/basics/excel-built-in.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-built-in.png -------------------------------------------------------------------------------- /img/basics/excel-pre-installed-add-ins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-pre-installed-add-ins.png -------------------------------------------------------------------------------- /img/basics/jupyter-method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/jupyter-method.png -------------------------------------------------------------------------------- /img/basics/pycharm-function-pop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-function-pop.png -------------------------------------------------------------------------------- /img/basics/pycharm-function-popup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-function-popup.png -------------------------------------------------------------------------------- /img/basics/pycharm-methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-methods.png -------------------------------------------------------------------------------- /img/basics/pycon-files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon-files.png -------------------------------------------------------------------------------- /img/basics/pycon_sponsor_levels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon_sponsor_levels.png -------------------------------------------------------------------------------- /img/basics/pycon_sponsors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon_sponsors.png -------------------------------------------------------------------------------- /img/basics/python-pre-installed-add-ins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/python-pre-installed-add-ins.png -------------------------------------------------------------------------------- /img/basics/reserved_words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/reserved_words.png -------------------------------------------------------------------------------- /img/basics/standard-library-import.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/standard-library-import.png -------------------------------------------------------------------------------- /img/basics/standard-library.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/standard-library.png -------------------------------------------------------------------------------- /img/basics/vscode-method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/vscode-method.png -------------------------------------------------------------------------------- /img/dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/dataframe.png -------------------------------------------------------------------------------- /img/dataframe_components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/dataframe_components.png -------------------------------------------------------------------------------- /img/excel_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/excel_table.png -------------------------------------------------------------------------------- /img/pandas_dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/pandas_dataframe.png -------------------------------------------------------------------------------- /img/split_apply_combine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/split_apply_combine.png -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | Click 2 | 3 | pandas 4 | numpy 5 | scipy 6 | requests 7 | openpyxl 8 | cookiecutter 9 | sqlalchemy 10 | flask 11 | feedparser 12 | bs4 13 | # selenium 14 | 15 | # statsmodels 16 | # tldextract 17 | # pyflux 18 | # fbprophet 19 | lxml 20 | jupyter 21 | matplotlib 22 | 23 | # celery==3.1.25 24 | 25 | # alpha_vantage 26 | -------------------------------------------------------------------------------- /section1_challenge_1.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | 3 | # Challenge 1 4 | # Modify the code below to match the Expected Output at the bottom 5 | 6 | import csv 7 | import os 8 | from pprint import pprint 9 | 10 | current_directory = os.getcwd() 11 | 12 | pycon_sponsors_filename = 'pycon_sponsors.csv' 13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 14 | 15 | # print(pycon_sponsors_filepath) 16 | 17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 22 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 25 | 26 | pprint(sponsor_levels) 27 | 28 | pycon_sponsors = [] 29 | 30 | # print(pycon_sponsors_filepath) 31 | 32 | with open(pycon_sponsors_filepath, 'r') as f: 33 | rows = csv.reader(f) 34 | 35 | header = next(f) 36 | 37 | for row_number, row in enumerate(rows): 38 | print("Row Number:\t", row_number, "Values:\t", row) 39 | 40 | """ 41 | Current Output: 42 | 43 | [{'amount': 150000, 'sponsor_level': 'VISIONARY'}, 44 | {'amount': 90000, 'sponsor_level': 'SUSTAINABILITY'}, 45 | {'amount': 60000, 'sponsor_level': 'MAINTAINING'}, 46 | {'amount': 30000, 'sponsor_level': 'CONTRIBUTING'}, 47 | {'amount': 15000, 'sponsor_level': 'SUPPORTING'}, 48 | {'amount': 7500, 'sponsor_level': 'PARTNER'}, 49 | {'amount': 3750, 'sponsor_level': 'PARTICIPATING'}, 50 | {'amount': 1500, 'sponsor_level': 'ASSOCIATE'}] 51 | 52 | Row Number: 0 Values: ['GOOG', 'ALPHABET INC.', 'VISIONARY'] 53 | Row Number: 1 Values: ['AMZN', 'AMAZON COM INC', 'SUSTAINABILITY'] 54 | Row Number: 2 Values: ['#N/A', 'BLOOMBERG', 'VISIONARY'] 55 | Row Number: 3 Values: ['COF', 'CAPITAL ONE FINANCIAL CORP', 'MAINTAINING'] 56 | Row Number: 4 Values: ['GLW', 'CORNING INC', 'MAINTAINING'] 57 | Row Number: 5 Values: ['ESTC', 'ELASTIC N.V.', 'PARTNER'] 58 | Row Number: 6 Values: ['FB', 'FACEBOOK INC', 'SUSTAINABILITY'] 59 | Row Number: 7 Values: ['#N/A', 'HUAWEI TECHNOLOGIES', 'SUSTAINABILITY'] 60 | Row Number: 8 Values: ['IBM', 'INTERNATIONAL BUSINESS MACHINES CORP', 'CONTRIBUTING'] 61 | Row Number: 9 Values: ['JPM', 'JPMORGAN CHASE & CO', 'SUPPORTING'] 62 | Row Number: 10 Values: ['MSFT', 'MICROSOFT CORP', 'VISIONARY'] 63 | Row Number: 11 Values: ['NFLX', 'NETFLIX INC', 'PARTNER'] 64 | Row Number: 12 Values: ['CRM', 'SALESFORCE.COM INC.', 'SUSTAINABILITY'] 65 | Row Number: 13 Values: ['WORK', 'SLACK TECHNOLOGIES INC.', 'MAINTAINING'] 66 | 67 | Expected Output: 68 | 69 | Company Number: 0 70 | DCompany: ALPHABET INC. 71 | Level: VISIONARY 72 | Donated: 150000 73 | Company Number: 1 74 | DCompany: AMAZON COM INC 75 | Level: SUSTAINABILITY 76 | Donated: 90000 77 | Company Number: 2 78 | DCompany: BLOOMBERG 79 | Level: VISIONARY 80 | Donated: 150000 81 | Company Number: 3 82 | DCompany: CAPITAL ONE FINANCIAL CORP 83 | Level: MAINTAINING 84 | Donated: 60000 85 | Company Number: 4 86 | DCompany: CORNING INC 87 | Level: MAINTAINING 88 | Donated: 60000 89 | Company Number: 5 90 | DCompany: ELASTIC N.V. 91 | Level: PARTNER 92 | Donated: 7500 93 | Company Number: 6 94 | DCompany: FACEBOOK INC 95 | Level: SUSTAINABILITY 96 | Donated: 90000 97 | Company Number: 7 98 | DCompany: HUAWEI TECHNOLOGIES 99 | Level: SUSTAINABILITY 100 | Donated: 90000 101 | Company Number: 8 102 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP 103 | Level: CONTRIBUTING 104 | Donated: 30000 105 | Company Number: 9 106 | DCompany: JPMORGAN CHASE & CO 107 | Level: SUPPORTING 108 | Donated: 15000 109 | Company Number: 10 110 | DCompany: MICROSOFT CORP 111 | Level: VISIONARY 112 | Donated: 150000 113 | Company Number: 11 114 | DCompany: NETFLIX INC 115 | Level: PARTNER 116 | Donated: 7500 117 | Company Number: 12 118 | DCompany: SALESFORCE.COM INC. 119 | Level: SUSTAINABILITY 120 | Donated: 90000 121 | Company Number: 13 122 | DCompany: SLACK TECHNOLOGIES INC. 123 | Level: MAINTAINING 124 | Donated: 60000 125 | 126 | """ 127 | -------------------------------------------------------------------------------- /section1_challenge_1_answer.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | 3 | # Modify the code below to match the Expected Output at the bottom 4 | 5 | import csv 6 | import os 7 | from pprint import pprint 8 | 9 | current_directory = os.getcwd() 10 | 11 | pycon_sponsors_filename = 'pycon_sponsors.csv' 12 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 13 | 14 | print(pycon_sponsors_filepath) 15 | 16 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 17 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 18 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 19 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 20 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 21 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 22 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 23 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 24 | 25 | sponsor_vlookup = {} 26 | 27 | for sponsor_level in sponsor_levels: 28 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount'] 29 | 30 | pprint(sponsor_levels) 31 | 32 | pycon_sponsors = [] 33 | 34 | print(pycon_sponsors_filepath) 35 | 36 | with open(pycon_sponsors_filepath, 'r') as f: 37 | rows = csv.reader(f) 38 | 39 | header = next(f) 40 | 41 | for row_number, row in enumerate(rows): 42 | ticker, name, level = row 43 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n") 44 | 45 | """ 46 | Company Number: 0 47 | DCompany: ALPHABET INC. 48 | Level: VISIONARY 49 | Donated: 150000 50 | Company Number: 1 51 | DCompany: AMAZON COM INC 52 | Level: SUSTAINABILITY 53 | Donated: 90000 54 | Company Number: 2 55 | DCompany: BLOOMBERG 56 | Level: VISIONARY 57 | Donated: 150000 58 | Company Number: 3 59 | DCompany: CAPITAL ONE FINANCIAL CORP 60 | Level: MAINTAINING 61 | Donated: 60000 62 | Company Number: 4 63 | DCompany: CORNING INC 64 | Level: MAINTAINING 65 | Donated: 60000 66 | Company Number: 5 67 | DCompany: ELASTIC N.V. 68 | Level: PARTNER 69 | Donated: 7500 70 | Company Number: 6 71 | DCompany: FACEBOOK INC 72 | Level: SUSTAINABILITY 73 | Donated: 90000 74 | Company Number: 7 75 | DCompany: HUAWEI TECHNOLOGIES 76 | Level: SUSTAINABILITY 77 | Donated: 90000 78 | Company Number: 8 79 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP 80 | Level: CONTRIBUTING 81 | Donated: 30000 82 | Company Number: 9 83 | DCompany: JPMORGAN CHASE & CO 84 | Level: SUPPORTING 85 | Donated: 15000 86 | Company Number: 10 87 | DCompany: MICROSOFT CORP 88 | Level: VISIONARY 89 | Donated: 150000 90 | Company Number: 11 91 | DCompany: NETFLIX INC 92 | Level: PARTNER 93 | Donated: 7500 94 | Company Number: 12 95 | DCompany: SALESFORCE.COM INC. 96 | Level: SUSTAINABILITY 97 | Donated: 90000 98 | Company Number: 13 99 | DCompany: SLACK TECHNOLOGIES INC. 100 | Level: MAINTAINING 101 | Donated: 60000 102 | """ 103 | -------------------------------------------------------------------------------- /section1_challenge_2.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | 3 | # Challenge 2 4 | # Modify the code below to sum up all the donations by the companies in the list 5 | 6 | import csv 7 | import os 8 | from pprint import pprint 9 | 10 | current_directory = os.getcwd() 11 | 12 | pycon_sponsors_filename = 'pycon_sponsors.csv' 13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 14 | 15 | # print(pycon_sponsors_filepath) 16 | 17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 22 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 25 | 26 | sponsor_vlookup = {} 27 | 28 | for sponsor_level in sponsor_levels: 29 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount'] 30 | 31 | pprint(sponsor_levels) 32 | 33 | pycon_sum = [] 34 | 35 | # print(pycon_sponsors_filepath) 36 | 37 | with open(pycon_sponsors_filepath, 'r') as f: 38 | rows = csv.reader(f) 39 | 40 | header = next(f) 41 | 42 | for row_number, row in enumerate(rows): 43 | ticker, name, level = row 44 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n") 45 | 46 | """ 47 | Current Output: 48 | 49 | [{'amount': 150000, 'sponsor_level': 'VISIONARY'}, 50 | {'amount': 90000, 'sponsor_level': 'SUSTAINABILITY'}, 51 | {'amount': 60000, 'sponsor_level': 'MAINTAINING'}, 52 | {'amount': 30000, 'sponsor_level': 'CONTRIBUTING'}, 53 | {'amount': 15000, 'sponsor_level': 'SUPPORTING'}, 54 | {'amount': 7500, 'sponsor_level': 'PARTNER'}, 55 | {'amount': 3750, 'sponsor_level': 'PARTICIPATING'}, 56 | {'amount': 1500, 'sponsor_level': 'ASSOCIATE'}] 57 | 58 | Company Number: 0 59 | DCompany: ALPHABET INC. 60 | Level: VISIONARY 61 | Donated: 150000 62 | Company Number: 1 63 | DCompany: AMAZON COM INC 64 | Level: SUSTAINABILITY 65 | Donated: 90000 66 | Company Number: 2 67 | DCompany: BLOOMBERG 68 | Level: VISIONARY 69 | Donated: 150000 70 | Company Number: 3 71 | DCompany: CAPITAL ONE FINANCIAL CORP 72 | Level: MAINTAINING 73 | Donated: 60000 74 | Company Number: 4 75 | DCompany: CORNING INC 76 | Level: MAINTAINING 77 | Donated: 60000 78 | Company Number: 5 79 | DCompany: ELASTIC N.V. 80 | Level: PARTNER 81 | Donated: 7500 82 | Company Number: 6 83 | DCompany: FACEBOOK INC 84 | Level: SUSTAINABILITY 85 | Donated: 90000 86 | Company Number: 7 87 | DCompany: HUAWEI TECHNOLOGIES 88 | Level: SUSTAINABILITY 89 | Donated: 90000 90 | Company Number: 8 91 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP 92 | Level: CONTRIBUTING 93 | Donated: 30000 94 | Company Number: 9 95 | DCompany: JPMORGAN CHASE & CO 96 | Level: SUPPORTING 97 | Donated: 15000 98 | Company Number: 10 99 | DCompany: MICROSOFT CORP 100 | Level: VISIONARY 101 | Donated: 150000 102 | Company Number: 11 103 | DCompany: NETFLIX INC 104 | Level: PARTNER 105 | Donated: 7500 106 | Company Number: 12 107 | DCompany: SALESFORCE.COM INC. 108 | Level: SUSTAINABILITY 109 | Donated: 90000 110 | Company Number: 13 111 | DCompany: SLACK TECHNOLOGIES INC. 112 | Level: MAINTAINING 113 | Donated: 60000 114 | 115 | """ 116 | """ 117 | Expected Output: 118 | 119 | Total Sum: 1050000 120 | 121 | """ 122 | -------------------------------------------------------------------------------- /section1_challenge_2_answer.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | # Modify the code below to sum up all the donations by the companies in the list 3 | 4 | import csv 5 | import os 6 | from pprint import pprint 7 | 8 | current_directory = os.getcwd() 9 | 10 | pycon_sponsors_filename = 'pycon_sponsors.csv' 11 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 12 | 13 | print(pycon_sponsors_filepath) 14 | 15 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 16 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 17 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 18 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 19 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 20 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 21 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 22 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 23 | 24 | sponsor_vlookup = {} 25 | 26 | for sponsor_level in sponsor_levels: 27 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount'] 28 | 29 | pprint(sponsor_levels) 30 | 31 | pycon_sum = [] 32 | 33 | print(pycon_sponsors_filepath) 34 | 35 | with open(pycon_sponsors_filepath, 'r') as f: 36 | rows = csv.reader(f) 37 | 38 | header = next(f) 39 | 40 | for row_number, row in enumerate(rows): 41 | ticker, name, level = row 42 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n") 43 | value = int(sponsor_vlookup[row[2]]) 44 | pycon_sum.append(value) 45 | 46 | print("Total Sum", sum(pycon_sum)) 47 | 48 | """ 49 | Output: 50 | 51 | 1050000 52 | 53 | """ 54 | -------------------------------------------------------------------------------- /section1_challenge_3.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | 3 | # Challenge 3 4 | # Create a function that takes a filepath as a parameter and returns the sum of donations 5 | 6 | import csv 7 | import os 8 | from pprint import pprint 9 | 10 | current_directory = os.getcwd() 11 | 12 | pycon_sponsors_filename = 'pycon_sponsors.csv' 13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 14 | 15 | print(pycon_sponsors_filepath) 16 | 17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 22 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 25 | 26 | sponsor_vlookup = {} 27 | 28 | for sponsor_level in sponsor_levels: 29 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount'] 30 | 31 | pprint(sponsor_levels) 32 | 33 | pycon_sum = [] 34 | 35 | print(pycon_sponsors_filepath) 36 | 37 | with open(pycon_sponsors_filepath, 'r') as f: 38 | rows = csv.reader(f) 39 | 40 | header = next(f) 41 | 42 | for row_number, row in enumerate(rows): 43 | ticker, name, level = row 44 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n") 45 | value = int(sponsor_vlookup[row[2]]) 46 | pycon_sum.append(value) 47 | 48 | print("Total Sum", sum(pycon_sum)) 49 | 50 | """ 51 | Output: 52 | 53 | 1050000 54 | 55 | """ 56 | -------------------------------------------------------------------------------- /section1_challenge_3_answer.py: -------------------------------------------------------------------------------- 1 | # Perform an Excel VLOOKUP with a Python Dictionary 2 | 3 | # Challenge 3 4 | # Create a function that takes a filepath as a parameter and returns the sum of donations 5 | 6 | import csv 7 | import os 8 | from pprint import pprint 9 | 10 | current_directory = os.getcwd() 11 | 12 | pycon_sponsors_filename = 'pycon_sponsors.csv' 13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename) 14 | 15 | print(pycon_sponsors_filepath) 16 | 17 | def sum_donations(filepath): 18 | 19 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000}, 20 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000}, 21 | {'sponsor_level': 'MAINTAINING', 'amount': 60000}, 22 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000}, 23 | {'sponsor_level': 'SUPPORTING', 'amount': 15000}, 24 | {'sponsor_level': 'PARTNER', 'amount': 7500}, 25 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750}, 26 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}] 27 | 28 | sponsor_vlookup = {} 29 | 30 | for sponsor_level in sponsor_levels: 31 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount'] 32 | 33 | pycon_sum = [] 34 | 35 | with open(filepath, 'r') as f: 36 | rows = csv.reader(f) 37 | 38 | header = next(f) 39 | 40 | for row_number, row in enumerate(rows): 41 | ticker, name, level = row 42 | value = int(sponsor_vlookup[row[2]]) 43 | pycon_sum.append(value) 44 | 45 | return pycon_sum 46 | 47 | 48 | """ 49 | Output: 50 | 51 | pycon_sum = sum_donation(pycon_sponsors_filepath) 52 | 53 | print("Total Donation:\t", pycon_sum) 54 | 55 | """ 56 | -------------------------------------------------------------------------------- /section2-02-real-world-example-refactored.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import zipfile 4 | import re # regular expression 5 | 6 | import requests 7 | 8 | import pandas as pd 9 | 10 | # fixes display of dataframes in Python Console 11 | pd.set_option('display.float_format', lambda x: f'{x:.5f}') 12 | pd.set_option('display.max_columns', 500) 13 | pd.set_option('display.max_rows', 500) 14 | pd.set_option('display.width', 600) 15 | 16 | current_directory = os.getcwd() 17 | 18 | 19 | def extract_zip_contents(filepath): 20 | zip_file_local_extract_path = filepath.replace(".zip", "") 21 | 22 | # create directory for zip files 23 | if os.path.exists(zip_file_local_extract_path): 24 | 25 | print("Folder already Exists!") 26 | 27 | else: 28 | try: 29 | 30 | z = zipfile.ZipFile(zip_file_local_extract_path) 31 | 32 | z.extractall(zip_file_local_extract_path) 33 | 34 | print("Extracting Contents: \t", zip_file_local_extract_path) 35 | except: 36 | print("Issue Extracting, Going to Skip :)") 37 | return None 38 | 39 | return zip_file_local_extract_path 40 | 41 | 42 | def download_filings(start_year, end_year, output_directory): 43 | quarters = ['q1', 'q2', 'q3', 'q4'] 44 | 45 | zip_filepaths = [] 46 | 47 | for year in range(start_year, end_year): 48 | for quarter in quarters: 49 | 50 | url = rf'https://www.sec.gov/files/dera/data/financial-statement-data-sets/{year}{quarter}.zip' 51 | 52 | try: 53 | 54 | # we can get the filename (basename) of the url using basename 55 | basename = os.path.basename(url) 56 | 57 | print(basename) 58 | 59 | zip_file_local_filepath = os.path.join(output_directory, basename) 60 | 61 | print(zip_file_local_filepath) 62 | 63 | zip_filepaths.append(zip_file_local_filepath) 64 | 65 | if not os.path.exists(zip_file_local_filepath): 66 | 67 | print(f"Downloading: \t{url}") 68 | 69 | r = requests.get(url) 70 | 71 | if r.status_code == 200: 72 | 73 | print(f"Download Complete") 74 | 75 | with open(zip_file_local_filepath, 'wb') as fd: 76 | fd.write(r.content) 77 | 78 | else: 79 | print("Got an Error Code!") 80 | 81 | else: 82 | print("It appears Zip File already exists", zip_file_local_filepath) 83 | 84 | except Exception as E: 85 | print("Error Downloading", url, E) 86 | 87 | return zip_filepaths 88 | 89 | 90 | def transform_data(numbers_filepath, submissions_filepath, df_sic_list, df_symbol_cik, metric="Revenues", form_type='10-'): 91 | print("Transforming ", numbers_filepath) 92 | 93 | df_numbers = pd.read_csv(numbers_filepath, delimiter="\t") 94 | 95 | df_submissions = pd.read_csv(submissions_filepath, delimiter="\t") 96 | 97 | # convert sic to string 98 | df_submissions['sic'] = df_submissions['sic'].astype('Int64').astype('str') 99 | 100 | df_submissions = df_submissions[['adsh', 'cik', 'name', 'sic', 'countryba', 'stprba', 'fye', 'form', 'period', 'filed', 'instance']] 101 | 102 | df_symbol_cik['symbol'] = df_symbol_cik['symbol'].str.upper() 103 | 104 | # create list of dataframe column names 105 | submissions_columns = df_submissions.columns.tolist() 106 | 107 | # going to merge two dataframes into one 108 | df_submissions_symbols = pd.merge(df_submissions, df_symbol_cik) 109 | 110 | # merge sic codes onto submission dataframe 111 | df_submissions_symbols = pd.merge(df_submissions_symbols, df_sic_list, on="sic") 112 | 113 | # we can drop columns by name using drop 114 | df_submissions_symbols = df_submissions_symbols.drop(columns=['instance']) 115 | 116 | new_submissions_columns = ["symbol", "industry_title"] + submissions_columns 117 | 118 | df_submissions_symbols = df_submissions_symbols.reindex(columns=new_submissions_columns) 119 | 120 | df_submissions_symbols = df_submissions_symbols[df_submissions_symbols['form'].str.contains(form_type, flags=re.IGNORECASE, regex=True)] 121 | 122 | df_submission_numbers = pd.merge(df_numbers, df_submissions_symbols, left_on='adsh', right_on='adsh', how='inner') 123 | 124 | new_column_order = ['cik', 125 | 'symbol', 126 | 'name', 127 | 'sic', 128 | 'industry_title', 129 | 'countryba', 130 | 'stprba', 131 | 'fye', 132 | 'form', 133 | 'period', 134 | 'filed', 135 | 'adsh', 136 | 'tag', 137 | 'version', 138 | 'coreg', 139 | 'ddate', 140 | 'qtrs', 141 | 'uom', 142 | 'value' 143 | ] 144 | 145 | # reorder columns 146 | df_submission_numbers = df_submission_numbers.reindex(columns=new_column_order) 147 | 148 | # Group by: split-apply-combine 149 | if metric: 150 | df_values = df_submission_numbers[df_submission_numbers['tag'].isin([metric])] 151 | else: 152 | df_values = df_submission_numbers.copy() 153 | 154 | df_values = df_values.dropna(subset=['value']) 155 | 156 | # only show companies with 4 quarters (1 year) worth of data 157 | df_values = df_values[df_values['qtrs'] == 4] 158 | df_values = df_values[(df_values['uom'] == "USD") | (df_values['uom'] == "EUR")] 159 | 160 | df_values = df_values.sort_values('ddate', ascending=True) 161 | 162 | group = [] 163 | 164 | for (symbol, qtrs), df_group in df_values.groupby(["symbol", "qtrs"]): 165 | df_group['pct_change'] = df_group['value'].pct_change() 166 | group.append(df_group) 167 | 168 | df_values_pct = pd.concat(group) 169 | 170 | df_values_pct = df_values_pct.sort_values('ddate', ascending=False) 171 | 172 | print("Done Transforming ", numbers_filepath) 173 | 174 | return df_values_pct 175 | 176 | 177 | def filter_ticker_list(df_submissions_symbols): 178 | pycon_sponsors = [{'symbol': 'GOOG', 'name': 'ALPHABET INC.', 'sponsor_level': 'VISIONARY'}, 179 | {'symbol': 'AMZN', 'name': 'AMAZON COM INC', 'sponsor_level': 'SUSTAINABILITY'}, 180 | {'symbol': '#N/A', 'name': 'BLOOMBERG', 'sponsor_level': 'VISIONARY'}, 181 | {'symbol': 'COF', 'name': 'CAPITAL ONE FINANCIAL CORP', 'sponsor_level': 'MAINTAINING'}, 182 | {'symbol': 'GLW', 'name': 'CORNING INC', 'sponsor_level': 'MAINTAINING'}, 183 | {'symbol': 'ESTC', 'name': 'ELASTIC N.V.', 'sponsor_level': 'PARTNER'}, 184 | {'symbol': 'FB', 'name': 'FACEBOOK INC', 'sponsor_level': 'SUSTAINABILITY'}, 185 | {'symbol': '#N/A', 'name': 'HUAWEI TECHNOLOGIES', 'sponsor_level': 'SUSTAINABILITY'}, 186 | {'symbol': 'IBM', 'name': 'INTERNATIONAL BUSINESS MACHINES CORP', 'sponsor_level': 'CONTRIBUTING'}, 187 | {'symbol': 'JPM', 'name': 'JPMORGAN CHASE & CO', 'sponsor_level': 'SUPPORTING'}, 188 | {'symbol': 'MSFT', 'name': 'MICROSOFT CORP', 'sponsor_level': 'VISIONARY'}, 189 | {'symbol': 'NFLX', 'name': 'NETFLIX INC', 'sponsor_level': 'PARTNER'}, 190 | {'symbol': 'CRM', 'name': 'SALESFORCE.COM INC.', 'sponsor_level': 'SUSTAINABILITY'}, 191 | {'symbol': 'WORK', 'name': 'SLACK TECHNOLOGIES INC.', 'sponsor_level': 'MAINTAINING'}] 192 | 193 | df_companies = pd.DataFrame(pycon_sponsors) 194 | 195 | ticker_list_pycon_sponsors = df_companies['symbol'].tolist() 196 | 197 | df_selected_submissions = df_submissions_symbols[df_submissions_symbols['symbol'].isin(ticker_list_pycon_sponsors)] 198 | 199 | new_submissions_columns = ['cik', 200 | 'symbol', 201 | 'name', 202 | 'sic', 203 | 'industry_title', 204 | 'countryba', 205 | 'stprba', 206 | 'fye', 207 | 'form', 208 | 'period', 209 | 'filed', 210 | 'adsh' 211 | ] 212 | 213 | df_selected_submissions = df_selected_submissions.reindex(columns=new_submissions_columns) 214 | 215 | return df_selected_submissions 216 | 217 | 218 | def main(start_year, end_year): 219 | url = 'https://www.sec.gov/include/ticker.txt' 220 | 221 | df_symbol_cik = pd.read_csv(url, delimiter="\t", names=['symbol', 'cik']) 222 | 223 | # standard industrial classification 224 | sic_url = r'https://www.sec.gov/info/edgar/siccodes.htm' 225 | # we can extract table from html by passing in url 226 | sics_tables = pd.read_html(sic_url) 227 | df_sic_list = sics_tables[0] 228 | 229 | # rename columns to lower, no spaces, and rename sic_code to sic 230 | df_sic_list.columns = df_sic_list.columns.str.lower().str.replace(" ", "_").str.replace("sic_code", "sic") 231 | 232 | # convert sic column to string 233 | df_sic_list['sic'] = df_sic_list['sic'].astype('Int64').astype('str') 234 | 235 | output_directory = os.path.join(current_directory, "zip-data") 236 | 237 | # create directory for zip files 238 | if os.path.exists(output_directory): 239 | print("Folder already Exists!") 240 | else: 241 | print("Folder doesn't exist") 242 | os.mkdir(output_directory) 243 | print("Created Directory!") 244 | 245 | zip_filepaths = download_filings(start_year, end_year, output_directory) 246 | 247 | zip_folders = [] 248 | 249 | for zip_filepath in zip_filepaths: 250 | zip_folder = extract_zip_contents(zip_filepath) 251 | 252 | if zip_folder: 253 | zip_folders.append(zip_folder) 254 | 255 | # get list of all extracted files 256 | files = glob.glob(output_directory + "\\*\\*.*") 257 | 258 | num_files = [file for file in files if "num.txt" in file] 259 | sub_files = [file for file in files if "sub.txt" in file] 260 | 261 | pre_files = [file for file in files if "pre.txt" in file] 262 | tag_files = [file for file in files if "tag.txt" in file] 263 | readme_files = [file for file in files if "readme.htm" in file] 264 | 265 | num_files.sort(reverse=True) 266 | sub_files.sort(reverse=True) 267 | 268 | if len(num_files) == len(sub_files): 269 | sub_num_files = list(zip(sub_files, num_files)) 270 | 271 | filings = [] 272 | 273 | for sub_file, num_file in sub_num_files[1:5]: 274 | df_companies_pct_chg = transform_data(num_file, sub_file, df_sic_list, df_symbol_cik, metric="Revenues", form_type='10-') 275 | 276 | filings.append(df_companies_pct_chg) 277 | 278 | df_all_filings = pd.concat(filings) 279 | 280 | # df_all_filings = df_all_filings.dropna(subset=['pct_change']) 281 | 282 | # df_all_filings = df_all_filings[df_all_filings['pct_change'] > 0] 283 | # 284 | # df_all_filings = df_all_filings.drop_duplicates(keep='first', subset=['cik']).sort_values('value', ascending=False) 285 | 286 | df_all_filings.to_csv('all_filings.csv') 287 | 288 | 289 | if __name__ == "__main__": 290 | start_year = 2020 291 | end_year = 2022 292 | 293 | main(start_year, end_year) 294 | -------------------------------------------------------------------------------- /section2_challenge.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Visit Awesome Public Datasets on Github 4 | 5 | https://github.com/awesomedata/awesome-public-datasets 6 | 7 | # Find a dataset you find interesting, easily downloadable, and format Pandas can work with 8 | 9 | # Create a Github Username 10 | 11 | # Think of an awesome project Name 12 | 13 | # Create a Github Repository 14 | 15 | # Create a new project structure using Cookiecutter 16 | 17 | # Commit and push your new project to Github 18 | 19 | # Write Code to Download the Dataset using Requests 20 | 21 | # Write Code to Transform & Analyze the Dataset using Pandas 22 | 23 | # Write Code to Visualize Dataset using Your favorite Pandas Visualization Library 24 | 25 | # Write Code to Display your Data in a Browser Using Flask 26 | 27 | # Refactor & clean up your code to make it easy to maintain and share 28 | 29 | # Setup Airflow Schedule to automatically download, transform, and output your results of your Analysis 30 | 31 | # Repeat, but with a different dataset 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:spreadsheets_to_dataframes/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | # Define setup.py command aliases here 22 | test = pytest 23 | 24 | [tool:pytest] 25 | collect_ignore = ['setup.py'] 26 | 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.rst') as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open('HISTORY.rst') as history_file: 12 | history = history_file.read() 13 | 14 | requirements = ['Click>=7.0', ] 15 | 16 | setup_requirements = ['pytest-runner', ] 17 | 18 | test_requirements = ['pytest>=3', ] 19 | 20 | setup( 21 | author="Ryan S. McCoy", 22 | author_email='github@ryansmccoy.com', 23 | python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*', 24 | classifiers=[ 25 | 'Development Status :: 2 - Pre-Alpha', 26 | 'Intended Audience :: Developers', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Natural Language :: English', 29 | "Programming Language :: Python :: 2", 30 | 'Programming Language :: Python :: 2.7', 31 | 'Programming Language :: Python :: 3', 32 | 'Programming Language :: Python :: 3.5', 33 | 'Programming Language :: Python :: 3.6', 34 | 'Programming Language :: Python :: 3.7', 35 | ], 36 | description="Examples from Presentation", 37 | entry_points={ 38 | 'console_scripts': [ 39 | 'spreadsheets_to_dataframes=spreadsheets_to_dataframes.cli:main', 40 | ], 41 | }, 42 | install_requires=requirements, 43 | license="MIT license", 44 | long_description=readme + '\n\n' + history, 45 | include_package_data=True, 46 | keywords='spreadsheets_to_dataframes', 47 | name='spreadsheets_to_dataframes', 48 | packages=find_packages(include=['spreadsheets_to_dataframes', 'spreadsheets_to_dataframes.*']), 49 | setup_requires=setup_requirements, 50 | test_suite='tests', 51 | tests_require=test_requirements, 52 | url='https://github.com/ryansmccoy/spreadsheets_to_dataframes', 53 | version='0.1.0', 54 | zip_safe=False, 55 | ) 56 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py35, py36, py37 flake8 3 | 4 | [travis] 5 | python = 6 | 3.7: py37 7 | 3.6: py36 8 | 3.5: py35 9 | 2.7: py27 10 | 11 | [testenv:flake8] 12 | basepython = python 13 | deps = flake8 14 | commands = flake8 spreadsheets_to_dataframes 15 | 16 | [testenv] 17 | setenv = 18 | PYTHONPATH = {toxinidir} 19 | deps = 20 | -r{toxinidir}/requirements_dev.txt 21 | ; If you want to make tox run the tests with the same versions, create a 22 | ; requirements.txt with the pinned versions and uncomment the following line: 23 | ; -r{toxinidir}/requirements.txt 24 | commands = 25 | pip install -U pip 26 | pytest --basetemp={envtmpdir} 27 | 28 | --------------------------------------------------------------------------------