├── .editorconfig
├── .gitignore
├── .travis.yml
├── 01-basics.ipynb
├── 01-presentation-example
├── 01_simple_open.py
├── 02_open_file.py
├── 03_data_manipulation.py
├── 04_perform_calculation.py
├── 04_perform_calculation_no_comments.py
├── 05_storing_data.py
├── 06_pandas.py
└── 06_pandas_no_comments.py
├── 02-selenium-examples
└── download_calendar.py
├── 02-selenium-safari
├── create_names.py
├── gather_links_for_processing.py
├── html_to_pdf.py
├── main.py
├── merge_pdf_files.py
├── process_html_remove_junk.py
├── requirements.txt
└── zip_folder.py
├── 02-webscrape-celery
├── __init__.py
├── basic_consumer.py
├── basic_producer.py
├── consumer.py
├── data.html
├── producer.py
└── urls.txt
├── 02-webscraping.ipynb
├── 03-tidy-data.ipynb
├── 04-other-analysis
├── Autoregression_retail_sales.ipynb
├── Dynamic Linear Regression Models in Python.ipynb
├── example_pandas.py
└── read_sec.py
├── 04-pandas-other
└── pandas-selecting-rows.ipynb
├── 04-pandas.ipynb
├── 05-data-analysis.ipynb
├── 05-other-visualizations
├── Visualization.ipynb
└── visualize-football-stadiums.ipynb
├── 06-data-visualizations.ipynb
├── 06-flask
└── flask-rss
│ ├── README.md
│ ├── main.py
│ ├── static
│ ├── css
│ │ ├── bootstrap-theme.css
│ │ ├── bootstrap-theme.css.map
│ │ ├── bootstrap-theme.min.css
│ │ ├── bootstrap-theme.min.css.map
│ │ ├── bootstrap.css
│ │ ├── bootstrap.css.map
│ │ ├── bootstrap.min.css
│ │ ├── bootstrap.min.css.map
│ │ ├── reader.css
│ │ └── style.css
│ ├── fonts
│ │ ├── glyphicons-halflings-regular.eot
│ │ ├── glyphicons-halflings-regular.svg
│ │ ├── glyphicons-halflings-regular.ttf
│ │ ├── glyphicons-halflings-regular.woff
│ │ └── glyphicons-halflings-regular.woff2
│ └── js
│ │ ├── bootstrap.js
│ │ ├── bootstrap.min.js
│ │ ├── jquery-2.2.0.js
│ │ └── npm.js
│ └── templates
│ ├── index.html
│ ├── layout.html
│ ├── notfound.html
│ ├── reader.html
│ └── table.html
├── 07-airflow
├── README.md
├── dags
│ ├── example_postgres.py
│ ├── sql
│ │ ├── stock_insert.sql
│ │ └── stock_schema.sql
│ ├── stock_analysis_dag.py
│ └── stocks.py
└── docker-compose.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── data
├── 20180806_ALL_EQUITY_meetup.csv
├── WA_Fn-UseC_-HR-Employee-Attrition.xlsx
├── WMT_US.csv
├── WMT_US_pandas.csv
├── WMT_US_updated.csv
├── billboard.csv
├── country_timeseries.csv
├── fortune_1000.csv
├── gapminder.tsv
├── global_equity_historic_sales_1999_2018_usd_all_meetup.csv
├── linkedin_industries.html
├── msft_stock_key_data.csv
├── pew.csv
├── portfolio.csv
├── pycon_sponsor_levels.csv
├── pycon_sponsors.csv
├── retail_sales.csv
├── sponsors_vlookup.csv
├── stl.csv
├── stlcom_larget_employers.xlsx
├── stlregionalchamber_largest_employers_.xlsx
├── stock_data_simple.csv
├── stock_data_simple.xlsx
├── stock_description.csv
├── table1.csv
├── table2.csv
├── table3.csv
├── table4a.csv
├── table4b.csv
└── weather.csv
├── docs
├── Makefile
├── authors.rst
├── conf.py
├── contributing.rst
├── history.rst
├── index.rst
├── installation.rst
├── make.bat
├── readme.rst
└── usage.rst
├── img
├── basics
│ ├── basic_python_style.png
│ ├── built-in_data_structures.png
│ ├── built-in_functions.png
│ ├── built-in_len.png
│ ├── calculations.png
│ ├── cell.png
│ ├── cell_ex.png
│ ├── cell_types.png
│ ├── cells.png
│ ├── comments.png
│ ├── data-types.png
│ ├── data_collections.png
│ ├── excel-built-in-string.png
│ ├── excel-built-in.png
│ ├── excel-pre-installed-add-ins.png
│ ├── jupyter-method.png
│ ├── pycharm-function-pop.png
│ ├── pycharm-function-popup.png
│ ├── pycharm-methods.png
│ ├── pycon-files.png
│ ├── pycon_sponsor_levels.png
│ ├── pycon_sponsors.png
│ ├── python-pre-installed-add-ins.png
│ ├── reserved_words.png
│ ├── standard-library-import.png
│ ├── standard-library.png
│ └── vscode-method.png
├── dataframe.png
├── dataframe_components.png
├── excel_table.png
├── pandas_dataframe.png
└── split_apply_combine.png
├── requirements_dev.txt
├── section1-01-basics_but_important_stuff.ipynb
├── section1-02-files_lists_dictionaries.ipynb
├── section1_challenge_1.py
├── section1_challenge_1_answer.py
├── section1_challenge_2.py
├── section1_challenge_2_answer.py
├── section1_challenge_3.py
├── section1_challenge_3_answer.py
├── section2-01-real-world-example.py
├── section2-02-real-world-example-refactored.py
├── section2_challenge.rst
├── setup.cfg
├── setup.py
└── tox.ini
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 |
17 | [LICENSE]
18 | insert_final_newline = false
19 |
20 | [Makefile]
21 | indent_style = tab
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # dotenv
84 | .env
85 |
86 | # virtualenv
87 | .venv
88 | venv/
89 | ENV/
90 |
91 | # Spyder project settings
92 | .spyderproject
93 | .spyproject
94 |
95 | # Rope project settings
96 | .ropeproject
97 |
98 | # mkdocs documentation
99 | /site
100 |
101 | # mypy
102 | .mypy_cache/
103 | venv/
104 | stock_algo/
105 | sec.gov.zip
106 | .ipynb_checkpoints/
107 | .idea/
108 | zip-data/
109 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Config file for automatic testing at travis-ci.org
2 |
3 | language: python
4 | python:
5 | - 3.7
6 | - 3.6
7 | - 3.5
8 | - 2.7
9 |
10 | # Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
11 | install: pip install -U tox-travis
12 |
13 | # Command to run tests, e.g. python setup.py test
14 | script: tox
15 |
16 |
17 |
--------------------------------------------------------------------------------
/01-basics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Intro to Python\n",
8 | "\n",
9 | "## string"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "pycharm": {
17 | "is_executing": false,
18 | "name": "#%%\n"
19 | },
20 | "scrolled": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "tickers = \"GOOG MSFT IBM TSLA\""
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Save String to File"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {
38 | "pycharm": {
39 | "name": "#%%\n"
40 | }
41 | },
42 | "outputs": [],
43 | "source": [
44 | "# write to file\n",
45 | "f = open('tickers.txt', 'wt')\n",
46 | "f.write(tickers)\n",
47 | "f.close()"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Tuple"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {
61 | "pycharm": {
62 | "name": "#%%\n"
63 | }
64 | },
65 | "outputs": [],
66 | "source": [
67 | "# tuple\n",
68 | "tickers = (\"GOOG\",\n",
69 | " \"MSFT\",\n",
70 | " \"IBM\",\n",
71 | " \"TSLA\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "### For-Loop over tuple and sum values"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {
85 | "pycharm": {
86 | "name": "#%%\n"
87 | }
88 | },
89 | "outputs": [
90 | {
91 | "name": "stdout",
92 | "output_type": "stream",
93 | "text": [
94 | "(1, 1)\n",
95 | "The sum of the tuple:\t2\n",
96 | "(2, 2)\n",
97 | "The sum of the tuple:\t4\n",
98 | "(3, 3)\n",
99 | "The sum of the tuple:\t6\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "list_of_tuples = [(1,1),\n",
105 | " (2,2),\n",
106 | " (3,3)]\n",
107 | "\n",
108 | "for values in list_of_tuples:\n",
109 | " print(f\"{values}\")\n",
110 | " total = sum(values)\n",
111 | " print(f\"The sum of the tuple:\\t{total}\")"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "## For-Loop Over tickers"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {
125 | "pycharm": {
126 | "name": "#%%\n"
127 | }
128 | },
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "GOOG\n",
135 | "MSFT\n",
136 | "IBM\n",
137 | "TSLA\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "# loolist\n",
143 | "tickers = [\"GOOG\",\n",
144 | " \"MSFT\",\n",
145 | " \"IBM\",\n",
146 | " \"TSLA\"]\n",
147 | "\n",
148 | "for ticker in tickers:\n",
149 | " print(ticker)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## For Loop - String Formatting"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 6,
162 | "metadata": {
163 | "pycharm": {
164 | "name": "#%%\n"
165 | }
166 | },
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | "Ticker: GOOG\n",
173 | "Ticker: MSFT\n",
174 | "Ticker: IBM\n",
175 | "Ticker: TSLA\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# loop through list\n",
181 | "tickers = [\"GOOG\",\"MSFT\",\"IBM\",\"TSLA\"]\n",
182 | "\n",
183 | "for ticker in tickers:\n",
184 | " print(f\"Ticker: {ticker}\")"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": []
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 7,
195 | "metadata": {
196 | "pycharm": {
197 | "name": "#%%\n"
198 | }
199 | },
200 | "outputs": [],
201 | "source": [
202 | "tickers = set([\"GOOG\",\n",
203 | " \"MSFT\",\n",
204 | " \"IBM\",\n",
205 | " \"TSLA\"])\n",
206 | "\n",
207 | "tickers = (\"GOOG\",\n",
208 | " \"MSFT\",\n",
209 | " \"IBM\",\n",
210 | " \"TSLA\")"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "# Tuple"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 9,
223 | "metadata": {
224 | "pycharm": {
225 | "name": "#%%\n"
226 | }
227 | },
228 | "outputs": [],
229 | "source": [
230 | "tickers = tuple([\"GOOG\",\n",
231 | " \"MSFT\",\n",
232 | " \"IBM\",\n",
233 | " \"TSLA\"])"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "# Dictionary"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 11,
246 | "metadata": {
247 | "pycharm": {
248 | "name": "#%%\n"
249 | }
250 | },
251 | "outputs": [],
252 | "source": [
253 | "tickers = {1: \"GOOG\",\n",
254 | " 2: \"MSFT\",\n",
255 | " 3: \"IBM\",\n",
256 | " 4: \"TSLA\"}"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "# Opening Files"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 12,
269 | "metadata": {
270 | "pycharm": {
271 | "name": "#%%\n"
272 | }
273 | },
274 | "outputs": [
275 | {
276 | "name": "stdout",
277 | "output_type": "stream",
278 | "text": [
279 | "Ticker,Date,Shares,Price\n",
280 | "\n",
281 | "GOOG,2019-10-01,100,1\n",
282 | "\n",
283 | "MSFT,2019-10-01,200,1\n",
284 | "\n",
285 | "IBM,2019-10-01,500,1\n",
286 | "\n",
287 | "TSLA,2019-10-01,300,1\n",
288 | "\n",
289 | "\n",
290 | "\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "import os\n",
296 | "# Contents of portfolio.csv:\n",
297 | "\"\"\"\n",
298 | "Ticker,Date,Shares,Price\n",
299 | "GOOG,2019-10-01,100,1\n",
300 | "MSFT,2019-10-01,200,1\n",
301 | "IBM,2019-10-01,500,1\n",
302 | "TSLA,2019-10-01,300,1\n",
303 | "\"\"\"\n",
304 | "\n",
305 | "# basic - open a file\n",
306 | "file = open('data/portfolio.csv', 'r')\n",
307 | "\n",
308 | "# print each line\n",
309 | "for line in file:\n",
310 | " print(line)\n",
311 | "\n",
312 | "# don't forget to close the file\n",
313 | "file.close()"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "# Opening Files - Preferred Way"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 13,
326 | "metadata": {
327 | "pycharm": {
328 | "is_executing": false,
329 | "name": "#%%\n"
330 | }
331 | },
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "['GOOG', '2019-10-01', '100', '1']\n",
338 | "['MSFT', '2019-10-01', '200', '1']\n",
339 | "['IBM', '2019-10-01', '500', '1']\n",
340 | "['TSLA', '2019-10-01', '300', '1']\n",
341 | "['']\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "### Better way to a file\n",
347 | "### with automatically closes the file for you\n",
348 | "\n",
349 | "with open('data/portfolio.csv', 'r') as f:\n",
350 | " headers = next(f) # skip a single of input\n",
351 | " for line in f:\n",
352 | " line = line.strip() #strip the whitespace\n",
353 | " parts = line.split(\",\")\n",
354 | " print(parts)\n"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "# Example of indexing into lists and if statement"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": 14,
367 | "metadata": {
368 | "pycharm": {
369 | "is_executing": false,
370 | "name": "#%%\n"
371 | }
372 | },
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "Ticker:GOOG\tDate: 2019-10-01\tShares: 100\tPrice: 1\n",
379 | "\n",
380 | "Ticker:MSFT\tDate: 2019-10-01\tShares: 200\tPrice: 1\n",
381 | "\n",
382 | "Ticker:IBM\tDate: 2019-10-01\tShares: 500\tPrice: 1\n",
383 | "\n",
384 | "Ticker:TSLA\tDate: 2019-10-01\tShares: 300\tPrice: 1\n",
385 | "\n"
386 | ]
387 | }
388 | ],
389 | "source": [
390 | "\n",
391 | "with open(r'data/portfolio.csv', 'r') as f:\n",
392 | " headers = next(f) # skip a single line of input, or skip header\n",
393 | " for line in f:\n",
394 | " parts = line.split(\",\")\n",
395 | " # check if number of items in list greater than 1\n",
396 | " # this will skip lines with only 1 element\n",
397 | " if len(parts) > 1:\n",
398 | " ticker = parts[0] # take the first item in the list\n",
399 | " date = parts[1] # take the second item\n",
400 | " shares = parts[2]\n",
401 | " price = parts[3]\n",
402 | " # f-strings formatting\n",
403 | " print(f\"Ticker:{ticker}\\tDate: {date}\\tShares: {shares}\\tPrice: {price}\")"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "# \"Batteries included\" with csv module"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 15,
416 | "metadata": {
417 | "pycharm": {
418 | "is_executing": false,
419 | "name": "#%%\n"
420 | }
421 | },
422 | "outputs": [
423 | {
424 | "name": "stdout",
425 | "output_type": "stream",
426 | "text": [
427 | "[{'date': '2019-10-01', 'price': '1', 'shares': '100', 'ticker': 'GOOG'},\n",
428 | " {'date': '2019-10-01', 'price': '1', 'shares': '200', 'ticker': 'MSFT'},\n",
429 | " {'date': '2019-10-01', 'price': '1', 'shares': '500', 'ticker': 'IBM'},\n",
430 | " {'date': '2019-10-01', 'price': '1', 'shares': '300', 'ticker': 'TSLA'}]\n"
431 | ]
432 | }
433 | ],
434 | "source": [
435 | "import csv\n",
436 | "from pprint import pprint\n",
437 | "\n",
438 | "portfolio = list() # need to create a list before you try using it\n",
439 | "# or, more common way to create list\n",
440 | "portfolio = [] # create a list to store tickers\n",
441 | "\n",
442 | "with open(r'data/portfolio.csv', 'r') as f:\n",
443 | " rows = csv.reader(f)\n",
444 | " headers = next(f) # skip a single of input\n",
445 | " for row in rows:\n",
446 | " if len(row) > 1:\n",
447 | " record = {\n",
448 | " 'ticker' : row[0],\n",
449 | " 'date' : row[1],\n",
450 | " 'shares' : row[2],\n",
451 | " 'price': row[3]\n",
452 | " }\n",
453 | " portfolio.append(record)\n",
454 | "\n",
455 | "pprint(portfolio)"
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "# Create a function that takes a filename and returns the contents"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 16,
468 | "metadata": {
469 | "pycharm": {
470 | "is_executing": false,
471 | "name": "#%%\n"
472 | }
473 | },
474 | "outputs": [
475 | {
476 | "name": "stdout",
477 | "output_type": "stream",
478 | "text": [
479 | "[{'date': '2019-10-01', 'price': '1', 'shares': '100', 'ticker': 'GOOG'},\n",
480 | " {'date': '2019-10-01', 'price': '1', 'shares': '200', 'ticker': 'MSFT'},\n",
481 | " {'date': '2019-10-01', 'price': '1', 'shares': '500', 'ticker': 'IBM'},\n",
482 | " {'date': '2019-10-01', 'price': '1', 'shares': '300', 'ticker': 'TSLA'}]\n"
483 | ]
484 | }
485 | ],
486 | "source": [
487 | "import csv\n",
488 | "from pprint import pprint\n",
489 | "\n",
490 | "def read_portfolio(filename):\n",
491 | "\n",
492 | " portfolio = list() # create a list to store tickers\n",
493 | "\n",
494 | " with open(filename, 'r') as f:\n",
495 | " rows = csv.reader(f)\n",
496 | " headers = next(f) # skip a single of input\n",
497 | " for row in rows:\n",
498 | " if len(row) > 1:\n",
499 | " record = {\n",
500 | " 'ticker' : row[0],\n",
501 | " 'date' : row[1],\n",
502 | " 'shares' : row[2],\n",
503 | " 'price': row[3]\n",
504 | " }\n",
505 | " portfolio.append(record)\n",
506 | " return portfolio\n",
507 | "\n",
508 | "portfolio = read_portfolio(r'data/portfolio.csv')\n",
509 | "\n",
510 | "pprint(portfolio)\n"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {
516 | "pycharm": {
517 | "name": "#%% md\n"
518 | }
519 | },
520 | "source": [
521 | "# SQL Connectivity"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 17,
527 | "metadata": {
528 | "pycharm": {
529 | "name": "#%%\n"
530 | }
531 | },
532 | "outputs": [
533 | {
534 | "name": "stdout",
535 | "output_type": "stream",
536 | "text": [
537 | "2020-05-14 14:47:03.454172\n",
538 | "2019-11-01 00:00:00\n",
539 | "2020-05-14 14:47:03.455172\n"
540 | ]
541 | }
542 | ],
543 | "source": [
544 | "# $ pip install sqlalchemy\n",
545 | "# sqlalchemy_uri = \"dialect+driver://user:password@host:port/dbname\"\n",
546 | "import sqlalchemy as sa\n",
547 | "\n",
548 | "conn = sa.create_engine('sqlite://')\n",
549 | "\n",
550 | "conn.execute('''CREATE TABLE zoo\n",
551 | " (critter VARCHAR(20) PRIMARY KEY,\n",
552 | " count INT,\n",
553 | " damages FLOAT)''')\n",
554 | "\n",
555 | "import datetime\n",
556 | "\n",
557 | "today = datetime.datetime.today()\n",
558 | "print(today)\n",
559 | "\n",
560 | "today = datetime.datetime.strptime(\"11/01/2019\",\"%m/%d/%Y\" )\n",
561 | "print(today)\n",
562 | "\n",
563 | "today = datetime.datetime.now()\n",
564 | "print(today)"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "outputs": [],
571 | "source": [],
572 | "metadata": {
573 | "collapsed": false,
574 | "pycharm": {
575 | "name": "#%%\n"
576 | }
577 | }
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": null,
582 | "metadata": {},
583 | "outputs": [],
584 | "source": []
585 | }
586 | ],
587 | "metadata": {
588 | "kernelspec": {
589 | "display_name": "Python 3",
590 | "language": "python",
591 | "name": "python3"
592 | },
593 | "language_info": {
594 | "codemirror_mode": {
595 | "name": "ipython",
596 | "version": 3
597 | },
598 | "file_extension": ".py",
599 | "mimetype": "text/x-python",
600 | "name": "python",
601 | "nbconvert_exporter": "python",
602 | "pygments_lexer": "ipython3",
603 | "version": "3.7.7"
604 | }
605 | },
606 | "nbformat": 4,
607 | "nbformat_minor": 1
608 | }
--------------------------------------------------------------------------------
/01-presentation-example/01_simple_open.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | filename = r'data\WMT_US.csv'
4 |
5 | f = open(filename, 'r')
6 |
7 | print(f)
8 |
9 | data = f.read()
10 |
11 | print(data)
12 |
13 | f.close()
14 |
15 | f = open(filename, 'r') # open file
16 |
17 | for line in f:
18 | print(line)
19 |
20 | f.close() # close file
21 |
--------------------------------------------------------------------------------
/01-presentation-example/02_open_file.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 |
4 | filename = r'data\WMT_US.csv'
5 |
6 | total = 0.0
7 |
8 | with open(filename, 'r') as f:
9 | rows = csv.reader(f)
10 |
11 | # save header row
12 | header = next(f)
13 | # and skip to next row
14 |
15 | for row in rows:
16 | print(row)
17 |
18 |
--------------------------------------------------------------------------------
/01-presentation-example/03_data_manipulation.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 | from datetime import datetime
4 |
5 | filename = r'data\WMT_US.csv'
6 |
7 | with open(filename, 'r') as f:
8 | rows = csv.reader(f)
9 |
10 | # skip header row
11 | header = next(f)
12 |
13 | for row in rows:
14 | row[2] = datetime.strptime(row[2], "%m/%d/%Y")
15 |
16 | # convert string to integer
17 | row[3] = int(row[3])
18 | row[4] = int(row[4])
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/01-presentation-example/04_perform_calculation.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 | from datetime import datetime
4 |
5 | filename = r'data\WMT_US.csv'
6 |
7 | records = []
8 |
9 | with open(filename, 'r') as f:
10 | rows = csv.reader(f)
11 |
12 | # skip header row
13 | header = next(f)
14 |
15 | for row in rows:
16 | # print(row)
17 | # ['WMT US', 'WAL-MART STORES INC', '12/31/2014', '476293988352', '460271988736']
18 |
19 | # convert string to date object
20 | row_date = datetime.strptime(row[2], "%m/%d/%Y")
21 | # print(row_date)
22 | # 2003-12-31 00:00:00
23 | # gives us ability to ask for year
24 | row_date_year = row_date.year
25 |
26 | # need to convert sales and expenses values from string to integer
27 | # so can perform mathmatical operations
28 | row_sales = int(row[3])
29 | row_expenses = int(row[4])
30 |
31 | # perform profit calculation
32 | profit = row_sales - row_expenses
33 |
34 | print(f"{row_date_year} Profit = {profit:,}")
35 |
36 | """
37 | Output:
38 |
39 | 2014 Profit = 16,021,999,616
40 | 2013 Profit = 16,999,000,064
41 | 2012 Profit = 15,699,000,320
42 | 2011 Profit = 16,389,000,192
43 | 2010 Profit = 14,334,999,552
44 | 2009 Profit = 13,400,000,512
45 | 2008 Profit = 12,730,999,808
46 | 2007 Profit = 11,283,999,744
47 | 2006 Profit = 11,230,999,552
48 | 2005 Profit = 10,266,999,808
49 | 2004 Profit = 9,054,000,128
50 | 2003 Profit = 7,954,999,808
51 | """
52 |
53 |
--------------------------------------------------------------------------------
/01-presentation-example/04_perform_calculation_no_comments.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 | from datetime import datetime
4 |
5 | filename = r'data\WMT_US.csv'
6 |
7 | with open(filename, 'r') as f:
8 | rows = csv.reader(f)
9 | header = next(f)
10 |
11 | for row in rows:
12 | row_date_year = datetime.strptime(row[2], "%m/%d/%Y").year
13 |
14 | row_sales = int(row[3])
15 | row_expenses = int(row[4])
16 |
17 | profit = row_sales - row_expenses
18 |
19 | print(f"{row_date_year} Profit = {profit:,}")
20 |
21 | """
22 | Output:
23 |
24 | 2014 Profit = 16,021,999,616
25 | 2013 Profit = 16,999,000,064
26 | 2012 Profit = 15,699,000,320
27 | 2011 Profit = 16,389,000,192
28 | 2010 Profit = 14,334,999,552
29 | 2009 Profit = 13,400,000,512
30 | 2008 Profit = 12,730,999,808
31 | 2007 Profit = 11,283,999,744
32 | 2006 Profit = 11,230,999,552
33 | 2005 Profit = 10,266,999,808
34 | 2004 Profit = 9,054,000,128
35 | 2003 Profit = 7,954,999,808
36 | """
37 |
38 |
--------------------------------------------------------------------------------
/01-presentation-example/05_storing_data.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 | from datetime import datetime
4 | from pprint import pprint
5 |
6 | filename = r'data\WMT_US.csv'
7 |
8 | records = []
9 |
10 | with open(filename, 'r') as f:
11 | rows = csv.reader(f)
12 |
13 | # skip header row
14 | header = next(f)
15 |
16 | for row in rows:
17 | row[2] = datetime.strptime(row[2], "%m/%d/%Y")
18 | row[3] = int(row[3])
19 | row[4] = int(row[4])
20 | # perform calculation
21 | profit = row[3] - row[4]
22 |
23 | record = {
24 | "ticker": row[0],
25 | "name": row[1],
26 | "date": row[2],
27 | "sales": row[3],
28 | "expenses": row[4],
29 | "profit": profit
30 | }
31 |
32 | records.append(record)
33 |
34 | pprint(records)
35 |
--------------------------------------------------------------------------------
/01-presentation-example/06_pandas.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 |
4 | pd.set_option('display.float_format', lambda x: f'{x:.5f}')
5 | pd.set_option('display.max_columns', 100)
6 | pd.set_option('display.max_rows', 100)
7 | pd.set_option('display.width', 600)
8 |
9 | filename = r'data\WMT_US.csv'
10 |
11 | df = pd.read_csv(filename)
12 |
13 | # check the data types of each columns
14 | print(df.dtypes)
15 |
16 | """
17 | Ticker object
18 | Company Name object
19 | Year End object
20 | Total Sales int64
21 | Total Expenses int64
22 | dtype: object
23 | """
24 |
25 | # convert the date column to python date object
26 | # which makes it easier to work with
27 | df['Year End'] = pd.to_datetime(df['Year End'])
28 |
29 | # print(df.dtypes)
30 | """
31 | Ticker object
32 | Company Name object
33 | Year End datetime64[ns]
34 | Total Sales int64
35 | Total Expenses int64
36 | dtype: object
37 |
38 | """
39 |
40 | # calculate total profit
41 | df['Total Profit'] = df['Total Sales'] - df['Total Expenses']
42 |
43 | # print(df)
44 | """
45 | Ticker Company Name Year End Total Sales Total Expenses Total Profit
46 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616
47 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064
48 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320
49 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192
50 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552
51 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512
52 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808
53 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744
54 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552
55 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808
56 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128
57 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808
58 | """
59 |
60 | df['Profit Margin'] = (df['Total Profit'] / df['Total Sales']) * 100
61 |
62 | # print(df)
63 | """
64 | Ticker Company Name Year End Total Sales Total Expenses Total Profit Profit Margin
65 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616 3.36389
66 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064 3.62327
67 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320 3.51247
68 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192 3.88504
69 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552 3.51164
70 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512 3.30369
71 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808 3.36089
72 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744 3.23648
73 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552 3.59476
74 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808 3.56507
75 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128 3.53218
76 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808 3.46448
77 | """
78 |
79 | # percent change needs to be ascending dates
80 | df.sort_values("Year End", inplace=True)
81 | df['Sales Growth YoY %'] = df['Total Sales'].pct_change() * 100
82 |
83 | # print(df)
84 | """
85 | Ticker Company Name Year End Total Sales Total Expenses Total Profit Profit Margin Sales Growth YoY %
86 | 11 WMT US WAL-MART STORES INC 2003-12-31 229615992832 221660993024 7954999808 3.46448 nan
87 | 10 WMT US WAL-MART STORES INC 2004-12-31 256329007104 247275006976 9054000128 3.53218 11.63378
88 | 9 WMT US WAL-MART STORES INC 2005-12-31 287989006336 277722006528 10266999808 3.56507 12.35131
89 | 8 WMT US WAL-MART STORES INC 2006-12-31 312426987520 301195987968 11230999552 3.59476 8.48573
90 | 7 WMT US WAL-MART STORES INC 2007-12-31 348650012672 337366012928 11283999744 3.23648 11.59408
91 | 6 WMT US WAL-MART STORES INC 2008-12-31 378798997504 366067997696 12730999808 3.36089 8.64735
92 | 5 WMT US WAL-MART STORES INC 2009-12-31 405606989824 392206989312 13400000512 3.30369 7.07710
93 | 4 WMT US WAL-MART STORES INC 2010-12-31 408214011904 393879012352 14334999552 3.51164 0.64275
94 | 3 WMT US WAL-MART STORES INC 2011-12-31 421849006080 405460005888 16389000192 3.88504 3.34016
95 | 2 WMT US WAL-MART STORES INC 2012-12-31 446950014976 431251014656 15699000320 3.51247 5.95024
96 | 1 WMT US WAL-MART STORES INC 2013-12-31 469162000384 452163000320 16999000064 3.62327 4.96968
97 | 0 WMT US WAL-MART STORES INC 2014-12-31 476293988352 460271988736 16021999616 3.36389 1.52015
98 | """
99 |
100 | new_filename = filename.replace(".csv", "_pandas.csv")
101 |
102 | df.to_csv(new_filename)
103 |
--------------------------------------------------------------------------------
/01-presentation-example/06_pandas_no_comments.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 |
4 | input_filename = r'data\WMT_US.csv'
5 | output_filename = r'data\WMT_US_output.csv'
6 |
7 | df = pd.read_csv(input_filename)
8 |
9 | df['Total Profit'] = df['Total Sales'] - df['Total Expenses']
10 |
11 | df.to_csv(output_filename)
12 |
13 | print(df)
14 | """
15 | Ticker Company Name Year End Total Sales Total Expenses Total Profit
16 | 0 WMT US WAL-MART STORES INC 12/31/2014 476293988352 460271988736 16021999616
17 | 1 WMT US WAL-MART STORES INC 12/31/2013 469162000384 452163000320 16999000064
18 | 2 WMT US WAL-MART STORES INC 12/31/2012 446950014976 431251014656 15699000320
19 | 3 WMT US WAL-MART STORES INC 12/31/2011 421849006080 405460005888 16389000192
20 | 4 WMT US WAL-MART STORES INC 12/31/2010 408214011904 393879012352 14334999552
21 | 5 WMT US WAL-MART STORES INC 12/31/2009 405606989824 392206989312 13400000512
22 | 6 WMT US WAL-MART STORES INC 12/31/2008 378798997504 366067997696 12730999808
23 | 7 WMT US WAL-MART STORES INC 12/31/2007 348650012672 337366012928 11283999744
24 | 8 WMT US WAL-MART STORES INC 12/31/2006 312426987520 301195987968 11230999552
25 | 9 WMT US WAL-MART STORES INC 12/31/2005 287989006336 277722006528 10266999808
26 | 10 WMT US WAL-MART STORES INC 12/31/2004 256329007104 247275006976 9054000128
27 | 11 WMT US WAL-MART STORES INC 12/31/2003 229615992832 221660993024 7954999808
28 | """
29 |
--------------------------------------------------------------------------------
/02-selenium-examples/download_calendar.py:
--------------------------------------------------------------------------------
1 | #! py27w
2 | import os, time
3 | from datetime import datetime
4 | from datetime import date
5 | from datetime import timedelta
6 | from selenium import webdriver
7 | from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
8 | from selenium.common.exceptions import NoSuchElementException
9 | from selenium.webdriver.common.by import By
10 | from selenium.webdriver.support.ui import WebDriverWait
11 | from selenium.webdriver.support import expected_conditions as EC
12 |
13 | fp = webdriver.FirefoxProfile()
14 | fp.set_preference('browser.download.folderList', 2)
15 | fp.set_preference("browser.download.manager.showWhenStarting", False)
16 | fp.set_preference('browser.download.dir', os.getcwd())
17 | fp.set_preference("browser.helperApps.neverAsk.saveToDisk", 'application/vnd.ms-excel')
18 | fp.set_preference("browser.download.dir", "c:\\tmp");
19 | driver = webdriver.Firefox(firefox_profile=fp)
20 | driver.get('https://www.zacks.com/earnings/earnings-reports')
21 |
22 |
23 | def click_calendar():
24 | try:
25 | element_xpath = '//*[@id="earnings_release"]/div[1]/p/a'
26 | element = WebDriverWait(driver, 10).until(
27 | lambda driver: driver.find_element_by_xpath(element_xpath).click()
28 | )
29 | finally:
30 | print("clicked calendar")
31 |
32 |
33 | def click_prev_day(x):
34 | s = 'datespan_%d' % (x)
35 | try:
36 | WebDriverWait(driver, 10).until(
37 | lambda driver: driver.find_element_by_id(s).click()
38 | )
39 | except:
40 | result = False
41 | else:
42 | result = True
43 | return result
44 |
45 |
46 | def click_export():
47 | try:
48 | element = WebDriverWait(driver, 10).until(
49 | lambda driver: driver.find_element_by_id('export_excel').click()
50 | )
51 | except:
52 | result = False
53 | else:
54 | result = True
55 | return result
56 |
57 |
58 | def click_prev_month():
59 | try:
60 | driver.find_element_by_id('prevCal').click()
61 | except:
62 | result = False
63 | else:
64 | result = True
65 | i = 31
66 | while i > 27:
67 | try:
68 | click_prev_day(i)
69 | return False
70 | except:
71 | print('could not find %s in prev month' % (i))
72 | i -= 1
73 |
74 |
75 | def subtract_day(n):
76 | y = n - 1
77 | return y
78 |
79 |
80 | def start_date():
81 | return datetime(2016, 2, 29)
82 |
83 |
84 | def click_to_start_date():
85 | start_date = datetime(2016, 2, 28)
86 | a = date.today()
87 | b = start_date
88 | c = a.month - b.month
89 | if c > 0:
90 | click_calendar()
91 | while c > 0:
92 | click_prev_month()
93 | c -= 1
94 | try:
95 | click_prev_day(31)
96 | except:
97 | click_prev_day(30)
98 |
99 |
100 | def main():
101 | # click_to_start_date()
102 | # sdate = start_date()
103 | m = 12
104 | while m > 0:
105 | m -= 1
106 | for x in range(31, 0, -1):
107 | click_calendar()
108 | click_prev_day(x)
109 | click_export()
110 |
111 | click_calendar()
112 | click_prev_month()
113 |
114 |
115 | if __name__ == '__main__':
116 | main()
117 |
--------------------------------------------------------------------------------
/02-selenium-safari/create_names.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import urllib.parse
5 | from pyquery import PyQuery as pq
6 | from bs4 import BeautifulSoup
7 | import configparser
8 |
9 |
10 | def create_filenames_for_conversion(filepath, filename, file_extention):
11 | print(filepath, filename)
12 | timestr = time.strftime("%Y%m%d-%H%M%S",time.localtime(os.path.getmtime(os.path.join(filepath, filename))))
13 | filename = filename.replace(file_extention,"")
14 | filename = filename.translate(dict((ord(char), None) for char in '\/*?:"<>|,.'))
15 | filename_html, filename_pdf = os.path.join(timestr + '_' + filename + '(clean)' + file_extention), os.path.join(timestr+'_'+filename + '(clean).pdf')
16 | print('starting creation of: ' + filename_html)
17 | return filename_html, filename_pdf
18 |
19 |
20 | def create_filename_from_url(url):
21 | url, fragment = urllib.parse.urldefrag(url)
22 | parsed = urllib.parse.urlsplit(url)
23 | stripped = parsed.path.replace(URL_REPLACE, '')
24 | filename = stripped.translate(dict((ord(char), None) for char in '\/*?:"<>|'))
25 | print(filename)
26 | return filename
27 |
28 | def create_folder_path_from_url(base_dir, url):
29 | path = os.path.join(base_dir, str(url.split("/")[5]+"_"+url.split("/")[6]).translate(dict((ord(char), None) for char in '\/*?:"<>|')))
30 | if os.path.exists(path) != True:
31 | os.makedirs(path)
32 | print(path)
33 | return path
34 |
35 |
36 | def create_file(filename, w_page_source, URL_WEBSITE):
37 | d = pq(w_page_source, parser='html')
38 | ab = d.make_links_absolute(URL_WEBSITE)
39 | soup = BeautifulSoup(ab.html(), "html.parser")
40 | try:
41 | with open(filename, "w", encoding='utf-8') as f:
42 | f.write(str(soup.decode_contents))
43 | except:
44 | print('something broke: ', filename)
45 | return filename
46 |
47 |
48 |
--------------------------------------------------------------------------------
/02-selenium-safari/gather_links_for_processing.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import configparser
3 | import time
4 | from create_names import create_file
5 | #sys.stdout = codecs.getwriter('utf8')(sys.stdout)
6 | import urllib.parse
7 | import lxml
8 | import lxml.html
9 | from lxml.html import parse, tostring, open_in_browser, fromstring
10 |
11 |
12 | def get_toc_links(filename, w_page_source, URL_WEBSITE,toc_xpath=None):
13 | create_file(filename, w_page_source, URL_WEBSITE)
14 | html = lxml.html.fromstring(w_page_source)
15 | html.make_links_absolute(URL_WEBSITE)
16 | ab = lxml.html.tostring(html,pretty_print=True, method="html")
17 | soup = BeautifulSoup(ab, 'lxml')
18 | links = []
19 | for link in soup.find_all('a'):
20 | if 'href' in link.attrs:
21 | links.append(str(link.attrs['href']))
22 | urls = []
23 | for i in links:
24 | url, fragment = urllib.parse.urldefrag(i)
25 | urls.append(url)
26 | urls = f7(urls)
27 | newurls = []
28 | for i in urls:
29 | if 'htm' in i:
30 | newurls.append(i)
31 | return(newurls)
32 |
33 | def f7(seq):
34 | seen = set()
35 | seen_add = seen.add
36 | return [x for x in seq if not (x in seen or seen_add(x))]
37 |
--------------------------------------------------------------------------------
/02-selenium-safari/html_to_pdf.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | # try importing scandir and if found, use it as it's a few magnitudes of an order faster than stock os.walk
4 |
5 | import sys
6 | import os
7 | # Generate type library so that we can access constants
8 |
9 | import process_html_remove_junk
10 | def convertHTML2PDF(htmlPath, pdfPath):
11 | import win32com.client.makepy
12 | import win32com.client
13 | from win32com.client import Dispatch
14 | from win32com.client.dynamic import ERRORS_BAD_CONTEXT
15 | import winerror
16 | win32com.client.makepy.GenerateFromTypeLibSpec('Acrobat')
17 | # Use Unicode characters instead of their ascii psuedo-replacements
18 | UNICODE_SNOB = 0
19 | 'Convert an HTML document to PDF format'
20 | # Connect to Adobe Acrobat
21 | import win32com.client
22 | avDoc = win32com.client.DispatchEx('AcroExch.AVDoc')
23 | avDoc.Open(os.path.abspath(htmlPath), 'html2pdf')
24 | # Save in PDF format
25 | pdDoc = avDoc.GetPDDoc()
26 | pdDoc.Save(win32com.client.constants.PDSaveFull, os.path.abspath(pdfPath))
27 | pdDoc.Close()
28 | # Close HTML document without prompting to save
29 | avDoc.Close(True)
30 |
31 | def file_conversion(folder):
32 | nfolder = os.path.join(folder,'clean')
33 | #folder = os.path.normpath(sys.argv[1])
34 | if nfolder is None:
35 | directory = 'C:\\HTML'
36 | files = process_html_remove_junk.walk_dir_fullfilename(directory)
37 | else:
38 | files=[]
39 | files = [os.path.join(nfolder, x) for x in os.listdir(nfolder)]
40 | for filename in files:
41 | basename = os.path.basename(filename)
42 | extname = os.path.splitext(basename)
43 | dirname = os.path.dirname(filename)
44 | pdf = os.path.join(folder,'pdf', extname[0]+'.pdf')
45 | try:
46 | print(pdf)
47 | convertHTML2PDF(filename, pdf)
48 | except:
49 | print('problem with: ' + filename)
50 |
51 |
52 |
--------------------------------------------------------------------------------
/02-selenium-safari/main.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import os
4 | import selenium
5 | from selenium import webdriver
6 | import create_names
7 | from create_names import create_folder_path_from_url, create_filename_from_url, create_file
8 | from zip_folder import zip_directory
9 | from merge_pdf_files import pyMerger
10 | import time
11 | import gather_links_for_processing
12 | from gather_links_for_processing import get_toc_links
13 | import random
14 | import process_html_remove_junk
15 | import html_to_pdf
16 |
17 |
18 | def file_merge(directory):
19 | for path, dirnames, files in os.walk(directory):
20 | pyMerger(path)
21 | print(path)
22 |
23 | def post_process():
24 | file_merge(BASE_DIR)
25 | zip_directory(BASE_DIR)
26 |
27 | def pause_for_random_time():
28 | time.sleep(random.randint(3,7))
29 |
30 | def process_html_files(directory=None):
31 | files_processed = process_html_remove_junk.process_html_files_removing_junk(directory)
32 | return files_processed
33 |
34 | def process_cleaned_files_into_pdf(directory):
35 | files_processed_pdf = html_to_pdf.file_conversion(directory)
36 | return files_processed_pdf
37 |
38 | def grab_urls_from_file(INPUT_FILE):
39 | file = INPUT_FILE
40 | urls = []
41 | list_of_list_of_filenames=[]
42 | with open(file, 'r') as f:
43 | urls = f.read().splitlines()
44 | if len(urls) < 1:
45 | urls = [sys.argv[1]]
46 | print(urls)
47 | return urls
48 |
49 | def main():
50 | '''
51 | function which calls file with urls to process
52 | '''
53 | if len(sys.argv) < 2:
54 | sys.exit(0)
55 |
56 | w = webdriver.Chrome()
57 |
58 | domain_url, base_login = URL_WEBSITE, URL_LOGIN
59 | w.get(domain_url + base_login)
60 | loginElem = w.find_element_by_name('email')
61 | loginElem.send_keys(USERNAME)
62 | loginPass = w.find_element_by_name('password1')
63 | loginPass.send_keys(PASSWORD)
64 | time.sleep(3)
65 | loginPass.submit()
66 | time.sleep(3)
67 |
68 | urls = grab_urls_from_file(INPUT_FILE)
69 |
70 | for url in urls:
71 | w.get(url)
72 | base_dir = os.path.abspath(os.sep)
73 | path = create_names.create_folder_path_from_url(BASE_DIR, url)
74 | filename = os.path.join(path,create_names.create_filename_from_url(url) + '(t).html')
75 | page_source = w.page_source
76 | toc_table_only = page_source
77 | toc = gather_links_for_processing.get_toc_links(filename, w.page_source, URL_WEBSITE)
78 | for webpage_url in toc:
79 | try:
80 | w.get(webpage_url)
81 | filename = create_names.create_filename_from_url(w.current_url)
82 | fout = create_names.create_file(os.path.join(path,filename + '.html'), w.page_source, URL_WEBSITE)
83 | except:
84 | print('something broke: ', filename)
85 | pause_for_random_time()
86 | list_of_list_of_filenames = process_html_files(path)
87 | process_cleaned_files_into_pdf(path)
88 | #pyMerger(directory)
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 | if __name__ == '__main__':
99 | main()
100 |
--------------------------------------------------------------------------------
/02-selenium-safari/merge_pdf_files.py:
--------------------------------------------------------------------------------
1 | import os
2 | def pyMerger(directory):
3 | pdfFiles = [f for f in os.listdir(directory) if f.lower().endswith("pdf")]
4 | merger = PdfFileMerger()
5 |
6 | if pdfFiles != []: # check if directory has pdf files in it
7 | for filename in pdfFiles:
8 | if filename != "_mergedFull.pdf": # check if merged file already exists and skip it
9 | merger.append(PdfFileReader(os.path.join(directory, filename), "rb"))
10 |
11 | outputFile = os.path.join(r'c:\pdf', directory+"_mergedFull.pdf")
12 | merger.write(outputFile) # it will overwrite if final file existed
13 | else:
14 | print(directory + " has no pdf files in it.")
15 |
16 |
--------------------------------------------------------------------------------
/02-selenium-safari/process_html_remove_junk.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | from bs4 import BeautifulSoup
4 | import create_names
5 | from create_names import create_filenames_for_conversion
6 |
7 |
8 |
9 | def grab_junk_tag(file):
10 | with open(file, 'r', encoding='utf-8') as f:
11 | data = f.read()
12 | bsObj = BeautifulSoup(data, "html.parser")
13 | head_elements_blacklist = ['topbar t-topbar']
14 | body_elements_blacklist = ['expanded', 'annotator-modal-wrapper annotator-editor-modal annotator-editor annotator-hide', 'annotator-modal-wrapper annotator-delete-confirm-modal', 'annotator-adder','sbo-reading-menu sbo-menu-top', 'interface-controls interface-controls-top', 'sample-message', 'font-flyout','t-sbo-next sbo-next sbo-nav-bottom', 't-sbo-next sbo-next sbo-nav-top', 't-sbo-prev sbo-prev sbo-nav-bottom', 't-sbo-prev sbo-prev sbo-nav-top', 'reading-controls-bottom']
15 | footer_elements_blacklist = ['pagefoot t-pagefoot']
16 | html_elements_blacklists = [{'header': head_elements_blacklist}, {'div': body_elements_blacklist}, {'footer': footer_elements_blacklist}]
17 |
18 | for elements in html_elements_blacklists:
19 | for element, tags in elements.items():
20 | for tag in tags:
21 | try:
22 | temp = bsObj.find(element, {'class': tag})
23 | temp.decompose()
24 | #print('processed: ' + element + ' ' + tag)
25 | except:
26 | print('error: ' + tag)
27 | continue
28 | return(bsObj)
29 |
30 | def check_for_folder_and_create(destfolder,additional=None):
31 | if additional != None:
32 | new_folders = []
33 | for folder in additional:
34 | newfolder = os.path.join(destfolder,folder)
35 | if not os.path.isdir(newfolder):
36 | os.makedirs(newfolder)
37 | new_folders.append(newfolder)
38 | return new_folders
39 | if not os.path.isdir(destfolder):
40 | os.makedirs(destfolder)
41 | return destfolder
42 |
43 |
44 | def get_fullfilepaths_files_in_folder(folder_to_process, extfilter=None):
45 | files_in_folder = [os.path.join(folder_to_process, x) for x in os.listdir(folder_to_process) if extfilter in x]
46 | return files_in_folder
47 |
48 | def walk_dir_fullfilename(directory, extfilter=None):
49 | all_files = []
50 | for path, dirnames, files in os.walk(directory):
51 | for file in files:
52 | filepath, filename = path, file
53 | fullfilepath = os.path.join(path, file)
54 | if extfilter != None:
55 | if extfilter in fullfilepath and '(clean)' not in fullfilepath:
56 | all_files.append(fullfilepath)
57 | else:
58 | pass
59 | else:
60 | all_files.append(fullfilepath)
61 | return all_files
62 |
63 | #walk_test=walk_dir_fullfilename(directory, extfilter='htm')
64 |
65 | def process_html_files_removing_junk(directory):
66 | #folder = os.path.normpath(sys.argv[1])
67 | if directory is None:
68 | directory = 'C:\\HTML'
69 | files = walk_dir_fullfilename(directory)
70 | else:
71 | files = get_fullfilepaths_files_in_folder(directory, extfilter='htm')
72 | for filename in files:
73 | try:
74 | list_of_list_of_filenames = []
75 | basename = os.path.basename(filename)
76 | extname = os.path.splitext(basename)
77 | dirname = os.path.dirname(filename)
78 | destfolder = directory
79 | filename_html, filename_pdf = create_names.create_filenames_for_conversion(destfolder, filename, extname[1])
80 | filepath = check_for_folder_and_create(destfolder,additional=["clean","pdf","html"])
81 | pdf = os.path.join(filepath[1],filename_pdf)
82 | html_clean = os.path.join(filepath[0], filename_html)
83 | try:
84 | bsObj = grab_junk_tag(filename)
85 | except:
86 | print('error' + html)
87 | try:
88 | with open(html_clean, "w", encoding='utf-8') as file:
89 | file.write(str(bsObj.decode_contents))
90 | except:
91 | with open(html_clean, "w", encoding='utf-8') as file:
92 | file.write(str(bsObj.decode_contents))
93 | list_of_list_of_filenames.append([filename, html_clean, pdf])
94 | except:
95 | print('problem with:' + filename)
96 | return list_of_list_of_filenames
--------------------------------------------------------------------------------
/02-selenium-safari/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-selenium-safari/requirements.txt
--------------------------------------------------------------------------------
/02-selenium-safari/zip_folder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import zipfile
3 | import shutil
4 |
5 | def zip_folder(folder_path, output_path):
6 | parent_folder = os.path.dirname(folder_path)
7 | # Retrieve the paths of the folder contents.
8 | contents = os.walk(folder_path)
9 | try:
10 | zip_file = zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED)
11 | for root, folders, files in contents:
12 | # Include all subfolders, including empty ones.
13 | for folder_name in folders:
14 | absolute_path = os.path.join(root, folder_name)
15 | relative_path = absolute_path.replace(parent_folder + '\\','')
16 | print("Adding {:d} to archive.".format(absolute_path))
17 | zip_file.write(absolute_path, relative_path)
18 | for file_name in files:
19 | absolute_path = os.path.join(root, file_name)
20 | relative_path = absolute_path.replace(parent_folder + '\\','')
21 | print ("Adding '{:s}' to archive.".format(absolute_path))
22 | zip_file.write(absolute_path, relative_path)
23 | print("'{:s}' created successfully.".format(output_path))
24 | except IOError:
25 | print (message)
26 | sys.exit(1)
27 | except OSError:
28 | print (message)
29 | sys.exit(1)
30 | except zipfile.BadZipfile:
31 | print (message)
32 | sys.exit(1)
33 | finally:
34 | zip_file.close()
35 | shutil.rmtree(folder_path)
36 |
37 | def zip_directory(directory):
38 | folders = [ name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name)) ]
39 | for i in folders:
40 | zip_folder(os.path.join(directory, i), os.path.join(directory, i + ".zip"))
41 |
42 |
--------------------------------------------------------------------------------
/02-webscrape-celery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-webscrape-celery/__init__.py
--------------------------------------------------------------------------------
/02-webscrape-celery/basic_consumer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pika
3 | import requests
4 | from bs4 import BeautifulSoup
5 | # python3 -m venv venv
6 |
7 | # activates the virtualenv
8 | # source venv/bin/activate
9 | # pip3 install bs4 requests celery pika
10 | # python basic_consumer.py
11 |
12 | def on_message(channel, method_frame, header_frame, body):
13 | print(f"-> Starting: [{body}]")
14 | r = requests.get(body)
15 | soup = BeautifulSoup(r.text)
16 | print(f"-> Extracted: {soup.html.head.title}")
17 | print(f"-> Done: [{body}]")
18 | channel.basic_ack(delivery_tag=method_frame.delivery_tag)
19 |
20 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
21 | channel = connection.channel()
22 | print('* Handling messages.')
23 |
24 | channel.basic_consume('pages', on_message)
25 |
26 | try:
27 | channel.start_consuming()
28 | except KeyboardInterrupt:
29 | channel.stop_consuming()
30 |
31 | connection.close()
32 |
--------------------------------------------------------------------------------
/02-webscrape-celery/basic_producer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import time
3 | import pika
4 | # activate spreadsheets_to_dataframes
5 | # python basic_producer.py
6 |
7 | print("* Connecting to RabbitMQ broker")
8 |
9 | connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
10 | channel = connection.channel()
11 | channel.queue_declare(queue='pages')
12 |
13 | def produce():
14 | with open('webscraping_celery/urls.txt', 'r') as f:
15 | urls = f.read().splitlines()
16 |
17 | for url in urls:
18 | print(f"* Pushed: [{url}]")
19 | channel.basic_publish(exchange='', routing_key='pages', body=url)
20 |
21 | counter = 0
22 |
23 | produce()
24 |
25 | connection.close()
26 |
--------------------------------------------------------------------------------
/02-webscrape-celery/consumer.py:
--------------------------------------------------------------------------------
1 |
2 | #####################
3 | # using docker for both broker and backend
4 | # $ docker run -d -p 5672:5672 -p 15672:15672 --name url-rabbit rabbitmq:management
5 | # $ celery -A consumer worker --loglevel=info
6 |
7 |
8 | import requests
9 | from celery import Celery
10 | # pip install celery==3.1.21
11 | # ^ windows
12 |
13 | app = Celery('tasks', broker='amqp://localhost/')
14 |
15 | @app.task
16 | def download_url(url):
17 | print(f"-> Starting: [{url}]")
18 | try:
19 | req = requests.get(url)
20 | if req.status_code == 200:
21 |
22 | print(f"-> Success Download: [{url}]")
23 | except:
24 | print(f'error: {url}')
25 |
26 |
27 | # celery -A consumer worker --loglevel=info
28 | # ^ run above celery command in terminal while situated in same folder as current file
29 |
30 | # from celery.task.control import discard_all
31 | # discard_all()
32 | # ^ use above to clear celery queue
33 |
34 |
35 |
--------------------------------------------------------------------------------
/02-webscrape-celery/data.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/02-webscrape-celery/data.html
--------------------------------------------------------------------------------
/02-webscrape-celery/producer.py:
--------------------------------------------------------------------------------
1 | import time
2 | from celery import Celery
3 | import consumer
4 |
5 | app = Celery('tasks', broker='amqp://localhost//')
6 |
7 | def produce():
8 | with open(f'urls.txt', 'r') as f:
9 | urls = f.read().splitlines()
10 |
11 | for url in urls:
12 | consumer.download_url.delay(url)
13 | print(f"* Submitted: [{url}]")
14 |
15 | produce()
16 |
17 | #####################
18 |
19 | # urls.txt # example
20 |
21 | """
22 | http://www.apple.com
23 | http://www.amazon.com
24 | http://www.abc.xyz
25 | http://www.microsoft.com
26 | http://www.facebook.com
27 | http://www.alibabagroup.com
28 | http://www.tencent.com
29 | http://www.berkshirehathaway.com
30 | http://www.jpmorganchase.com
31 | http://www.exxonmobil.com
32 | http://www.jnj.com
33 | http://usa.visa.com
34 | http://www.shell.com
35 | http://www.samsung.com
36 | http://www.bankofamerica.com
37 | http://www.icbc.com.cn
38 | http://www.wellsfargo.com
39 | http://corporate.walmart.com
40 | http://www.nestle.com
41 | http://www.unitedhealthgroup.com
42 | http://www.intel.com
43 | http://www.att.com
44 | http://www.chevron.com
45 | http://www.ccb.com
46 | http://www.homedepot.com
47 | http://www.pfizer.com
48 | http://www.verizon.com
49 | http://www.toyota.co.jp
50 | http://www.ab-inbev.com
51 | http://www.mastercard.com
52 | """
53 |
--------------------------------------------------------------------------------
/02-webscrape-celery/urls.txt:
--------------------------------------------------------------------------------
1 | http://www.apple.com
2 | http://www.amazon.com
3 | http://www.abc.xyz
4 | http://www.microsoft.com
5 | http://www.facebook.com
6 | http://www.alibabagroup.com
7 | http://www.tencent.com
8 | http://www.berkshirehathaway.com
9 | http://www.jpmorganchase.com
10 | http://www.exxonmobil.com
11 | http://www.jnj.com
12 | http://usa.visa.com
13 | http://www.shell.com
14 | http://www.samsung.com
15 | http://www.bankofamerica.com
16 | http://www.icbc.com.cn
17 | http://www.wellsfargo.com
18 | http://corporate.walmart.com
19 | http://www.nestle.com
20 | http://www.unitedhealthgroup.com
21 | http://www.intel.com
22 | http://www.att.com
23 | http://www.chevron.com
24 | http://www.ccb.com
25 | http://www.homedepot.com
26 | http://www.pfizer.com
27 | http://www.verizon.com
28 | http://www.toyota.co.jp
29 | http://www.ab-inbev.com
30 | http://www.mastercard.com
31 | http://www.cisco.com
32 | http://www.pg.com
33 | http://www.novartis.com
34 | http://www.petrochina.com.cn
35 | http://www.roche.com
36 | http://www.boeing.com
37 | http://www.coca-colacompany.com
38 | http://www.hsbc.com
39 | http://www.tsmc.com
40 | http://www.chinamobileltd.com
41 | http://www.oracle.com
42 | http://www.abchina.com
43 | http://www.netflix.com
44 | http://www.citigroup.com
45 | http://www.lvmh.com
46 | http://www.merck.com
47 | http://www.total.com
48 | http://www.pingan.com
49 | http://www.thewaltdisneycompany.com
50 | http://www.pepsico.com
51 |
--------------------------------------------------------------------------------
/04-other-analysis/example_pandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | filepath = r'2019-11-04T00-02-57_stlregionalchamber_largest_employers.csv'
4 |
5 | df = pd.read_csv(filepath)
6 | df.columns = df.columns.str.strip().to_list()
7 | df.columns = df.columns.str.replace(" ","_")
8 | df.columns = df.columns.str.replace("(","_").str.replace(")","_")
9 | df.columns = df.columns.str.lower()
10 |
11 | df_nan = df[df.website.isna()]
12 | df = df.dropna(subset=['website'])
13 |
14 | df['website'] = df.website.str.replace('www.',"")
15 | df['website'] = df['website'].apply(lambda x : "https://" + x)
16 |
17 | import tldextract
18 |
19 | folderpath = r'D:\PROJECTS\presentations\stl_data'
20 |
21 | import glob
22 |
23 | files = glob.glob(folderpath + "\\*.csv")
24 |
25 | import os
26 |
27 | df_list = []
28 | for file in files:
29 | df = pd.read_csv(file)
30 | df['year'] = os.path.basename(file).split("_")[0]
31 | df_list.append(df)
32 |
33 | df = pd.concat(df_list)
34 |
35 | df.columns = df.columns.str.strip().to_list()
36 | df.columns = df.columns.str.replace(" ","_")
37 | df.columns = df.columns.str.replace("(","_").str.replace(")","_")
38 | df.columns = df.columns.str.lower()
39 | df.columns = df.columns.str.replace(".","")
40 |
41 | df = df.dropna(subset=['website'])
42 | df['website'] = df.website.str.replace('www.',"")
43 | df['website'] = df['website'].apply(lambda x : "https://" + x)
44 | df.sort_values('st_louis_employees', ascending=False)
45 | df['st_louis_employees'] = df.st_louis_employees.astype(int)
46 | df['website_domain'] = df['website'].apply(lambda x: tldextract.extract(x).domain)
47 |
48 | df_groups = []
49 |
50 | for i, df_group in df.groupby('website_domain'):
51 | if df_group.index.size > 1:
52 | df_group = df_group.sort_values('year')
53 | df_group['pct_chg'] = df_group['st_louis_employees'].pct_change()
54 | else:
55 | df_group['pct_chg'] = None
56 |
57 | df_groups.append(df_group)
58 |
59 | df_all = pd.concat(df_groups)
60 | df_all.sort_values('st_louis_employees', ascending=False)
61 |
--------------------------------------------------------------------------------
/04-other-analysis/read_sec.py:
--------------------------------------------------------------------------------
1 |
2 | import pandas as pd
3 |
4 | df = pd.DataFrame()
5 |
6 | df.from_csv(files)
7 |
8 |
9 | subs = []
10 |
11 | for filename in files:
12 | print(f"Companies in {filename}")
13 | zip_filepath = os.path.join(folder_path, filename)
14 | data_file = zipfile.ZipFile(zip_filepath)
15 |
16 | df_sub = pd.read_csv(data_file.open('sub.txt'), sep='\t', error_bad_lines=False)
17 |
18 | subs.append(df_sub)
19 |
20 | revenues = []
21 | for filename in files:
22 | print(f"Companies in {filename}")
23 | zip_filepath = os.path.join(folder_path, filename)
24 | data_file = zipfile.ZipFile(zip_filepath)
25 |
26 | df_num = pd.read_csv(data_file.open('num.txt'), encoding="latin1", sep='\t', error_bad_lines=False)
27 | df_revenues = df_num[df_num['tag'].str.contains('Revenues', regex=True)]
28 | revenues.append(df_revenues)
29 |
30 | df_revs = pd.concat(revenues)
31 |
32 | df_revs.sort_values('ddate', inplace=True)
33 | df_revs['cik'] = df_revs['adsh'].apply(lambda x: x.split("-")[0])
34 |
35 | for i, df_group in df_revs.groupby('cik'):
36 | print(df_group)
37 |
38 | # df_sub_nodupes = df_sub.drop_duplicates(subset='name')
39 | # df_sub_nodupes.head(100)
40 |
41 | # df['stprinc'].drop_duplicates()
42 |
43 | df_missouri = df_sub[(df_sub['stprinc'].isin(['MO']) | df_sub['stprma'].isin(['MO'])) ]
44 | print(df_missouri[df_missouri['form'].isin(['10-K'])].sort_values('name'))
45 |
46 | df_sec_symbols = pd.read_json(r'D:\PROJECTS\presentations\meetup-2019-spreadsheets-to-dataframes\company_tickers.json').T
47 |
48 | """
49 | curl "https://api-global.morningstar.com/sal-service/v1/stock/newfinancials/0P0000014I/incomeStatement/detail?dataType=A^&reportType=A^&locale=en^&^&operation=export" -H "Sec-Fetch-Mode: cors" -H "Origin: https://www.morningstar.com" -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" -H "Accept: application/json, text/plain, */*" -H "Referer: https://www.morningstar.com/stocks/xnys/cat/financials" -H "X-API-RequestId: 52a823bc-0d1f-6c2a-51a9-fb553553a192" -H "ApiKey: lstzFDEOhfFNMLikKa0am9mgEKLBl49T" -H "X-API-REALTIME-E: eyJlbmMiOiJBMTI4R0NNIiwiYWxnIjoiUlNBLU9BRVAifQ.XmuAS3x5r-0MJuwLDdD4jNC6zjsY7HAFNo2VdvGg6jGcj4hZ4NaJgH20ez313H8An9UJrsUj8ERH0R8UyjQu2UGMUnJ5B1ooXFPla0LQEbN_Em3-IG84YPFcWVmEgcs1Fl2jjlKHVqZp04D21UvtgQ4xyPwQ-QDdTxHqyvSCpcE.ACRnQsNuTh1K_C9R.xpLNZ8Cc9faKoOYhss1CD0A4hG4m0M7-LZQ0fISw7NUHwzQs2AEo9ZXfwOvAj1fCbcE96mbKQo8gr7Oq1a2-piYXM1X5yNMcCxEaYyGinpnf6PGqbdr6zbYZdqyJk0KrxWVhKSQchLJaLGJOts4GlpqujSqJObJQcWWbkJQYKG9K7oKsdtMAKsHIVo5-0BCUbjKVnHJNsYwTsI7xn2Om8zGm4A.nBOuiEDssVFHC_N68tDjVA" -H "X-SAL-ContentType: e7FDDltrTy+tA2HnLovvGL0LFMwT+KkEptGju5wXVTU=" -H "DNT: 1" --compressed
50 |
51 | """
52 |
53 | df_missouri_qtr_ann = df_missouri[df_missouri['form'].isin(['10-Q', '10-K'])]
54 |
55 | df_missouri['instance'].apply(lambda x: x.split("-"))
56 |
57 | df_microsoft = df_sub[df_sub['name'].str.contains('COCA COLA CO', regex=True)]
58 |
59 | microsoft_adsh = df_microsoft.adsh.to_list()[0]
60 |
61 | df_pre = pd.read_csv(data_file.open('pre.txt'), sep='\t', error_bad_lines=False)
62 | df_num = pd.read_csv(data_file.open('num.txt'), sep='\t', error_bad_lines=False)
63 | df_pre.head(100)
64 |
65 | df_ko_num = df_num[df_num['adsh'].isin(['0000021344-19-000034'])]
66 | df_revenues = df_num[df_num['tag'].str.contains('Revenues', regex=True)]
67 | df_revenues.sort_values('adsh').head(100)
68 |
69 | df_microsoft_num.sort_values("tag").drop_duplicates(subset=['tag'])
70 | df_microsoft_num = df_microsoft_num[df_microsoft_num['form'].isin(['10-Q', '10-K'])]
71 |
72 |
73 | df_ = df_.iloc[:, 0:len(df_.columns.tolist()[0:len(df_head.columns.tolist())])]
74 | df_.columns = df_head.columns.to_list()
75 |
76 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/README.md:
--------------------------------------------------------------------------------
1 | # Flask-RSS
2 | $ python main.py
3 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/main.py:
--------------------------------------------------------------------------------
1 |
2 | import feedparser
3 | import pandas as pd
4 | from flask import Flask, render_template
5 |
6 | app = Flask(__name__)
7 |
8 | @app.route("/")
9 | def index():
10 |
11 | feed = feedparser.parse(r'http://www.prweb.com/rss2/daily.xml')
12 |
13 | df = pd.json_normalize(feed.entries, sep='_')
14 |
15 | df['source'] = "prweb"
16 |
17 | df = df.sort_values('published', ascending=False)
18 |
19 | df = df[['published', 'link', 'title','source']]
20 |
21 | return render_template("reader.html", df=df.itertuples(), columns_to_display=['published', 'Source', 'Headline'])
22 |
23 | if __name__ == "__main__":
24 | app.run(debug=True)
25 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/css/bootstrap-theme.min.css.map:
--------------------------------------------------------------------------------
1 | {"version":3,"sources":["less/theme.less","less/mixins/vendor-prefixes.less","less/mixins/gradients.less","less/mixins/reset-filter.less"],"names":[],"mappings":";;;;AAmBA,YAAA,aAAA,UAAA,aAAA,aAAA,aAME,YAAA,EAAA,KAAA,EAAA,eC2CA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBDvCR,mBAAA,mBAAA,oBAAA,oBAAA,iBAAA,iBAAA,oBAAA,oBAAA,oBAAA,oBAAA,oBAAA,oBCsCA,mBAAA,MAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,iBDlCR,qBAAA,sBAAA,sBAAA,uBAAA,mBAAA,oBAAA,sBAAA,uBAAA,sBAAA,uBAAA,sBAAA,uBAAA,+BAAA,gCAAA,6BAAA,gCAAA,gCAAA,gCCiCA,mBAAA,KACQ,WAAA,KDlDV,mBAAA,oBAAA,iBAAA,oBAAA,oBAAA,oBAuBI,YAAA,KAyCF,YAAA,YAEE,iBAAA,KAKJ,aErEI,YAAA,EAAA,IAAA,EAAA,KACA,iBAAA,iDACA,iBAAA,4CAAA,iBAAA,qEAEA,iBAAA,+CCnBF,OAAA,+GH4CA,OAAA,0DACA,kBAAA,SAuC2C,aAAA,QAA2B,aAAA,KArCtE,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAgBN,aEtEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAiBN,aEvEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAkBN,UExEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,gBAAA,gBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,iBAAA,iBAEE,iBAAA,QACA,aAAA,QAMA,mBAAA,0BAAA,yBAAA,0BAAA,yBAAA,yBAAA,oBAAA,2BAAA,0BAAA,2BAAA,0BAAA,0BAAA,6BAAA,oCAAA,mCAAA,oCAAA,mCAAA,mCAME,iBAAA,QACA,iBAAA,KAmBN,aEzEI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,mBAAA,mBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,oBAAA,oBAEE,iBAAA,QACA,aAAA,QAMA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,uBAAA,8BAAA,6BAAA,8BAAA,6BAAA,6BAAA,gCAAA,uCAAA,sCAAA,uCAAA,sCAAA,sCAME,iBAAA,QACA,iBAAA,KAoBN,YE1EI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDAEA,OAAA,+GCnBF,OAAA,0DH4CA,kBAAA,SACA,aAAA,QAEA,kBAAA,kBAEE,iBAAA,QACA,oBAAA,EAAA,MAGF,mBAAA,mBAEE,iBAAA,QACA,aAAA,QAMA,qBAAA,4BAAA,2BAAA,4BAAA,2BAAA,2BAAA,sBAAA,6BAAA,4BAAA,6BAAA,4BAAA,4BAAA,+BAAA,sCAAA,qCAAA,sCAAA,qCAAA,qCAME,iBAAA,QACA,iBAAA,KA2BN,eAAA,WClCE,mBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,EAAA,IAAA,IAAA,iBD2CV,0BAAA,0BE3FI,iBAAA,QACA,iBAAA,oDACA,iBAAA,+CAAA,iBAAA,wEACA,iBAAA,kDACA,OAAA,+GF0FF,kBAAA,SAEF,yBAAA,+BAAA,+BEhGI,iBAAA,QACA,iBAAA,oDACA,iBAAA,+CAAA,iBAAA,wEACA,iBAAA,kDACA,OAAA,+GFgGF,kBAAA,SASF,gBE7GI,iBAAA,iDACA,iBAAA,4CACA,iBAAA,qEAAA,iBAAA,+CACA,OAAA,+GACA,OAAA,0DCnBF,kBAAA,SH+HA,cAAA,ICjEA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,iBD6DV,sCAAA,oCE7GI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SD2CF,mBAAA,MAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,iBD0EV,cAAA,iBAEE,YAAA,EAAA,IAAA,EAAA,sBAIF,gBEhII,iBAAA,iDACA,iBAAA,4CACA,iBAAA,qEAAA,iBAAA,+CACA,OAAA,+GACA,OAAA,0DCnBF,kBAAA,SHkJA,cAAA,IAHF,sCAAA,oCEhII,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SD2CF,mBAAA,MAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,gBDgFV,8BAAA,iCAYI,YAAA,EAAA,KAAA,EAAA,gBAKJ,qBAAA,kBAAA,mBAGE,cAAA,EAqBF,yBAfI,mDAAA,yDAAA,yDAGE,MAAA,KE7JF,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,UFqKJ,OACE,YAAA,EAAA,IAAA,EAAA,qBC3HA,mBAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,MAAA,EAAA,IAAA,EAAA,sBAAA,EAAA,IAAA,IAAA,gBDsIV,eEtLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAKF,YEvLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAMF,eExLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAOF,cEzLI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF8KF,aAAA,QAeF,UEjMI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFuMJ,cE3MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFwMJ,sBE5MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFyMJ,mBE7MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF0MJ,sBE9MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF2MJ,qBE/MI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF+MJ,sBElLI,iBAAA,yKACA,iBAAA,oKACA,iBAAA,iKFyLJ,YACE,cAAA,IC9KA,mBAAA,EAAA,IAAA,IAAA,iBACQ,WAAA,EAAA,IAAA,IAAA,iBDgLV,wBAAA,8BAAA,8BAGE,YAAA,EAAA,KAAA,EAAA,QEnOE,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFiOF,aAAA,QALF,+BAAA,qCAAA,qCAQI,YAAA,KAUJ,OCnME,mBAAA,EAAA,IAAA,IAAA,gBACQ,WAAA,EAAA,IAAA,IAAA,gBD4MV,8BE5PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFyPJ,8BE7PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF0PJ,8BE9PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF2PJ,2BE/PI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF4PJ,8BEhQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SF6PJ,6BEjQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFoQJ,MExQI,iBAAA,oDACA,iBAAA,+CACA,iBAAA,wEAAA,iBAAA,kDACA,OAAA,+GACA,kBAAA,SFsQF,aAAA,QC3NA,mBAAA,MAAA,EAAA,IAAA,IAAA,gBAAA,EAAA,IAAA,EAAA,qBACQ,WAAA,MAAA,EAAA,IAAA,IAAA,gBAAA,EAAA,IAAA,EAAA"}
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/css/reader.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/css/reader.css
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/css/style.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/css/style.css
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/06-flask/flask-rss/static/fonts/glyphicons-halflings-regular.woff2
--------------------------------------------------------------------------------
/06-flask/flask-rss/static/js/npm.js:
--------------------------------------------------------------------------------
1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
2 | require('../../js/transition.js')
3 | require('../../js/alert.js')
4 | require('../../js/button.js')
5 | require('../../js/carousel.js')
6 | require('../../js/collapse.js')
7 | require('../../js/dropdown.js')
8 | require('../../js/modal.js')
9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')
--------------------------------------------------------------------------------
/06-flask/flask-rss/templates/index.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block title %}
4 |
5 | {% endblock %}
6 |
7 | {% block content %}
8 |
9 |
Flask-RSS
10 | RSS Reader
11 |
12 | Feeds
13 |
14 | {% for key, value in featured.items() %}
15 | - {{ key }}: View Feed
16 | {% endfor %}
17 |
18 |
19 |
20 | {% endblock %}
21 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/templates/layout.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {% block title %}{% endblock %} ~ RssReader
5 |
6 |
7 |
8 |
9 |
10 | {% block header %}{% endblock %}
11 |
12 |
13 |
14 |
15 |
16 |
17 | {% block content %}{% endblock %}
18 |
19 |
20 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/templates/notfound.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block title %}
4 | 404
5 | {% endblock %}
6 |
7 | {% block content %}
8 | 404 - Seite nicht gefunden
9 |
10 | Diese Seite gibt es nicht. Kann es vielleicht sein, dass du dich vertippt hast?
11 | Oder hat dir irgendjemand einen falschen Link gegeben?! :o Wenn ja, darfst du ihn gerne eine verpassen!
12 |
13 | Featured Websites
14 |
15 | {% for key, value in featured.items() %}
16 | - {{ key }}: Zum Feed
17 | {% endfor %}
18 |
20 |
21 |
22 | {% endblock %}
--------------------------------------------------------------------------------
/06-flask/flask-rss/templates/reader.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block title %}
4 | {% if name %}
5 | {{ name }}
6 | {% else %}
7 | Reader
8 | {% endif %}
9 | {% endblock %}
10 |
11 | {% block content %}
12 |
13 |
14 | {% for column in columns_to_display %}
15 | {{ column }} |
16 | {%- endfor -%}
17 |
18 |
19 |
20 | {% for row in df %}
21 |
22 | {{ row.published }} |
23 | {{ row.source }} |
24 |
25 | {{ row.title }} |
26 |
27 | {%- endfor -%}
28 |
29 |
30 |
31 | {% endblock %}
32 |
--------------------------------------------------------------------------------
/06-flask/flask-rss/templates/table.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block title %}
4 | {% if name %}
5 | {{ name }}
6 | {% else %}
7 | Reader
8 | {% endif %}
9 | {% endblock %}
10 |
11 | {% block content %}
12 |
13 |
14 | {% for column in columns_to_display %}
15 | {{ column }} |
16 | {%- endfor -%}
17 |
18 |
19 |
20 | {% for row in df %}
21 |
22 | {{ row.published }} |
23 | {{ row.source }} |
24 |
25 | {{ row.title }} |
26 |
27 | {%- endfor -%}
28 |
29 |
30 |
31 | {% endblock %}
32 |
--------------------------------------------------------------------------------
/07-airflow/README.md:
--------------------------------------------------------------------------------
1 |
2 | ### Deployment
3 |
4 | https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html
5 |
--------------------------------------------------------------------------------
/07-airflow/dags/example_postgres.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from airflow import DAG
4 | from airflow.providers.postgres.operators.postgres import PostgresOperator
5 |
6 | default_args = {"owner": "airflow"}
7 |
8 | # create_stock_table, populate_stock_table, get_all_stocks are examples of tasks created by
9 | # instantiating the Postgres Operator
10 |
11 | with DAG(
12 | dag_id="postgres_operator_dag",
13 | start_date=datetime.datetime(2020, 2, 2),
14 | schedule_interval="@once",
15 | default_args=default_args,
16 | catchup=False,
17 | ) as dag:
18 |
19 | create_stock_table = PostgresOperator(
20 | task_id="create_stock_table",
21 | postgres_conn_id="postgres_default",
22 | sql="sql/stock_schema.sql"
23 | )
24 |
25 | populate_stock_table = PostgresOperator(
26 | task_id="populate_stock_table",
27 | postgres_conn_id="postgres_default",
28 | sql="sql/stock_insert.sql"
29 | )
30 |
31 | get_all_stocks = PostgresOperator(
32 | task_id="get_all_stocks", postgres_conn_id="postgres_default", sql="SELECT * FROM stocks;"
33 | )
34 |
35 | create_stock_table >> populate_stock_table >> get_all_stocks
36 |
--------------------------------------------------------------------------------
/07-airflow/dags/sql/stock_insert.sql:
--------------------------------------------------------------------------------
1 | insert into stocks values (1, 'MSFT', 'Microsoft', '2018-07-05', 124.35);
2 | insert into stocks values (2, 'GOOG', 'Google', '2019-05-01', 234.42);
3 | insert into stocks values (3, 'TSLA', 'Tesla', '2020-06-23', 2434.22);
4 | insert into stocks values (4, 'AMZN', 'Amazon', '2013-08-11', 2344.34);
5 |
--------------------------------------------------------------------------------
/07-airflow/dags/sql/stock_schema.sql:
--------------------------------------------------------------------------------
1 | -- create pet table
2 | CREATE TABLE IF NOT EXISTS stocks (
3 | id SERIAL PRIMARY KEY,
4 | symbol VARCHAR NOT NULL,
5 | name VARCHAR NOT NULL,
6 | date DATE NOT NULL,
7 | price numeric NOT NULL);
8 |
--------------------------------------------------------------------------------
/07-airflow/dags/stock_analysis_dag.py:
--------------------------------------------------------------------------------
1 | from airflow import DAG
2 | from airflow.operators.python_operator import PythonOperator
3 |
4 | from datetime import datetime, timedelta
5 | import datetime as dt
6 | import pandas as pd
7 | import yfinance as yf
8 | import requests
9 |
10 | from functools import reduce
11 |
12 |
13 | ############################################
14 | # DEFINE AIRFLOW DAG (SETTINGS + SCHEDULE)
15 | ############################################
16 | default_args = {
17 | 'owner': 'airflow',
18 | 'depends_on_past': False,
19 | 'email': ['user@gmail.com'],
20 | 'email_on_failure': False,
21 | 'email_on_retry': False,
22 | 'retries': 1
23 | }
24 |
25 | dag = DAG( 'stocks_analysis_ETL_7AM',
26 | default_args=default_args,
27 | description='Collect Stock Prices For Analysis',
28 | catchup=False,
29 | start_date= datetime(2020, 6, 23),
30 | schedule_interval= '* 7 * * *'
31 | )
32 |
33 | tickers = ['AAPL', 'AMZN', 'BLK', 'T', 'TSLA'] # <-- Initial Tickers List. It will be available globally for all functions.
34 |
35 | ####################################################
36 | # DEFINE PYTHON FUNCTIONS
37 | ####################################################
38 |
39 | def fetch_prices_function(**kwargs): # <-- Remember to include "**kwargs" in all the defined functions
40 | print('1 Fetching stock prices and remove duplicates...')
41 | stocks_prices = []
42 | for i in range(0, len(tickers)):
43 | prices = yf.download(tickers[i], period = 'max').iloc[: , :5].dropna(axis=0, how='any')
44 | prices = prices.loc[~prices.index.duplicated(keep='last')]
45 | prices = prices.reset_index()
46 | prices.insert(loc = 1, column = 'Stock', value = tickers[i])
47 | stocks_prices.append(prices)
48 | return stocks_prices # <-- This list is the output of the fetch_prices_function and the input for the functions below
49 |
50 |
51 | def stocks_plot_function(**kwargs):
52 | print('2 Pulling stocks_prices to concatenate sub-lists to create a combined dataset + write to CSV file...')
53 | ti = kwargs['ti']
54 | stocks_prices = ti.xcom_pull(task_ids='fetch_prices_task') # <-- xcom_pull is used to pull the stocks_prices list generated above
55 | stock_plots_data = pd.concat(stocks_prices, ignore_index=True)
56 | stock_plots_data.to_csv('/Users/anbento/Documents/Data_Sets/Medium/stocks_plots_data.csv', index=False)
57 |
58 | print('DF Shape: ', stock_plots_data.shape)
59 | print(stock_plots_data.head(5))
60 | print('Completed \n\n')
61 |
62 | def stocks_table_function(**kwargs):
63 | print('3 Creating aggregated dataframe with stock stats for last available date + write to CSV file...')
64 | ti = kwargs['ti']
65 | stocks_prices = ti.xcom_pull(task_ids='fetch_prices_task') # <-- xcom_pull is used to pull the stocks_prices list generated above
66 | stocks_adj_close = []
67 | for i in range(0, len(stocks_prices)):
68 | adj_price= stocks_prices[i][['Date','Adj Close']]
69 | adj_price.set_index('Date', inplace = True)
70 | adj_price.columns = [tickers[i]]
71 | stocks_adj_close.append(adj_price)
72 |
73 | stocks_adj_close = reduce(lambda left,right: pd.merge(left, right, left_index = True, right_index = True ,how='outer'), stocks_adj_close)
74 | stocks_adj_close.sort_index(ascending = False, inplace = True)
75 | stocks_adj_close.index = pd.to_datetime(stocks_adj_close.index).date
76 |
77 | ##########################################
78 | # DEFINE AIRFLOW OPERATORS
79 | ##########################################
80 |
81 | fetch_prices_task = PythonOperator(task_id = 'fetch_prices_task',
82 | python_callable = fetch_prices_function,
83 | provide_context = True,
84 | dag= dag )
85 |
86 | stocks_plot_task= PythonOperator(task_id = 'stocks_plot_task',
87 | python_callable = stocks_plot_function,
88 | provide_context = True,
89 | dag= dag)
90 |
91 | stocks_table_task = PythonOperator(task_id = 'stocks_table_task',
92 | python_callable = stocks_table_function,
93 | provide_context = True,
94 | dag= dag)
95 |
96 | ##########################################
97 | # DEFINE TASKS HIERARCHY
98 | ##########################################
99 |
100 | fetch_prices_task >> stocks_plot_task >> stocks_table_task
101 |
--------------------------------------------------------------------------------
/07-airflow/dags/stocks.py:
--------------------------------------------------------------------------------
1 | # import json
2 | # from datetime import datetime, timedelta
3 | #
4 | # import redis
5 | # from airflow.models import DAG
6 | # from airflow.operators import PythonOperator
7 | #
8 | # stocks = ('AAPL', 'AMZN', 'GOOGL', 'MSFT',
9 | # 'FB', 'BABA', 'BRK.B', 'JPM',
10 | # 'XOM', 'JNJ', 'V', 'BAC', 'WFC',
11 | # 'WMT', 'UNH', 'INTC', 'T', 'CVX',
12 | # 'HD', 'PFE', 'VZ', 'MA', 'CSCO', 'PG',
13 | # 'BA', 'KO', 'ORCL', 'NFLX', 'C', 'MRK',
14 | # 'DIS')
15 | #
16 | #
17 | # def get_stocks(ds, **context):
18 | # symbol = context['params']['symbol']
19 | #
20 | # pg_hook = postgres_hook(postgres_conn_id='stocks')
21 | # api_hook = http_hook(http_conn_id='alphavantage', method='GET')
22 | #
23 | # # If either of these raises an exception then we'll be notified via
24 | # # Airflow
25 | # resp = api_hook.run(f'query?function=TIME_SERIES_DAILY_ADJUSTED&symbol={symbol}&apikey=537201H9R203WT4C&datatype=csv')
26 | # resp = json.loads(resp.content)
27 | #
28 | # # These are the only valid stocks the DB supports at the moment. Anything
29 | # # else that turns up will be ignored.
30 | #
31 | # stocks_insert = f"""INSERT INTO stocks (symbol, valid_until, price)
32 | # VALUES ({symbol}, {valid_until}, {price});"""
33 | #
34 | # # If this raises an exception then we'll be notified via Airflow
35 | # valid_until = datetime.fromtimestamp(resp['timestamp'])
36 | #
37 | # for iso2, price in resp['stocks'].items():
38 | # # If converting the price to a float fails for whatever reason then
39 | # # just move on.
40 | # try:
41 | # price = float(price)
42 | # except:
43 | # continue
44 | #
45 | # iso2 = iso2.upper().strip()
46 | #
47 | # if iso2 not in stocks or price < 0:
48 | # continue
49 | #
50 | # pg_hook.run(stocks_insert, parameters=(iso2,
51 | # valid_until,
52 | # price))
53 | #
54 | #
55 | # def cache_latest_stocks(ds, **kwargs):
56 | # redis_conn = redis.StrictRedis(host='redis')
57 | # pg_hook = postgres_hook(postgres_conn_id='stocks')
58 | # latest_stocks = """SELECT DISTINCT ON (symbol)
59 | # symbol, price
60 | # FROM stocks
61 | # ORDER BY symbol, valid_until DESC;"""
62 | #
63 | # for iso2, stock in pg_hook.get_records(latest_stocks):
64 | # redis_conn.set(iso2, stock)
65 | #
66 | #
67 | # args = {
68 | # 'owner': 'ryan',
69 | # 'depends_on_past': False,
70 | # 'start_date': datetime.utcnow(),
71 | # 'retries': 1,
72 | # 'retry_delay': timedelta(minutes=5),
73 | # }
74 | #
75 | # # Run at the top of the hour Monday to Friday.
76 | # # Note: This doesn't line up with the market hours of
77 | # # 10PM Sunday till 10PM Friday GMT.
78 | # dag = DAG(dag_id='stocks',
79 | # default_args=args,
80 | # schedule_interval='0 * * * 1,2,3,4,5',
81 | # dagrun_timeout=timedelta(seconds=30))
82 | #
83 | # # loop through the lob's we want to use to build up our dag
84 | # for stock in stocks:
85 | # get_stocks_task = \
86 | # PythonOperator(task_id='get_stocks',
87 | # provide_context=True,
88 | # op_kwargs={"stock": stock},
89 | # python_callable=get_stocks,
90 | # dag=dag)
91 | #
92 | # cache_latest_stocks_task = \
93 | # PythonOperator(task_id='cache_latest_stocks',
94 | # provide_context=True,
95 | # python_callable=cache_latest_stocks,
96 | # dag=dag)
97 | #
98 | # get_stocks_task.set_downstream(cache_latest_stocks_task)
99 |
--------------------------------------------------------------------------------
/07-airflow/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:master-python3.8
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # AIRFLOW_GID - Group ID in Airflow containers
31 | # Default: 50000
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account.
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account.
35 | # Default: airflow
36 | #
37 | # Feel free to modify this file to suit your needs.
38 | ---
39 | version: '3'
40 | x-airflow-common:
41 | &airflow-common
42 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.0.2}
43 | environment:
44 | &airflow-common-env
45 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
46 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
47 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
48 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
49 | AIRFLOW__CORE__FERNET_KEY: ''
50 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
51 | AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
52 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
53 | volumes:
54 | - ./dags:/opt/airflow/dags
55 | - ./logs:/opt/airflow/logs
56 | - ./plugins:/opt/airflow/plugins
57 | user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
58 | depends_on:
59 | redis:
60 | condition: service_healthy
61 | postgres:
62 | condition: service_healthy
63 | before_script:
64 | - pip3 install yfinance pandas
65 |
66 | services:
67 | postgres:
68 | image: postgres:13
69 | environment:
70 | POSTGRES_USER: airflow
71 | POSTGRES_PASSWORD: airflow
72 | POSTGRES_DB: airflow
73 | volumes:
74 | - postgres-db-volume:/var/lib/postgresql/data
75 | healthcheck:
76 | test: [ "CMD", "pg_isready", "-U", "airflow" ]
77 | interval: 5s
78 | retries: 5
79 | restart: always
80 |
81 | redis:
82 | image: redis:latest
83 | ports:
84 | - 6379:6379
85 | healthcheck:
86 | test: [ "CMD", "redis-cli", "ping" ]
87 | interval: 5s
88 | timeout: 30s
89 | retries: 50
90 | restart: always
91 |
92 | airflow-webserver:
93 | <<: *airflow-common
94 | command: webserver
95 | ports:
96 | - 8080:8080
97 | healthcheck:
98 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ]
99 | interval: 10s
100 | timeout: 10s
101 | retries: 5
102 | restart: always
103 |
104 | airflow-scheduler:
105 | <<: *airflow-common
106 | command: scheduler
107 | restart: always
108 |
109 | airflow-worker:
110 | <<: *airflow-common
111 | command: celery worker
112 | restart: always
113 |
114 | airflow-init:
115 | <<: *airflow-common
116 | command: version
117 | environment:
118 | <<: *airflow-common-env
119 | _AIRFLOW_DB_UPGRADE: 'true'
120 | _AIRFLOW_WWW_USER_CREATE: 'true'
121 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
122 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
123 |
124 | flower:
125 | <<: *airflow-common
126 | command: celery flower
127 | ports:
128 | - 5555:5555
129 | healthcheck:
130 | test: [ "CMD", "curl", "--fail", "http://localhost:5555/" ]
131 | interval: 10s
132 | timeout: 10s
133 | retries: 5
134 | restart: always
135 |
136 | volumes:
137 | postgres-db-volume:
138 |
--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Credits
3 | =======
4 |
5 | Development Lead
6 | ----------------
7 |
8 | * Ryan S. McCoy
9 |
10 | Contributors
11 | ------------
12 |
13 | None yet. Why not be the first?
14 |
--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 | .. highlight:: shell
2 |
3 | ============
4 | Contributing
5 | ============
6 |
7 | Contributions are welcome, and they are greatly appreciated! Every little bit
8 | helps, and credit will always be given.
9 |
10 | You can contribute in many ways:
11 |
12 | Types of Contributions
13 | ----------------------
14 |
15 | Report Bugs
16 | ~~~~~~~~~~~
17 |
18 | Report bugs at https://github.com/ryansmccoy/spreadsheets_to_dataframes/issues.
19 |
20 | If you are reporting a bug, please include:
21 |
22 | * Your operating system name and version.
23 | * Any details about your local setup that might be helpful in troubleshooting.
24 | * Detailed steps to reproduce the bug.
25 |
26 | Fix Bugs
27 | ~~~~~~~~
28 |
29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
30 | wanted" is open to whoever wants to implement it.
31 |
32 | Implement Features
33 | ~~~~~~~~~~~~~~~~~~
34 |
35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
36 | and "help wanted" is open to whoever wants to implement it.
37 |
38 | Write Documentation
39 | ~~~~~~~~~~~~~~~~~~~
40 |
41 | Spreadsheets to DataFrames could always use more documentation, whether as part of the
42 | official Spreadsheets to DataFrames docs, in docstrings, or even on the web in blog posts,
43 | articles, and such.
44 |
45 | Submit Feedback
46 | ~~~~~~~~~~~~~~~
47 |
48 | The best way to send feedback is to file an issue at https://github.com/ryansmccoy/spreadsheets_to_dataframes/issues.
49 |
50 | If you are proposing a feature:
51 |
52 | * Explain in detail how it would work.
53 | * Keep the scope as narrow as possible, to make it easier to implement.
54 | * Remember that this is a volunteer-driven project, and that contributions
55 | are welcome :)
56 |
57 | Get Started!
58 | ------------
59 |
60 | Ready to contribute? Here's how to set up `spreadsheets_to_dataframes` for local development.
61 |
62 | 1. Fork the `spreadsheets_to_dataframes` repo on GitHub.
63 | 2. Clone your fork locally::
64 |
65 | $ git clone git@github.com:your_name_here/spreadsheets_to_dataframes.git
66 |
67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
68 |
69 | $ mkvirtualenv spreadsheets_to_dataframes
70 | $ cd spreadsheets_to_dataframes/
71 | $ python setup.py develop
72 |
73 | 4. Create a branch for local development::
74 |
75 | $ git checkout -b name-of-your-bugfix-or-feature
76 |
77 | Now you can make your changes locally.
78 |
79 | 5. When you're done making changes, check that your changes pass flake8 and the
80 | tests, including testing other Python versions with tox::
81 |
82 | $ flake8 spreadsheets_to_dataframes tests
83 | $ python setup.py test or pytest
84 | $ tox
85 |
86 | To get flake8 and tox, just pip install them into your virtualenv.
87 |
88 | 6. Commit your changes and push your branch to GitHub::
89 |
90 | $ git add .
91 | $ git commit -m "Your detailed description of your changes."
92 | $ git push origin name-of-your-bugfix-or-feature
93 |
94 | 7. Submit a pull request through the GitHub website.
95 |
96 | Pull Request Guidelines
97 | -----------------------
98 |
99 | Before you submit a pull request, check that it meets these guidelines:
100 |
101 | 1. The pull request should include tests.
102 | 2. If the pull request adds functionality, the docs should be updated. Put
103 | your new functionality into a function with a docstring, and add the
104 | feature to the list in README.rst.
105 | 3. The pull request should work for Python 2.7, 3.5, 3.6 and 3.7, and for PyPy. Check
106 | https://travis-ci.org/ryansmccoy/spreadsheets_to_dataframes/pull_requests
107 | and make sure that the tests pass for all supported Python versions.
108 |
109 | Tips
110 | ----
111 |
112 | To run a subset of tests::
113 |
114 | $ pytest tests.test_spreadsheets_to_dataframes
115 |
116 |
117 | Deploying
118 | ---------
119 |
120 | A reminder for the maintainers on how to deploy.
121 | Make sure all your changes are committed (including an entry in HISTORY.rst).
122 | Then run::
123 |
124 | $ bump2version patch # possible: major / minor / patch
125 | $ git push
126 | $ git push --tags
127 |
128 | Travis will then deploy to PyPI if tests pass.
129 |
--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 |
5 | 0.1.0 (2019-10-09)
6 | ------------------
7 |
8 | * First release on PyPI.
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019, Ryan S. McCoy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHORS.rst
2 | include CONTRIBUTING.rst
3 | include HISTORY.rst
4 | include LICENSE
5 | include README.rst
6 |
7 | recursive-include tests *
8 | recursive-exclude * __pycache__
9 | recursive-exclude * *.py[co]
10 |
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean clean-test clean-pyc clean-build docs help
2 | .DEFAULT_GOAL := help
3 |
4 | define BROWSER_PYSCRIPT
5 | import os, webbrowser, sys
6 |
7 | try:
8 | from urllib import pathname2url
9 | except:
10 | from urllib.request import pathname2url
11 |
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 |
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 |
19 | for line in sys.stdin:
20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | if match:
22 | target, help = match.groups()
23 | print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 |
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 |
29 | help:
30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 |
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 |
34 | clean-build: ## remove build artifacts
35 | rm -fr build/
36 | rm -fr dist/
37 | rm -fr .eggs/
38 | find . -name '*.egg-info' -exec rm -fr {} +
39 | find . -name '*.egg' -exec rm -f {} +
40 |
41 | clean-pyc: ## remove Python file artifacts
42 | find . -name '*.pyc' -exec rm -f {} +
43 | find . -name '*.pyo' -exec rm -f {} +
44 | find . -name '*~' -exec rm -f {} +
45 | find . -name '__pycache__' -exec rm -fr {} +
46 |
47 | clean-test: ## remove test and coverage artifacts
48 | rm -fr .tox/
49 | rm -f .coverage
50 | rm -fr htmlcov/
51 | rm -fr .pytest_cache
52 |
53 | lint: ## check style with flake8
54 | flake8 spreadsheets_to_dataframes tests
55 |
56 | test: ## run tests quickly with the default Python
57 | pytest
58 |
59 | test-all: ## run tests on every Python version with tox
60 | tox
61 |
62 | coverage: ## check code coverage quickly with the default Python
63 | coverage run --source spreadsheets_to_dataframes -m pytest
64 | coverage report -m
65 | coverage html
66 | $(BROWSER) htmlcov/index.html
67 |
68 | docs: ## generate Sphinx HTML documentation, including API docs
69 | rm -f docs/spreadsheets_to_dataframes.rst
70 | rm -f docs/modules.rst
71 | sphinx-apidoc -o docs/ spreadsheets_to_dataframes
72 | $(MAKE) -C docs clean
73 | $(MAKE) -C docs html
74 | $(BROWSER) docs/_build/html/index.html
75 |
76 | servedocs: docs ## compile the docs watching for changes
77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
78 |
79 | release: dist ## package and upload a release
80 | twine upload dist/*
81 |
82 | dist: clean ## builds source and wheel package
83 | python setup.py sdist
84 | python setup.py bdist_wheel
85 | ls -l dist
86 |
87 | install: clean ## install the package to the active Python's site-packages
88 | python setup.py install
89 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | =======================================================================================
2 | From Spreadsheets to DataFrames: Escaping Excel Hell with Python
3 | =======================================================================================
4 |
5 | ==============================================================================================================================================================================
6 |
7 | `Pycon 2021 Tutorial Video [YouTube] - May 12, 2021 `_
8 |
9 |
10 | Other Presentations:
11 |
12 | `STL Python Presentation [YouTube] `_
13 |
14 | `Chicago Python Users Group [YouTube] `_
15 |
16 | Details
17 |
18 | A spreadsheet is a wonderful invention and an excellent tool for certain jobs. All too often, however, spreadsheets are called upon to perform tasks that are beyond their capabilities. It’s like the old saying, 'If the only tool you have is a hammer, every problem looks like a nail.' However, some problems are better addressed with a screwdriver, with glue, or with a Swiss Army Knife.
19 |
20 | Python is described by some in the programming world as the Swiss Army Knife of programming languages because of its unrivaled versatility and flexibility in use. This allows its users to solve complex problems relatively easily compared with other programming languages and is one of the reasons why Python has become increasingly popular over time.
21 |
22 | In this tutorial, we’ll briefly discuss spreadsheets, signs that you might be living in “Excel Hell”, and then we’ll spend the rest of the time learning how to escape it using Python.
23 |
24 | In the first section, we’ll extend on what spreadsheet users already know about cells, rows, columns, and formulas, and map them to their Python equivalent, such as variables, lists, dictionaries, and functions. At the end of this section, we’ll do an interactive exercise and learn how we can perform a simple calculation, similar to one you might do in Excel, but instead using Python.
25 |
26 | In the second section, we’ll discuss (and attempt) how we can perform more complex tasks including web scraping, data processing, analysis, and visualization, by utilizing a few popular 3rd party libraries used including Requests, Pandas, Flask, Matplotlib, and others.
27 |
28 | In the last section, we’ll round out our discussion with a few important concepts in data management, including concept of tidy data, building a data pipeline, and a few strategies (and packages) to use when approaching various data problems, including demo using Apache Airflow.
29 |
30 | Slides
31 | ======================
32 |
33 | `Intro [Slides] `_
34 |
35 | `Excel to Python [Slides] `_
36 |
37 | `Python Libraries & Resources [Slides] `_
38 |
39 | `Data Management [Slides] `_
40 |
41 | Tutorial Code
42 | ======================
43 |
44 | Section 1 - Python Fundamentals for an Excel User
45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46 |
47 | `01 basics_but_important_stuff.ipynb `_
48 |
49 | `02 files_lists_dictionaries.ipynb `_
50 |
51 | Section 1 - Challenges
52 | ~~~~~~~~~~~~~~~~~~~~~~~
53 |
54 | `challenge_1.py `_
55 |
56 | `challenge_1_answer.py `_
57 |
58 | `challenge_2.py `_
59 |
60 | `challenge_2_answer.py `_
61 |
62 | `challenge_3.py `_
63 |
64 | `challenge_3_answer.py `_
65 |
66 | Section 2 - Real-World Python Example for an Excel User
67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
68 |
69 | `01-real-world-example.py `_
70 |
71 | `02-real-world-example-refactored.py `_
72 |
73 | Section 2 - Challenge
74 | ~~~~~~~~~~~~~~~~~~~~~~~
75 |
76 | `section2_challenge.rst `_
77 |
78 |
79 | Section 3 - Best Practices in Python & Data for an Excel User
80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
81 |
82 | `Data Management [Slides] `_
83 |
84 | `07-airflow `_
85 |
86 | STL Python - Talk Code
87 | ======================
88 |
89 | `01-basics.ipynb `_
90 |
91 | `02-webscraping.ipynb `_
92 |
93 | `03-tidy-data.ipynb `_
94 |
95 | `04-pandas.ipynb `_
96 |
97 | `05-data-analysis.ipynb `_
98 |
99 | `06-data-visualizations.ipynb `_
100 |
101 | STL Python - Folders
102 | ===================================================
103 |
104 | * 01-basics - examples used in presentation
105 | * 02-webscraping - program that clicks through a calendar (written in javascript) and exports csv files
106 | * 02-selenium-safari - program that logins to website, scrapes html from javascript generated page, cleans html, and exports to pdf files
107 | * 02-webscrape-celery - example of that uses a message queue, and celery to download list of urls
108 | * 04-other-analysis - examples of different quantitative notebooks
109 | * 05-other-visualizations - examples of different data visualization tools
110 | * 06-flask - different flask examples
111 | * 07-airflow - example that uses airflow to download and store stock prices
112 |
113 | Quick Start Guides
114 | ======================
115 |
116 |
117 | `Install Anaconda & Pycharm `_
118 |
119 | * Anaconda = manages your Python environments
120 |
121 | * Pycharm = code editor
122 |
123 | `Install Git `_ - Allows you to git clone/download Github Projects'
124 |
125 | Setup Environment & Run Example (Windows):
126 | ==================================================
127 |
128 | .. code-block:: bash
129 |
130 | $ git clone https://github.com/ryansmccoy/spreadsheets-to-dataframes.git
131 | $ cd spreadsheets-to-dataframes
132 | $ conda create -n spreadsheets-to-dataframes python=3.8 pandas scipy numpy lxml jupyter matplotlib -y
133 | $ activate spreadsheets-to-dataframes
134 | $ pip install -r requirements_dev.txt
135 |
136 | Setup Environment & Run Example (Linux):
137 | ==================================================
138 |
139 | .. code-block:: bash
140 |
141 | $ git clone https://github.com/ryansmccoy/spreadsheets-to-dataframes.git
142 | $ cd spreadsheets-to-dataframes
143 | $ conda create -n spreadsheets-to-dataframes python=3.8 pandas scipy numpy lxml jupyter matplotlib -y
144 | $ source activate spreadsheets-to-dataframes
145 | $ pip install -r requirements_dev.txt
146 |
147 | Running Jupyter Notebooks:
148 | ==================================================
149 |
150 | Navigate to spreadsheet-to-dataframe directory/folder:
151 |
152 | .. code-block:: bash
153 |
154 | $ activate spreadsheets-to-dataframes
155 | $ jupyter notebook
156 |
157 | (Optional) Install Docker to Run Airflow Example
158 | ===================================================
159 |
160 | https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html
161 |
162 | Python Books & Videos:
163 | ===================================================
164 |
165 | `(Book) Python Crash Course, 2nd Edition `_
166 |
167 | `(Book) Introducing Python: Modern Computing in Simple Packages `_
168 |
169 | `(Book) Learning Python, 5th Edition `_
170 |
171 | `(Book) Automate the Boring Stuff with Python, 2nd Edition: Practical Programming for Total Beginners `_
172 |
173 | `(Book) Think Python: How to Think Like a Computer Scientist `_
174 |
175 | `(Book) The Quick Python Book (Book) `_
176 |
177 | `(Book) Serious Python: Black-Belt Advice on Deployment, Scalability, Testing, and More `_
178 |
179 | `(Github) A Whirlwind Tour of Python `_
180 |
181 | `(Github) Python Data Science Handbook `_
182 |
183 | `(Github) Introduction to Python `_
184 |
185 | Cookiecutter:
186 | ===================================================
187 |
188 | $ pip install cookiecutter
189 |
190 | Resources:
191 |
192 | https://github.com/cookiecutter/cookiecutter
193 |
194 | https://github.com/audreyfeldroy/cookiecutter-pypackage
195 |
196 | https://towardsdatascience.com/cookiecutter-creating-custom-reusable-project-templates-fc85c8627b07
197 |
198 | Requests
199 | ===================================================
200 |
201 | $ pip install requests
202 |
203 | Resources:
204 |
205 | https://python.readthedocs.io/en/stable/library/stdtypes.html
206 |
207 | https://realpython.com/python-requests/
208 |
209 | Have you mastered Requests? Then you should check out multithreading, concurrency, asyncio, message queues, parallelism.
210 |
211 | https://yasoob.me/2019/05/29/speedingw-up-python-code-using-multithreading/
212 |
213 | https://www.toptal.com/python/beginners-guide-to-concurrency-and-parallelism-in-python
214 |
215 | https://creativedata.stream/multi-threading-api-requests-in-python/
216 |
217 | https://levelup.gitconnected.com/asynchronous-tasks-in-python-with-celery-rabbitmq-redis-480f6e506d76
218 |
219 | https://tests4geeks.com/blog/python-celery-rabbitmq-tutorial/
220 |
221 | https://codeburst.io/automated-web-scraping-with-python-and-celery-ac02a4a9ce51
222 |
223 | https://github.com/ryansmccoy/zmq-high-speed-subs
224 |
225 |
226 | Pandas
227 | ===================================================
228 |
229 | $ pip install pandas
230 |
231 | Resources:
232 |
233 | `Dealing With Data `_
234 |
235 | `Pandas Cookbook `_
236 |
237 | `brandon-rhodes\pycon-pandas-tutorial `_
238 |
239 | `Python pandas Q&A video series `_
240 |
241 | `Master Data Analysis with Python `_
242 |
243 | Have you mastered Pandas? Then you check out Dask and Spark.
244 |
245 | https://dask.org/
246 |
247 | https://spark.apache.org/docs/latest/api/python/
248 |
249 | Visualization:
250 | ===================================================
251 |
252 | $ pip install matplotlib
253 |
254 | Resources:
255 |
256 | https://github.com/fasouto/awesome-dataviz
257 |
258 | https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html
259 |
260 | https://www.toptal.com/designers/data-visualization/data-visualization-tools
261 |
262 | https://realpython.com/pandas-plot-python/
263 |
264 | Have you mastered Matplotlilb? Then you should checkout Javascript, D3, React, Tableau
265 |
266 | Flask:
267 | ===================================================
268 |
269 | $ pip install flask
270 |
271 | Resources:
272 |
273 | https://www.fullstackpython.com/flask.html
274 |
275 | https://blog.miguelgrinberg.com/
276 |
277 | Have you mastered Flask? Then you should checkout FastAPI, Javascript, Node, React
278 |
--------------------------------------------------------------------------------
/data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx
--------------------------------------------------------------------------------
/data/WMT_US.csv:
--------------------------------------------------------------------------------
1 | Ticker,Company Name,Year End,Total Sales,Total Expenses
2 | WMT US,WAL-MART STORES INC,12/31/2014,476293988352,460271988736
3 | WMT US,WAL-MART STORES INC,12/31/2013,469162000384,452163000320
4 | WMT US,WAL-MART STORES INC,12/31/2012,446950014976,431251014656
5 | WMT US,WAL-MART STORES INC,12/31/2011,421849006080,405460005888
6 | WMT US,WAL-MART STORES INC,12/31/2010,408214011904,393879012352
7 | WMT US,WAL-MART STORES INC,12/31/2009,405606989824,392206989312
8 | WMT US,WAL-MART STORES INC,12/31/2008,378798997504,366067997696
9 | WMT US,WAL-MART STORES INC,12/31/2007,348650012672,337366012928
10 | WMT US,WAL-MART STORES INC,12/31/2006,312426987520,301195987968
11 | WMT US,WAL-MART STORES INC,12/31/2005,287989006336,277722006528
12 | WMT US,WAL-MART STORES INC,12/31/2004,256329007104,247275006976
13 | WMT US,WAL-MART STORES INC,12/31/2003,229615992832,221660993024
14 |
--------------------------------------------------------------------------------
/data/WMT_US_pandas.csv:
--------------------------------------------------------------------------------
1 | ,Ticker,Company Name,Year End,Total Sales,Total Expenses,Total Profit
2 | 0,WMT US,WAL-MART STORES INC,12/31/2014,476293988352,460271988736,16021999616
3 | 1,WMT US,WAL-MART STORES INC,12/31/2013,469162000384,452163000320,16999000064
4 | 2,WMT US,WAL-MART STORES INC,12/31/2012,446950014976,431251014656,15699000320
5 | 3,WMT US,WAL-MART STORES INC,12/31/2011,421849006080,405460005888,16389000192
6 | 4,WMT US,WAL-MART STORES INC,12/31/2010,408214011904,393879012352,14334999552
7 | 5,WMT US,WAL-MART STORES INC,12/31/2009,405606989824,392206989312,13400000512
8 | 6,WMT US,WAL-MART STORES INC,12/31/2008,378798997504,366067997696,12730999808
9 | 7,WMT US,WAL-MART STORES INC,12/31/2007,348650012672,337366012928,11283999744
10 | 8,WMT US,WAL-MART STORES INC,12/31/2006,312426987520,301195987968,11230999552
11 | 9,WMT US,WAL-MART STORES INC,12/31/2005,287989006336,277722006528,10266999808
12 | 10,WMT US,WAL-MART STORES INC,12/31/2004,256329007104,247275006976,9054000128
13 | 11,WMT US,WAL-MART STORES INC,12/31/2003,229615992832,221660993024,7954999808
14 |
--------------------------------------------------------------------------------
/data/WMT_US_updated.csv:
--------------------------------------------------------------------------------
1 | ,ticker,name,date,sales,expenses,profit
2 | 0,WMT US,WAL-MART STORES INC,2014-12-31,476293988352,460271988736,16021999616
3 | 1,WMT US,WAL-MART STORES INC,2013-12-31,469162000384,452163000320,16999000064
4 | 2,WMT US,WAL-MART STORES INC,2012-12-31,446950014976,431251014656,15699000320
5 | 3,WMT US,WAL-MART STORES INC,2011-12-31,421849006080,405460005888,16389000192
6 | 4,WMT US,WAL-MART STORES INC,2010-12-31,408214011904,393879012352,14334999552
7 | 5,WMT US,WAL-MART STORES INC,2009-12-31,405606989824,392206989312,13400000512
8 | 6,WMT US,WAL-MART STORES INC,2008-12-31,378798997504,366067997696,12730999808
9 | 7,WMT US,WAL-MART STORES INC,2007-12-31,348650012672,337366012928,11283999744
10 | 8,WMT US,WAL-MART STORES INC,2006-12-31,312426987520,301195987968,11230999552
11 | 9,WMT US,WAL-MART STORES INC,2005-12-31,287989006336,277722006528,10266999808
12 | 10,WMT US,WAL-MART STORES INC,2004-12-31,256329007104,247275006976,9054000128
13 | 11,WMT US,WAL-MART STORES INC,2003-12-31,229615992832,221660993024,7954999808
14 |
--------------------------------------------------------------------------------
/data/country_timeseries.csv:
--------------------------------------------------------------------------------
1 | Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
2 | 1/5/2015,289,2776,,10030,,,,,,1786,,2977,,,,,
3 | 1/4/2015,288,2775,,9780,,,,,,1781,,2943,,,,,
4 | 1/3/2015,287,2769,8166,9722,,,,,,1767,3496,2915,,,,,
5 | 1/2/2015,286,,8157,,,,,,,,3496,,,,,,
6 | 12/31/2014,284,2730,8115,9633,,,,,,1739,3471,2827,,,,,
7 | 12/28/2014,281,2706,8018,9446,,,,,,1708,3423,2758,,,,,
8 | 12/27/2014,280,2695,,9409,,,,,,1697,,2732,,,,,
9 | 12/24/2014,277,2630,7977,9203,,,,,,,3413,2655,,,,,
10 | 12/21/2014,273,2597,,9004,,,,,,1607,,2582,,,,,
11 | 12/20/2014,272,2571,7862,8939,,,,,,1586,3384,2556,,,,,
12 | 12/18/2014,271,,7830,,,,,,,,3376,,,,,,
13 | 12/14/2014,267,2416,,8356,,,,,,1525,,2085,,,,,
14 | 12/9/2014,262,,7797,,,,,,,,3290,,,,,,
15 | 12/7/2014,260,2292,,7897,20,1,4,1,7,1428,,1768,8,0,1,0,6
16 | 12/3/2014,256,,7719,,,,,,,,3177,,,,,,
17 | 11/30/2014,253,2164,,7312,20,1,4,1,7,1327,,1583,8,0,1,0,6
18 | 11/28/2014,251,,7635,,,,,,,,3145,,,,,,
19 | 11/23/2014,246,2134,,6599,20,1,4,1,7,1260,,1398,8,0,1,0,6
20 | 11/22/2014,245,,7168,,,,,,,,3016,,,,,,
21 | 11/18/2014,241,2047,7082,6190,20,1,4,1,6,1214,2963,1267,8,0,1,0,6
22 | 11/16/2014,239,1971,,6073,20,1,4,1,5,1192,,1250,8,0,1,0,5
23 | 11/15/2014,238,,7069,,,,,,,,2964,,,,,,
24 | 11/11/2014,234,1919,,5586,20,1,4,1,4,1166,,1187,8,0,1,0,3
25 | 11/10/2014,233,,6878,,,,,,,,2812,,,,,,
26 | 11/9/2014,232,1878,,5368,20,1,4,1,1,1142,,1169,8,0,1,0,1
27 | 11/8/2014,231,,6822,,,,,,,,2836,,,,,,
28 | 11/4/2014,227,,6619,4862,20,1,4,1,1,,2766,1130,8,0,1,0,1
29 | 11/3/2014,226,1760,,,,,,,,1054,,,,,,,
30 | 11/2/2014,225,1731,,4759,20,1,4,1,1,1041,,1070,8,0,1,0,1
31 | 10/31/2014,222,,6525,,,,,,,,2697,,,,,,
32 | 10/29/2014,220,1667,,5338,20,1,4,1,1,1018,,1510,8,0,1,0,1
33 | 10/27/2014,218,1906,,5235,20,1,4,1,1,997,,1500,8,0,1,0,1
34 | 10/25/2014,216,,6535,,,,,,,,2413,,,,,,
35 | 10/22/2014,214,,,3896,,,4,1,1,,,1281,,,1,0,1
36 | 10/21/2014,213,1553,,,,,,,,926,,,,,,,
37 | 10/19/2014,211,1540,,3706,20,1,3,1,,904,,1259,8,0,1,0,
38 | 10/18/2014,210,,4665,,,,,,,,2705,,,,,,
39 | 10/14/2014,206,1519,,3410,20,1,3,1,,862,,1200,8,0,0,1,
40 | 10/13/2014,205,,4262,,,,,,,,2484,,,,,,
41 | 10/12/2014,204,1472,,3252,20,1,2,1,,843,,1183,8,0,1,1,
42 | 10/11/2014,203,,4249,,,,,,,,2458,,,,,,
43 | 10/8/2014,200,,,2950,20,1,1,1,,,,930,8,0,1,1,
44 | 10/7/2014,199,1350,4076,,,,,,,778,2316,,,,,,
45 | 10/5/2014,197,1298,,2789,20,1,1,,,768,,879,8,0,0,,
46 | 10/4/2014,196,,3924,,,,,,,,2210,,,,,,
47 | 10/1/2014,193,1199,3834,2437,20,1,1,,,739,2069,623,8,0,0,,
48 | 9/28/2014,190,1157,3696,2304,20,1,,,,710,1998,622,8,0,,,
49 | 9/23/2014,185,1074,3458,2021,20,1,,,,648,1830,605,8,0,,,
50 | 9/21/2014,183,1022,3280,1940,20,1,,,,635,1677,597,8,0,,,
51 | 9/20/2014,182,,,1813,,,,,,,,593,,,,,
52 | 9/19/2014,181,1008,,,,,,,,632,,,,,,,
53 | 9/17/2014,179,,3022,,,,,,,,1578,,,,,,
54 | 9/14/2014,176,942,2710,1673,,,,,,601,1459,562,,,,,
55 | 9/13/2014,175,936,,1620,21,1,,,,595,1296,562,8,0,,,
56 | 9/10/2014,172,899,,1478,21,1,,,,568,,536,8,,,,
57 | 9/9/2014,171,,2407,,,,,,,,,,,,,,
58 | 9/7/2014,169,861,2081,1424,21,3,,,,557,1137,524,8,0,,,
59 | 9/5/2014,167,812,1871,1261,22,1,,,,517,1089,491,8,,,,
60 | 8/31/2014,162,771,1698,1216,21,1,,,,494,871,476,7,,,,
61 | 8/26/2014,157,648,1378,1026,17,,,,,430,694,422,6,,,,
62 | 8/20/2014,151,607,1082,910,16,,,,,406,624,392,5,,,,
63 | 8/18/2014,149,579,972,907,15,,,,,396,576,374,4,,,,
64 | 8/16/2014,147,543,834,848,15,,,,,394,466,365,4,,,,
65 | 8/13/2014,144,519,786,810,12,,,,,380,413,348,4,,,,
66 | 8/11/2014,142,510,670,783,12,,,,,377,355,334,3,,,,
67 | 8/9/2014,140,506,599,730,13,,,,,373,323,315,2,,,,
68 | 8/6/2014,137,495,554,717,13,,,,,367,294,298,2,,,,
69 | 8/4/2014,135,495,516,691,9,,,,,363,282,286,1,,,,
70 | 8/1/2014,132,485,468,646,4,,,,,358,255,273,1,,,,
71 | 7/30/2014,129,472,391,574,3,,,,,346,227,252,1,,,,
72 | 7/27/2014,126,460,329,533,1,,,,,339,156,233,1,,,,
73 | 7/23/2014,123,427,249,525,0,,,,,319,129,224,0,,,,
74 | 7/20/2014,120,415,224,454,,,,,,314,127,219,,,,,
75 | 7/17/2014,117,410,196,442,,,,,,310,116,206,,,,,
76 | 7/14/2014,114,411,174,397,,,,,,310,106,197,,,,,
77 | 7/12/2014,112,406,172,386,,,,,,304,105,194,,,,,
78 | 7/8/2014,108,409,142,337,,,,,,309,88,142,,,,,
79 | 7/6/2014,106,408,131,305,,,,,,307,84,127,,,,,
80 | 7/2/2014,102,412,115,252,,,,,,305,75,101,,,,,
81 | 6/30/2014,100,413,107,239,,,,,,303,65,99,,,,,
82 | 6/22/2014,92,,51,,,,,,,,34,,,,,,
83 | 6/20/2014,90,390,,158,,,,,,270,,34,,,,,
84 | 6/19/2014,89,,41,,,,,,,,25,,,,,,
85 | 6/18/2014,88,390,,136,,,,,,267,,28,,,,,
86 | 6/17/2014,87,,,97,,,,,,,,49,,,,,
87 | 6/16/2014,86,398,33,,,,,,,264,24,,,,,,
88 | 6/10/2014,80,351,13,89,,,,,,226,24,7,,,,,
89 | 6/5/2014,75,,13,81,,,,,,,,6,,,,,
90 | 6/3/2014,73,344,13,,,,,,,215,12,6,,,,,
91 | 6/1/2014,71,328,13,79,,,,,,208,12,6,,,,,
92 | 5/28/2014,67,291,13,50,,,,,,193,12,6,,,,,
93 | 5/27/2014,66,281,12,16,,,,,,186,11,5,,,,,
94 | 5/23/2014,62,258,12,0,,,,,,174,11,0,,,,,
95 | 5/12/2014,51,248,12,0,,,,,,171,11,0,,,,,
96 | 5/10/2014,49,233,12,0,,,,,,157,11,0,,,,,
97 | 5/7/2014,46,236,13,0,,,,,,158,11,0,,,,,
98 | 5/5/2014,44,235,13,0,,,,,,157,11,0,,,,,
99 | 5/3/2014,42,231,13,0,,,,,,155,11,0,,,,,
100 | 5/1/2014,40,226,13,0,,,,,,149,11,0,,,,,
101 | 4/26/2014,35,224,,0,,,,,,143,,0,,,,,
102 | 4/24/2014,33,,35,0,,,,,,,,0,,,,,
103 | 4/23/2014,32,218,,0,,,,,,141,,0,,,,,
104 | 4/22/2014,31,,,0,,,,,,,,0,,,,,
105 | 4/21/2014,30,,34,,,,,,,,11,,,,,,
106 | 4/20/2014,29,208,,,,,,,,136,6,,,,,,
107 | 4/17/2014,26,203,27,,,,,,,129,,,,,,,
108 | 4/16/2014,25,197,27,,,,,,,122,13,,,,,,
109 | 4/15/2014,24,,,12,,,,,,,,,,,,,
110 | 4/14/2014,23,168,,,,,,,,108,,,,,,,
111 | 4/11/2014,20,159,26,2,,,,,,106,13,2,,,,,
112 | 4/9/2014,18,158,25,2,,,,,,101,12,2,,,,,
113 | 4/7/2014,16,151,21,2,,,,,,95,10,2,,,,,
114 | 4/4/2014,13,143,18,2,,,,,,86,7,2,,,,,
115 | 4/1/2014,10,127,8,2,,,,,,83,5,2,,,,,
116 | 3/31/2014,9,122,8,2,,,,,,80,4,2,,,,,
117 | 3/29/2014,7,112,7,,,,,,,70,2,,,,,,
118 | 3/28/2014,6,112,3,2,,,,,,70,3,2,,,,,
119 | 3/27/2014,5,103,8,6,,,,,,66,6,5,,,,,
120 | 3/26/2014,4,86,,,,,,,,62,,,,,,,
121 | 3/25/2014,3,86,,,,,,,,60,,,,,,,
122 | 3/24/2014,2,86,,,,,,,,59,,,,,,,
123 | 3/22/2014,0,49,,,,,,,,29,,,,,,,
--------------------------------------------------------------------------------
/data/fortune_1000.csv:
--------------------------------------------------------------------------------
1 | rank,name,industry,location,employees,revenues_millions
2 | 1,Walmart,General Merchandisers,"Bentonville, AR","2,200,000","$523,964 "
3 | 2,Amazon,Internet Services and Retailing,"Seattle, WA","798,000","$280,522 "
4 | 3,Exxon Mobil,Petroleum Refining,"Irving, TX","74,900","$264,938 "
5 | 4,Apple,"Computers, Office Equipment","Cupertino, CA","137,000","$260,174 "
6 | 5,CVS Health,Food and Drug Stores,"Woonsocket, RI","290,000","$256,776 "
7 | 6,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE","391,500","$254,616 "
8 | 7,UnitedHealth Group,Health Care: Insurance and Managed Care,"Minnetonka, MN","325,000","$242,155 "
9 | 8,McKesson,Wholesalers: Health Care,"San Francisco, CA","70,000","$214,319 "
10 | 9,AT&T,Telecommunications,"Dallas, TX","247,800","$181,193 "
11 | 10,AmerisourceBergen,Wholesalers: Health Care,"Chesterbrook, PA","21,500","$179,589 "
12 | 12,Ford Motor,Motor Vehicles and Parts,"Dearborn, MI","190,000","$155,900 "
13 | 13,Cigna,Health Care: Insurance and Managed Care,"Bloomfield, CT","73,700","$153,566 "
14 | 14,Costco Wholesale,General Merchandisers,"Issaquah, WA","201,500","$152,703 "
15 | 15,Chevron,Petroleum Refining,"San Ramon, CA","48,200","$146,516 "
16 | 16,Cardinal Health,Wholesalers: Health Care,"Dublin, OH","49,500","$145,534 "
17 | 17,JPMorgan Chase,Commercial Banks,"New York, NY","256,981","$142,422 "
18 | 18,General Motors,Motor Vehicles and Parts,"Detroit, MI","164,000","$137,237 "
19 | 19,Walgreens Boots Alliance,Food and Drug Stores,"Deerfield, IL","287,000","$136,866 "
20 | 20,Verizon Communications,Telecommunications,"New York, NY","135,000","$131,868 "
21 | 21,Microsoft,Computer Software,"Redmond, WA","144,000","$125,843 "
22 | 22,Marathon Petroleum,Petroleum Refining,"Findlay, OH","60,910","$124,813 "
23 | 23,Kroger,Food and Drug Stores,"Cincinnati, OH","435,000","$122,286 "
24 | 24,Fannie Mae,Diversified Financials,"Washington, DC","7,500","$120,304 "
25 | 25,Bank of America,Commercial Banks,"Charlotte, NC","208,131","$113,589 "
26 | 26,Home Depot,Specialty Retailers: Other,"Atlanta, GA","415,700","$110,225 "
27 | 27,Phillips 66,Petroleum Refining,"Houston, TX","14,500","$109,559 "
28 | 28,Comcast NBCUniversal,Telecommunications,"Philadelphia, PA","190,000","$108,942 "
29 | 29,Anthem,Health Care: Insurance and Managed Care,"Indianapolis, IN","70,600","$104,213 "
30 | 30,Wells Fargo,Commercial Banks,"San Francisco, CA","259,800","$103,915 "
31 |
--------------------------------------------------------------------------------
/data/linkedin_industries.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 |
10 |
11 | Code |
12 | Groups |
13 | Description
|
14 |
15 |
16 | 47 |
17 | corp, fin |
18 | Accounting |
19 |
20 |
21 | 94 |
22 | man, tech, tran |
23 | Airlines/Aviation |
24 |
25 |
26 | 120 |
27 | leg, org |
28 | Alternative Dispute Resolution |
29 |
30 |
31 | 125 |
32 | hlth |
33 | Alternative Medicine |
34 |
35 |
36 | 127 |
37 | art, med |
38 | Animation |
39 |
40 |
41 | 19 |
42 | good |
43 | Apparel & Fashion |
44 |
45 |
46 | 50 |
47 | cons |
48 | Architecture & Planning |
49 |
50 |
51 | 111 |
52 | art, med, rec |
53 | Arts and Crafts |
54 |
55 |
56 | 53 |
57 | man |
58 | Automotive |
59 |
60 |
61 | 52 |
62 | gov, man |
63 | Aviation & Aerospace |
64 |
65 |
66 | 41 |
67 | fin |
68 | Banking |
69 |
70 |
71 | 12 |
72 | gov, hlth, tech |
73 | Biotechnology |
74 |
75 |
76 | 36 |
77 | med, rec |
78 | Broadcast Media |
79 |
80 |
81 | 49 |
82 | cons |
83 | Building Materials |
84 |
85 |
86 | 138 |
87 | corp, man |
88 | Business Supplies and Equipment |
89 |
90 |
91 | 129 |
92 | fin |
93 | Capital Markets |
94 |
95 |
96 | 54 |
97 | man |
98 | Chemicals |
99 |
100 |
101 | 90 |
102 | org, serv |
103 | Civic & Social Organization |
104 |
105 |
106 | 51 |
107 | cons, gov |
108 | Civil Engineering |
109 |
110 |
111 | 128 |
112 | cons, corp, fin |
113 | Commercial Real Estate |
114 |
115 |
116 | 118 |
117 | tech |
118 | Computer & Network Security |
119 |
120 |
121 | 109 |
122 | med, rec |
123 | Computer Games |
124 |
125 |
126 | 3 |
127 | tech |
128 | Computer Hardware |
129 |
130 |
131 | 5 |
132 | tech |
133 | Computer Networking |
134 |
135 |
136 | 4 |
137 | tech |
138 | Computer Software |
139 |
140 |
141 | 48 |
142 | cons |
143 | Construction |
144 |
145 |
146 | 24 |
147 | good, man |
148 | Consumer Electronics |
149 |
150 |
151 | 25 |
152 | good, man |
153 | Consumer Goods |
154 |
155 |
156 | 91 |
157 | org, serv |
158 | Consumer Services |
159 |
160 |
161 | 18 |
162 | good |
163 | Cosmetics |
164 |
165 |
166 | 65 |
167 | agr |
168 | Dairy |
169 |
170 |
171 | 1 |
172 | gov, tech |
173 | Defense & Space |
174 |
175 |
176 | 99 |
177 | art, med |
178 | Design |
179 |
180 |
181 | 69 |
182 | edu |
183 | Education Management |
184 |
185 |
186 | 132 |
187 | edu, org |
188 | E-Learning |
189 |
190 |
191 | 112 |
192 | good, man |
193 | Electrical/Electronic Manufacturing |
194 |
195 |
196 | 28 |
197 | med, rec |
198 | Entertainment |
199 |
200 |
201 | 86 |
202 | org, serv |
203 | Environmental Services |
204 |
205 |
206 | 110 |
207 | corp, rec, serv |
208 | Events Services |
209 |
210 |
211 | 76 |
212 | gov |
213 | Executive Office |
214 |
215 |
216 | 122 |
217 | corp, serv |
218 | Facilities Services |
219 |
220 |
221 | 63 |
222 | agr |
223 | Farming |
224 |
225 |
226 | 43 |
227 | fin |
228 | Financial Services |
229 |
230 |
231 | 38 |
232 | art, med, rec |
233 | Fine Art |
234 |
235 |
236 | 66 |
237 | agr |
238 | Fishery |
239 |
240 |
241 | 34 |
242 | rec, serv |
243 | Food & Beverages |
244 |
245 |
246 | 23 |
247 | good, man, serv |
248 | Food Production |
249 |
250 |
251 | 101 |
252 | org |
253 | Fund-Raising |
254 |
255 |
256 | 26 |
257 | good, man |
258 | Furniture |
259 |
260 |
261 | 29 |
262 | rec |
263 | Gambling & Casinos |
264 |
265 |
266 | 145 |
267 | cons, man |
268 | Glass, Ceramics & Concrete |
269 |
270 |
271 | 75 |
272 | gov |
273 | Government Administration |
274 |
275 |
276 | 148 |
277 | gov |
278 | Government Relations |
279 |
280 |
281 | 140 |
282 | art, med |
283 | Graphic Design |
284 |
285 |
286 | 124 |
287 | hlth, rec |
288 | Health, Wellness and Fitness |
289 |
290 |
291 | 68 |
292 | edu |
293 | Higher Education |
294 |
295 |
296 | 14 |
297 | hlth |
298 | Hospital & Health Care |
299 |
300 |
301 | 31 |
302 | rec, serv, tran |
303 | Hospitality |
304 |
305 |
306 | 137 |
307 | corp |
308 | Human Resources |
309 |
310 |
311 | 134 |
312 | corp, good, tran |
313 | Import and Export |
314 |
315 |
316 | 88 |
317 | org, serv |
318 | Individual & Family Services |
319 |
320 |
321 | 147 |
322 | cons, man |
323 | Industrial Automation |
324 |
325 |
326 | 84 |
327 | med, serv |
328 | Information Services |
329 |
330 |
331 | 96 |
332 | tech |
333 | Information Technology and Services |
334 |
335 |
336 | 42 |
337 | fin |
338 | Insurance |
339 |
340 |
341 | 74 |
342 | gov |
343 | International Affairs |
344 |
345 |
346 | 141 |
347 | gov, org, tran |
348 | International Trade and Development |
349 |
350 |
351 | 6 |
352 | tech |
353 | Internet |
354 |
355 |
356 | 45 |
357 | fin |
358 | Investment Banking |
359 |
360 |
361 | 46 |
362 | fin |
363 | Investment Management |
364 |
365 |
366 | 73 |
367 | gov, leg |
368 | Judiciary |
369 |
370 |
371 | 77 |
372 | gov, leg |
373 | Law Enforcement |
374 |
375 |
376 | 9 |
377 | leg |
378 | Law Practice |
379 |
380 |
381 | 10 |
382 | leg |
383 | Legal Services |
384 |
385 |
386 | 72 |
387 | gov, leg |
388 | Legislative Office |
389 |
390 |
391 | 30 |
392 | rec, serv, tran |
393 | Leisure, Travel & Tourism |
394 |
395 |
396 | 85 |
397 | med, rec, serv |
398 | Libraries |
399 |
400 |
401 | 116 |
402 | corp, tran |
403 | Logistics and Supply Chain |
404 |
405 |
406 | 143 |
407 | good |
408 | Luxury Goods & Jewelry |
409 |
410 |
411 | 55 |
412 | man |
413 | Machinery |
414 |
415 |
416 | 11 |
417 | corp |
418 | Management Consulting |
419 |
420 |
421 | 95 |
422 | tran |
423 | Maritime |
424 |
425 |
426 | 97 |
427 | corp |
428 | Market Research |
429 |
430 |
431 | 80 |
432 | corp, med |
433 | Marketing and Advertising |
434 |
435 |
436 | 135 |
437 | cons, gov, man |
438 | Mechanical or Industrial Engineering |
439 |
440 |
441 | 126 |
442 | med, rec |
443 | Media Production |
444 |
445 |
446 | 17 |
447 | hlth |
448 | Medical Devices |
449 |
450 |
451 | 13 |
452 | hlth |
453 | Medical Practice |
454 |
455 |
456 | 139 |
457 | hlth |
458 | Mental Health Care |
459 |
460 |
461 | 71 |
462 | gov |
463 | Military |
464 |
465 |
466 | 56 |
467 | man |
468 | Mining & Metals |
469 |
470 |
471 | 35 |
472 | art, med, rec |
473 | Motion Pictures and Film |
474 |
475 |
476 | 37 |
477 | art, med, rec |
478 | Museums and Institutions |
479 |
480 |
481 | 115 |
482 | art, rec |
483 | Music |
484 |
485 |
486 | 114 |
487 | gov, man, tech |
488 | Nanotechnology |
489 |
490 |
491 | 81 |
492 | med, rec |
493 | Newspapers |
494 |
495 |
496 | 100 |
497 | org |
498 | Non-Profit Organization Management |
499 |
500 |
501 | 57 |
502 | man |
503 | Oil & Energy |
504 |
505 |
506 | 113 |
507 | med |
508 | Online Media |
509 |
510 |
511 | 123 |
512 | corp |
513 | Outsourcing/Offshoring |
514 |
515 |
516 | 87 |
517 | serv, tran |
518 | Package/Freight Delivery |
519 |
520 |
521 | 146 |
522 | good, man |
523 | Packaging and Containers |
524 |
525 |
526 | 61 |
527 | man |
528 | Paper & Forest Products |
529 |
530 |
531 | 39 |
532 | art, med, rec |
533 | Performing Arts |
534 |
535 |
536 | 15 |
537 | hlth, tech |
538 | Pharmaceuticals |
539 |
540 |
541 | 131 |
542 | org |
543 | Philanthropy |
544 |
545 |
546 | 136 |
547 | art, med, rec |
548 | Photography |
549 |
550 |
551 | 117 |
552 | man |
553 | Plastics |
554 |
555 |
556 | 107 |
557 | gov, org |
558 | Political Organization |
559 |
560 |
561 | 67 |
562 | edu |
563 | Primary/Secondary Education |
564 |
565 |
566 | 83 |
567 | med, rec |
568 | Printing |
569 |
570 |
571 | 105 |
572 | corp |
573 | Professional Training & Coaching |
574 |
575 |
576 | 102 |
577 | corp, org |
578 | Program Development |
579 |
580 |
581 | 79 |
582 | gov |
583 | Public Policy |
584 |
585 |
586 | 98 |
587 | corp |
588 | Public Relations and Communications |
589 |
590 |
591 | 78 |
592 | gov |
593 | Public Safety |
594 |
595 |
596 | 82 |
597 | med, rec |
598 | Publishing |
599 |
600 |
601 | 62 |
602 | man |
603 | Railroad Manufacture |
604 |
605 |
606 | 64 |
607 | agr |
608 | Ranching |
609 |
610 |
611 | 44 |
612 | cons, fin, good |
613 | Real Estate |
614 |
615 |
616 | 40 |
617 | rec, serv |
618 | Recreational Facilities and Services |
619 |
620 |
621 | 89 |
622 | org, serv |
623 | Religious Institutions |
624 |
625 |
626 | 144 |
627 | gov, man, org |
628 | Renewables & Environment |
629 |
630 |
631 | 70 |
632 | edu, gov |
633 | Research |
634 |
635 |
636 | 32 |
637 | rec, serv |
638 | Restaurants |
639 |
640 |
641 | 27 |
642 | good, man |
643 | Retail |
644 |
645 |
646 | 121 |
647 | corp, org, serv |
648 | Security and Investigations |
649 |
650 |
651 | 7 |
652 | tech |
653 | Semiconductors |
654 |
655 |
656 | 58 |
657 | man |
658 | Shipbuilding |
659 |
660 |
661 | 20 |
662 | good, rec |
663 | Sporting Goods |
664 |
665 |
666 | 33 |
667 | rec |
668 | Sports |
669 |
670 |
671 | 104 |
672 | corp |
673 | Staffing and Recruiting |
674 |
675 |
676 | 22 |
677 | good |
678 | Supermarkets |
679 |
680 |
681 | 8 |
682 | gov, tech |
683 | Telecommunications |
684 |
685 |
686 | 60 |
687 | man |
688 | Textiles |
689 |
690 |
691 | 130 |
692 | gov, org |
693 | Think Tanks |
694 |
695 |
696 | 21 |
697 | good |
698 | Tobacco |
699 |
700 |
701 | 108 |
702 | corp, gov, serv |
703 | Translation and Localization |
704 |
705 |
706 | 92 |
707 | tran |
708 | Transportation/Trucking/Railroad |
709 |
710 |
711 | 59 |
712 | man |
713 | Utilities |
714 |
715 |
716 | 106 |
717 | fin, tech |
718 | Venture Capital & Private Equity |
719 |
720 |
721 | 16 |
722 | hlth |
723 | Veterinary |
724 |
725 |
726 | 93 |
727 | tran |
728 | Warehousing |
729 |
730 |
731 | 133 |
732 | good |
733 | Wholesale |
734 |
735 |
736 | 142 |
737 | good, man, rec |
738 | Wine and Spirits |
739 |
740 |
741 | 119 |
742 | tech |
743 | Wireless |
744 |
745 |
746 | 103 |
747 | art, med, rec |
748 | Writing and Editing |
749 |
750 |
751 |
752 |
753 |
754 |
--------------------------------------------------------------------------------
/data/msft_stock_key_data.csv:
--------------------------------------------------------------------------------
1 | Symbol,MSFT
2 | Name,Microsoft Corporation Common Stock
3 | Exchange,NASDAQ-GS
4 | Sector,Technology
5 | Industry,Computer Software: Prepackaged Software
6 | 1 Year Target,$277.50
7 | Today's High/Low,$261.00/$257.60
8 | Share Volume,"24,878,582"
9 | Average Volume,"28,320,974"
10 | Previous Close,$259.50
11 | 52 Week High/Low,$259.93/$166.11
12 | Market Cap,"1,966,557,339,088"
13 | P/E Ratio,38.86
14 | Forward P/E 1 Yr.,35.21
15 | Earnings Per Share(EPS),$6.71
16 | Annualized Dividend,$2.24
17 | Ex Dividend Date,19-May-21
18 | Dividend Pay Date,10-Jun-21
19 | Current Yield,0.88%
20 | Beta,0.8
21 |
--------------------------------------------------------------------------------
/data/pew.csv:
--------------------------------------------------------------------------------
1 | "religion","<$10k","$10-20k","$20-30k","$30-40k","$40-50k","$50-75k","$75-100k","$100-150k",">150k","Don't know/refused"
2 | "Agnostic",27,34,60,81,76,137,122,109,84,96
3 | "Atheist",12,27,37,52,35,70,73,59,74,76
4 | "Buddhist",27,21,30,34,33,58,62,39,53,54
5 | "Catholic",418,617,732,670,638,1116,949,792,633,1489
6 | "Don’t know/refused",15,14,15,11,10,35,21,17,18,116
7 | "Evangelical Prot",575,869,1064,982,881,1486,949,723,414,1529
8 | "Hindu",1,9,7,9,11,34,47,48,54,37
9 | "Historically Black Prot",228,244,236,238,197,223,131,81,78,339
10 | "Jehovah's Witness",20,27,24,24,21,30,15,11,6,37
11 | "Jewish",19,19,25,25,30,95,69,87,151,162
12 | "Mainline Prot",289,495,619,655,651,1107,939,753,634,1328
13 | "Mormon",29,40,48,51,56,112,85,49,42,69
14 | "Muslim",6,7,9,10,9,23,16,8,6,22
15 | "Orthodox",13,17,23,32,32,47,38,42,46,73
16 | "Other Christian",9,7,11,13,13,14,18,14,12,18
17 | "Other Faiths",20,33,40,46,49,63,46,40,41,71
18 | "Other World Religions",5,2,3,4,2,7,3,4,4,8
19 | "Unaffiliated",217,299,374,365,341,528,407,321,258,597
20 |
--------------------------------------------------------------------------------
/data/portfolio.csv:
--------------------------------------------------------------------------------
1 | Ticker,Date,Shares,Price
2 | GOOG,2019-10-01,100,1
3 | MSFT,2019-10-01,200,1
4 | IBM,2019-10-01,500,1
5 | TSLA,2019-10-01,300,1
6 |
7 |
--------------------------------------------------------------------------------
/data/pycon_sponsor_levels.csv:
--------------------------------------------------------------------------------
1 | sponsor_level,amount
2 | VISIONARY,150000
3 | SUSTAINABILITY,90000
4 | MAINTAINING,60000
5 | CONTRIBUTING,30000
6 | SUPPORTING,15000
7 | PARTNER,7500
8 | PARTICIPATING,3750
9 | ASSOCIATE,1500
10 |
--------------------------------------------------------------------------------
/data/pycon_sponsors.csv:
--------------------------------------------------------------------------------
1 | symbol,name,sponsor_level
2 | GOOG,ALPHABET INC.,VISIONARY
3 | AMZN,AMAZON COM INC,SUSTAINABILITY
4 | #N/A,BLOOMBERG,VISIONARY
5 | COF,CAPITAL ONE FINANCIAL CORP,MAINTAINING
6 | GLW,CORNING INC,MAINTAINING
7 | ESTC,ELASTIC N.V.,PARTNER
8 | FB,FACEBOOK INC,SUSTAINABILITY
9 | #N/A,HUAWEI TECHNOLOGIES,SUSTAINABILITY
10 | IBM,INTERNATIONAL BUSINESS MACHINES CORP,CONTRIBUTING
11 | JPM,JPMORGAN CHASE & CO,SUPPORTING
12 | MSFT,MICROSOFT CORP,VISIONARY
13 | NFLX,NETFLIX INC,PARTNER
14 | CRM,SALESFORCE.COM INC.,SUSTAINABILITY
15 | WORK,SLACK TECHNOLOGIES INC.,MAINTAINING
16 |
--------------------------------------------------------------------------------
/data/retail_sales.csv:
--------------------------------------------------------------------------------
1 | date,sales
2 | 2009-10-01,338630
3 | 2009-11-01,339386
4 | 2009-12-01,400264
5 | 2010-01-01,314640
6 | 2010-02-01,311022
7 | 2010-03-01,360819
8 | 2010-04-01,356460
9 | 2010-05-01,365713
10 | 2010-06-01,358675
11 | 2010-07-01,362027
12 | 2010-08-01,362682
13 | 2010-09-01,346069
14 | 2010-10-01,355212
15 | 2010-11-01,365809
16 | 2010-12-01,426654
17 | 2011-01-01,335608
18 | 2011-02-01,337352
19 | 2011-03-01,387092
20 | 2011-04-01,380754
21 | 2011-05-01,391970
22 | 2011-06-01,388636
23 | 2011-07-01,384600
24 | 2011-08-01,394548
25 | 2011-09-01,374895
26 | 2011-10-01,379364
27 | 2011-11-01,391081
28 | 2011-12-01,451669
29 | 2012-01-01,355058
30 | 2012-02-01,372523
31 | 2012-03-01,414275
32 | 2012-04-01,393035
33 | 2012-05-01,418648
34 | 2012-06-01,400996
35 | 2012-07-01,396020
36 | 2012-08-01,417911
37 | 2012-09-01,385597
38 | 2012-10-01,399341
39 | 2012-11-01,410992
40 | 2012-12-01,461994
41 | 2013-01-01,375537
42 | 2013-02-01,373938
43 | 2013-03-01,421638
44 | 2013-04-01,408381
45 | 2013-05-01,436985
46 | 2013-06-01,414701
47 | 2013-07-01,422357
48 | 2013-08-01,434950
49 | 2013-09-01,396199
50 | 2013-10-01,415740
51 | 2013-11-01,423611
52 | 2013-12-01,477205
53 | 2014-01-01,383399
54 | 2014-02-01,380315
55 | 2014-03-01,432806
56 | 2014-04-01,431415
57 | 2014-05-01,458822
58 | 2014-06-01,433152
59 | 2014-07-01,443005
60 | 2014-08-01,450913
61 | 2014-09-01,420871
62 | 2014-10-01,437702
63 | 2014-11-01,437910
64 | 2014-12-01,501232
65 | 2015-01-01,397252
66 | 2015-02-01,386935
67 | 2015-03-01,444110
68 | 2015-04-01,438217
69 | 2015-05-01,462615
70 | 2015-06-01,448229
71 | 2015-07-01,457710
72 | 2015-08-01,456340
73 | 2015-09-01,430917
74 |
--------------------------------------------------------------------------------
/data/sponsors_vlookup.csv:
--------------------------------------------------------------------------------
1 | symbol,name,sponsor_level,amount
2 | GOOG,ALPHABET INC.,VISIONARY,150000
3 | AMZN,AMAZON COM INC,SUSTAINABILITY,90000
4 | #N/A,BLOOMBERG,VISIONARY,150000
5 | COF,CAPITAL ONE FINANCIAL CORP,MAINTAINING,60000
6 | GLW,CORNING INC,MAINTAINING,60000
7 | ESTC,ELASTIC N.V.,PARTNER,7500
8 | FB,FACEBOOK INC,SUSTAINABILITY,90000
9 | #N/A,HUAWEI TECHNOLOGIES,SUSTAINABILITY,90000
10 | IBM,INTERNATIONAL BUSINESS MACHINES CORP,CONTRIBUTING,30000
11 | JPM,JPMORGAN CHASE & CO,SUPPORTING,15000
12 | MSFT,MICROSOFT CORP,VISIONARY,150000
13 | NFLX,NETFLIX INC,PARTNER,7500
14 | CRM,SALESFORCE.COM INC.,SUSTAINABILITY,90000
15 | WORK,SLACK TECHNOLOGIES INC.,MAINTAINING,60000
16 |
--------------------------------------------------------------------------------
/data/stlcom_larget_employers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stlcom_larget_employers.xlsx
--------------------------------------------------------------------------------
/data/stlregionalchamber_largest_employers_.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stlregionalchamber_largest_employers_.xlsx
--------------------------------------------------------------------------------
/data/stock_data_simple.csv:
--------------------------------------------------------------------------------
1 | ticker,company_name,sector,trade_date,price,price_change_percent,market_capitalization,annual_sales,shares_outstanding
2 | WMT,Wal-Mart Stores,Retail,1/16/2014,76.76,-1.20%,"248,377","55,688",3235772
3 | AAPL,Apple Inc,Technology,1/16/2014,554.25,-0.60%,"494,697","37,472",892553
4 | IBM,Intl Business Machines,Technology,1/16/2014,188.76,0.50%,"204,965","23,720",1085854
5 | BAC,Bank Of America Corp,Financial,1/16/2014,17.08,-0.40%,"182,177","23,553",10666133
6 | SGL.KR,Samsung Electronics,Technology,1/16/2014,"1,301,000.00",0.20%,"180,329","23,444",147299
7 | NESN.CH,Nestle 'R',Consumer Staple,1/16/2014,67.45,1.20%,"239,974","22,584",3224798
8 | MSFT,Microsoft Corp,Technology,1/16/2014,36.89,0.40%,"307,956","18,529",8347968
9 | AMZN,Amazon.Com Inc,Retail,1/16/2014,395.8,0.00%,"181,170","17,092",457733
10 | GOOG,Google Inc,Technology,1/16/2014,"1,156.22",0.70%,"386,278","14,893",334087
11 | PFE,Pfizer Inc,Health Care,1/16/2014,31.17,0.00%,"202,014","12,643",6481070
12 |
--------------------------------------------------------------------------------
/data/stock_data_simple.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/data/stock_data_simple.xlsx
--------------------------------------------------------------------------------
/data/table1.csv:
--------------------------------------------------------------------------------
1 | "country","year","cases","population"
2 | "Afghanistan",1999,745,19987071
3 | "Afghanistan",2000,2666,20595360
4 | "Brazil",1999,37737,172006362
5 | "Brazil",2000,80488,174504898
6 | "China",1999,212258,1272915272
7 | "China",2000,213766,1280428583
8 |
--------------------------------------------------------------------------------
/data/table2.csv:
--------------------------------------------------------------------------------
1 | "country","year","type","count"
2 | "Afghanistan",1999,"cases",745
3 | "Afghanistan",1999,"population",19987071
4 | "Afghanistan",2000,"cases",2666
5 | "Afghanistan",2000,"population",20595360
6 | "Brazil",1999,"cases",37737
7 | "Brazil",1999,"population",172006362
8 | "Brazil",2000,"cases",80488
9 | "Brazil",2000,"population",174504898
10 | "China",1999,"cases",212258
11 | "China",1999,"population",1272915272
12 | "China",2000,"cases",213766
13 | "China",2000,"population",1280428583
14 |
--------------------------------------------------------------------------------
/data/table3.csv:
--------------------------------------------------------------------------------
1 | "country","year","rate"
2 | "Afghanistan",1999,"745/19987071"
3 | "Afghanistan",2000,"2666/20595360"
4 | "Brazil",1999,"37737/172006362"
5 | "Brazil",2000,"80488/174504898"
6 | "China",1999,"212258/1272915272"
7 | "China",2000,"213766/1280428583"
8 |
--------------------------------------------------------------------------------
/data/table4a.csv:
--------------------------------------------------------------------------------
1 | "country","1999","2000"
2 | "Afghanistan",745,2666
3 | "Brazil",37737,80488
4 | "China",212258,213766
5 |
--------------------------------------------------------------------------------
/data/table4b.csv:
--------------------------------------------------------------------------------
1 | "country","1999","2000"
2 | "Afghanistan",19987071,20595360
3 | "Brazil",172006362,174504898
4 | "China",1272915272,1280428583
5 |
--------------------------------------------------------------------------------
/data/weather.csv:
--------------------------------------------------------------------------------
1 | "id","year","month","element","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31"
2 | "MX17004",2010,1,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,27.8,NA
3 | "MX17004",2010,1,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,14.5,NA
4 | "MX17004",2010,2,"tmax",NA,27.3,24.1,NA,NA,NA,NA,NA,NA,NA,29.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA
5 | "MX17004",2010,2,"tmin",NA,14.4,14.4,NA,NA,NA,NA,NA,NA,NA,13.4,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,10.7,NA,NA,NA,NA,NA,NA,NA,NA
6 | "MX17004",2010,3,"tmax",NA,NA,NA,NA,32.1,NA,NA,NA,NA,34.5,NA,NA,NA,NA,NA,31.1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
7 | "MX17004",2010,3,"tmin",NA,NA,NA,NA,14.2,NA,NA,NA,NA,16.8,NA,NA,NA,NA,NA,17.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
8 | "MX17004",2010,4,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,36.3,NA,NA,NA,NA
9 | "MX17004",2010,4,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.7,NA,NA,NA,NA
10 | "MX17004",2010,5,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,33.2,NA,NA,NA,NA
11 | "MX17004",2010,5,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18.2,NA,NA,NA,NA
12 | "MX17004",2010,6,"tmax",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,30.1,NA,NA
13 | "MX17004",2010,6,"tmin",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,18,NA,NA
14 | "MX17004",2010,7,"tmax",NA,NA,28.6,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,29.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
15 | "MX17004",2010,7,"tmin",NA,NA,17.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
16 | "MX17004",2010,8,"tmax",NA,NA,NA,NA,29.6,NA,NA,29,NA,NA,NA,NA,29.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,26.4,NA,29.7,NA,NA,NA,28,NA,25.4
17 | "MX17004",2010,8,"tmin",NA,NA,NA,NA,15.8,NA,NA,17.3,NA,NA,NA,NA,16.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,15.6,NA,NA,NA,15.3,NA,15.4
18 | "MX17004",2010,10,"tmax",NA,NA,NA,NA,27,NA,28.1,NA,NA,NA,NA,NA,NA,29.5,28.7,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,31.2,NA,NA,NA
19 | "MX17004",2010,10,"tmin",NA,NA,NA,NA,14,NA,12.9,NA,NA,NA,NA,NA,NA,13,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,15,NA,NA,NA
20 | "MX17004",2010,11,"tmax",NA,31.3,NA,27.2,26.3,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28.1,27.7,NA,NA,NA,NA
21 | "MX17004",2010,11,"tmin",NA,16.3,NA,12,7.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,12.1,14.2,NA,NA,NA,NA
22 | "MX17004",2010,12,"tmax",29.9,NA,NA,NA,NA,27.8,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
23 | "MX17004",2010,12,"tmin",13.8,NA,NA,NA,NA,10.5,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
24 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python -msphinx
7 | SPHINXPROJ = spreadsheets_to_dataframes
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # spreadsheets_to_dataframes documentation build configuration file, created by
5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another
17 | # directory, add these directories to sys.path here. If the directory is
18 | # relative to the documentation root, use os.path.abspath to make it
19 | # absolute, like shown here.
20 | #
21 | import os
22 | import sys
23 | sys.path.insert(0, os.path.abspath('..'))
24 |
25 | import spreadsheets_to_dataframes
26 |
27 | # -- General configuration ---------------------------------------------
28 |
29 | # If your documentation needs a minimal Sphinx version, state it here.
30 | #
31 | # needs_sphinx = '1.0'
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 |
40 | # The suffix(es) of source filenames.
41 | # You can specify multiple suffix as a list of string:
42 | #
43 | # source_suffix = ['.rst', '.md']
44 | source_suffix = '.rst'
45 |
46 | # The master toctree document.
47 | master_doc = 'index'
48 |
49 | # General information about the project.
50 | project = u'Spreadsheets to DataFrames'
51 | copyright = u"2019, Ryan S. McCoy"
52 | author = u"Ryan S. McCoy"
53 |
54 | # The version info for the project you're documenting, acts as replacement
55 | # for |version| and |release|, also used in various other places throughout
56 | # the built documents.
57 | #
58 | # The short X.Y version.
59 | version = spreadsheets_to_dataframes.__version__
60 | # The full version, including alpha/beta/rc tags.
61 | release = spreadsheets_to_dataframes.__version__
62 |
63 | # The language for content autogenerated by Sphinx. Refer to documentation
64 | # for a list of supported languages.
65 | #
66 | # This is also used if you do content translation via gettext catalogs.
67 | # Usually you set "language" from the command line for these cases.
68 | language = None
69 |
70 | # List of patterns, relative to source directory, that match files and
71 | # directories to ignore when looking for source files.
72 | # This patterns also effect to html_static_path and html_extra_path
73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
74 |
75 | # The name of the Pygments (syntax highlighting) style to use.
76 | pygments_style = 'sphinx'
77 |
78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
79 | todo_include_todos = False
80 |
81 |
82 | # -- Options for HTML output -------------------------------------------
83 |
84 | # The theme to use for HTML and HTML Help pages. See the documentation for
85 | # a list of builtin themes.
86 | #
87 | html_theme = 'alabaster'
88 |
89 | # Theme options are theme-specific and customize the look and feel of a
90 | # theme further. For a list of options available for each theme, see the
91 | # documentation.
92 | #
93 | # html_theme_options = {}
94 |
95 | # Add any paths that contain custom static files (such as style sheets) here,
96 | # relative to this directory. They are copied after the builtin static files,
97 | # so a file named "default.css" will overwrite the builtin "default.css".
98 | html_static_path = ['_static']
99 |
100 |
101 | # -- Options for HTMLHelp output ---------------------------------------
102 |
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'spreadsheets_to_dataframesdoc'
105 |
106 |
107 | # -- Options for LaTeX output ------------------------------------------
108 |
109 | latex_elements = {
110 | # The paper size ('letterpaper' or 'a4paper').
111 | #
112 | # 'papersize': 'letterpaper',
113 |
114 | # The font size ('10pt', '11pt' or '12pt').
115 | #
116 | # 'pointsize': '10pt',
117 |
118 | # Additional stuff for the LaTeX preamble.
119 | #
120 | # 'preamble': '',
121 |
122 | # Latex figure (float) alignment
123 | #
124 | # 'figure_align': 'htbp',
125 | }
126 |
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title, author, documentclass
129 | # [howto, manual, or own class]).
130 | latex_documents = [
131 | (master_doc, 'spreadsheets_to_dataframes.tex',
132 | u'Spreadsheets to DataFrames Documentation',
133 | u'Ryan S. McCoy', 'manual'),
134 | ]
135 |
136 |
137 | # -- Options for manual page output ------------------------------------
138 |
139 | # One entry per manual page. List of tuples
140 | # (source start file, name, description, authors, manual section).
141 | man_pages = [
142 | (master_doc, 'spreadsheets_to_dataframes',
143 | u'Spreadsheets to DataFrames Documentation',
144 | [author], 1)
145 | ]
146 |
147 |
148 | # -- Options for Texinfo output ----------------------------------------
149 |
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | # dir menu entry, description, category)
153 | texinfo_documents = [
154 | (master_doc, 'spreadsheets_to_dataframes',
155 | u'Spreadsheets to DataFrames Documentation',
156 | author,
157 | 'spreadsheets_to_dataframes',
158 | 'One line description of project.',
159 | 'Miscellaneous'),
160 | ]
161 |
162 |
163 |
164 |
--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 |
--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to Spreadsheets to DataFrames's documentation!
2 | ======================================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: Contents:
7 |
8 | readme
9 | installation
10 | usage
11 | modules
12 | contributing
13 | authors
14 | history
15 |
16 | Indices and tables
17 | ==================
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. highlight:: shell
2 |
3 | ============
4 | Installation
5 | ============
6 |
7 |
8 | Stable release
9 | --------------
10 |
11 | To install Spreadsheets to DataFrames, run this command in your terminal:
12 |
13 | .. code-block:: console
14 |
15 | $ pip install spreadsheets_to_dataframes
16 |
17 | This is the preferred method to install Spreadsheets to DataFrames, as it will always install the most recent stable release.
18 |
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 |
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 |
25 |
26 | From sources
27 | ------------
28 |
29 | The sources for Spreadsheets to DataFrames can be downloaded from the `Github repo`_.
30 |
31 | You can either clone the public repository:
32 |
33 | .. code-block:: console
34 |
35 | $ git clone git://github.com/ryansmccoy/spreadsheets_to_dataframes
36 |
37 | Or download the `tarball`_:
38 |
39 | .. code-block:: console
40 |
41 | $ curl -OJL https://github.com/ryansmccoy/spreadsheets_to_dataframes/tarball/master
42 |
43 | Once you have a copy of the source, you can install it with:
44 |
45 | .. code-block:: console
46 |
47 | $ python setup.py install
48 |
49 |
50 | .. _Github repo: https://github.com/ryansmccoy/spreadsheets_to_dataframes
51 | .. _tarball: https://github.com/ryansmccoy/spreadsheets_to_dataframes/tarball/master
52 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=python -msphinx
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=spreadsheets_to_dataframes
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | echo.then set the SPHINXBUILD environment variable to point to the full
21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | echo.Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 |
--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
1 | =====
2 | Usage
3 | =====
4 |
5 | To use Spreadsheets to DataFrames in a project::
6 |
7 | import spreadsheets_to_dataframes
8 |
--------------------------------------------------------------------------------
/img/basics/basic_python_style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/basic_python_style.png
--------------------------------------------------------------------------------
/img/basics/built-in_data_structures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_data_structures.png
--------------------------------------------------------------------------------
/img/basics/built-in_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_functions.png
--------------------------------------------------------------------------------
/img/basics/built-in_len.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/built-in_len.png
--------------------------------------------------------------------------------
/img/basics/calculations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/calculations.png
--------------------------------------------------------------------------------
/img/basics/cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell.png
--------------------------------------------------------------------------------
/img/basics/cell_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell_ex.png
--------------------------------------------------------------------------------
/img/basics/cell_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cell_types.png
--------------------------------------------------------------------------------
/img/basics/cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/cells.png
--------------------------------------------------------------------------------
/img/basics/comments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/comments.png
--------------------------------------------------------------------------------
/img/basics/data-types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/data-types.png
--------------------------------------------------------------------------------
/img/basics/data_collections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/data_collections.png
--------------------------------------------------------------------------------
/img/basics/excel-built-in-string.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-built-in-string.png
--------------------------------------------------------------------------------
/img/basics/excel-built-in.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-built-in.png
--------------------------------------------------------------------------------
/img/basics/excel-pre-installed-add-ins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/excel-pre-installed-add-ins.png
--------------------------------------------------------------------------------
/img/basics/jupyter-method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/jupyter-method.png
--------------------------------------------------------------------------------
/img/basics/pycharm-function-pop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-function-pop.png
--------------------------------------------------------------------------------
/img/basics/pycharm-function-popup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-function-popup.png
--------------------------------------------------------------------------------
/img/basics/pycharm-methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycharm-methods.png
--------------------------------------------------------------------------------
/img/basics/pycon-files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon-files.png
--------------------------------------------------------------------------------
/img/basics/pycon_sponsor_levels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon_sponsor_levels.png
--------------------------------------------------------------------------------
/img/basics/pycon_sponsors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/pycon_sponsors.png
--------------------------------------------------------------------------------
/img/basics/python-pre-installed-add-ins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/python-pre-installed-add-ins.png
--------------------------------------------------------------------------------
/img/basics/reserved_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/reserved_words.png
--------------------------------------------------------------------------------
/img/basics/standard-library-import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/standard-library-import.png
--------------------------------------------------------------------------------
/img/basics/standard-library.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/standard-library.png
--------------------------------------------------------------------------------
/img/basics/vscode-method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/basics/vscode-method.png
--------------------------------------------------------------------------------
/img/dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/dataframe.png
--------------------------------------------------------------------------------
/img/dataframe_components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/dataframe_components.png
--------------------------------------------------------------------------------
/img/excel_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/excel_table.png
--------------------------------------------------------------------------------
/img/pandas_dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/pandas_dataframe.png
--------------------------------------------------------------------------------
/img/split_apply_combine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryansmccoy/spreadsheets-to-dataframes/22b4a3393626a8df24e8f5a188b3407d20a6430f/img/split_apply_combine.png
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | Click
2 |
3 | pandas
4 | numpy
5 | scipy
6 | requests
7 | openpyxl
8 | cookiecutter
9 | sqlalchemy
10 | flask
11 | feedparser
12 | bs4
13 | # selenium
14 |
15 | # statsmodels
16 | # tldextract
17 | # pyflux
18 | # fbprophet
19 | lxml
20 | jupyter
21 | matplotlib
22 |
23 | # celery==3.1.25
24 |
25 | # alpha_vantage
26 |
--------------------------------------------------------------------------------
/section1_challenge_1.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 |
3 | # Challenge 1
4 | # Modify the code below to match the Expected Output at the bottom
5 |
6 | import csv
7 | import os
8 | from pprint import pprint
9 |
10 | current_directory = os.getcwd()
11 |
12 | pycon_sponsors_filename = 'pycon_sponsors.csv'
13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
14 |
15 | # print(pycon_sponsors_filepath)
16 |
17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
22 | {'sponsor_level': 'PARTNER', 'amount': 7500},
23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
25 |
26 | pprint(sponsor_levels)
27 |
28 | pycon_sponsors = []
29 |
30 | # print(pycon_sponsors_filepath)
31 |
32 | with open(pycon_sponsors_filepath, 'r') as f:
33 | rows = csv.reader(f)
34 |
35 | header = next(f)
36 |
37 | for row_number, row in enumerate(rows):
38 | print("Row Number:\t", row_number, "Values:\t", row)
39 |
40 | """
41 | Current Output:
42 |
43 | [{'amount': 150000, 'sponsor_level': 'VISIONARY'},
44 | {'amount': 90000, 'sponsor_level': 'SUSTAINABILITY'},
45 | {'amount': 60000, 'sponsor_level': 'MAINTAINING'},
46 | {'amount': 30000, 'sponsor_level': 'CONTRIBUTING'},
47 | {'amount': 15000, 'sponsor_level': 'SUPPORTING'},
48 | {'amount': 7500, 'sponsor_level': 'PARTNER'},
49 | {'amount': 3750, 'sponsor_level': 'PARTICIPATING'},
50 | {'amount': 1500, 'sponsor_level': 'ASSOCIATE'}]
51 |
52 | Row Number: 0 Values: ['GOOG', 'ALPHABET INC.', 'VISIONARY']
53 | Row Number: 1 Values: ['AMZN', 'AMAZON COM INC', 'SUSTAINABILITY']
54 | Row Number: 2 Values: ['#N/A', 'BLOOMBERG', 'VISIONARY']
55 | Row Number: 3 Values: ['COF', 'CAPITAL ONE FINANCIAL CORP', 'MAINTAINING']
56 | Row Number: 4 Values: ['GLW', 'CORNING INC', 'MAINTAINING']
57 | Row Number: 5 Values: ['ESTC', 'ELASTIC N.V.', 'PARTNER']
58 | Row Number: 6 Values: ['FB', 'FACEBOOK INC', 'SUSTAINABILITY']
59 | Row Number: 7 Values: ['#N/A', 'HUAWEI TECHNOLOGIES', 'SUSTAINABILITY']
60 | Row Number: 8 Values: ['IBM', 'INTERNATIONAL BUSINESS MACHINES CORP', 'CONTRIBUTING']
61 | Row Number: 9 Values: ['JPM', 'JPMORGAN CHASE & CO', 'SUPPORTING']
62 | Row Number: 10 Values: ['MSFT', 'MICROSOFT CORP', 'VISIONARY']
63 | Row Number: 11 Values: ['NFLX', 'NETFLIX INC', 'PARTNER']
64 | Row Number: 12 Values: ['CRM', 'SALESFORCE.COM INC.', 'SUSTAINABILITY']
65 | Row Number: 13 Values: ['WORK', 'SLACK TECHNOLOGIES INC.', 'MAINTAINING']
66 |
67 | Expected Output:
68 |
69 | Company Number: 0
70 | DCompany: ALPHABET INC.
71 | Level: VISIONARY
72 | Donated: 150000
73 | Company Number: 1
74 | DCompany: AMAZON COM INC
75 | Level: SUSTAINABILITY
76 | Donated: 90000
77 | Company Number: 2
78 | DCompany: BLOOMBERG
79 | Level: VISIONARY
80 | Donated: 150000
81 | Company Number: 3
82 | DCompany: CAPITAL ONE FINANCIAL CORP
83 | Level: MAINTAINING
84 | Donated: 60000
85 | Company Number: 4
86 | DCompany: CORNING INC
87 | Level: MAINTAINING
88 | Donated: 60000
89 | Company Number: 5
90 | DCompany: ELASTIC N.V.
91 | Level: PARTNER
92 | Donated: 7500
93 | Company Number: 6
94 | DCompany: FACEBOOK INC
95 | Level: SUSTAINABILITY
96 | Donated: 90000
97 | Company Number: 7
98 | DCompany: HUAWEI TECHNOLOGIES
99 | Level: SUSTAINABILITY
100 | Donated: 90000
101 | Company Number: 8
102 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP
103 | Level: CONTRIBUTING
104 | Donated: 30000
105 | Company Number: 9
106 | DCompany: JPMORGAN CHASE & CO
107 | Level: SUPPORTING
108 | Donated: 15000
109 | Company Number: 10
110 | DCompany: MICROSOFT CORP
111 | Level: VISIONARY
112 | Donated: 150000
113 | Company Number: 11
114 | DCompany: NETFLIX INC
115 | Level: PARTNER
116 | Donated: 7500
117 | Company Number: 12
118 | DCompany: SALESFORCE.COM INC.
119 | Level: SUSTAINABILITY
120 | Donated: 90000
121 | Company Number: 13
122 | DCompany: SLACK TECHNOLOGIES INC.
123 | Level: MAINTAINING
124 | Donated: 60000
125 |
126 | """
127 |
--------------------------------------------------------------------------------
/section1_challenge_1_answer.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 |
3 | # Modify the code below to match the Expected Output at the bottom
4 |
5 | import csv
6 | import os
7 | from pprint import pprint
8 |
9 | current_directory = os.getcwd()
10 |
11 | pycon_sponsors_filename = 'pycon_sponsors.csv'
12 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
13 |
14 | print(pycon_sponsors_filepath)
15 |
16 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
17 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
18 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
19 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
20 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
21 | {'sponsor_level': 'PARTNER', 'amount': 7500},
22 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
23 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
24 |
25 | sponsor_vlookup = {}
26 |
27 | for sponsor_level in sponsor_levels:
28 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount']
29 |
30 | pprint(sponsor_levels)
31 |
32 | pycon_sponsors = []
33 |
34 | print(pycon_sponsors_filepath)
35 |
36 | with open(pycon_sponsors_filepath, 'r') as f:
37 | rows = csv.reader(f)
38 |
39 | header = next(f)
40 |
41 | for row_number, row in enumerate(rows):
42 | ticker, name, level = row
43 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n")
44 |
45 | """
46 | Company Number: 0
47 | DCompany: ALPHABET INC.
48 | Level: VISIONARY
49 | Donated: 150000
50 | Company Number: 1
51 | DCompany: AMAZON COM INC
52 | Level: SUSTAINABILITY
53 | Donated: 90000
54 | Company Number: 2
55 | DCompany: BLOOMBERG
56 | Level: VISIONARY
57 | Donated: 150000
58 | Company Number: 3
59 | DCompany: CAPITAL ONE FINANCIAL CORP
60 | Level: MAINTAINING
61 | Donated: 60000
62 | Company Number: 4
63 | DCompany: CORNING INC
64 | Level: MAINTAINING
65 | Donated: 60000
66 | Company Number: 5
67 | DCompany: ELASTIC N.V.
68 | Level: PARTNER
69 | Donated: 7500
70 | Company Number: 6
71 | DCompany: FACEBOOK INC
72 | Level: SUSTAINABILITY
73 | Donated: 90000
74 | Company Number: 7
75 | DCompany: HUAWEI TECHNOLOGIES
76 | Level: SUSTAINABILITY
77 | Donated: 90000
78 | Company Number: 8
79 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP
80 | Level: CONTRIBUTING
81 | Donated: 30000
82 | Company Number: 9
83 | DCompany: JPMORGAN CHASE & CO
84 | Level: SUPPORTING
85 | Donated: 15000
86 | Company Number: 10
87 | DCompany: MICROSOFT CORP
88 | Level: VISIONARY
89 | Donated: 150000
90 | Company Number: 11
91 | DCompany: NETFLIX INC
92 | Level: PARTNER
93 | Donated: 7500
94 | Company Number: 12
95 | DCompany: SALESFORCE.COM INC.
96 | Level: SUSTAINABILITY
97 | Donated: 90000
98 | Company Number: 13
99 | DCompany: SLACK TECHNOLOGIES INC.
100 | Level: MAINTAINING
101 | Donated: 60000
102 | """
103 |
--------------------------------------------------------------------------------
/section1_challenge_2.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 |
3 | # Challenge 2
4 | # Modify the code below to sum up all the donations by the companies in the list
5 |
6 | import csv
7 | import os
8 | from pprint import pprint
9 |
10 | current_directory = os.getcwd()
11 |
12 | pycon_sponsors_filename = 'pycon_sponsors.csv'
13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
14 |
15 | # print(pycon_sponsors_filepath)
16 |
17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
22 | {'sponsor_level': 'PARTNER', 'amount': 7500},
23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
25 |
26 | sponsor_vlookup = {}
27 |
28 | for sponsor_level in sponsor_levels:
29 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount']
30 |
31 | pprint(sponsor_levels)
32 |
33 | pycon_sum = []
34 |
35 | # print(pycon_sponsors_filepath)
36 |
37 | with open(pycon_sponsors_filepath, 'r') as f:
38 | rows = csv.reader(f)
39 |
40 | header = next(f)
41 |
42 | for row_number, row in enumerate(rows):
43 | ticker, name, level = row
44 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n")
45 |
46 | """
47 | Current Output:
48 |
49 | [{'amount': 150000, 'sponsor_level': 'VISIONARY'},
50 | {'amount': 90000, 'sponsor_level': 'SUSTAINABILITY'},
51 | {'amount': 60000, 'sponsor_level': 'MAINTAINING'},
52 | {'amount': 30000, 'sponsor_level': 'CONTRIBUTING'},
53 | {'amount': 15000, 'sponsor_level': 'SUPPORTING'},
54 | {'amount': 7500, 'sponsor_level': 'PARTNER'},
55 | {'amount': 3750, 'sponsor_level': 'PARTICIPATING'},
56 | {'amount': 1500, 'sponsor_level': 'ASSOCIATE'}]
57 |
58 | Company Number: 0
59 | DCompany: ALPHABET INC.
60 | Level: VISIONARY
61 | Donated: 150000
62 | Company Number: 1
63 | DCompany: AMAZON COM INC
64 | Level: SUSTAINABILITY
65 | Donated: 90000
66 | Company Number: 2
67 | DCompany: BLOOMBERG
68 | Level: VISIONARY
69 | Donated: 150000
70 | Company Number: 3
71 | DCompany: CAPITAL ONE FINANCIAL CORP
72 | Level: MAINTAINING
73 | Donated: 60000
74 | Company Number: 4
75 | DCompany: CORNING INC
76 | Level: MAINTAINING
77 | Donated: 60000
78 | Company Number: 5
79 | DCompany: ELASTIC N.V.
80 | Level: PARTNER
81 | Donated: 7500
82 | Company Number: 6
83 | DCompany: FACEBOOK INC
84 | Level: SUSTAINABILITY
85 | Donated: 90000
86 | Company Number: 7
87 | DCompany: HUAWEI TECHNOLOGIES
88 | Level: SUSTAINABILITY
89 | Donated: 90000
90 | Company Number: 8
91 | DCompany: INTERNATIONAL BUSINESS MACHINES CORP
92 | Level: CONTRIBUTING
93 | Donated: 30000
94 | Company Number: 9
95 | DCompany: JPMORGAN CHASE & CO
96 | Level: SUPPORTING
97 | Donated: 15000
98 | Company Number: 10
99 | DCompany: MICROSOFT CORP
100 | Level: VISIONARY
101 | Donated: 150000
102 | Company Number: 11
103 | DCompany: NETFLIX INC
104 | Level: PARTNER
105 | Donated: 7500
106 | Company Number: 12
107 | DCompany: SALESFORCE.COM INC.
108 | Level: SUSTAINABILITY
109 | Donated: 90000
110 | Company Number: 13
111 | DCompany: SLACK TECHNOLOGIES INC.
112 | Level: MAINTAINING
113 | Donated: 60000
114 |
115 | """
116 | """
117 | Expected Output:
118 |
119 | Total Sum: 1050000
120 |
121 | """
122 |
--------------------------------------------------------------------------------
/section1_challenge_2_answer.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 | # Modify the code below to sum up all the donations by the companies in the list
3 |
4 | import csv
5 | import os
6 | from pprint import pprint
7 |
8 | current_directory = os.getcwd()
9 |
10 | pycon_sponsors_filename = 'pycon_sponsors.csv'
11 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
12 |
13 | print(pycon_sponsors_filepath)
14 |
15 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
16 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
17 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
18 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
19 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
20 | {'sponsor_level': 'PARTNER', 'amount': 7500},
21 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
22 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
23 |
24 | sponsor_vlookup = {}
25 |
26 | for sponsor_level in sponsor_levels:
27 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount']
28 |
29 | pprint(sponsor_levels)
30 |
31 | pycon_sum = []
32 |
33 | print(pycon_sponsors_filepath)
34 |
35 | with open(pycon_sponsors_filepath, 'r') as f:
36 | rows = csv.reader(f)
37 |
38 | header = next(f)
39 |
40 | for row_number, row in enumerate(rows):
41 | ticker, name, level = row
42 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n")
43 | value = int(sponsor_vlookup[row[2]])
44 | pycon_sum.append(value)
45 |
46 | print("Total Sum", sum(pycon_sum))
47 |
48 | """
49 | Output:
50 |
51 | 1050000
52 |
53 | """
54 |
--------------------------------------------------------------------------------
/section1_challenge_3.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 |
3 | # Challenge 3
4 | # Create a function that takes a filepath as a parameter and returns the sum of donations
5 |
6 | import csv
7 | import os
8 | from pprint import pprint
9 |
10 | current_directory = os.getcwd()
11 |
12 | pycon_sponsors_filename = 'pycon_sponsors.csv'
13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
14 |
15 | print(pycon_sponsors_filepath)
16 |
17 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
18 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
19 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
20 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
21 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
22 | {'sponsor_level': 'PARTNER', 'amount': 7500},
23 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
24 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
25 |
26 | sponsor_vlookup = {}
27 |
28 | for sponsor_level in sponsor_levels:
29 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount']
30 |
31 | pprint(sponsor_levels)
32 |
33 | pycon_sum = []
34 |
35 | print(pycon_sponsors_filepath)
36 |
37 | with open(pycon_sponsors_filepath, 'r') as f:
38 | rows = csv.reader(f)
39 |
40 | header = next(f)
41 |
42 | for row_number, row in enumerate(rows):
43 | ticker, name, level = row
44 | print("Company Number:\t", row_number, "\n\tDCompany:", name, "\n\tLevel: ",level, "\n\tDonated:", sponsor_vlookup[row[2]], "\n")
45 | value = int(sponsor_vlookup[row[2]])
46 | pycon_sum.append(value)
47 |
48 | print("Total Sum", sum(pycon_sum))
49 |
50 | """
51 | Output:
52 |
53 | 1050000
54 |
55 | """
56 |
--------------------------------------------------------------------------------
/section1_challenge_3_answer.py:
--------------------------------------------------------------------------------
1 | # Perform an Excel VLOOKUP with a Python Dictionary
2 |
3 | # Challenge 3
4 | # Create a function that takes a filepath as a parameter and returns the sum of donations
5 |
6 | import csv
7 | import os
8 | from pprint import pprint
9 |
10 | current_directory = os.getcwd()
11 |
12 | pycon_sponsors_filename = 'pycon_sponsors.csv'
13 | pycon_sponsors_filepath = os.path.join(current_directory, "data", pycon_sponsors_filename)
14 |
15 | print(pycon_sponsors_filepath)
16 |
17 | def sum_donations(filepath):
18 |
19 | sponsor_levels = [{'sponsor_level': 'VISIONARY', 'amount': 150000},
20 | {'sponsor_level': 'SUSTAINABILITY', 'amount': 90000},
21 | {'sponsor_level': 'MAINTAINING', 'amount': 60000},
22 | {'sponsor_level': 'CONTRIBUTING', 'amount': 30000},
23 | {'sponsor_level': 'SUPPORTING', 'amount': 15000},
24 | {'sponsor_level': 'PARTNER', 'amount': 7500},
25 | {'sponsor_level': 'PARTICIPATING', 'amount': 3750},
26 | {'sponsor_level': 'ASSOCIATE', 'amount': 1500}]
27 |
28 | sponsor_vlookup = {}
29 |
30 | for sponsor_level in sponsor_levels:
31 | sponsor_vlookup[sponsor_level['sponsor_level']] = sponsor_level['amount']
32 |
33 | pycon_sum = []
34 |
35 | with open(filepath, 'r') as f:
36 | rows = csv.reader(f)
37 |
38 | header = next(f)
39 |
40 | for row_number, row in enumerate(rows):
41 | ticker, name, level = row
42 | value = int(sponsor_vlookup[row[2]])
43 | pycon_sum.append(value)
44 |
45 | return pycon_sum
46 |
47 |
48 | """
49 | Output:
50 |
51 | pycon_sum = sum_donation(pycon_sponsors_filepath)
52 |
53 | print("Total Donation:\t", pycon_sum)
54 |
55 | """
56 |
--------------------------------------------------------------------------------
/section2-02-real-world-example-refactored.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import zipfile
4 | import re # regular expression
5 |
6 | import requests
7 |
8 | import pandas as pd
9 |
10 | # fixes display of dataframes in Python Console
11 | pd.set_option('display.float_format', lambda x: f'{x:.5f}')
12 | pd.set_option('display.max_columns', 500)
13 | pd.set_option('display.max_rows', 500)
14 | pd.set_option('display.width', 600)
15 |
16 | current_directory = os.getcwd()
17 |
18 |
19 | def extract_zip_contents(filepath):
20 | zip_file_local_extract_path = filepath.replace(".zip", "")
21 |
22 | # create directory for zip files
23 | if os.path.exists(zip_file_local_extract_path):
24 |
25 | print("Folder already Exists!")
26 |
27 | else:
28 | try:
29 |
30 | z = zipfile.ZipFile(zip_file_local_extract_path)
31 |
32 | z.extractall(zip_file_local_extract_path)
33 |
34 | print("Extracting Contents: \t", zip_file_local_extract_path)
35 | except:
36 | print("Issue Extracting, Going to Skip :)")
37 | return None
38 |
39 | return zip_file_local_extract_path
40 |
41 |
42 | def download_filings(start_year, end_year, output_directory):
43 | quarters = ['q1', 'q2', 'q3', 'q4']
44 |
45 | zip_filepaths = []
46 |
47 | for year in range(start_year, end_year):
48 | for quarter in quarters:
49 |
50 | url = rf'https://www.sec.gov/files/dera/data/financial-statement-data-sets/{year}{quarter}.zip'
51 |
52 | try:
53 |
54 | # we can get the filename (basename) of the url using basename
55 | basename = os.path.basename(url)
56 |
57 | print(basename)
58 |
59 | zip_file_local_filepath = os.path.join(output_directory, basename)
60 |
61 | print(zip_file_local_filepath)
62 |
63 | zip_filepaths.append(zip_file_local_filepath)
64 |
65 | if not os.path.exists(zip_file_local_filepath):
66 |
67 | print(f"Downloading: \t{url}")
68 |
69 | r = requests.get(url)
70 |
71 | if r.status_code == 200:
72 |
73 | print(f"Download Complete")
74 |
75 | with open(zip_file_local_filepath, 'wb') as fd:
76 | fd.write(r.content)
77 |
78 | else:
79 | print("Got an Error Code!")
80 |
81 | else:
82 | print("It appears Zip File already exists", zip_file_local_filepath)
83 |
84 | except Exception as E:
85 | print("Error Downloading", url, E)
86 |
87 | return zip_filepaths
88 |
89 |
90 | def transform_data(numbers_filepath, submissions_filepath, df_sic_list, df_symbol_cik, metric="Revenues", form_type='10-'):
91 | print("Transforming ", numbers_filepath)
92 |
93 | df_numbers = pd.read_csv(numbers_filepath, delimiter="\t")
94 |
95 | df_submissions = pd.read_csv(submissions_filepath, delimiter="\t")
96 |
97 | # convert sic to string
98 | df_submissions['sic'] = df_submissions['sic'].astype('Int64').astype('str')
99 |
100 | df_submissions = df_submissions[['adsh', 'cik', 'name', 'sic', 'countryba', 'stprba', 'fye', 'form', 'period', 'filed', 'instance']]
101 |
102 | df_symbol_cik['symbol'] = df_symbol_cik['symbol'].str.upper()
103 |
104 | # create list of dataframe column names
105 | submissions_columns = df_submissions.columns.tolist()
106 |
107 | # going to merge two dataframes into one
108 | df_submissions_symbols = pd.merge(df_submissions, df_symbol_cik)
109 |
110 | # merge sic codes onto submission dataframe
111 | df_submissions_symbols = pd.merge(df_submissions_symbols, df_sic_list, on="sic")
112 |
113 | # we can drop columns by name using drop
114 | df_submissions_symbols = df_submissions_symbols.drop(columns=['instance'])
115 |
116 | new_submissions_columns = ["symbol", "industry_title"] + submissions_columns
117 |
118 | df_submissions_symbols = df_submissions_symbols.reindex(columns=new_submissions_columns)
119 |
120 | df_submissions_symbols = df_submissions_symbols[df_submissions_symbols['form'].str.contains(form_type, flags=re.IGNORECASE, regex=True)]
121 |
122 | df_submission_numbers = pd.merge(df_numbers, df_submissions_symbols, left_on='adsh', right_on='adsh', how='inner')
123 |
124 | new_column_order = ['cik',
125 | 'symbol',
126 | 'name',
127 | 'sic',
128 | 'industry_title',
129 | 'countryba',
130 | 'stprba',
131 | 'fye',
132 | 'form',
133 | 'period',
134 | 'filed',
135 | 'adsh',
136 | 'tag',
137 | 'version',
138 | 'coreg',
139 | 'ddate',
140 | 'qtrs',
141 | 'uom',
142 | 'value'
143 | ]
144 |
145 | # reorder columns
146 | df_submission_numbers = df_submission_numbers.reindex(columns=new_column_order)
147 |
148 | # Group by: split-apply-combine
149 | if metric:
150 | df_values = df_submission_numbers[df_submission_numbers['tag'].isin([metric])]
151 | else:
152 | df_values = df_submission_numbers.copy()
153 |
154 | df_values = df_values.dropna(subset=['value'])
155 |
156 | # only show companies with 4 quarters (1 year) worth of data
157 | df_values = df_values[df_values['qtrs'] == 4]
158 | df_values = df_values[(df_values['uom'] == "USD") | (df_values['uom'] == "EUR")]
159 |
160 | df_values = df_values.sort_values('ddate', ascending=True)
161 |
162 | group = []
163 |
164 | for (symbol, qtrs), df_group in df_values.groupby(["symbol", "qtrs"]):
165 | df_group['pct_change'] = df_group['value'].pct_change()
166 | group.append(df_group)
167 |
168 | df_values_pct = pd.concat(group)
169 |
170 | df_values_pct = df_values_pct.sort_values('ddate', ascending=False)
171 |
172 | print("Done Transforming ", numbers_filepath)
173 |
174 | return df_values_pct
175 |
176 |
177 | def filter_ticker_list(df_submissions_symbols):
178 | pycon_sponsors = [{'symbol': 'GOOG', 'name': 'ALPHABET INC.', 'sponsor_level': 'VISIONARY'},
179 | {'symbol': 'AMZN', 'name': 'AMAZON COM INC', 'sponsor_level': 'SUSTAINABILITY'},
180 | {'symbol': '#N/A', 'name': 'BLOOMBERG', 'sponsor_level': 'VISIONARY'},
181 | {'symbol': 'COF', 'name': 'CAPITAL ONE FINANCIAL CORP', 'sponsor_level': 'MAINTAINING'},
182 | {'symbol': 'GLW', 'name': 'CORNING INC', 'sponsor_level': 'MAINTAINING'},
183 | {'symbol': 'ESTC', 'name': 'ELASTIC N.V.', 'sponsor_level': 'PARTNER'},
184 | {'symbol': 'FB', 'name': 'FACEBOOK INC', 'sponsor_level': 'SUSTAINABILITY'},
185 | {'symbol': '#N/A', 'name': 'HUAWEI TECHNOLOGIES', 'sponsor_level': 'SUSTAINABILITY'},
186 | {'symbol': 'IBM', 'name': 'INTERNATIONAL BUSINESS MACHINES CORP', 'sponsor_level': 'CONTRIBUTING'},
187 | {'symbol': 'JPM', 'name': 'JPMORGAN CHASE & CO', 'sponsor_level': 'SUPPORTING'},
188 | {'symbol': 'MSFT', 'name': 'MICROSOFT CORP', 'sponsor_level': 'VISIONARY'},
189 | {'symbol': 'NFLX', 'name': 'NETFLIX INC', 'sponsor_level': 'PARTNER'},
190 | {'symbol': 'CRM', 'name': 'SALESFORCE.COM INC.', 'sponsor_level': 'SUSTAINABILITY'},
191 | {'symbol': 'WORK', 'name': 'SLACK TECHNOLOGIES INC.', 'sponsor_level': 'MAINTAINING'}]
192 |
193 | df_companies = pd.DataFrame(pycon_sponsors)
194 |
195 | ticker_list_pycon_sponsors = df_companies['symbol'].tolist()
196 |
197 | df_selected_submissions = df_submissions_symbols[df_submissions_symbols['symbol'].isin(ticker_list_pycon_sponsors)]
198 |
199 | new_submissions_columns = ['cik',
200 | 'symbol',
201 | 'name',
202 | 'sic',
203 | 'industry_title',
204 | 'countryba',
205 | 'stprba',
206 | 'fye',
207 | 'form',
208 | 'period',
209 | 'filed',
210 | 'adsh'
211 | ]
212 |
213 | df_selected_submissions = df_selected_submissions.reindex(columns=new_submissions_columns)
214 |
215 | return df_selected_submissions
216 |
217 |
218 | def main(start_year, end_year):
219 | url = 'https://www.sec.gov/include/ticker.txt'
220 |
221 | df_symbol_cik = pd.read_csv(url, delimiter="\t", names=['symbol', 'cik'])
222 |
223 | # standard industrial classification
224 | sic_url = r'https://www.sec.gov/info/edgar/siccodes.htm'
225 | # we can extract table from html by passing in url
226 | sics_tables = pd.read_html(sic_url)
227 | df_sic_list = sics_tables[0]
228 |
229 | # rename columns to lower, no spaces, and rename sic_code to sic
230 | df_sic_list.columns = df_sic_list.columns.str.lower().str.replace(" ", "_").str.replace("sic_code", "sic")
231 |
232 | # convert sic column to string
233 | df_sic_list['sic'] = df_sic_list['sic'].astype('Int64').astype('str')
234 |
235 | output_directory = os.path.join(current_directory, "zip-data")
236 |
237 | # create directory for zip files
238 | if os.path.exists(output_directory):
239 | print("Folder already Exists!")
240 | else:
241 | print("Folder doesn't exist")
242 | os.mkdir(output_directory)
243 | print("Created Directory!")
244 |
245 | zip_filepaths = download_filings(start_year, end_year, output_directory)
246 |
247 | zip_folders = []
248 |
249 | for zip_filepath in zip_filepaths:
250 | zip_folder = extract_zip_contents(zip_filepath)
251 |
252 | if zip_folder:
253 | zip_folders.append(zip_folder)
254 |
255 | # get list of all extracted files
256 | files = glob.glob(output_directory + "\\*\\*.*")
257 |
258 | num_files = [file for file in files if "num.txt" in file]
259 | sub_files = [file for file in files if "sub.txt" in file]
260 |
261 | pre_files = [file for file in files if "pre.txt" in file]
262 | tag_files = [file for file in files if "tag.txt" in file]
263 | readme_files = [file for file in files if "readme.htm" in file]
264 |
265 | num_files.sort(reverse=True)
266 | sub_files.sort(reverse=True)
267 |
268 | if len(num_files) == len(sub_files):
269 | sub_num_files = list(zip(sub_files, num_files))
270 |
271 | filings = []
272 |
273 | for sub_file, num_file in sub_num_files[1:5]:
274 | df_companies_pct_chg = transform_data(num_file, sub_file, df_sic_list, df_symbol_cik, metric="Revenues", form_type='10-')
275 |
276 | filings.append(df_companies_pct_chg)
277 |
278 | df_all_filings = pd.concat(filings)
279 |
280 | # df_all_filings = df_all_filings.dropna(subset=['pct_change'])
281 |
282 | # df_all_filings = df_all_filings[df_all_filings['pct_change'] > 0]
283 | #
284 | # df_all_filings = df_all_filings.drop_duplicates(keep='first', subset=['cik']).sort_values('value', ascending=False)
285 |
286 | df_all_filings.to_csv('all_filings.csv')
287 |
288 |
289 | if __name__ == "__main__":
290 | start_year = 2020
291 | end_year = 2022
292 |
293 | main(start_year, end_year)
294 |
--------------------------------------------------------------------------------
/section2_challenge.rst:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Visit Awesome Public Datasets on Github
4 |
5 | https://github.com/awesomedata/awesome-public-datasets
6 |
7 | # Find a dataset you find interesting, easily downloadable, and format Pandas can work with
8 |
9 | # Create a Github Username
10 |
11 | # Think of an awesome project Name
12 |
13 | # Create a Github Repository
14 |
15 | # Create a new project structure using Cookiecutter
16 |
17 | # Commit and push your new project to Github
18 |
19 | # Write Code to Download the Dataset using Requests
20 |
21 | # Write Code to Transform & Analyze the Dataset using Pandas
22 |
23 | # Write Code to Visualize Dataset using Your favorite Pandas Visualization Library
24 |
25 | # Write Code to Display your Data in a Browser Using Flask
26 |
27 | # Refactor & clean up your code to make it easy to maintain and share
28 |
29 | # Setup Airflow Schedule to automatically download, transform, and output your results of your Analysis
30 |
31 | # Repeat, but with a different dataset
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.1.0
3 | commit = True
4 | tag = True
5 |
6 | [bumpversion:file:setup.py]
7 | search = version='{current_version}'
8 | replace = version='{new_version}'
9 |
10 | [bumpversion:file:spreadsheets_to_dataframes/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 |
14 | [bdist_wheel]
15 | universal = 1
16 |
17 | [flake8]
18 | exclude = docs
19 |
20 | [aliases]
21 | # Define setup.py command aliases here
22 | test = pytest
23 |
24 | [tool:pytest]
25 | collect_ignore = ['setup.py']
26 |
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """The setup script."""
5 |
6 | from setuptools import setup, find_packages
7 |
8 | with open('README.rst') as readme_file:
9 | readme = readme_file.read()
10 |
11 | with open('HISTORY.rst') as history_file:
12 | history = history_file.read()
13 |
14 | requirements = ['Click>=7.0', ]
15 |
16 | setup_requirements = ['pytest-runner', ]
17 |
18 | test_requirements = ['pytest>=3', ]
19 |
20 | setup(
21 | author="Ryan S. McCoy",
22 | author_email='github@ryansmccoy.com',
23 | python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
24 | classifiers=[
25 | 'Development Status :: 2 - Pre-Alpha',
26 | 'Intended Audience :: Developers',
27 | 'License :: OSI Approved :: MIT License',
28 | 'Natural Language :: English',
29 | "Programming Language :: Python :: 2",
30 | 'Programming Language :: Python :: 2.7',
31 | 'Programming Language :: Python :: 3',
32 | 'Programming Language :: Python :: 3.5',
33 | 'Programming Language :: Python :: 3.6',
34 | 'Programming Language :: Python :: 3.7',
35 | ],
36 | description="Examples from Presentation",
37 | entry_points={
38 | 'console_scripts': [
39 | 'spreadsheets_to_dataframes=spreadsheets_to_dataframes.cli:main',
40 | ],
41 | },
42 | install_requires=requirements,
43 | license="MIT license",
44 | long_description=readme + '\n\n' + history,
45 | include_package_data=True,
46 | keywords='spreadsheets_to_dataframes',
47 | name='spreadsheets_to_dataframes',
48 | packages=find_packages(include=['spreadsheets_to_dataframes', 'spreadsheets_to_dataframes.*']),
49 | setup_requires=setup_requirements,
50 | test_suite='tests',
51 | tests_require=test_requirements,
52 | url='https://github.com/ryansmccoy/spreadsheets_to_dataframes',
53 | version='0.1.0',
54 | zip_safe=False,
55 | )
56 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27, py35, py36, py37 flake8
3 |
4 | [travis]
5 | python =
6 | 3.7: py37
7 | 3.6: py36
8 | 3.5: py35
9 | 2.7: py27
10 |
11 | [testenv:flake8]
12 | basepython = python
13 | deps = flake8
14 | commands = flake8 spreadsheets_to_dataframes
15 |
16 | [testenv]
17 | setenv =
18 | PYTHONPATH = {toxinidir}
19 | deps =
20 | -r{toxinidir}/requirements_dev.txt
21 | ; If you want to make tox run the tests with the same versions, create a
22 | ; requirements.txt with the pinned versions and uncomment the following line:
23 | ; -r{toxinidir}/requirements.txt
24 | commands =
25 | pip install -U pip
26 | pytest --basetemp={envtmpdir}
27 |
28 |
--------------------------------------------------------------------------------