├── code ├── data │ └── raw │ │ └── AmazonBooks.xlsx ├── images │ └── readme.md ├── st_simple_1.py ├── dash_simple_app_1.py ├── dash_simple_app_2.py ├── dash_html_gen.py ├── st_simple_2.py ├── st_simple_sidebar.py ├── dash_full_app.py ├── ch6-exercise-2.ipynb ├── ch6-exercise-1.ipynb ├── ch3-exercise-2.ipynb └── ch3-exercise-3.ipynb ├── readme_resources └── python-data-visualization.jpg ├── requirements.piptools ├── LICENSE ├── .gitignore ├── README.md └── requirements.txt /code/data/raw/AmazonBooks.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkpython/python-data-visualization/HEAD/code/data/raw/AmazonBooks.xlsx -------------------------------------------------------------------------------- /readme_resources/python-data-visualization.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talkpython/python-data-visualization/HEAD/readme_resources/python-data-visualization.jpg -------------------------------------------------------------------------------- /code/images/readme.md: -------------------------------------------------------------------------------- 1 | ## Placeholder 2 | 3 | This file is here just so git will create the images folder. The notebooks may save output here and the folder must exist first. 4 | -------------------------------------------------------------------------------- /code/st_simple_1.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | import streamlit as st 4 | import plotly.express as px 5 | 6 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 7 | df = pd.read_csv(src_file) 8 | 9 | fig = px.histogram( 10 | df, 11 | x="fuelCost08", 12 | color="class_summary", 13 | labels={"fuelCost08": "Annual Fuel Cost"}, 14 | nbins=40, 15 | title="Fuel Cost Distribution", 16 | ) 17 | 18 | st.title("Simple Example") 19 | st.write(fig) 20 | 21 | -------------------------------------------------------------------------------- /code/dash_simple_app_1.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from dash import Dash, html, dcc 4 | import plotly.express as px 5 | 6 | app = Dash(__name__) 7 | 8 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 9 | df = pd.read_csv(src_file) 10 | 11 | fig = px.histogram( 12 | df, 13 | x="fuelCost08", 14 | color="class_summary", 15 | labels={"fuelCost08": "Annual Fuel Cost"}, 16 | nbins=40, 17 | title="Fuel Cost Distribution", 18 | ) 19 | 20 | app.layout = html.Div(children=[ 21 | html.H1("Simple Histogram"), 22 | html.Div("Annual Fuel Cost Plot."), 23 | dcc.Graph(id="example-histogram", figure=fig), 24 | ]) 25 | 26 | if __name__ == "__main__": 27 | app.run_server(debug=True) 28 | 29 | 30 | -------------------------------------------------------------------------------- /requirements.piptools: -------------------------------------------------------------------------------- 1 | # This file contains the top-level dependencies, without pinned versions, for this course. 2 | # It is used to create the working requirements.txt file with transitive dependencies and fixed versions. 3 | # You don't need to use this file or modify it, just install the dependencies for requirements.txt 4 | # But you can learn more here: https://pip-tools.readthedocs.io/en/latest/ 5 | # 6 | 7 | # Chapter 3 dependencies 8 | jupyter 9 | jupyterlab 10 | pandas 11 | numpy 12 | matplotlib 13 | statsmodels 14 | 15 | # Chapter 4 dependencies 16 | # no additional libs 17 | 18 | # Chapter 5 dependencies 19 | seaborn 20 | 21 | # Chapter 6 dependencies 22 | altair 23 | altair_data_server 24 | altair_saver 25 | openpyxl 26 | vegafusion 27 | vegafusion-python-embed 28 | vl-convert-python 29 | 30 | 31 | # Chapter 7 dependencies 32 | plotly 33 | kaleido 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Talk Python 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/dash_simple_app_2.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | 4 | from dash import Dash, html, dcc, Input, Output 5 | import plotly.express as px 6 | 7 | app = Dash(__name__) 8 | 9 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 10 | df = pd.read_csv(src_file) 11 | fuel_types = df["fuel_type_summary"].unique() 12 | 13 | app.layout = html.Div( 14 | children=[ 15 | html.H1("Simple Histogram"), 16 | html.Div("Annual Fuel Cost Plot."), 17 | dcc.Graph(id="histogram"), 18 | dcc.Dropdown( 19 | id="fuel_id", 20 | options=[{"label": i, "value": i} for i in fuel_types], 21 | value=[i for i in fuel_types], 22 | multi=True, 23 | ), 24 | ] 25 | ) 26 | 27 | @app.callback(Output("histogram", "figure"), Input("fuel_id", "value")) 28 | def update_output(fuel_list): 29 | filtered_df = df[df["fuel_type_summary"].isin(fuel_list)] 30 | fig = px.histogram( 31 | filtered_df, 32 | x="fuelCost08", 33 | color="class_summary", 34 | labels={"fuelCost08": "Annual Fuel Cost"}, 35 | nbins=40, 36 | title="Fuel Cost Distribution", 37 | ) 38 | return fig 39 | 40 | if __name__ == "__main__": 41 | app.run_server(debug=True) 42 | 43 | 44 | -------------------------------------------------------------------------------- /code/dash_html_gen.py: -------------------------------------------------------------------------------- 1 | from dash import Dash, html, dcc 2 | import plotly.express as px 3 | 4 | external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"] 5 | app = Dash(__name__, external_stylesheets=external_stylesheets) 6 | 7 | app.layout = html.Div( 8 | [ 9 | html.H1("Simple HTML Only Site"), 10 | html.H2("TalkPython Training"), 11 | html.Div( 12 | [ 13 | html.P("Annual Fuel Cost Plot", 14 | className="my-p-class", 15 | id="my-p-id") 16 | ], 17 | style={ 18 | "color": "green", 19 | "fontSize": 18 20 | }, 21 | ), 22 | dcc.Markdown(""" 23 | #### Dash Supports Markdown 24 | 25 | You can write simple text and format it with markup like 26 | **bold text** and *italics*, [links](http://commonmark.org/help). 27 | You can also use: 28 | * lists 29 | * inline `code` snippets 30 | * and more 31 | """), 32 | ], 33 | style={ 34 | "margin-left": "25px", 35 | "width": "55%", 36 | "backgroundColor": "lightgray" 37 | }, 38 | ) 39 | 40 | 41 | if __name__ == "__main__": 42 | app.run_server(debug=True) 43 | 44 | 45 | -------------------------------------------------------------------------------- /code/st_simple_2.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | import streamlit as st 4 | import plotly.express as px 5 | import altair as alt 6 | 7 | @st.cache() 8 | def load_data(): 9 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 10 | raw_df = pd.read_csv(src_file) 11 | return raw_df 12 | 13 | # Load data and determine valid values 14 | df = load_data() 15 | min_year = int(df["year"].min()) 16 | max_year = int(df["year"].max()) 17 | valid_makes = sorted(df["make"].unique()) 18 | 19 | # Setup the UI 20 | st.title("Simple Example") 21 | make = st.multiselect("Select a make:", valid_makes) 22 | year_range = st.slider( 23 | label="Year range", 24 | min_value=min_year, 25 | max_value=max_year, 26 | value=(min_year, max_year), 27 | ) 28 | 29 | # Filter data based on inputs 30 | year_filter = df["year"].between(year_range[0], year_range[1]) 31 | make_filter = df["make"].isin(make) 32 | 33 | plot_df = df[make_filter & year_filter] 34 | 35 | avg_fuel_economy = plot_df["fuelCost08"].mean().round(0) 36 | st.metric("Average", avg_fuel_economy) 37 | 38 | # Plot the data 39 | fig = px.histogram( 40 | plot_df, 41 | x="fuelCost08", 42 | color="class_summary", 43 | labels={"fuelCost08": "Annual Fuel Cost"}, 44 | nbins=40, 45 | title="Fuel Cost Distribution", 46 | ) 47 | 48 | altair_chart = ( 49 | alt.Chart(plot_df).mark_tick().encode(y="fuel_type_summary", x="barrels08") 50 | ) 51 | 52 | # Display the output results 53 | st.write(fig) 54 | st.write(altair_chart) 55 | 56 | st.write("Sample data", plot_df.head(10)) -------------------------------------------------------------------------------- /code/st_simple_sidebar.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | import streamlit as st 4 | import plotly.express as px 5 | import altair as alt 6 | 7 | 8 | @st.cache() 9 | def load_data(): 10 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 11 | raw_df = pd.read_csv(src_file) 12 | return raw_df 13 | 14 | 15 | # Load data and determine valid values 16 | df = load_data() 17 | min_year = int(df["year"].min()) 18 | max_year = int(df["year"].max()) 19 | 20 | # Add ALL as an option to make it easier to select all 21 | valid_makes = ["ALL"] + sorted(df["make"].unique()) 22 | 23 | # Get the top 5 as the default 24 | default_makes = df["make"].value_counts().nlargest(5).index.tolist() 25 | 26 | # Setup the UI 27 | st.title("Simple Sidebar Example") 28 | make = st.sidebar.multiselect("Select a make:", valid_makes, default=default_makes) 29 | year_range = st.sidebar.slider( 30 | label="Year range", 31 | min_value=min_year, 32 | max_value=max_year, 33 | value=(min_year, max_year), 34 | ) 35 | 36 | # Filter data based on inputs 37 | year_filter = df["year"].between(year_range[0], year_range[1]) 38 | if "ALL" in make: 39 | # Dummy filter to include all makes 40 | make_filter = True 41 | else: 42 | make_filter = df["make"].isin(make) 43 | 44 | plot_df = df[make_filter & year_filter] 45 | 46 | avg_fuel_economy = plot_df["fuelCost08"].mean().round(0) 47 | st.sidebar.metric("Average", avg_fuel_economy) 48 | 49 | # Plot the data 50 | fig = px.histogram( 51 | plot_df, 52 | x="fuelCost08", 53 | color="class_summary", 54 | labels={"fuelCost08": "Annual Fuel Cost"}, 55 | nbins=40, 56 | title="Fuel Cost Distribution", 57 | ) 58 | 59 | altair_chart = ( 60 | alt.Chart(plot_df) 61 | .mark_tick() 62 | .encode(y="fuel_type_summary", x="barrels08") 63 | .properties(width=600) 64 | ) 65 | 66 | # Display the output results 67 | st.write(fig) 68 | st.write(altair_chart) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # PyCharm projects 132 | .idea/ 133 | 134 | # Ignore some of the output generated from the notebooks. 135 | code/images/*.png 136 | code/images/*.jpg 137 | code/images/*.svg 138 | code/images/*.pdf 139 | seaborn_heatmap.svg 140 | histogram.svg 141 | customization_example.svg 142 | plotlyhistogram.svg 143 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Data Visualization course 2 | Code and examples from our [**Python Data Visualization course**](https://training.talkpython.fm/courses/python-data-visualization). 3 | 4 | [![](./readme_resources/python-data-visualization.jpg)](https://training.talkpython.fm/courses/python-data-visualization) 5 | 6 | Have you ever been confused by all the different python plotting libraries? Have you tried to make a "simple" plot and gotten stuck and been unable to move forward? Do you want to make sophisticated, interactive data visualizations in python? If you answer yes, to any of these questions, then this course is for you. 7 | 8 | ## What's this course about and how is it different? 9 | 10 | The python data visualization landscape has many different libraries. They are all powerful and useful but it can be confusing to determine what works best for you. This course is unique because you will learn about many of the most popular python visualization libraries. You will start by learning how to use each library to build simple visualizations. You will also explore more complex usage and identify the scenarios where each library shines. 11 | 12 | By the end of this course, you will have a basic working knowledge of how to visualize data in python using multiple libraries. You will also learn which library is best for you and your coding style. Along the way, you'll learn general visualization concepts to make your plots more effective. 13 | 14 | In addition to the overview material, we will cover some of the more complex, interactive visualization dashboard technologies. 15 | 16 | ## What topics are covered 17 | 18 | In this course, you will: 19 | 20 | - Review the python visualization landscape 21 | - Explore core visualization concepts 22 | - Use matplotlib to build and customize visualizations 23 | - Build and customize simple plots with pandas 24 | - Learn about seaborn and use it for statistical visualizations 25 | - Create visualizations using Altair 26 | - Generate interactive plots using the Plotly library 27 | - Design interactive dashboards using Streamlit 28 | - Construct highly custom and flexible dashboards using Plotly's Dash framework 29 | 30 | View the full [**course outline**](https://training.talkpython.fm/courses/python-data-visualization#course_outline). 31 | 32 | ## Who is this course for? 33 | 34 | Developers and Data Analysts that have some experience with python but have not developed a competency in a python visualization library. This course is also helpful for those that feel restricted by their current plotting tools and wish to explore other options. 35 | 36 | **Note**: All software used during this course, including editors, Python language, etc., are 100% free and open source. You won't have to buy anything to take the course. 37 | 38 | ## Take the course 39 | 40 | Data sciense is one of the hottest topic of the year and data visualization is a core skillset needed to properly communicate your results and discoveries. **Take this course** to get good at a wide variety of modern Python-based visualization libraries. 41 | -------------------------------------------------------------------------------- /code/dash_full_app.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | 4 | from dash import Dash, html, dcc, Input, Output, dash_table 5 | import plotly.express as px 6 | 7 | external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"] 8 | app = Dash(__name__, external_stylesheets=external_stylesheets) 9 | 10 | styles = {"pre": {"border": "thin lightgrey solid", "overflowX": "scroll"}} 11 | 12 | src_file = Path.cwd() / "data" / "raw" / "EPA_fuel_economy_summary.csv" 13 | df = pd.read_csv(src_file) 14 | 15 | # Define the input parameters 16 | min_year = df["year"].min() 17 | max_year = df["year"].max() 18 | all_years = df["year"].unique() 19 | transmission_types = df["transmission"].unique() 20 | 21 | data_table_cols = [ 22 | "make", 23 | "model", 24 | "year", 25 | "transmission", 26 | "drive", 27 | "class_summary", 28 | "cylinders", 29 | "displ", 30 | "fuelCost08", 31 | ] 32 | 33 | # Need to keep track of button clicks to see if there is a change 34 | total_clicks = 0 35 | 36 | app.layout = html.Div( 37 | [ 38 | html.H1("Fuel Cost Analysis"), 39 | html.Div([ 40 | html.P("Talk Python Training Example"), 41 | dcc.Graph(id="histogram-with-slider", 42 | config={"displayModeBar": False}), 43 | dcc.Graph(id="scatter-plot"), 44 | html.Label("Year Range"), 45 | dcc.RangeSlider( 46 | id="year-slider", 47 | min=min_year, 48 | max=max_year, 49 | value=(min_year, max_year), 50 | marks={str(year): str(year) 51 | for year in all_years}, 52 | ), 53 | html.Label("Transmission type"), 54 | dcc.Checklist( 55 | id="transmission-list", 56 | options=[{ 57 | "label": i, 58 | "value": i 59 | } for i in transmission_types], 60 | value=transmission_types, 61 | labelStyle={"display": "inline-block"}, 62 | ), 63 | html.Hr(), 64 | html.Button("Reset selections", id="reset", n_clicks=0), 65 | html.H3(id="selected_count"), 66 | dash_table.DataTable( 67 | id="data-table", 68 | data=[], 69 | page_size=10, 70 | columns=[{ 71 | "name": i, 72 | "id": i 73 | } for i in data_table_cols], 74 | ), 75 | ]), 76 | ], 77 | style={"margin-bottom": "150px"}, 78 | ) 79 | 80 | 81 | @app.callback( 82 | Output("histogram-with-slider", "figure"), 83 | Output("scatter-plot", "figure"), 84 | Output("data-table", "data"), 85 | Output("selected_count", "children"), 86 | Input("year-slider", "value"), 87 | Input("transmission-list", "value"), 88 | Input("scatter-plot", "selectedData"), 89 | Input("reset", "n_clicks"), 90 | ) 91 | def update_figure(year_range, transmission_list, selectedData, n_clicks): 92 | # Global variables may cause unexepcted behavior in multi-user setup 93 | global total_clicks 94 | filtered_df = df[df["year"].between(year_range[0], year_range[1]) 95 | & df["transmission"].isin(transmission_list)] 96 | 97 | fig_hist = px.histogram( 98 | filtered_df, 99 | x="fuelCost08", 100 | color="class_summary", 101 | labels={"fuelCost08": "Annual Fuel Cost"}, 102 | nbins=40, 103 | ) 104 | 105 | fig_scatter = px.scatter( 106 | filtered_df, 107 | x="displ", 108 | y="fuelCost08", 109 | hover_data=[filtered_df.index, "make", "model", "year"], 110 | ) 111 | 112 | fig_scatter.update_layout(clickmode="event", uirevision=True) 113 | fig_scatter.update_traces(selected_marker_color="red") 114 | 115 | if n_clicks > total_clicks: 116 | # From here - https://community.plotly.com/t/applying-only-newest-selectedpoints-in-multiple-graphs-or-clearing-selection/31881 117 | fig_scatter.update_traces(selected_marker_color=None) 118 | total_clicks = n_clicks 119 | selectedData = None 120 | 121 | if selectedData: 122 | points = selectedData["points"] 123 | index_list = [ 124 | points[x]["customdata"][0] for x in range(0, len(points)) 125 | ] 126 | filtered_df = df[df.index.isin(index_list)] 127 | num_points_label = f"Showing {len(points)} selected points:" 128 | else: 129 | num_points_label = "No points selected - showing top 10 only" 130 | filtered_df = filtered_df.head(10) 131 | 132 | return fig_hist, fig_scatter, filtered_df.to_dict( 133 | "records"), num_points_label 134 | 135 | 136 | if __name__ == "__main__": 137 | app.run_server(debug=True) 138 | -------------------------------------------------------------------------------- /code/ch6-exercise-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3cebc250", 6 | "metadata": {}, 7 | "source": [ 8 | "## Chapter 6 Altair Data Visualization\n", 9 | "Exercise 2" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "id": "909ae279", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from pathlib import Path\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import altair as alt" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "70efbb1c", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# data_server seems to have been removed but there doesn't\n", 33 | "# seem to be an announcement (or we missed it). \n", 34 | "# Moving to vegafusion will make it all sign again.\n", 35 | "\n", 36 | "# alt.data_transformers.enable('data_server')\n", 37 | "\n", 38 | "alt.data_transformers.enable('vegafusion')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "3c6745dd", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "src_file = Path.cwd() / 'data' / 'raw' / 'EPA_fuel_economy_summary.csv'\n", 49 | "df = pd.read_csv(src_file)\n", 50 | "df.head()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "55cae40c", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "alt.Chart(df).mark_circle(size=50).encode(\n", 61 | " x='displ',\n", 62 | " y='fuelCost08',\n", 63 | " tooltip=['make', 'model', 'year'],\n", 64 | ").interactive()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "38f2ceca", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "chart1 = alt.Chart(df).mark_tick().encode(\n", 75 | " y='fuel_type_summary',\n", 76 | " x='barrels08'\n", 77 | ")\n", 78 | "chart2 = alt.Chart(df).mark_bar().encode(\n", 79 | " alt.X('barrels08:Q', bin=True),\n", 80 | " alt.Y('count()')\n", 81 | ")\n", 82 | "chart1 | chart2" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "538830f6", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "chart2 & chart1" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "3bdfad56", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "alt.hconcat(chart1, chart2)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "46778465", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "alt.vconcat(chart1, chart2)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "98137673", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "alt.Chart(df).mark_circle(size=50).encode(\n", 123 | " x='displ',\n", 124 | " y='fuelCost08',\n", 125 | " color='class_summary:N',\n", 126 | " tooltip=['make', 'model', 'year'],\n", 127 | ").facet(row='class_summary:N')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "d6c7f7dc", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "alt.Chart(df).mark_circle(size=50).encode(\n", 138 | " x='displ',\n", 139 | " y='fuelCost08',\n", 140 | " color='class_summary:N',\n", 141 | " tooltip=['make', 'model', 'year'],\n", 142 | ").facet('class_summary:N', columns=2)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "071e7c16", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "base_chart = alt.Chart(df).mark_circle(size=50).encode(\n", 153 | " x='displ',\n", 154 | " y='fuelCost08',\n", 155 | " color='class_summary:N',\n", 156 | " tooltip=['make', 'model', 'year'],\n", 157 | ")\n", 158 | "\n", 159 | "base_chart.facet('class_summary:N', columns=2)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "9b35b7e1", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "bars = alt.Chart(df).mark_bar().encode(\n", 170 | " x='mean(fuelCost08):Q',\n", 171 | " y='year:O'\n", 172 | ")\n", 173 | "bars" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "1ac960a3", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "rule = alt.Chart(df).mark_rule(color='red').encode(\n", 184 | " x='mean(fuelCost08):Q'\n", 185 | ")\n", 186 | "bars + rule" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "ab415527", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "text = bars.mark_text(\n", 197 | " align='left', dx=3).encode(text=alt.Text('mean(fuelCost08):Q', format=',.0f'))\n", 198 | "(bars + rule + text)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "92297ff2", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "(bars + rule + text).properties(width=700)" 209 | ] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3 (ipykernel)", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.12.0" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 5 233 | } 234 | -------------------------------------------------------------------------------- /code/ch6-exercise-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "24828c85", 6 | "metadata": {}, 7 | "source": [ 8 | "## Chapter 6 Altair Data Visualization\n", 9 | "Exercise 1" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "id": "c44a7038", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from pathlib import Path\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import altair as alt" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "afa93459", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "src_file = Path.cwd() / 'data' / 'raw' / 'EPA_fuel_economy_summary.csv'" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "2190f191", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "df = pd.read_csv(src_file)\n", 43 | "df.head()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "cbff6887", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "alt.data_transformers.names()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "39e254bc", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# data_server seems to have been removed but there doesn't\n", 64 | "# seem to be an announcement (or we missed it). \n", 65 | "# Moving to vegafusion will make it all sign again.\n", 66 | "\n", 67 | "# alt.data_transformers.enable('data_server')\n", 68 | "\n", 69 | "alt.data_transformers.enable('vegafusion')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "9de36e98", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "alt.Chart(df).mark_circle().encode(\n", 80 | " x='displ',\n", 81 | " y='fuelCost08',\n", 82 | ")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "59e5f555", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "alt.Chart(df).mark_point().encode(\n", 93 | " x='displ',\n", 94 | " y='fuelCost08',\n", 95 | ")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "2209f687", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "alt.Chart(df).mark_point().encode(\n", 106 | " x='displ',\n", 107 | " y='fuelCost08',\n", 108 | " color='drive',\n", 109 | " shape='drive',\n", 110 | ")" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "a0410ea4", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "alt.Chart(df).mark_bar().encode(\n", 121 | " x='fuelCost08',\n", 122 | " y='count()',\n", 123 | ")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "fe20ec30", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "alt.Chart(df).mark_tick().encode(\n", 134 | " y='fuel_type_summary',\n", 135 | " x='barrels08'\n", 136 | ")" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "67e239c2", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "alt.Chart(df).mark_boxplot().encode(\n", 147 | " x='year',\n", 148 | " y='fuelCost08'\n", 149 | ")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "84d63b73", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "alt.Chart(df).mark_boxplot().encode(\n", 160 | " x='year:O',\n", 161 | " y='fuelCost08:Q'\n", 162 | ")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "57c40450", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "alt.Chart(df).mark_bar().encode(\n", 173 | " x='mean(fuelCost08)',\n", 174 | " y='year'\n", 175 | ")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "e345aa0f", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "alt.Chart(df).mark_bar().encode(\n", 186 | " x='mean(fuelCost08)',\n", 187 | " y='year:O'\n", 188 | ")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "2b1de1b7", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "alt.Chart(df).mark_bar().encode(\n", 199 | " x='fuelCost08',\n", 200 | " y='count()',\n", 201 | ")" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "id": "a656ef1b", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "alt.Chart(df).mark_bar().encode(\n", 212 | " alt.X('fuelCost08', type='quantitative', bin=True),\n", 213 | " alt.Y(aggregate='count', type='quantitative'),\n", 214 | ")" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "id": "38989baa", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "alt.Chart(df).mark_bar().encode(\n", 225 | " alt.X('fuelCost08:Q', bin=alt.Bin(extent=[0,5000], step=250)),\n", 226 | " alt.Y('count()'),\n", 227 | ")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "b149d438", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "alt.Chart(df).mark_point().encode(\n", 238 | " alt.X('displ', type='quantitative'),\n", 239 | " alt.Y('fuelCost08'),\n", 240 | " alt.Color('cylinders', type='ordinal')\n", 241 | ")" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "63a8b75d", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "alt.Chart(df).mark_point().encode(\n", 252 | " alt.X('displ', type='quantitative'),\n", 253 | " alt.Y('fuelCost08'),\n", 254 | " alt.Color('cylinders', type='quantitative')\n", 255 | ")" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "d21d2d2d", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "alt.Chart(df).mark_point().encode(\n", 266 | " alt.X('displ', type='quantitative'),\n", 267 | " alt.Y('fuelCost08'),\n", 268 | " alt.Color('cylinders', type='nominal')\n", 269 | ")" 270 | ] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "dataviz", 276 | "language": "python", 277 | "name": "dataviz" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.12.0" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 5 294 | } 295 | -------------------------------------------------------------------------------- /code/ch3-exercise-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "90ec1037", 6 | "metadata": {}, 7 | "source": [ 8 | "### Chapter 3 Matplotlib Data Visualization\n", 9 | "\n", 10 | "Exercise 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "cd0de61c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from pathlib import Path\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "from matplotlib import ticker" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "735b302c", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "src_file = Path.cwd() / 'data' / 'raw' / 'EPA_fuel_economy.csv'\n", 35 | "image_dir = Path.cwd() / 'images'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "de1064cd", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "df = pd.read_csv(src_file)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "5830f9e5", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df.head()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "4535ce51", 61 | "metadata": {}, 62 | "source": [ 63 | "## Additional Plot Types\n", 64 | "Going beyong the histogram and boxplot" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "9f3e1b91", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "avg_by_year = df.groupby(['year'], as_index=False).agg({'comb08': 'mean'}).round(2)\n", 75 | "avg_by_year" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "c81563ac", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "fig, ax1 = plt.subplots()\n", 86 | "ax1.plot(avg_by_year['year'], avg_by_year['comb08']);" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "059234a4", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "fig, ax1 = plt.subplots()\n", 97 | "ax1.plot(avg_by_year['year'], avg_by_year['comb08'])\n", 98 | "ax1.set_xticks(np.arange(2000, 2022, 2));" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "f2f60ebb", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "fig, ax1 = plt.subplots()\n", 109 | "ax1.plot(avg_by_year['year'], avg_by_year['comb08'])\n", 110 | "ax1.xaxis.set_major_formatter(ticker.StrMethodFormatter(\"{x:0.0f}\"));" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "fb6b7a19", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "fig, ax1 = plt.subplots()\n", 121 | "ax1.bar(avg_by_year['year'], avg_by_year['comb08']);" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "682ac912", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "fig, ax1 = plt.subplots()\n", 132 | "ax1.bar(avg_by_year['year'], avg_by_year['comb08'])\n", 133 | "ax1.set_xticks(np.arange(2000, 2022, 2));" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "7194f9c0", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "fig, ax1 = plt.subplots()\n", 144 | "ax1.barh(avg_by_year['year'], avg_by_year['comb08']);" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "a49a7fba", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "fig, ax1 = plt.subplots()\n", 155 | "ax1.scatter(x=df['fuelCost08'], y=df['displ'], alpha=0.5, c=df['cylinders'])\n", 156 | "ax1.set(xlabel='Fuel Cost ($)', ylabel='Displacement')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "0b814095", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "fig, ax1 = plt.subplots(figsize=(9,7))\n", 167 | "ax1.scatter(x=df['fuelCost08'], y=df['displ'], alpha=0.5, c=df['cylinders'])\n", 168 | "ax1.set_xlabel('Fuel Cost', size=14)\n", 169 | "ax1.set_ylabel('Displacement', size=14)\n", 170 | "ax1.xaxis.set_major_formatter('${x:,.0f}')\n", 171 | "ax1.tick_params(axis='x', labelrotation=45, labelsize=14)\n", 172 | "ax1.tick_params(axis='y', labelsize=14)\n", 173 | "ax1.axvline(3500, color='black', linestyle=':')\n", 174 | "ax1.annotate('Target of $3500', xy=(3500,2), size=16)\n", 175 | "ax1.grid(True)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "cfc2e4ca", 181 | "metadata": {}, 182 | "source": [ 183 | "## Using Styles" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "9375d640", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "plt.style.available" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "90b02bbc", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "plt.style.use('ggplot')" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "4b158548", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "fig, ax1 = plt.subplots()\n", 214 | "ax1.scatter(x=df['fuelCost08'], y=df['displ'], alpha=0.5, c=df['cylinders'])\n", 215 | "ax1.set(xlabel='Fuel Cost ($)', ylabel='Displacement')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "34968e08", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "from matplotlib import style\n", 226 | "\n", 227 | "print(plt.style.available)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "92f287e1", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# Note, styles seaborn, seaborn-dark have been renamed to seaborn-v0_8, seaborn-v0_8-dark.\n", 238 | "# Hence the code change below.\n", 239 | "\n", 240 | "for style in ['Solarize_Light2', 'bmh', 'ggplot', 'fivethirtyeight', 'seaborn-v0_8', 'seaborn-v0_8-dark']:\n", 241 | " with plt.style.context(style):\n", 242 | " fig, ax1 = plt.subplots()\n", 243 | " ax1.scatter(x=df['fuelCost08'], y=df['displ'], alpha=0.5, c=df['cylinders'])\n", 244 | " ax1.set(title = f'style - {style}', xlabel='Fuel Cost ($)', ylabel='Displacement')" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3 (ipykernel)", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.11.6" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 5 269 | } 270 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile requirements.piptools 6 | # 7 | altair==5.2.0 8 | # via 9 | # -r requirements.piptools 10 | # altair-data-server 11 | # altair-saver 12 | # altair-viewer 13 | # vegafusion 14 | altair-data-server==0.4.1 15 | # via 16 | # -r requirements.piptools 17 | # altair-saver 18 | # altair-viewer 19 | altair-saver==0.5.0 20 | # via -r requirements.piptools 21 | altair-viewer==0.4.0 22 | # via altair-saver 23 | anyio==4.1.0 24 | # via jupyter-server 25 | appnope==0.1.3 26 | # via ipykernel 27 | argon2-cffi==23.1.0 28 | # via jupyter-server 29 | argon2-cffi-bindings==21.2.0 30 | # via argon2-cffi 31 | arrow==1.3.0 32 | # via isoduration 33 | asttokens==2.4.1 34 | # via stack-data 35 | async-lru==2.0.4 36 | # via jupyterlab 37 | attrs==23.1.0 38 | # via 39 | # jsonschema 40 | # outcome 41 | # referencing 42 | # trio 43 | babel==2.13.1 44 | # via jupyterlab-server 45 | beautifulsoup4==4.12.2 46 | # via nbconvert 47 | bleach==6.1.0 48 | # via nbconvert 49 | certifi==2023.11.17 50 | # via 51 | # requests 52 | # selenium 53 | cffi==1.16.0 54 | # via argon2-cffi-bindings 55 | charset-normalizer==3.3.2 56 | # via requests 57 | comm==0.2.0 58 | # via 59 | # ipykernel 60 | # ipywidgets 61 | contourpy==1.2.0 62 | # via matplotlib 63 | cycler==0.12.1 64 | # via matplotlib 65 | debugpy==1.8.0 66 | # via ipykernel 67 | decorator==5.1.1 68 | # via ipython 69 | defusedxml==0.7.1 70 | # via nbconvert 71 | et-xmlfile==1.1.0 72 | # via openpyxl 73 | executing==2.0.1 74 | # via stack-data 75 | fastjsonschema==2.19.0 76 | # via nbformat 77 | fonttools==4.46.0 78 | # via matplotlib 79 | fqdn==1.5.1 80 | # via jsonschema 81 | h11==0.14.0 82 | # via wsproto 83 | idna==3.6 84 | # via 85 | # anyio 86 | # jsonschema 87 | # requests 88 | # trio 89 | ipykernel==6.27.1 90 | # via 91 | # jupyter 92 | # jupyter-console 93 | # jupyterlab 94 | # qtconsole 95 | ipython==8.18.1 96 | # via 97 | # ipykernel 98 | # ipywidgets 99 | # jupyter-console 100 | ipywidgets==8.1.1 101 | # via jupyter 102 | isoduration==20.11.0 103 | # via jsonschema 104 | jedi==0.19.1 105 | # via ipython 106 | jinja2==3.1.2 107 | # via 108 | # altair 109 | # jupyter-server 110 | # jupyterlab 111 | # jupyterlab-server 112 | # nbconvert 113 | json5==0.9.14 114 | # via jupyterlab-server 115 | jsonpointer==2.4 116 | # via jsonschema 117 | jsonschema[format-nongpl]==4.20.0 118 | # via 119 | # altair 120 | # jupyter-events 121 | # jupyterlab-server 122 | # nbformat 123 | jsonschema-specifications==2023.11.2 124 | # via jsonschema 125 | jupyter==1.0.0 126 | # via -r requirements.piptools 127 | jupyter-client==8.6.0 128 | # via 129 | # ipykernel 130 | # jupyter-console 131 | # jupyter-server 132 | # nbclient 133 | # qtconsole 134 | jupyter-console==6.6.3 135 | # via jupyter 136 | jupyter-core==5.5.0 137 | # via 138 | # ipykernel 139 | # jupyter-client 140 | # jupyter-console 141 | # jupyter-server 142 | # jupyterlab 143 | # nbclient 144 | # nbconvert 145 | # nbformat 146 | # qtconsole 147 | jupyter-events==0.9.0 148 | # via jupyter-server 149 | jupyter-lsp==2.2.1 150 | # via jupyterlab 151 | jupyter-server==2.11.1 152 | # via 153 | # jupyter-lsp 154 | # jupyterlab 155 | # jupyterlab-server 156 | # notebook 157 | # notebook-shim 158 | jupyter-server-terminals==0.4.4 159 | # via jupyter-server 160 | jupyterlab==4.0.9 161 | # via 162 | # -r requirements.piptools 163 | # notebook 164 | jupyterlab-pygments==0.3.0 165 | # via nbconvert 166 | jupyterlab-server==2.25.2 167 | # via 168 | # jupyterlab 169 | # notebook 170 | jupyterlab-widgets==3.0.9 171 | # via ipywidgets 172 | kaleido==0.2.1 173 | # via -r requirements.piptools 174 | kiwisolver==1.4.5 175 | # via matplotlib 176 | markupsafe==2.1.3 177 | # via 178 | # jinja2 179 | # nbconvert 180 | matplotlib==3.8.2 181 | # via 182 | # -r requirements.piptools 183 | # seaborn 184 | matplotlib-inline==0.1.6 185 | # via 186 | # ipykernel 187 | # ipython 188 | mistune==3.0.2 189 | # via nbconvert 190 | nbclient==0.9.0 191 | # via nbconvert 192 | nbconvert==7.11.0 193 | # via 194 | # jupyter 195 | # jupyter-server 196 | nbformat==5.9.2 197 | # via 198 | # jupyter-server 199 | # nbclient 200 | # nbconvert 201 | nest-asyncio==1.5.8 202 | # via ipykernel 203 | notebook==7.0.6 204 | # via jupyter 205 | notebook-shim==0.2.3 206 | # via 207 | # jupyterlab 208 | # notebook 209 | numpy==1.26.2 210 | # via 211 | # -r requirements.piptools 212 | # altair 213 | # contourpy 214 | # matplotlib 215 | # pandas 216 | # patsy 217 | # pyarrow 218 | # scipy 219 | # seaborn 220 | # statsmodels 221 | openpyxl==3.1.2 222 | # via -r requirements.piptools 223 | outcome==1.3.0.post0 224 | # via trio 225 | overrides==7.4.0 226 | # via jupyter-server 227 | packaging==23.2 228 | # via 229 | # altair 230 | # ipykernel 231 | # jupyter-server 232 | # jupyterlab 233 | # jupyterlab-server 234 | # matplotlib 235 | # nbconvert 236 | # plotly 237 | # qtconsole 238 | # qtpy 239 | # statsmodels 240 | pandas==2.1.3 241 | # via 242 | # -r requirements.piptools 243 | # altair 244 | # seaborn 245 | # statsmodels 246 | # vegafusion 247 | pandocfilters==1.5.0 248 | # via nbconvert 249 | parso==0.8.3 250 | # via jedi 251 | patsy==0.5.4 252 | # via statsmodels 253 | pexpect==4.9.0 254 | # via ipython 255 | pillow==10.1.0 256 | # via matplotlib 257 | platformdirs==4.0.0 258 | # via jupyter-core 259 | plotly==5.18.0 260 | # via -r requirements.piptools 261 | portpicker==1.6.0 262 | # via altair-data-server 263 | prometheus-client==0.19.0 264 | # via jupyter-server 265 | prompt-toolkit==3.0.41 266 | # via 267 | # ipython 268 | # jupyter-console 269 | protobuf==4.25.1 270 | # via vegafusion 271 | psutil==5.9.6 272 | # via 273 | # ipykernel 274 | # portpicker 275 | # vegafusion 276 | ptyprocess==0.7.0 277 | # via 278 | # pexpect 279 | # terminado 280 | pure-eval==0.2.2 281 | # via stack-data 282 | pyarrow==14.0.1 283 | # via vegafusion 284 | pycparser==2.21 285 | # via cffi 286 | pygments==2.17.2 287 | # via 288 | # ipython 289 | # jupyter-console 290 | # nbconvert 291 | # qtconsole 292 | pyparsing==3.1.1 293 | # via matplotlib 294 | pysocks==1.7.1 295 | # via urllib3 296 | python-dateutil==2.8.2 297 | # via 298 | # arrow 299 | # jupyter-client 300 | # matplotlib 301 | # pandas 302 | python-json-logger==2.0.7 303 | # via jupyter-events 304 | pytz==2023.3.post1 305 | # via pandas 306 | pyyaml==6.0.1 307 | # via jupyter-events 308 | pyzmq==25.1.1 309 | # via 310 | # ipykernel 311 | # jupyter-client 312 | # jupyter-console 313 | # jupyter-server 314 | # qtconsole 315 | qtconsole==5.5.1 316 | # via jupyter 317 | qtpy==2.4.1 318 | # via qtconsole 319 | referencing==0.31.1 320 | # via 321 | # jsonschema 322 | # jsonschema-specifications 323 | # jupyter-events 324 | requests==2.31.0 325 | # via jupyterlab-server 326 | rfc3339-validator==0.1.4 327 | # via 328 | # jsonschema 329 | # jupyter-events 330 | rfc3986-validator==0.1.1 331 | # via 332 | # jsonschema 333 | # jupyter-events 334 | rpds-py==0.13.2 335 | # via 336 | # jsonschema 337 | # referencing 338 | scipy==1.11.4 339 | # via statsmodels 340 | seaborn==0.13.0 341 | # via -r requirements.piptools 342 | selenium==4.15.2 343 | # via altair-saver 344 | send2trash==1.8.2 345 | # via jupyter-server 346 | six==1.16.0 347 | # via 348 | # asttokens 349 | # bleach 350 | # patsy 351 | # python-dateutil 352 | # rfc3339-validator 353 | sniffio==1.3.0 354 | # via 355 | # anyio 356 | # trio 357 | sortedcontainers==2.4.0 358 | # via trio 359 | soupsieve==2.5 360 | # via beautifulsoup4 361 | stack-data==0.6.3 362 | # via ipython 363 | statsmodels==0.14.0 364 | # via -r requirements.piptools 365 | tenacity==8.2.3 366 | # via plotly 367 | terminado==0.18.0 368 | # via 369 | # jupyter-server 370 | # jupyter-server-terminals 371 | tinycss2==1.2.1 372 | # via nbconvert 373 | toolz==0.12.0 374 | # via altair 375 | tornado==6.4 376 | # via 377 | # altair-data-server 378 | # ipykernel 379 | # jupyter-client 380 | # jupyter-server 381 | # jupyterlab 382 | # notebook 383 | # terminado 384 | traitlets==5.14.0 385 | # via 386 | # comm 387 | # ipykernel 388 | # ipython 389 | # ipywidgets 390 | # jupyter-client 391 | # jupyter-console 392 | # jupyter-core 393 | # jupyter-events 394 | # jupyter-server 395 | # jupyterlab 396 | # matplotlib-inline 397 | # nbclient 398 | # nbconvert 399 | # nbformat 400 | # qtconsole 401 | trio==0.23.1 402 | # via 403 | # selenium 404 | # trio-websocket 405 | trio-websocket==0.11.1 406 | # via selenium 407 | types-python-dateutil==2.8.19.14 408 | # via arrow 409 | tzdata==2023.3 410 | # via pandas 411 | uri-template==1.3.0 412 | # via jsonschema 413 | urllib3[socks]==2.1.0 414 | # via 415 | # requests 416 | # selenium 417 | # urllib3 418 | vegafusion==1.4.5 419 | # via -r requirements.piptools 420 | vegafusion-python-embed==1.4.5 421 | # via -r requirements.piptools 422 | vl-convert-python==1.2.0 423 | # via -r requirements.piptools 424 | wcwidth==0.2.12 425 | # via prompt-toolkit 426 | webcolors==1.13 427 | # via jsonschema 428 | webencodings==0.5.1 429 | # via 430 | # bleach 431 | # tinycss2 432 | websocket-client==1.6.4 433 | # via jupyter-server 434 | widgetsnbextension==4.0.9 435 | # via ipywidgets 436 | wsproto==1.2.0 437 | # via trio-websocket 438 | 439 | # The following packages are considered to be unsafe in a requirements file: 440 | # setuptools 441 | -------------------------------------------------------------------------------- /code/ch3-exercise-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2659304d", 6 | "metadata": {}, 7 | "source": [ 8 | "### Chapter 3 Matplotlib Data Visualization\n", 9 | "\n", 10 | "Exercise 3" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "8be87053", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from pathlib import Path\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import matplotlib as mpl\n", 25 | "from matplotlib import ticker\n", 26 | "import statsmodels.formula.api as smf" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "7672c4e6", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "src_file = Path.cwd() / 'data' / 'raw' / 'EPA_fuel_economy.csv'\n", 37 | "image_dir = Path.cwd() / 'images'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "id": "bd06540b", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.read_csv(src_file)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "id": "fe1c5dec", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "
\n", 60 | "\n", 73 | "\n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | "
makemodelyearcylinderstranydisplVClassco2barrels08fuelCost08fuelTypehighway08city08comb08
0AcuraNSX20006.0Automatic 4-spd3.0Two Seaters-118.3116672600Premium221518
1AcuraNSX20006.0Manual 6-spd3.2Two Seaters-118.3116672600Premium221518
2BMWM Coupe20006.0Manual 5-spd3.2Two Seaters-117.3478952500Premium231719
3BMWZ3 Coupe20006.0Automatic 4-spd2.8Two Seaters-117.3478952500Premium241719
4BMWZ3 Coupe20006.0Manual 5-spd2.8Two Seaters-117.3478952500Premium241719
\n", 181 | "
" 182 | ], 183 | "text/plain": [ 184 | " make model year cylinders trany displ VClass co2 \\\n", 185 | "0 Acura NSX 2000 6.0 Automatic 4-spd 3.0 Two Seaters -1 \n", 186 | "1 Acura NSX 2000 6.0 Manual 6-spd 3.2 Two Seaters -1 \n", 187 | "2 BMW M Coupe 2000 6.0 Manual 5-spd 3.2 Two Seaters -1 \n", 188 | "3 BMW Z3 Coupe 2000 6.0 Automatic 4-spd 2.8 Two Seaters -1 \n", 189 | "4 BMW Z3 Coupe 2000 6.0 Manual 5-spd 2.8 Two Seaters -1 \n", 190 | "\n", 191 | " barrels08 fuelCost08 fuelType highway08 city08 comb08 \n", 192 | "0 18.311667 2600 Premium 22 15 18 \n", 193 | "1 18.311667 2600 Premium 22 15 18 \n", 194 | "2 17.347895 2500 Premium 23 17 19 \n", 195 | "3 17.347895 2500 Premium 24 17 19 \n", 196 | "4 17.347895 2500 Premium 24 17 19 " 197 | ] 198 | }, 199 | "execution_count": 4, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df.head()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "id": "d0b63dde", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "%matplotlib inline" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 6, 221 | "id": "01181bed", 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/html": [ 227 | "
\n", 228 | "\n", 241 | "\n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "
yearfuelCost08
020002184.94
120012201.48
220022229.38
320032273.99
420042264.57
520052271.01
620062287.00
720072284.15
820082285.68
920092183.23
1020102116.50
1120112103.86
1220122066.26
1320131996.78
1420141987.84
1520151943.05
1620161894.37
1720171894.86
1820181905.42
1920191900.22
2020201920.10
\n", 357 | "
" 358 | ], 359 | "text/plain": [ 360 | " year fuelCost08\n", 361 | "0 2000 2184.94\n", 362 | "1 2001 2201.48\n", 363 | "2 2002 2229.38\n", 364 | "3 2003 2273.99\n", 365 | "4 2004 2264.57\n", 366 | "5 2005 2271.01\n", 367 | "6 2006 2287.00\n", 368 | "7 2007 2284.15\n", 369 | "8 2008 2285.68\n", 370 | "9 2009 2183.23\n", 371 | "10 2010 2116.50\n", 372 | "11 2011 2103.86\n", 373 | "12 2012 2066.26\n", 374 | "13 2013 1996.78\n", 375 | "14 2014 1987.84\n", 376 | "15 2015 1943.05\n", 377 | "16 2016 1894.37\n", 378 | "17 2017 1894.86\n", 379 | "18 2018 1905.42\n", 380 | "19 2019 1900.22\n", 381 | "20 2020 1920.10" 382 | ] 383 | }, 384 | "execution_count": 6, 385 | "metadata": {}, 386 | "output_type": "execute_result" 387 | } 388 | ], 389 | "source": [ 390 | "avg_by_year = df.groupby(['year'], as_index=False).agg({'fuelCost08': 'mean'}).round(2)\n", 391 | "avg_by_year" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 7, 397 | "id": "02d213d7", 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "mpg_model = smf.ols(\"fuelCost08 ~ year\", data=avg_by_year).fit()" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 8, 407 | "id": "343ec311", 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "0 2325.850476\n", 414 | "1 2303.716333\n", 415 | "2 2281.582190\n", 416 | "3 2259.448048\n", 417 | "4 2237.313905\n", 418 | "5 2215.179762\n", 419 | "6 2193.045619\n", 420 | "7 2170.911476\n", 421 | "8 2148.777333\n", 422 | "9 2126.643190\n", 423 | "10 2104.509048\n", 424 | "11 2082.374905\n", 425 | "12 2060.240762\n", 426 | "13 2038.106619\n", 427 | "14 2015.972476\n", 428 | "15 1993.838333\n", 429 | "16 1971.704190\n", 430 | "17 1949.570048\n", 431 | "18 1927.435905\n", 432 | "19 1905.301762\n", 433 | "20 1883.167619\n", 434 | "dtype: float64" 435 | ] 436 | }, 437 | "execution_count": 8, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "mpg_model.fittedvalues" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 9, 449 | "id": "8bf69ac9", 450 | "metadata": {}, 451 | "outputs": [ 452 | { 453 | "data": { 454 | "text/html": [ 455 | "\n", 456 | "\n", 457 | "\n", 458 | " \n", 459 | "\n", 460 | "\n", 461 | " \n", 462 | "\n", 463 | "\n", 464 | " \n", 465 | "\n", 466 | "\n", 467 | " \n", 468 | "\n", 469 | "\n", 470 | " \n", 471 | "\n", 472 | "\n", 473 | " \n", 474 | "\n", 475 | "\n", 476 | " \n", 477 | "\n", 478 | "\n", 479 | " \n", 480 | "\n", 481 | "\n", 482 | " \n", 483 | "\n", 484 | "
OLS Regression Results
Dep. Variable: fuelCost08 R-squared: 0.795
Model: OLS Adj. R-squared: 0.784
Method: Least Squares F-statistic: 73.69
Date: Sat, 09 Oct 2021 Prob (F-statistic): 5.79e-08
Time: 11:20:05 Log-Likelihood: -118.43
No. Observations: 21 AIC: 240.9
Df Residuals: 19 BIC: 242.9
Df Model: 1
Covariance Type: nonrobust
\n", 485 | "\n", 486 | "\n", 487 | " \n", 488 | "\n", 489 | "\n", 490 | " \n", 491 | "\n", 492 | "\n", 493 | " \n", 494 | "\n", 495 | "
coef std err t P>|t| [0.025 0.975]
Intercept 4.659e+04 5182.756 8.990 0.000 3.57e+04 5.74e+04
year -22.1341 2.578 -8.584 0.000 -27.531 -16.737
\n", 496 | "\n", 497 | "\n", 498 | " \n", 499 | "\n", 500 | "\n", 501 | " \n", 502 | "\n", 503 | "\n", 504 | " \n", 505 | "\n", 506 | "\n", 507 | " \n", 508 | "\n", 509 | "
Omnibus: 0.027 Durbin-Watson: 0.286
Prob(Omnibus): 0.986 Jarque-Bera (JB): 0.137
Skew: 0.063 Prob(JB): 0.934
Kurtosis: 2.624 Cond. No. 6.67e+05


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.67e+05. This might indicate that there are
strong multicollinearity or other numerical problems." 510 | ], 511 | "text/plain": [ 512 | "\n", 513 | "\"\"\"\n", 514 | " OLS Regression Results \n", 515 | "==============================================================================\n", 516 | "Dep. Variable: fuelCost08 R-squared: 0.795\n", 517 | "Model: OLS Adj. R-squared: 0.784\n", 518 | "Method: Least Squares F-statistic: 73.69\n", 519 | "Date: Sat, 09 Oct 2021 Prob (F-statistic): 5.79e-08\n", 520 | "Time: 11:20:05 Log-Likelihood: -118.43\n", 521 | "No. Observations: 21 AIC: 240.9\n", 522 | "Df Residuals: 19 BIC: 242.9\n", 523 | "Df Model: 1 \n", 524 | "Covariance Type: nonrobust \n", 525 | "==============================================================================\n", 526 | " coef std err t P>|t| [0.025 0.975]\n", 527 | "------------------------------------------------------------------------------\n", 528 | "Intercept 4.659e+04 5182.756 8.990 0.000 3.57e+04 5.74e+04\n", 529 | "year -22.1341 2.578 -8.584 0.000 -27.531 -16.737\n", 530 | "==============================================================================\n", 531 | "Omnibus: 0.027 Durbin-Watson: 0.286\n", 532 | "Prob(Omnibus): 0.986 Jarque-Bera (JB): 0.137\n", 533 | "Skew: 0.063 Prob(JB): 0.934\n", 534 | "Kurtosis: 2.624 Cond. No. 6.67e+05\n", 535 | "==============================================================================\n", 536 | "\n", 537 | "Notes:\n", 538 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 539 | "[2] The condition number is large, 6.67e+05. This might indicate that there are\n", 540 | "strong multicollinearity or other numerical problems.\n", 541 | "\"\"\"" 542 | ] 543 | }, 544 | "execution_count": 9, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "mpg_model.summary()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 10, 556 | "id": "a715f8b7", 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "data": { 561 | "image/png": "\n", 562 | "text/plain": [ 563 | "
" 564 | ] 565 | }, 566 | "metadata": { 567 | "needs_background": "light" 568 | }, 569 | "output_type": "display_data" 570 | } 571 | ], 572 | "source": [ 573 | "fig, ax = plt.subplots()\n", 574 | "ax.scatter(x=avg_by_year['year'], y=avg_by_year['fuelCost08'])\n", 575 | "ax.plot(avg_by_year['year'], mpg_model.fittedvalues);" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 11, 581 | "id": "9a3a36f7", 582 | "metadata": {}, 583 | "outputs": [ 584 | { 585 | "data": { 586 | "image/png": "\n", 587 | "text/plain": [ 588 | "
" 589 | ] 590 | }, 591 | "metadata": { 592 | "needs_background": "light" 593 | }, 594 | "output_type": "display_data" 595 | } 596 | ], 597 | "source": [ 598 | "fig, ax = plt.subplots()\n", 599 | "ax.scatter(x=avg_by_year['year'], y=avg_by_year['fuelCost08'])\n", 600 | "ax.plot(avg_by_year['year'], mpg_model.fittedvalues)\n", 601 | "ax.set_xlim((2010,2020))\n", 602 | "ax.set_ylim((1800,2200));" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 13, 608 | "id": "f49c42e4", 609 | "metadata": {}, 610 | "outputs": [ 611 | { 612 | "data": { 613 | "text/plain": [ 614 | "1970.0" 615 | ] 616 | }, 617 | "execution_count": 13, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "df_2010 = df.query('year >= 2010').copy()\n", 624 | "avg_fuel_cost = df_2010['fuelCost08'].mean().round(0)\n", 625 | "avg_fuel_cost" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 14, 631 | "id": "a24166c0", 632 | "metadata": {}, 633 | "outputs": [ 634 | { 635 | "data": { 636 | "image/png": "\n", 637 | "text/plain": [ 638 | "
" 639 | ] 640 | }, 641 | "metadata": {}, 642 | "output_type": "display_data" 643 | } 644 | ], 645 | "source": [ 646 | "mpl.style.use('ggplot')\n", 647 | "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, \n", 648 | " figsize=(12,6))\n", 649 | "\n", 650 | "ax1.scatter(x=avg_by_year['year'], \n", 651 | " y=avg_by_year['fuelCost08'])\n", 652 | "ax1.plot(avg_by_year['year'], \n", 653 | " mpg_model.fittedvalues, \n", 654 | " color='forestgreen', linestyle='--')\n", 655 | "\n", 656 | "ax1.set(xlabel='Year', ylabel='Fuel Cost', \n", 657 | " ylim=(1850, 2200), xlim=(2010,2020))\n", 658 | "ax1.yaxis.set_major_formatter('${x:,.0f}')\n", 659 | "ax1.axhline(avg_fuel_cost, linestyle=':', color='orange')\n", 660 | "ax1.annotate(f'${avg_fuel_cost}', xy=(2017, avg_fuel_cost))\n", 661 | "\n", 662 | "ax2.hist(df_2010['fuelCost08'], color = \"skyblue\", ec=\"white\")\n", 663 | "ax2.xaxis.set_major_formatter('${x:,.0f}')\n", 664 | "ax2.set(xlabel='Fuel Costs', ylabel='Num autos')\n", 665 | "ax2.axvline(avg_fuel_cost, linestyle=':')\n", 666 | "ax2.annotate(f'${avg_fuel_cost}', xy=(avg_fuel_cost, 3500))\n", 667 | "\n", 668 | "fig.suptitle('EPA Estimated FuelCosts', \n", 669 | " weight='bold', size=14)\n", 670 | "fig.savefig(image_dir/'line_hist.svg', \n", 671 | " transparent=False, dpi=200, bbox_inches=\"tight\")" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "id": "7416766b", 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [] 681 | } 682 | ], 683 | "metadata": { 684 | "kernelspec": { 685 | "display_name": "Python 3 (ipykernel)", 686 | "language": "python", 687 | "name": "python3" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.8.11" 700 | } 701 | }, 702 | "nbformat": 4, 703 | "nbformat_minor": 5 704 | } 705 | --------------------------------------------------------------------------------