├── .RData
├── .Rhistory
├── .Rproj.user
    ├── 3B0EFA5
    │   ├── pcs
    │   │   ├── debug-breakpoints.pper
    │   │   ├── files-pane.pper
    │   │   ├── source-pane.pper
    │   │   ├── windowlayoutstate.pper
    │   │   └── workbench-pane.pper
    │   ├── persistent-state
    │   ├── rmd-outputs
    │   ├── saved_source_markers
    │   ├── sdb
    │   │   ├── per
    │   │   │   └── t
    │   │   │   │   ├── C00A79D2
    │   │   │   │   ├── EEFE0FD5
    │   │   │   │   ├── F1EE8CA4
    │   │   │   │   └── FE94A69D
    │   │   ├── prop
    │   │   │   ├── 32602BA4
    │   │   │   ├── 3B0BAF17
    │   │   │   ├── 460054D1
    │   │   │   ├── 8DD8A556
    │   │   │   ├── 945C141F
    │   │   │   ├── C5350C47
    │   │   │   ├── D635D4D3
    │   │   │   ├── DA516F3B
    │   │   │   ├── E7318A37
    │   │   │   ├── ED483BAF
    │   │   │   ├── EE8F787
    │   │   │   ├── FFC79A6C
    │   │   │   └── INDEX
    │   │   └── s-21D18981
    │   │   │   ├── 206A0EDA
    │   │   │   ├── 3F887430
    │   │   │   ├── D09CC3F1
    │   │   │   ├── EB01FDAC
    │   │   │   └── lock_file
    │   └── session-persistent-state
    └── shared
    │   └── notebooks
    │       └── paths
├── .gitignore
├── Data Visualization - Part 1.Rmd
├── Data Visualization - Part 1._pub.html
├── Data Visualization - Part 1.md
├── Data Visualization - Part 2.Rmd
├── Data Visualization - Part 2._pub.html
├── Data Visualization - Part 2.md
├── Data Visualization - Part 3.Rmd
├── Data Visualization - Part 3._pub.html
├── Data Visualization - Part 3.md
├── Data Visualization - Tropical Storms.Rmd
├── Data Visualization Lesson.Rmd
├── Data Visualization Lesson._pub.html
├── Data Visualization Lesson.md
├── Data-Visualization-Lesson.Rproj
├── Data_Visualization_-_Part_1.html
├── Data_Visualization_-_Part_1.pdf
├── Data_Visualization_-_Part_2.html
├── Data_Visualization_-_Part_2.pdf
├── Data_Visualization_-_Part_3.html
├── Data_Visualization_-_Part_3_files
    └── figure-html
    │   ├── unnamed-chunk-2-1.png
    │   ├── unnamed-chunk-3-1.png
    │   ├── unnamed-chunk-4-1.png
    │   └── unnamed-chunk-5-1.png
├── Data_Visualization_-_Tropical_Storms.Rmd
├── Data_Visualization_-_Tropical_Storms.html
├── Data_Visualization_-_Tropical_Storms_files
    └── figure-html
    │   ├── unnamed-chunk-2-1.png
    │   ├── unnamed-chunk-5-1.png
    │   ├── unnamed-chunk-7-1.png
    │   └── unnamed-chunk-8-1.png
├── Data_Visualization_Lesson.html
├── Monthly Crude Oil Production by State 1981 - Nov 2016.csv
├── Oil_Production_By_State.html
├── README.md
├── data
    └── Historical_Tropical_Storm_Tracks.csv
├── data_preparation.R
├── figure
    ├── titlePhoto-1.png
    ├── unnamed-chunk-1-1.png
    ├── unnamed-chunk-10-1.png
    ├── unnamed-chunk-11-1.png
    ├── unnamed-chunk-12-1.png
    ├── unnamed-chunk-13-1.png
    ├── unnamed-chunk-14-1.png
    ├── unnamed-chunk-15-1.png
    ├── unnamed-chunk-16-1.png
    ├── unnamed-chunk-17-1.png
    ├── unnamed-chunk-18-1.png
    ├── unnamed-chunk-19-1.png
    ├── unnamed-chunk-2-1.png
    ├── unnamed-chunk-20-1.png
    ├── unnamed-chunk-3-1.png
    ├── unnamed-chunk-4-1.png
    ├── unnamed-chunk-5-1.png
    ├── unnamed-chunk-6-1.png
    ├── unnamed-chunk-7-1.png
    ├── unnamed-chunk-8-1.png
    └── unnamed-chunk-9-1.png
├── ggmapTemp.png
├── hurricane_leaflet.html
├── images
    ├── bad-pie1-fix.png
    ├── bad-pie1.png
    ├── chart_vs_text.png
    ├── lie_chart_bad.png
    ├── lie_chart_fixed.png
    ├── tg_tb_tu.jpg
    ├── tg_tb_tu.xcf
    ├── title_photo.png
    ├── title_photo_2.png
    └── title_photo_3.png
└── m.html


/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/.RData


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/pcs/debug-breakpoints.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "debugBreakpointsState" : {
3 |         "breakpoints" : [
4 |         ]
5 |     }
6 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson",
3 |     "sortOrder" : [
4 |         {
5 |             "ascending" : true,
6 |             "columnIndex" : 2
7 |         }
8 |     ]
9 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "activeTab" : 3
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
 1 | {
 2 |     "left" : {
 3 |         "panelheight" : 740,
 4 |         "splitterpos" : 309,
 5 |         "topwindowstate" : "NORMAL",
 6 |         "windowheight" : 778
 7 |     },
 8 |     "right" : {
 9 |         "panelheight" : 740,
10 |         "splitterpos" : 465,
11 |         "topwindowstate" : "NORMAL",
12 |         "windowheight" : 778
13 |     }
14 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 |     "TabSet1" : 0,
3 |     "TabSet2" : 3,
4 |     "TabZoom" : {
5 |     }
6 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/persistent-state:
--------------------------------------------------------------------------------
 1 | build-last-errors="[]"
 2 | build-last-errors-base-dir=""
 3 | build-last-outputs="[]"
 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}"
 5 | console_procs="[]"
 6 | files.monitored-path=""
 7 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":true,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}"
 8 | imageDirtyState="1"
 9 | saveActionState="-1"
10 | 


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/rmd-outputs:
--------------------------------------------------------------------------------
 1 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html
 2 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html
 3 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html
 4 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html
 5 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/saved_source_markers:
--------------------------------------------------------------------------------
1 | {"active_set":"","sets":[]}


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/per/t/C00A79D2:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "library(knitr)\n# Set figure dimensions\n#opts_chunk$set(fig.width=5, fig.height=5)\n# Set figures to upload to imgur.com\n#opts_knit$set(upload.fun = imgur_upload, base.url = NULL)\nopts_knit$set(upload.fun = function(file){library(RWordPress);uploadFile(file)$url;})\n\nrmd.file <- \"Data Visualization - Part 3.Rmd\"\n# Knit the .Rmd file\nknit(rmd.file)\n# Set up input/ output files\nmarkdown.file <- gsub(pattern = \"Rmd$\", replacement = \"md\", x = rmd.file)\nhtml.file <- gsub(pattern = \"md$\", replacement = \"_pub.html\", x = markdown.file)\n\nlibrary(markdown)\n# Removes 'yaml' information\nmarkdownToHTML(file = markdown.file, output = html.file, fragment.only = TRUE)\n\nlibrary(RWordPress)\n# Set your WP username, password, and your site URL\noptions(WordpressLogin = c(stoltzmaniac = 'ejkDD$$ckckslppzzzekAABV'),\n        WordpressURL = 'https://stoltzmaniac.com/xmlrpc.php')\n# Create a line-by-line text vector\ntext = paste(readLines(html.file), collapse = \"\\n\")\n# Send to Worpdress\nnewPost(list(description = text, title = \"Data Visualization - Part 3\"), publish = FALSE)\n",
 4 |     "created" : 1491592764133.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "2697624838",
 9 |     "id" : "C00A79D2",
10 |     "lastKnownWriteTime" : 1491595519,
11 |     "last_content_update" : 1491595519472,
12 |     "path" : "~/Desktop/uploading to wp.R",
13 |     "project_path" : null,
14 |     "properties" : {
15 |         "tempName" : "Untitled1"
16 |     },
17 |     "relative_order" : 4,
18 |     "source_on_save" : false,
19 |     "source_window" : "",
20 |     "type" : "r_source"
21 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/per/t/EEFE0FD5:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 3\"\nauthor: \"Scott Stoltzman\"\ndate: \"April 7, 2017\"\noutput: html_document\n---\n\n\n### What Type of Data Visualization Do You Choose (if any)?  \n\nDetermining whether or not you need a visualization is ***step one***. While it seems silly, this is probably something everyone (including myself) should be doing more often. A lot of times, it seems like a great way to showcase the amount of work you have been doing, but winds up being completely ineffective and could potentially harm what you're doing. Once you determine that you actually need to visualize your data, you should have a rough idea of the options to look at. This post will explain and demonstrate some of the common types of charts and plots.  \n\n\n```{r, echo=FALSE,results='hide', warning=FALSE, message=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo_3.png\")\ngrid.raster(img)\n```\n\nThis is Part 3 in a series about Data visualization:  \n\n* [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_TOP)\n* [Data Visualization - Part 2](https://www.stoltzmaniac.com/data-visualization-part-2?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_TOP)\n\n#### Determine whether or not you actually need a visualizatoin in the first place.\n\nLike the best practices I listed in [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_MID_ARTICLE), make sure your visualizations:\n\n- Are clearly illustrating a relevant point  \n- Are tailored to the appropriate audience  \n- Are tailored to the presentation medium  \n- Are memorable to those who care about the material  \n- Are increasing the understanding of the subject matter  \n  \nIf these don't seem possible, ***you probably don't need a data visualization.***  \n\n#### If you do need one, what's a good first step to take?\n\nTake a look at the forum in which you're presenting, it matters! If you are writing for a scientific journal, it will be different than presenting live to a thousand person audience. Think about a Ted Talk compared to the Journal of Physics.  \n\nPoint being: **consider your audience!**  \n\nLet's talk about a high-level presentation. Everyone has seen a slideshow with fancy charts that add zero value. Do not be the person presenting something that way! Providing useless content will confuse the audience and/or lead to boredom.\n\nIf your point is to show year-over-year change of a single metric - show it as a simple number on the page in big bold font rather than a chart.\n\nIn this made up example, I am displaying revenue over the last few years (***note: be more specific*** when it comes to what type of revenue you're talking about).  \n\nWhich of the following makes more sense to put on a slide?\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/chart_vs_text.png\")\ngrid.raster(img)\n```\n\nIf you agree with me, the one on the right will be much easier for people to understand in a presentation. It gets the point across without requiring processing which will allow people to focus on what is important. Any additional nuggets you would like to point out can be spoken to. \n\nNow, let's talk about publishing content that isn't for academic use but will reach the public (i.e. newspapers, magazines, blogs, etc.). These types of charts can cover a wide range of topics so we'll have to stick to the basics. We're going to look at displaying information which is interesting and adds value.  \n\nHere is a great example from [Junk Charts](http://junkcharts.typepad.com/junk_charts/2017/04/what-does-lying-politicians-have-in-common-with-rainbow-colors.html) in which the author of the original [Daily Kos Article](http://www.dailykos.com/story/2016/8/7/1556666/-Three-lessons-from-the-rise-of-Donald-Trump) is showing a type of \"lie detector\" chart. The chart does a number of things well: it illustrates a relevant point, it is appropriate to the audience and medium, and really helps to understand the subject matter better. However, the original chart is too colorful which takes away from its effectiveness. Junk Charts took it to the next level by simplifying the colors and axes.  \n\n\n#### Original Version (Daily Kos)\n```{r, fig.align='center',echo=FALSE,fig.height=6,fig.show='hold'}\nimg = readPNG(\"images/lie_chart_bad.png\")\ngrid.raster(img)\n```  \n\n#### Modified Version (Junk Charts)\n```{r, fig.align='center',echo=FALSE,fig.height=6,fig.show='hold'}\nimg = readPNG(\"images/lie_chart_fixed.png\")\ngrid.raster(img)\n```  \n\nBy merely looking at this chart you can see how it is ranked, a sense of scale, the comparison between people, and clearly labeled names. Fantastic work!  \n\nRather than going over more examples of work others are doing, please visit [Chart Porn](http://chartporn.org/) (don't worry about the name, it's a great data visualization site) and [Junk Charts](http://junkcharts.typepad.com/). They have phenomenal examples of what to do (and what not to do) when publishing to the public.\n\n#### You have a point, now what?  \n\nThere is no rulebook as to how to display your data. However, as you have seen, there are both great and poor options. The choice is up to you - so think long and hard before making a decision (and you can always try a number of them out on people before publishing).\n\n**Ask yourself the following questions to help drive your decision:**  \n\n- Are you making a comparison?\n- Are you finding a relationship?\n- Are you showing a distribution?\n- Are you finding a trend over time?\n- Are you showing composition?\n  \nOnce you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.\n\n#### Rule of Thumb  \n\n- **Trend:** Column, Line  \n- **Comparison:** Area, Bar, Bullet, Column, Line, Scatter  \n- **Relationship:** Line, Scatter  \n- **Distribution:** Bar, Boxplot, Column  \n- **Composition:** Donut, Pie, Stacked Bar, Stacked Column  \n  \nObviously, there are plenty of choices beyond these, so don't hesitate to use what works best. I will go over some of these basics and show some comparisons of poor charting techniques vs. slightly better ones.\n\nFor this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm)  \n\nLet's load up some libraries and get started.\n\n```{r libraryPrep, results='hide', warning=FALSE, message=FALSE}\nlibrary(ggplot2)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(lubridate)\nlibrary(scales)\n```\n\n\n\n```{r dataLoading, results='hide', warning=FALSE, message=FALSE}\n#Custom data preparation\n#GitHub (linked to at bottom of this post)\nsource('data_preparation.R')\ndata = getData()\n```\n\n```{r}\nhead(data)\n```\n\n----  \n\n## Trend - Line Chart\n\n**Objective:** Visualize a trend in oil production in the US from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period.\n\n#### Poor Version  \nThe x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still worthless.\n\n\n```{r,fig.align='center', fig.width=4}\ndf = data %>% \n  group_by(Year) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n  ggtitle('Oil Production Over Time') + \n  theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n  xlab('') + ylab('')\n```\n\n#### Better Version  \nThe title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.\n\n```{r,fig.align='center', fig.width=12}\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n  ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +\n  theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n  scale_y_continuous(labels = comma)\n```\n\n\n----\n\n## Comparison - Line Chart  \n\n**Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare their trends over the time period. From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010.\n\n#### Poor Version  \nThere are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Location, Year) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))\np + geom_line(stat='identity') + \n  ggtitle(paste('Oil Production By Year By State in the U.S.')) + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n#### Better Version  \nThis focuses attention on the top producing states. It compares them to each other and shows the trend per state as well. Using facet_wrap() tends to be used in what's known as \"small multiples\" - this is a technique which helps to break up the visual components of the data into easy-to-understand pieces which make intuitive sense.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nn=6 #Arbitrary at first, after trying a few, this made the most sense\ntopN = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(n)\n\ndf = data %>%\n  filter(Location %in% topN$Location) %>%\n  group_by(Year,Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\ndf$Location = as.factor(df$Location)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))\np + geom_line(stat='identity') + \n  ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n  facet_wrap(~Location) + \n  scale_y_continuous(labels = comma) \n\n```\n\n----\n\n## Relationship - Scatter Plot\n\n**Objective**: Check to see if data from Alaska and California is correlated. While this isn't extremely interesting, it does allow us to use this same data set (sorry). The charts indicate that there appears to be a strong positive correlation between the two states.\n\n#### Poor Version  \nLots of completely irrelevant data! The size of the point should have nothing to do with the year.  \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nstatesList = c('Alaska','California')\ndf = data %>%\n  filter(Location %in% statesList) %>%\n  spread(Location,ThousandBarrel) %>%\n  select(Alaska,California,Month,Year)\n\np = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))\np + geom_point() + \n  scale_y_continuous(labels = comma) +\n  scale_x_continuous(labels = comma) +\n  ggtitle('Oil Production - CA vs. AK') + \n  theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n#### Better Version  \nThe points are all the same size and a trend line helps to visualize the relationship. While it can sometimes be misleading, it makes sense with our current data.  \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  filter(Location %in% statesList) %>%\n  spread(Location,ThousandBarrel) %>%\n  select(Alaska,California,Year)\n\np = ggplot(df,aes(x=Alaska,y=California))\np + geom_point() + \n  scale_y_continuous(labels = comma) +\n  scale_x_continuous(labels = comma) +\n  ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  geom_smooth(method='lm')\n\n```\n\n## Distribution - Boxplot \n\n**Objective**: Examine the range of production by state (per year) to give us an idea of the variance. While the sums and means are nice, it's quite important to have an idea of distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! \n\n\n#### Poor Version  \nAlphabetical order doesn't add any value, names are overlapping on top of each other. While you can tell who the big players are, this visual does not add the value it should.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Year,Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Location,y=ThousandBarrel))\np + geom_boxplot() + \n  ggtitle('Distribution of Oil Production by State')\n\n```\n\n\n#### Better Version  \nThis gives a nice ranking to the plot while still showing their distributions. We could take this a step further and separate out the big players from the small players (I'll leave that up to you).\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_boxplot() + \n  scale_y_continuous(labels = comma) +\n  ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + \n  coord_flip()\n```\n\n\n## Composition - Stacked Bar \n\n**Objective**: Check out the composition of total production by state. It's interesting to see how the composition was relatively similar across decades until the 2010's. Texas was 50% of the output!\n\n\n#### Poor Version  \nMy favorite, the beautiful pie chart! There's nothing better than this... (no need for further commentary).\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(100*df$ThousandBarrel,0)\n\nlibrary(plotrix)\npie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')\n\n```\n\n\n#### Better Version  \nThe 1980's and 2010's will be missing years in terms of a \"decade\" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite \"apples-to-apples\" for a comparison because I created different decades, but you get the idea.\n\nI also created an \"Other\" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely - **do this carefully and transparently!**\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndata$Decade = '1980s'\ndata$Decade[data$Year >= 1990] = '1990s'\ndata$Decade[data$Year >= 2000] = '2000s'\ndata$Decade[data$Year >= 2010] = '2010s'\ndata$Decade = as.factor(data$Decade)\n\ntop5 = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(5) %>%\n  select(Location)\n\ntop5List = top5$Location\n\ndata$State = \"Other\"\n\nfor(i in 1:length(top5List)){\n  data$State[data$Location == top5List[i]] = top5List[i]\n}\n\ndf = data %>%\n  group_by(Decade,State) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(df$ThousandBarrel,3)\ndf$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')\n\np = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))\np + geom_bar(stat='identity') + \n  geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = \"stack\") + \n  scale_y_continuous(labels = percent) +\n  ggtitle('Percentage of Top Oil Producing States by Decade') + \n  guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + \n  theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n\n\n\n### Some other fun concepts are below!  \nSome of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. \n\nHave fun with your data visualizations: be creative, think outside the box, use tools other than computers if it makes sense, fail often but learn quickly. I'm sure I'll think of a thousand better ways to have illustrated the concepts in this post by tomorrow, so I'll make updates as I think of them!\n\nNow it's your turn!\n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)\n\n\n```{r,fig.height=4}\ndf = data %>% \n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel)\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_bar(stat='identity') + \n  ggtitle('Oil Production 1981 - 2016 By Location') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  coord_flip()\n```\n\n\n\n\n\n```{r,fig.height=4}\ntop10 = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(10)\nprint(top10)\n\ndf = data %>% \n  group_by(Location,Year) %>%\n  filter(Location %in% top10$Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) \np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))\np + geom_bar(stat='identity') + \n  ggtitle('Oil Production - Top 10 States') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n```{r, fig.height=4}\ndf = data %>%\n  filter(Year == 1990)%>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\ndf$Location = tolower(df$Location)\n\n#Add States without data\nStates = data.frame(Location = tolower(as.character(state.name)))\nmissingStates = States$Location[!(States$Location %in% df$Location)]\nappendData = data.frame(Location=missingStates,ThousandBarrel=0)\ndf = rbind(df,appendData)\n\nstates_map <- map_data(\"state\")\n\nggplot(df, aes(map_id = Location)) + \n    geom_map(aes(fill=ThousandBarrel), map = states_map) +\n    expand_limits(x = states_map$long, y = states_map$lat)\n\n```\n\n\n```{r, fig.height=4}\ndf = data %>% \n  filter(Location == 'Texas') %>%\n  group_by(Year,Month) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Month,y=ThousandBarrel))\np + geom_line(stat='identity',aes(group=Year,col=Year)) + \n  ggtitle('Oil Production By Year in the U.S.') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n\n\n\n\n",
 4 |     "created" : 1491578298512.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "985338938",
 9 |     "id" : "EEFE0FD5",
10 |     "lastKnownWriteTime" : 1491596055,
11 |     "last_content_update" : 1491596055925,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd",
13 |     "project_path" : "Data Visualization - Part 3.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 1,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/per/t/F1EE8CA4:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 2\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n  html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n----  \n\n# Data Visualization - Part 2\n\n## A Quick Overview of the ggplot2 Package in R  \n\nWhile it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. \n\nIn case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)\n\n```{r, fig.align='center',echo=FALSE}\nlibrary(png)\nlibrary(grid)\nimg = readPNG(\"images/title_photo_2.png\")\ngrid.raster(img)\n```\n\n\n### Why would you use ggplot2?  \n1. More robust plotting than the base plot package\n2. Better control over aesthetics - colors, axes, background, etc.\n3. Layering\n4. Variable Mapping (aes)\n5. Automatic aggregation of data\n6. Built in formulas & plotting (geom_smooth)\n7. The list goes on and on...  \n\nBasically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.\n\n### Why wouldn't you use ggplot2?  \n1. A bit of a learning curve\n2. Lack of user interactivity with the plots  \n\nFundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it.  \n\n**A Basic Example: Unemployment Over Time**  \n\n```{r,results='hide', warning=FALSE, message=FALSE}\nlibrary(dplyr)\nlibrary(ggplot2)\n\n# Load the economics data from ggplot2\ndata(economics,package='ggplot2')\n```\n\n```{r}\n# Take a look at the format of the data\nhead(economics)\n```\n\n\n```{r, fig.height = 4}\n# Create the plot\nggplot(data = economics) + geom_line(aes(x = date, y = unemploy))\n```\n\n\n\n### What happened to get that?  \n\n- `ggplot(economics)` loaded the data frame\n- `+` tells ggplot() that there is more to be added to the plot\n- `geom_line()` defined the type of plot\n- `aes(x = date, y = unemploy)` mapped the variables\n\nThe `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what \"auto-magically\" brings your plot to life. You are telling ggplot2, \"I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis.\" It's pretty straightforward in this case but there are more complex use cases as well.\n\n***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():\n`ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`\n\n### Here's the basic formula for success:\n\n- Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter\n- Include your data frame (economics) in a ggplot function: `ggplot(data = economics)`  \n- Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`\n    - \"geom\" stands for \"geometric object\" and determines the type of object (there can be more than one type per plot)\n    - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)\n- Add in layers and utilize `fill` and `col` parameters within `aes()`\n\n\nI'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on.  \n\n**Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).\n\n\nAs usual, some packages must be loaded.  \n\n```{r, results='hide', warning=FALSE, message=FALSE}\nlibrary(reshape2)\nlibrary(lubridate)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(ggplot2)\nlibrary(scales)\nlibrary(gridExtra)\n```\n\n### The Scatterplot  \n\nThis is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).\n\nThe author of this chart was looking for a correlation between area and population. \n\n```{r}\n# Use the \"midwest\"\" data from ggplot2\ndata(\"midwest\", package = \"ggplot2\")\n\nhead(midwest)\n```\n\n#### Here's the most basic version of the scatter plot  \n\nThis can be called by `geom_point()` in ggplot2\n\n```{r, warning=FALSE, fig.align='center',fig.height = 4}\n# Scatterplot\nggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point()  #ggplot\n```\n\n#### Here's  version with some additional features  \n\nWhile the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.\n\n```{r, warning=FALSE,message=FALSE,fig.height = 4}\nggplot(data = midwest, aes(x = area, y = poptotal)) + \ngeom_point(aes(col=state, size=popdensity)) + \n  geom_smooth(method=\"loess\", se=F) + \n  xlim(c(0, 0.1)) + \n  ylim(c(0, 500000)) + \n  labs(subtitle=\"Area Vs Population\", \n       y=\"Population\", \n       x=\"Area\", \n       title=\"Scatterplot\", \n       caption = \"Source: midwest\")\n```\n\n#### Explanation:  \n\n`ggplot(data = midwest, aes(x = area, y = poptotal)) + `    \nInputs the data and maps x and y variables as area and poptotal.  \n\n`geom_point(aes(col=state, size=popdensity)) + `  \nCreates a scatterplot and maps the color and size of points to state and popdensity.  \n\n`  geom_smooth(method=\"loess\", se=F) + `  \nCreates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.\n\n`  xlim(c(0, 0.1)) + `  \nSets the x-axis limits.  \n\n`  ylim(c(0, 500000)) + `  \nSets the y-axis limits.  \n\n`labs(subtitle=\"Area Vs Population\",`  \n\n`       y=\"Population\",`  \n\n`       x=\"Area\",`  \n\n`       title=\"Scatterplot\",`  \n\n`       caption = \"Source: midwest\")`  \nChanges the labels of the subtitle, y-axis, x-axis, title and caption.\n\nNotice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.\n\n\n### The Density Plot  \n\nDensity plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.\n\n```{r}\n# Examine the mpg data set\nhead(mpg)\n```\n\n#### Sample Density Plot\n\n```{r,fig.height = 4}\ng = ggplot(mpg, aes(cty))\ng + geom_density(aes(fill=factor(cyl)), alpha=0.8) + \n    labs(title=\"Density plot\", \n         subtitle=\"City Mileage Grouped by Number of cylinders\",\n         caption=\"Source: mpg\",\n         x=\"City Mileage\",\n         fill=\"# Cylinders\")\n\n```\n\nYou'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.\n\n`ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)`  \n\n`g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + `  \n`geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.\n\n`    labs(title=\"Density plot\",`  \n\n`         subtitle=\"City Mileage Grouped by Number of cylinders\",`  \n\n`         caption=\"Source: mpg\",`  \n\n`         x=\"City Mileage\",`  \n\n`         fill=\"# Cylinders\")`  \nLabeling is cleaned up at the end.\n\n\n#### How would you use your new knowledge to see the density by class instead of by number of cylinders?  \n\n***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.\n\n```{r,fig.height = 4}\ng + geom_density(aes(fill=factor(class)), alpha=0.8) + \n    labs(title=\"Density plot\", \n         subtitle=\"City Mileage Grouped by Class\",\n         caption=\"Source: mpg\",\n         x=\"City Mileage\",\n         fill=\"Class\")\n```\nNotice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.\n\n### The Histogram  \n\nHow could we show the city mileage in a histogram?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(cty))\ng + geom_histogram(bins=20) +\n    labs(title=\"Histogram\", \n         caption=\"Source: mpg\",\n         x=\"City Mileage\")\n```  \n\n`geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.\n\n\n### The Bar/Column Chart  \n\nFor all intensive purposes, bar and column charts are essentially the same. Technically, the term \"column chart\" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.\n\n```{r}\n#Data Preparation\nfreqtable <- table(mpg$manufacturer)\ndf <- as.data.frame.table(freqtable)\nhead(df)\n```\n\n\n```{r,fig.height = 4}\n#Set a theme\ntheme_set(theme_classic())\n\ng <- ggplot(df, aes(Var1, Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n      labs(title=\"Bar Chart\", \n           subtitle=\"Manufacturer of vehicles\", \n           caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n      theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\nThe addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.\n\nHowever, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.\n\nWithin `geom_bar()` there is another new piece of information: `stat=\"identity\"` which tells ggplot to use the actual value of `Freq`.\n\nYou may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. \n\n```{r,fig.height = 4}\ng <- ggplot(df, aes(reorder(Var1,Freq), Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n      labs(title=\"Bar Chart\", \n           x = 'Manufacturer',\n           subtitle=\"Manufacturer of vehicles\", \n           caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n      theme(axis.text.x = element_text(angle=65, vjust=0.6)) + \n  coord_flip()\n```\n\nLet's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n  theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\n`position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison.  \n\nDespite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.\n\n```{r}\ntheme_set(theme_bw())\n\ng = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n  facet_wrap(~manufacturer)\n```\nThis created a much nicer view of the information. It \"auto-magically\" split everything out by manufacturer!\n\n\n### Spatial Plots\n\nAnother nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude \"auto-magically.\"\n\nThis map is created with `ggmap` which utilizes Google Maps API.\n\n```{r, warning=FALSE, message=FALSE}\nlibrary(ggmap)\nlibrary(ggalt)\n\nfoco <-  geocode(\"Fort Collins, CO\")  # get longitude and latitude\n\n# Get the Map ----------------------------------------------\ncolo_map <- qmap(\"Colorado, United States\",zoom = 7, source = \"google\")   \n\n# Get Coordinates for Places ---------------------\ncolo_places <- c(\"Fort Collins, CO\",\n                    \"Denver, CO\",\n                    \"Grand Junction, CO\",\n                    \"Durango, CO\",\n                    \"Pueblo, CO\")\n\nplaces_loc <- geocode(colo_places)  # get longitudes and latitudes\n\n\n# Plot Open Street Map -------------------------------------\ncolo_map + geom_point(aes(x=lon, y=lat),\n                             data = places_loc, \n                             alpha = 0.7, \n                             size = 7, \n                             color = \"tomato\") + \n                  geom_encircle(aes(x=lon, y=lat),\n                                data = places_loc, size = 2, color = \"blue\")\n```\n\n### Final Thoughts  \n\nI hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!\n\n### Coming soon:  \n\n- Determining whether or not you need a visualization  \n- Choosing the type of plot to use depending on the use case  \n- Visualization beyond the standard charts and graphs  \n\n\nI made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)",
 4 |     "created" : 1491578353227.000,
 5 |     "dirty" : true,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "3000572754",
 9 |     "id" : "F1EE8CA4",
10 |     "lastKnownWriteTime" : 1490212163,
11 |     "last_content_update" : 1491581505392,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 2.Rmd",
13 |     "project_path" : "Data Visualization - Part 2.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 2,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/per/t/FE94A69D:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 1\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n  pdf_document: default\n  html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n----  \n\n# Introduction to Data Visualization\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo.png\")\ngrid.raster(img)\n```\n\nThe topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach \n$7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.  \n\nIn short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.\n\n---- \n\nI have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.\n\n### One author at Vox wanted to show the cause of death in all of Shakespeare\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1.png\")\ngrid.raster(img)\n```\n    \n\n**Is this not insane!?!?!**  \n\nUsing a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).\n  \n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1-fix.png\")\ngrid.raster(img)\n```\n----  \n\nTo be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).\n\n----\n\n### The Good, The Bad, The Ugly  \n\n**\"The Good\" visualizations:**  \n\n- Clearly illustrate a point  \n- Are tailored to the appropriate audience  \n    - Analysts may want detail  \n    - Executives may want a high-level view  \n- Are tailored to the presentation medium  \n    - A piece in an academic journal can be analyzed slowly and carefully  \n    - A slide in front of 5,000 people in a conference will be glanced at quickly  \n- Are memorable to those who care about the material  \n- Make an impact which increases the understanding of the subject matter  \n\n**\"The Bad\" visualizations:**  \n\n- Are difficult to interpret  \n- Are unintentionally misleading  \n- Contain redundant and boring information  \n\n**\"The Ugly\" visualizations:**  \n\n- Are almost impossible to interpret  \n- Are filled with completely worthless information  \n- Are intentionally created to mislead the audience  \n- Are inaccurate  \n\n### Coming soon:  \n\n- Determining whether or not you need a visualization  \n- Choosing the type of plot to use depending on the use case  \n- Introduction to the ggplot2 in R and how it works  \n- Visualization beyond the standard charts and graphs  \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)",
 4 |     "created" : 1491581474797.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "1337660815",
 9 |     "id" : "FE94A69D",
10 |     "lastKnownWriteTime" : 1489685647,
11 |     "last_content_update" : 1489685647,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 1.Rmd",
13 |     "project_path" : "Data Visualization - Part 1.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 3,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/32602BA4:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/3B0BAF17:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/460054D1:
--------------------------------------------------------------------------------
1 | {
2 |     "last_setup_crc32" : "BEDB844B56df664a",
3 |     "tempName" : "Untitled1"
4 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/8DD8A556:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/945C141F:
--------------------------------------------------------------------------------
1 | {
2 |     "last_setup_crc32" : "",
3 |     "tempName" : "Untitled1"
4 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/C5350C47:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/D635D4D3:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/DA516F3B:
--------------------------------------------------------------------------------
1 | {
2 |     "tempName" : "Untitled1"
3 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/E7318A37:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/ED483BAF:
--------------------------------------------------------------------------------
1 | {
2 |     "last_setup_crc32" : "",
3 |     "tempName" : "Untitled1"
4 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/EE8F787:
--------------------------------------------------------------------------------
1 | {
2 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/FFC79A6C:
--------------------------------------------------------------------------------
1 | {
2 |     "last_setup_crc32" : "",
3 |     "tempName" : "Untitled1"
4 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/prop/INDEX:
--------------------------------------------------------------------------------
 1 | ~%2FDesktop%2Fbookdown-demo-master%2Findex.Rmd="E7318A37"
 2 | ~%2FDesktop%2Fuploading%20to%20wp.R="DA516F3B"
 3 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%201.Rmd="FFC79A6C"
 4 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%202.Rmd="945C141F"
 5 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%203.Rmd="ED483BAF"
 6 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20Lesson.Rmd="460054D1"
 7 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FOil%20Production%20By%20State.Rmd="D635D4D3"
 8 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2Fdata_preparation.R="C5350C47"
 9 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2001%20-%20Crime%20Analysis.Rmd="EE8F787"
10 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2003%20-%20Crime%20Analysis%20adapted%20for%20Ghost.Rmd="3B0BAF17"
11 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2003%20-%20Crime%20Analysis.Rmd="8DD8A556"
12 | ~%2FDocuments%2FGitHub%2Fsc-paid-search-reporting%2FOtter%20Products%20-%20Paid%20Search%20Report.Rmd="32602BA4"
13 | 


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/s-21D18981/206A0EDA:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 2\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n  html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n----  \n\n# Data Visualization - Part 2\n\n## A Quick Overview of the ggplot2 Package in R  \n\nWhile it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. \n\nIn case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)\n\n```{r, fig.align='center',echo=FALSE}\nlibrary(png)\nlibrary(grid)\nimg = readPNG(\"images/title_photo_2.png\")\ngrid.raster(img)\n```\n\n\n### Why would you use ggplot2?  \n1. More robust plotting than the base plot package\n2. Better control over aesthetics - colors, axes, background, etc.\n3. Layering\n4. Variable Mapping (aes)\n5. Automatic aggregation of data\n6. Built in formulas & plotting (geom_smooth)\n7. The list goes on and on...  \n\nBasically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.\n\n### Why wouldn't you use ggplot2?  \n1. A bit of a learning curve\n2. Lack of user interactivity with the plots  \n\nFundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it.  \n\n**A Basic Example: Unemployment Over Time**  \n\n```{r,results='hide', warning=FALSE, message=FALSE}\nlibrary(dplyr)\nlibrary(ggplot2)\n\n# Load the economics data from ggplot2\ndata(economics,package='ggplot2')\n```\n\n```{r}\n# Take a look at the format of the data\nhead(economics)\n```\n\n\n```{r, fig.height = 4}\n# Create the plot\nggplot(data = economics) + geom_line(aes(x = date, y = unemploy))\n```\n\n\n\n### What happened to get that?  \n\n- `ggplot(economics)` loaded the data frame\n- `+` tells ggplot() that there is more to be added to the plot\n- `geom_line()` defined the type of plot\n- `aes(x = date, y = unemploy)` mapped the variables\n\nThe `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what \"auto-magically\" brings your plot to life. You are telling ggplot2, \"I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis.\" It's pretty straightforward in this case but there are more complex use cases as well.\n\n***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():\n`ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`\n\n### Here's the basic formula for success:\n\n- Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter\n- Include your data frame (economics) in a ggplot function: `ggplot(data = economics)`  \n- Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`\n    - \"geom\" stands for \"geometric object\" and determines the type of object (there can be more than one type per plot)\n    - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)\n- Add in layers and utilize `fill` and `col` parameters within `aes()`\n\n\nI'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on.  \n\n**Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).\n\n\nAs usual, some packages must be loaded.  \n\n```{r, results='hide', warning=FALSE, message=FALSE}\nlibrary(reshape2)\nlibrary(lubridate)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(ggplot2)\nlibrary(scales)\nlibrary(gridExtra)\n```\n\n### The Scatterplot  \n\nThis is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).\n\nThe author of this chart was looking for a correlation between area and population. \n\n```{r}\n# Use the \"midwest\"\" data from ggplot2\ndata(\"midwest\", package = \"ggplot2\")\n\nhead(midwest)\n```\n\n#### Here's the most basic version of the scatter plot  \n\nThis can be called by `geom_point()` in ggplot2\n\n```{r, warning=FALSE, fig.align='center',fig.height = 4}\n# Scatterplot\nggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point()  #ggplot\n```\n\n#### Here's  version with some additional features  \n\nWhile the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.\n\n```{r, warning=FALSE,message=FALSE,fig.height = 4}\nggplot(data = midwest, aes(x = area, y = poptotal)) + \ngeom_point(aes(col=state, size=popdensity)) + \n  geom_smooth(method=\"loess\", se=F) + \n  xlim(c(0, 0.1)) + \n  ylim(c(0, 500000)) + \n  labs(subtitle=\"Area Vs Population\", \n       y=\"Population\", \n       x=\"Area\", \n       title=\"Scatterplot\", \n       caption = \"Source: midwest\")\n```\n\n#### Explanation:  \n\n`ggplot(data = midwest, aes(x = area, y = poptotal)) + `    \nInputs the data and maps x and y variables as area and poptotal.  \n\n`geom_point(aes(col=state, size=popdensity)) + `  \nCreates a scatterplot and maps the color and size of points to state and popdensity.  \n\n`  geom_smooth(method=\"loess\", se=F) + `  \nCreates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.\n\n`  xlim(c(0, 0.1)) + `  \nSets the x-axis limits.  \n\n`  ylim(c(0, 500000)) + `  \nSets the y-axis limits.  \n\n`labs(subtitle=\"Area Vs Population\",`  \n\n`       y=\"Population\",`  \n\n`       x=\"Area\",`  \n\n`       title=\"Scatterplot\",`  \n\n`       caption = \"Source: midwest\")`  \nChanges the labels of the subtitle, y-axis, x-axis, title and caption.\n\nNotice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.\n\n\n### The Density Plot  \n\nDensity plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.\n\n```{r}\n# Examine the mpg data set\nhead(mpg)\n```\n\n#### Sample Density Plot\n\n```{r,fig.height = 4}\ng = ggplot(mpg, aes(cty))\ng + geom_density(aes(fill=factor(cyl)), alpha=0.8) + \n    labs(title=\"Density plot\", \n         subtitle=\"City Mileage Grouped by Number of cylinders\",\n         caption=\"Source: mpg\",\n         x=\"City Mileage\",\n         fill=\"# Cylinders\")\n\n```\n\nYou'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.\n\n`ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)`  \n\n`g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + `  \n`geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.\n\n`    labs(title=\"Density plot\",`  \n\n`         subtitle=\"City Mileage Grouped by Number of cylinders\",`  \n\n`         caption=\"Source: mpg\",`  \n\n`         x=\"City Mileage\",`  \n\n`         fill=\"# Cylinders\")`  \nLabeling is cleaned up at the end.\n\n\n#### How would you use your new knowledge to see the density by class instead of by number of cylinders?  \n\n***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.\n\n```{r,fig.height = 4}\ng + geom_density(aes(fill=factor(class)), alpha=0.8) + \n    labs(title=\"Density plot\", \n         subtitle=\"City Mileage Grouped by Class\",\n         caption=\"Source: mpg\",\n         x=\"City Mileage\",\n         fill=\"Class\")\n```\nNotice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.\n\n### The Histogram  \n\nHow could we show the city mileage in a histogram?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(cty))\ng + geom_histogram(bins=20) +\n    labs(title=\"Histogram\", \n         caption=\"Source: mpg\",\n         x=\"City Mileage\")\n```  \n\n`geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.\n\n\n### The Bar/Column Chart  \n\nFor all intensive purposes, bar and column charts are essentially the same. Technically, the term \"column chart\" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.\n\n```{r}\n#Data Preparation\nfreqtable <- table(mpg$manufacturer)\ndf <- as.data.frame.table(freqtable)\nhead(df)\n```\n\n\n```{r,fig.height = 4}\n#Set a theme\ntheme_set(theme_classic())\n\ng <- ggplot(df, aes(Var1, Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n      labs(title=\"Bar Chart\", \n           subtitle=\"Manufacturer of vehicles\", \n           caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n      theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\nThe addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.\n\nHowever, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.\n\nWithin `geom_bar()` there is another new piece of information: `stat=\"identity\"` which tells ggplot to use the actual value of `Freq`.\n\nYou may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. \n\n```{r,fig.height = 4}\ng <- ggplot(df, aes(reorder(Var1,Freq), Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n      labs(title=\"Bar Chart\", \n           x = 'Manufacturer',\n           subtitle=\"Manufacturer of vehicles\", \n           caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n      theme(axis.text.x = element_text(angle=65, vjust=0.6)) + \n  coord_flip()\n```\n\nLet's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n  theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\n`position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison.  \n\nDespite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.\n\n```{r}\ntheme_set(theme_bw())\n\ng = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n  facet_wrap(~manufacturer)\n```\nThis created a much nicer view of the information. It \"auto-magically\" split everything out by manufacturer!\n\n\n### Spatial Plots\n\nAnother nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude \"auto-magically.\"\n\nThis map is created with `ggmap` which utilizes Google Maps API.\n\n```{r, warning=FALSE, message=FALSE}\nlibrary(ggmap)\nlibrary(ggalt)\n\nfoco <-  geocode(\"Fort Collins, CO\")  # get longitude and latitude\n\n# Get the Map ----------------------------------------------\ncolo_map <- qmap(\"Colorado, United States\",zoom = 7, source = \"google\")   \n\n# Get Coordinates for Places ---------------------\ncolo_places <- c(\"Fort Collins, CO\",\n                    \"Denver, CO\",\n                    \"Grand Junction, CO\",\n                    \"Durango, CO\",\n                    \"Pueblo, CO\")\n\nplaces_loc <- geocode(colo_places)  # get longitudes and latitudes\n\n\n# Plot Open Street Map -------------------------------------\ncolo_map + geom_point(aes(x=lon, y=lat),\n                             data = places_loc, \n                             alpha = 0.7, \n                             size = 7, \n                             color = \"tomato\") + \n                  geom_encircle(aes(x=lon, y=lat),\n                                data = places_loc, size = 2, color = \"blue\")\n```\n\n### Final Thoughts  \n\nI hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!\n\n### Coming soon:  \n\n- Determining whether or not you need a visualization  \n- Choosing the type of plot to use depending on the use case  \n- Visualization beyond the standard charts and graphs  \n\n\nI made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)",
 4 |     "created" : 1489621938889.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "3000572754",
 9 |     "id" : "206A0EDA",
10 |     "lastKnownWriteTime" : 1490212163,
11 |     "last_content_update" : 1490212163,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 2.Rmd",
13 |     "project_path" : "Data Visualization - Part 2.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 2,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/s-21D18981/3F887430:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "library(knitr)\n# Set figure dimensions\n#opts_chunk$set(fig.width=5, fig.height=5)\n# Set figures to upload to imgur.com\nopts_knit$set(upload.fun = imgur_upload, base.url = NULL)\n\nrmd.file <- \"Data Visualization - Part 2.Rmd\"\n# Knit the .Rmd file\nknit(rmd.file)\n# Set up input/ output files\nmarkdown.file <- gsub(pattern = \"Rmd$\", replacement = \"md\", x = rmd.file)\nhtml.file <- gsub(pattern = \"md$\", replacement = \"_pub.html\", x = markdown.file)\n\nlibrary(markdown)\n# Removes 'yaml' information\nmarkdownToHTML(file = markdown.file, output = html.file, fragment.only = TRUE)\n\nlibrary(RWordPress)\n# Set your WP username, password, and your site URL\noptions(WordpressLogin = c(stoltzmaniac = 'ejkDD$$ckckslppzzzekAABV'),\n        WordpressURL = 'https://stoltzmaniac.com/xmlrpc.php')\n# Create a line-by-line text vector\ntext = paste(readLines(html.file), collapse = \"\\n\")\n# Send to Worpdress\nnewPost(list(description = text, title = \"Data Visualization - Part 2\"), publish = FALSE)\n",
 4 |     "created" : 1489416648044.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "4101462966",
 9 |     "id" : "3F887430",
10 |     "lastKnownWriteTime" : 1489621861,
11 |     "last_content_update" : 1489621861,
12 |     "path" : "~/Desktop/uploading to wp.R",
13 |     "project_path" : null,
14 |     "properties" : {
15 |         "tempName" : "Untitled1"
16 |     },
17 |     "relative_order" : 2,
18 |     "source_on_save" : false,
19 |     "source_window" : "",
20 |     "type" : "r_source"
21 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/s-21D18981/D09CC3F1:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 3\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 28, 2017\"\noutput: html_document\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n\n### Do You Actually Need a Visualization?\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo_3.png\")\ngrid.raster(img)\n```\n\n#### Know what you are trying to achieve before starting\n\nLet's load up some libraries and get started.\n\n```{r libraryPrep, results='hide', warning=FALSE, message=FALSE}\nlibrary(ggplot2)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(lubridate)\nlibrary(scales)\n```\n\n## Decide on what you're trying to accomplish first.  \n\nAsk yourself the following questions to help drive your decision:  \n\n- Are you making a comparison?\n- Are you finding a relationship?\n- Are you showing a distribution?\n- Are you finding a trend over time?\n- Are you showing composition?\n  \nOnce you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.\n\n#### Rule of Thumb  \n\n- **Trend: ** Column, Line  \n- **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter  \n- **Relationship: ** Line, Scatter  \n- **Distribution: ** Bar, Boxplot, Column  \n- **Composition: ** Donut, Pie, Stacked Bar, Stacked Column  \n  \nObviously, there are choices beyond these and you need to make your choice wisely. \n\nSide Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately.\n\nFor this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm)  \n\n\n```{r dataLoading, results='hide', warning=FALSE, message=FALSE}\n#Custom data preparation\n#GitHub (linked to at bottom of this post)\nsource('data_preparation.R')\ndata = getData()\n```\n\n```{r}\nhead(data)\n```\n\n----  \n\n## Trend - Line Chart\n\n**Objective:** Visualize a trend in oil production in the US from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period. This is ***exploratory visualization.***\n\nI decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading.\n\n#### Poor Version  \nThe x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still worthless.\n\n\n```{r,fig.align='center', fig.width=4}\ndf = data %>% \n  group_by(Year) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n  ggtitle('Oil Production Over Time') + \n  theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n  xlab('') + ylab('')\n```\n\n#### Better Version  \nThe title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.\n\n```{r,fig.align='center', fig.width=12}\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n  ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +\n  theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n  scale_y_continuous(labels = comma)\n```\n\n\n----\n\n## Comparison - Line Chart  \n\n**Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare their trends over the time period. From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010. This is another example of ***exploratory visualization.***\n\n#### Poor Version  \nThere are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Location, Year) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))\np + geom_line(stat='identity') + \n  ggtitle(paste('Oil Production By Year By State in the U.S.')) + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n#### Better Version  \nThis focuses attention on the top producing states. It compares them to each other and shows the trend per state as well.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nn=6 #Arbitrary at first, after trying a few, this made the most sense\ntopN = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(n)\n\ndf = data %>%\n  filter(Location %in% topN$Location) %>%\n  group_by(Year,Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\ndf$Location = as.factor(df$Location)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))\np + geom_line(stat='identity') + \n  ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n  facet_wrap(~Location) + \n  scale_y_continuous(labels = comma) \n\n```\n\n----\n\n## Relationship - Scatter Plot\n\n**Objective**: See if  data from Alaska and California is correlated (This probably isn't important but it allows us to use the same data).\n\n### Which of these views would you rather see?\n\n#### Poor Version  \nLots of completely irrelevant data! Size of the point should have nothing to do with the year.  \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nstatesList = c('Alaska','California')\ndf = data %>%\n  filter(Location %in% statesList) %>%\n  spread(Location,ThousandBarrel) %>%\n  select(Alaska,California,Month,Year)\n\np = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))\np + geom_point() + \n  scale_y_continuous(labels = comma) +\n  scale_x_continuous(labels = comma) +\n  ggtitle('Oil Production - CA vs. AK') + \n  theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n#### Better Version  \nThe trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data.  \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  filter(Location %in% statesList) %>%\n  spread(Location,ThousandBarrel) %>%\n  select(Alaska,California,Year)\n\np = ggplot(df,aes(x=Alaska,y=California))\np + geom_point() + \n  scale_y_continuous(labels = comma) +\n  scale_x_continuous(labels = comma) +\n  ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  geom_smooth(method='lm')\n\n```\n\n\n\n\n## Distribution - Boxplot \n\n**Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance.\n\n### Which of these views would you rather see?\n\n#### Poor Version  \n\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Year,Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Location,y=ThousandBarrel))\np + geom_boxplot() + \n  ggtitle('Distribution of Oil Production by State')\n\n```\n\n\n#### Better Version  \nThis gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_boxplot() + \n  scale_y_continuous(labels = comma) +\n  ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + \n  coord_flip()\n```\n\n\n## Composition - Stacked Bar \n\n**Objective**: Check out the composition of total production by state.\n\n### Which of these views would you rather see?\n\n#### Poor Version  \nMy favorite, the beautiful pie chart! There's nothing better than this...\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(100*df$ThousandBarrel,0)\n\nlibrary(plotrix)\npie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')\n\n```\n\n\n#### Better Version  \nThe 1980's and 2010's will be missing years in terms of a \"decade\" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite \"apples-to-apples\" for a comparison because I created different decades, but you get the idea.\n\nI also created an \"Other\" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndata$Decade = '1980s'\ndata$Decade[data$Year >= 1990] = '1990s'\ndata$Decade[data$Year >= 2000] = '2000s'\ndata$Decade[data$Year >= 2010] = '2010s'\ndata$Decade = as.factor(data$Decade)\n\ntop5 = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(5) %>%\n  select(Location)\n\ntop5List = top5$Location\n\ndata$State = \"Other\"\n\nfor(i in 1:length(top5List)){\n  data$State[data$Location == top5List[i]] = top5List[i]\n}\n\ndf = data %>%\n  group_by(Decade,State) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(df$ThousandBarrel,3)\ndf$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')\n\np = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))\np + geom_bar(stat='identity') + \n  geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = \"stack\") + \n  scale_y_continuous(labels = percent) +\n  ggtitle('Percentage of Top Oil Producing States by Decade') + \n  guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + \n  theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n\n\n\n### Some other fun concepts are below!  \nSome of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. \n\nHave fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. \n\nNow it's your turn!\n\n\n```{r}\ndf = data %>% \n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel)\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_bar(stat='identity') + \n  ggtitle('Oil Production 1981 - 2016 By Location') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  coord_flip()\n```\n\n\n\n\n\n```{r}\ntop10 = data %>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n  arrange(-ThousandBarrel) %>%\n  top_n(10)\nprint(top10)\n\ndf = data %>% \n  group_by(Location,Year) %>%\n  filter(Location %in% top10$Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel)) \np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))\np + geom_bar(stat='identity') + \n  ggtitle('Oil Production - Top 10 States') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n```{r, fig.width=10}\ndf = data %>%\n  filter(Year == 1990)%>%\n  group_by(Location) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\ndf$Location = tolower(df$Location)\n\n#Add States without data\nStates = data.frame(Location = tolower(as.character(state.name)))\nmissingStates = States$Location[!(States$Location %in% df$Location)]\nappendData = data.frame(Location=missingStates,ThousandBarrel=0)\ndf = rbind(df,appendData)\n\nstates_map <- map_data(\"state\")\n\nggplot(df, aes(map_id = Location)) + \n    geom_map(aes(fill=ThousandBarrel), map = states_map) +\n    expand_limits(x = states_map$long, y = states_map$lat)\n\n```\n\n\n```{r}\ndf = data %>% \n  filter(Location == 'Texas') %>%\n  group_by(Year,Month) %>%\n  summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Month,y=ThousandBarrel))\np + geom_line(stat='identity',aes(group=Year,col=Year)) + \n  ggtitle('Oil Production By Year in the U.S.') + \n  theme(plot.title = element_text(hjust = 0.5)) + \n  theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)",
 4 |     "created" : 1490662963331.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "289843358",
 9 |     "id" : "D09CC3F1",
10 |     "lastKnownWriteTime" : 1490664979,
11 |     "last_content_update" : 1490664979460,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd",
13 |     "project_path" : "Data Visualization - Part 3.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "BEDB844B56df664a",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 4,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/s-21D18981/EB01FDAC:
--------------------------------------------------------------------------------
 1 | {
 2 |     "collab_server" : "",
 3 |     "contents" : "---\ntitle: \"Data Visualization - Part 1\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n  pdf_document: default\n  html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n----  \n\n# Introduction to Data Visualization\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo.png\")\ngrid.raster(img)\n```\n\nThe topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach \n$7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.  \n\nIn short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.\n\n---- \n\nI have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.\n\n### One author at Vox wanted to show the cause of death in all of Shakespeare\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1.png\")\ngrid.raster(img)\n```\n    \n\n**Is this not insane!?!?!**  \n\nUsing a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).\n  \n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1-fix.png\")\ngrid.raster(img)\n```\n----  \n\nTo be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).\n\n----\n\n### The Good, The Bad, The Ugly  \n\n**\"The Good\" visualizations:**  \n\n- Clearly illustrate a point  \n- Are tailored to the appropriate audience  \n    - Analysts may want detail  \n    - Executives may want a high-level view  \n- Are tailored to the presentation medium  \n    - A piece in an academic journal can be analyzed slowly and carefully  \n    - A slide in front of 5,000 people in a conference will be glanced at quickly  \n- Are memorable to those who care about the material  \n- Make an impact which increases the understanding of the subject matter  \n\n**\"The Bad\" visualizations:**  \n\n- Are difficult to interpret  \n- Are unintentionally misleading  \n- Contain redundant and boring information  \n\n**\"The Ugly\" visualizations:**  \n\n- Are almost impossible to interpret  \n- Are filled with completely worthless information  \n- Are intentionally created to mislead the audience  \n- Are inaccurate  \n\n### Coming soon:  \n\n- Determining whether or not you need a visualization  \n- Choosing the type of plot to use depending on the use case  \n- Introduction to the ggplot2 in R and how it works  \n- Visualization beyond the standard charts and graphs  \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)",
 4 |     "created" : 1489685598305.000,
 5 |     "dirty" : false,
 6 |     "encoding" : "UTF-8",
 7 |     "folds" : "",
 8 |     "hash" : "1337660815",
 9 |     "id" : "EB01FDAC",
10 |     "lastKnownWriteTime" : 1489685647,
11 |     "last_content_update" : 1489685647034,
12 |     "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 1.Rmd",
13 |     "project_path" : "Data Visualization - Part 1.Rmd",
14 |     "properties" : {
15 |         "last_setup_crc32" : "",
16 |         "tempName" : "Untitled1"
17 |     },
18 |     "relative_order" : 3,
19 |     "source_on_save" : false,
20 |     "source_window" : "",
21 |     "type" : "r_markdown"
22 | }


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/sdb/s-21D18981/lock_file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/.Rproj.user/3B0EFA5/sdb/s-21D18981/lock_file


--------------------------------------------------------------------------------
/.Rproj.user/3B0EFA5/session-persistent-state:
--------------------------------------------------------------------------------
1 | virtual-session-id="BEDB844B"
2 | 


--------------------------------------------------------------------------------
/.Rproj.user/shared/notebooks/paths:
--------------------------------------------------------------------------------
1 | /Users/stoltzmanconsulting/Documents/Git-Repositories/GitHub/Data-Science-Certificate/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd="95580727"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/Data Visualization - Part 1.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Visualization - Part 1"
 3 | author: "Scott Stoltzman"
 4 | date: "March 14, 2017"
 5 | output:
 6 |   pdf_document: default
 7 |   html_document: default
 8 | subtitle: The Good, The Bad and The Ugly
 9 | ---
10 | 
11 | ```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}
12 | library(png)
13 | library(grid)
14 | ```
15 | ----  
16 | 
17 | # Introduction to Data Visualization
18 | 
19 | ```{r, fig.align='center',echo=FALSE}
20 | img = readPNG("images/title_photo.png")
21 | grid.raster(img)
22 | ```
23 | 
24 | The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 
25 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.  
26 | 
27 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.
28 | 
29 | ---- 
30 | 
31 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.
32 | 
33 | ### One author at Vox wanted to show the cause of death in all of Shakespeare
34 | 
35 | ```{r, fig.align='center',echo=FALSE}
36 | img = readPNG("images/bad-pie1.png")
37 | grid.raster(img)
38 | ```
39 |     
40 | 
41 | **Is this not insane!?!?!**  
42 | 
43 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).
44 |   
45 | 
46 | ```{r, fig.align='center',echo=FALSE}
47 | img = readPNG("images/bad-pie1-fix.png")
48 | grid.raster(img)
49 | ```
50 | ----  
51 | 
52 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).
53 | 
54 | ----
55 | 
56 | ### The Good, The Bad, The Ugly  
57 | 
58 | **"The Good" visualizations:**  
59 | 
60 | - Clearly illustrate a point  
61 | - Are tailored to the appropriate audience  
62 |     - Analysts may want detail  
63 |     - Executives may want a high-level view  
64 | - Are tailored to the presentation medium  
65 |     - A piece in an academic journal can be analyzed slowly and carefully  
66 |     - A slide in front of 5,000 people in a conference will be glanced at quickly  
67 | - Are memorable to those who care about the material  
68 | - Make an impact which increases the understanding of the subject matter  
69 | 
70 | **"The Bad" visualizations:**  
71 | 
72 | - Are difficult to interpret  
73 | - Are unintentionally misleading  
74 | - Contain redundant and boring information  
75 | 
76 | **"The Ugly" visualizations:**  
77 | 
78 | - Are almost impossible to interpret  
79 | - Are filled with completely worthless information  
80 | - Are intentionally created to mislead the audience  
81 | - Are inaccurate  
82 | 
83 | ### Coming soon:  
84 | 
85 | - Determining whether or not you need a visualization  
86 | - Choosing the type of plot to use depending on the use case  
87 | - Introduction to the ggplot2 in R and how it works  
88 | - Visualization beyond the standard charts and graphs  
89 | 
90 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)


--------------------------------------------------------------------------------
/Data Visualization - Part 1._pub.html:
--------------------------------------------------------------------------------
 1 | <hr/>
 2 | 
 3 | <h1>Introduction to Data Visualization</h1>
 4 | 
 5 | <p><img src="http://i.imgur.com/UffpkhM.png" title="plot of chunk unnamed-chunk-1" alt="plot of chunk unnamed-chunk-1" style="display: block; margin: auto;" /></p>
 6 | 
 7 | <p>The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 
 8 | $7 Billion by the end of 2022 according to <a href="https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry">Mordor Intelligence.</a> While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it&#39;s appropriate.  </p>
 9 | 
10 | <p>In short, are you adding value to your work or are you simply adding this to make it seem <strong><em>less boring?</em></strong> Let&#39;s take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.</p>
11 | 
12 | <hr/>
13 | 
14 | <p>I have to give credit to <a href="http://junkcharts.typepad.com/">Junk Charts</a> - it inspired a lot of this post.</p>
15 | 
16 | <h3>One author at Vox wanted to show the cause of death in all of Shakespeare</h3>
17 | 
18 | <p><img src="http://i.imgur.com/A58rWDW.png" title="plot of chunk unnamed-chunk-2" alt="plot of chunk unnamed-chunk-2" style="display: block; margin: auto;" /></p>
19 | 
20 | <p><strong>Is this not insane!?!?!</strong>  </p>
21 | 
22 | <p>Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. <a href="http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html">Junk Charts Article</a>.</p>
23 | 
24 | <h2><img src="http://i.imgur.com/lb1ZM3r.png" title="plot of chunk unnamed-chunk-3" alt="plot of chunk unnamed-chunk-3" style="display: block; margin: auto;" /></h2>
25 | 
26 | <p>To be clear, I&#39;m not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don&#39;t take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I&#39;ll walk through my theory and not worry too much about aesthetics (save that for a time when you&#39;re getting paid).</p>
27 | 
28 | <hr/>
29 | 
30 | <h3>The Good, The Bad, The Ugly</h3>
31 | 
32 | <p><strong>&ldquo;The Good&rdquo; visualizations:</strong>  </p>
33 | 
34 | <ul>
35 | <li>Clearly illustrate a point<br/></li>
36 | <li>Are tailored to the appropriate audience<br/>
37 | 
38 | <ul>
39 | <li>Analysts may want detail<br/></li>
40 | <li>Executives may want a high-level view<br/></li>
41 | </ul></li>
42 | <li>Are tailored to the presentation medium<br/>
43 | 
44 | <ul>
45 | <li>A piece in an academic journal can be analyzed slowly and carefully<br/></li>
46 | <li>A slide in front of 5,000 people in a conference will be glanced at quickly<br/></li>
47 | </ul></li>
48 | <li>Are memorable to those who care about the material<br/></li>
49 | <li>Make an impact which increases the understanding of the subject matter<br/></li>
50 | </ul>
51 | 
52 | <p><strong>&ldquo;The Bad&rdquo; visualizations:</strong>  </p>
53 | 
54 | <ul>
55 | <li>Are difficult to interpret<br/></li>
56 | <li>Are unintentionally misleading<br/></li>
57 | <li>Contain redundant and boring information<br/></li>
58 | </ul>
59 | 
60 | <p><strong>&ldquo;The Ugly&rdquo; visualizations:</strong>  </p>
61 | 
62 | <ul>
63 | <li>Are almost impossible to interpret<br/></li>
64 | <li>Are filled with completely worthless information<br/></li>
65 | <li>Are intentionally created to mislead the audience<br/></li>
66 | <li>Are inaccurate<br/></li>
67 | </ul>
68 | 
69 | <h3>Coming soon:</h3>
70 | 
71 | <ul>
72 | <li>Determining whether or not you need a visualization<br/></li>
73 | <li>Choosing the type of plot to use depending on the use case<br/></li>
74 | <li>Introduction to the ggplot2 in R and how it works<br/></li>
75 | <li>Visualization beyond the standard charts and graphs<br/></li>
76 | </ul>
77 | 
78 | <p>As always, the code used in this post is on my <a href="https://github.com/stoltzmaniac/Data-Visualization-Lesson">GitHub</a></p>
79 | 
80 | 


--------------------------------------------------------------------------------
/Data Visualization - Part 1.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Visualization - Part 1"
 3 | subtitle: "The Good, The Bad and The Ugly"
 4 | author: "Scott Stoltzman"
 5 | date: "March 14, 2017"
 6 | output: html_document
 7 | ---
 8 | 
 9 | 
10 | ----  
11 | 
12 | # Introduction to Data Visualization
13 | 
14 | <img src="http://i.imgur.com/UffpkhM.png" title="plot of chunk unnamed-chunk-1" alt="plot of chunk unnamed-chunk-1" style="display: block; margin: auto;" />
15 | 
16 | The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 
17 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.  
18 | 
19 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.
20 | 
21 | ---- 
22 | 
23 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.
24 | 
25 | ### One author at Vox wanted to show the cause of death in all of Shakespeare
26 | 
27 | <img src="http://i.imgur.com/A58rWDW.png" title="plot of chunk unnamed-chunk-2" alt="plot of chunk unnamed-chunk-2" style="display: block; margin: auto;" />
28 |     
29 | 
30 | **Is this not insane!?!?!**  
31 | 
32 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).
33 |   
34 | 
35 | <img src="http://i.imgur.com/lb1ZM3r.png" title="plot of chunk unnamed-chunk-3" alt="plot of chunk unnamed-chunk-3" style="display: block; margin: auto;" />
36 | ----  
37 | 
38 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).
39 | 
40 | ----
41 | 
42 | ### The Good, The Bad, The Ugly  
43 | 
44 | **"The Good" visualizations:**  
45 | 
46 | - Clearly illustrate a point  
47 | - Are tailored to the appropriate audience  
48 |     - Analysts may want detail  
49 |     - Executives may want a high-level view  
50 | - Are tailored to the presentation medium  
51 |     - A piece in an academic journal can be analyzed slowly and carefully  
52 |     - A slide in front of 5,000 people in a conference will be glanced at quickly  
53 | - Are memorable to those who care about the material  
54 | - Make an impact which increases the understanding of the subject matter  
55 | 
56 | **"The Bad" visualizations:**  
57 | 
58 | - Are difficult to interpret  
59 | - Are unintentionally misleading  
60 | - Contain redundant and boring information  
61 | 
62 | **"The Ugly" visualizations:**  
63 | 
64 | - Are almost impossible to interpret  
65 | - Are filled with completely worthless information  
66 | - Are intentionally created to mislead the audience  
67 | - Are inaccurate  
68 | 
69 | ### Coming soon:  
70 | 
71 | - Determining whether or not you need a visualization  
72 | - Choosing the type of plot to use depending on the use case  
73 | - Introduction to the ggplot2 in R and how it works  
74 | - Visualization beyond the standard charts and graphs  
75 | 
76 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)
77 | 


--------------------------------------------------------------------------------
/Data Visualization - Part 2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization - Part 2"
  3 | author: "Scott Stoltzman"
  4 | date: "March 14, 2017"
  5 | output:
  6 |   html_document: default
  7 | subtitle: The Good, The Bad and The Ugly
  8 | ---
  9 | 
 10 | ----  
 11 | 
 12 | # Data Visualization - Part 2
 13 | 
 14 | ## A Quick Overview of the ggplot2 Package in R  
 15 | 
 16 | While it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. 
 17 | 
 18 | In case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)
 19 | 
 20 | ```{r, fig.align='center',echo=FALSE}
 21 | library(png)
 22 | library(grid)
 23 | img = readPNG("images/title_photo_2.png")
 24 | grid.raster(img)
 25 | ```
 26 | 
 27 | 
 28 | ### Why would you use ggplot2?  
 29 | 1. More robust plotting than the base plot package
 30 | 2. Better control over aesthetics - colors, axes, background, etc.
 31 | 3. Layering
 32 | 4. Variable Mapping (aes)
 33 | 5. Automatic aggregation of data
 34 | 6. Built in formulas & plotting (geom_smooth)
 35 | 7. The list goes on and on...  
 36 | 
 37 | Basically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.
 38 | 
 39 | ### Why wouldn't you use ggplot2?  
 40 | 1. A bit of a learning curve
 41 | 2. Lack of user interactivity with the plots  
 42 | 
 43 | Fundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it.  
 44 | 
 45 | **A Basic Example: Unemployment Over Time**  
 46 | 
 47 | ```{r,results='hide', warning=FALSE, message=FALSE}
 48 | library(dplyr)
 49 | library(ggplot2)
 50 | 
 51 | # Load the economics data from ggplot2
 52 | data(economics,package='ggplot2')
 53 | ```
 54 | 
 55 | ```{r}
 56 | # Take a look at the format of the data
 57 | head(economics)
 58 | ```
 59 | 
 60 | 
 61 | ```{r, fig.height = 4}
 62 | # Create the plot
 63 | ggplot(data = economics) + geom_line(aes(x = date, y = unemploy))
 64 | ```
 65 | 
 66 | 
 67 | 
 68 | ### What happened to get that?  
 69 | 
 70 | - `ggplot(economics)` loaded the data frame
 71 | - `+` tells ggplot() that there is more to be added to the plot
 72 | - `geom_line()` defined the type of plot
 73 | - `aes(x = date, y = unemploy)` mapped the variables
 74 | 
 75 | The `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what "auto-magically" brings your plot to life. You are telling ggplot2, "I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis." It's pretty straightforward in this case but there are more complex use cases as well.
 76 | 
 77 | ***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():
 78 | `ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`
 79 | 
 80 | ### Here's the basic formula for success:
 81 | 
 82 | - Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter
 83 | - Include your data frame (economics) in a ggplot function: `ggplot(data = economics)`  
 84 | - Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`
 85 |     - "geom" stands for "geometric object" and determines the type of object (there can be more than one type per plot)
 86 |     - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)
 87 | - Add in layers and utilize `fill` and `col` parameters within `aes()`
 88 | 
 89 | 
 90 | I'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on.  
 91 | 
 92 | **Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).
 93 | 
 94 | 
 95 | As usual, some packages must be loaded.  
 96 | 
 97 | ```{r, results='hide', warning=FALSE, message=FALSE}
 98 | library(reshape2)
 99 | library(lubridate)
100 | library(dplyr)
101 | library(tidyr)
102 | library(ggplot2)
103 | library(scales)
104 | library(gridExtra)
105 | ```
106 | 
107 | ### The Scatterplot  
108 | 
109 | This is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).
110 | 
111 | The author of this chart was looking for a correlation between area and population. 
112 | 
113 | ```{r}
114 | # Use the "midwest"" data from ggplot2
115 | data("midwest", package = "ggplot2")
116 | 
117 | head(midwest)
118 | ```
119 | 
120 | #### Here's the most basic version of the scatter plot  
121 | 
122 | This can be called by `geom_point()` in ggplot2
123 | 
124 | ```{r, warning=FALSE, fig.align='center',fig.height = 4}
125 | # Scatterplot
126 | ggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point()  #ggplot
127 | ```
128 | 
129 | #### Here's  version with some additional features  
130 | 
131 | While the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.
132 | 
133 | ```{r, warning=FALSE,message=FALSE,fig.height = 4}
134 | ggplot(data = midwest, aes(x = area, y = poptotal)) + 
135 | geom_point(aes(col=state, size=popdensity)) + 
136 |   geom_smooth(method="loess", se=F) + 
137 |   xlim(c(0, 0.1)) + 
138 |   ylim(c(0, 500000)) + 
139 |   labs(subtitle="Area Vs Population", 
140 |        y="Population", 
141 |        x="Area", 
142 |        title="Scatterplot", 
143 |        caption = "Source: midwest")
144 | ```
145 | 
146 | #### Explanation:  
147 | 
148 | `ggplot(data = midwest, aes(x = area, y = poptotal)) + `    
149 | Inputs the data and maps x and y variables as area and poptotal.  
150 | 
151 | `geom_point(aes(col=state, size=popdensity)) + `  
152 | Creates a scatterplot and maps the color and size of points to state and popdensity.  
153 | 
154 | `  geom_smooth(method="loess", se=F) + `  
155 | Creates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.
156 | 
157 | `  xlim(c(0, 0.1)) + `  
158 | Sets the x-axis limits.  
159 | 
160 | `  ylim(c(0, 500000)) + `  
161 | Sets the y-axis limits.  
162 | 
163 | `labs(subtitle="Area Vs Population",`  
164 | 
165 | `       y="Population",`  
166 | 
167 | `       x="Area",`  
168 | 
169 | `       title="Scatterplot",`  
170 | 
171 | `       caption = "Source: midwest")`  
172 | Changes the labels of the subtitle, y-axis, x-axis, title and caption.
173 | 
174 | Notice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.
175 | 
176 | 
177 | ### The Density Plot  
178 | 
179 | Density plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.
180 | 
181 | ```{r}
182 | # Examine the mpg data set
183 | head(mpg)
184 | ```
185 | 
186 | #### Sample Density Plot
187 | 
188 | ```{r,fig.height = 4}
189 | g = ggplot(mpg, aes(cty))
190 | g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + 
191 |     labs(title="Density plot", 
192 |          subtitle="City Mileage Grouped by Number of cylinders",
193 |          caption="Source: mpg",
194 |          x="City Mileage",
195 |          fill="# Cylinders")
196 | 
197 | ```
198 | 
199 | You'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.
200 | 
201 | `ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)`  
202 | 
203 | `g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + `  
204 | `geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.
205 | 
206 | `    labs(title="Density plot",`  
207 | 
208 | `         subtitle="City Mileage Grouped by Number of cylinders",`  
209 | 
210 | `         caption="Source: mpg",`  
211 | 
212 | `         x="City Mileage",`  
213 | 
214 | `         fill="# Cylinders")`  
215 | Labeling is cleaned up at the end.
216 | 
217 | 
218 | #### How would you use your new knowledge to see the density by class instead of by number of cylinders?  
219 | 
220 | ***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.
221 | 
222 | ```{r,fig.height = 4}
223 | g + geom_density(aes(fill=factor(class)), alpha=0.8) + 
224 |     labs(title="Density plot", 
225 |          subtitle="City Mileage Grouped by Class",
226 |          caption="Source: mpg",
227 |          x="City Mileage",
228 |          fill="Class")
229 | ```
230 | Notice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.
231 | 
232 | ### The Histogram  
233 | 
234 | How could we show the city mileage in a histogram?
235 | 
236 | ```{r,fig.height = 4}
237 | g = ggplot(mpg,aes(cty))
238 | g + geom_histogram(bins=20) +
239 |     labs(title="Histogram", 
240 |          caption="Source: mpg",
241 |          x="City Mileage")
242 | ```  
243 | 
244 | `geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.
245 | 
246 | 
247 | ### The Bar/Column Chart  
248 | 
249 | For all intensive purposes, bar and column charts are essentially the same. Technically, the term "column chart" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.
250 | 
251 | ```{r}
252 | #Data Preparation
253 | freqtable <- table(mpg$manufacturer)
254 | df <- as.data.frame.table(freqtable)
255 | head(df)
256 | ```
257 | 
258 | 
259 | ```{r,fig.height = 4}
260 | #Set a theme
261 | theme_set(theme_classic())
262 | 
263 | g <- ggplot(df, aes(Var1, Freq))
264 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 
265 |       labs(title="Bar Chart", 
266 |            subtitle="Manufacturer of vehicles", 
267 |            caption="Source: Frequency of Manufacturers from 'mpg' dataset") +
268 |       theme(axis.text.x = element_text(angle=65, vjust=0.6))
269 | ```
270 | 
271 | The addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.
272 | 
273 | However, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.
274 | 
275 | Within `geom_bar()` there is another new piece of information: `stat="identity"` which tells ggplot to use the actual value of `Freq`.
276 | 
277 | You may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. 
278 | 
279 | ```{r,fig.height = 4}
280 | g <- ggplot(df, aes(reorder(Var1,Freq), Freq))
281 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 
282 |       labs(title="Bar Chart", 
283 |            x = 'Manufacturer',
284 |            subtitle="Manufacturer of vehicles", 
285 |            caption="Source: Frequency of Manufacturers from 'mpg' dataset") +
286 |       theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
287 |   coord_flip()
288 | ```
289 | 
290 | Let's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?
291 | 
292 | ```{r,fig.height = 4}
293 | g = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))
294 | g + geom_bar(stat='identity', position='dodge') + 
295 |   theme(axis.text.x = element_text(angle=65, vjust=0.6))
296 | ```
297 | 
298 | `position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison.  
299 | 
300 | Despite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.
301 | 
302 | ```{r}
303 | theme_set(theme_bw())
304 | 
305 | g = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))
306 | g + geom_bar(stat='identity', position='dodge') + 
307 |   facet_wrap(~manufacturer)
308 | ```
309 | This created a much nicer view of the information. It "auto-magically" split everything out by manufacturer!
310 | 
311 | 
312 | ### Spatial Plots
313 | 
314 | Another nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude "auto-magically."
315 | 
316 | This map is created with `ggmap` which utilizes Google Maps API.
317 | 
318 | ```{r, warning=FALSE, message=FALSE}
319 | library(ggmap)
320 | library(ggalt)
321 | 
322 | foco <-  geocode("Fort Collins, CO")  # get longitude and latitude
323 | 
324 | # Get the Map ----------------------------------------------
325 | colo_map <- qmap("Colorado, United States",zoom = 7, source = "google")   
326 | 
327 | # Get Coordinates for Places ---------------------
328 | colo_places <- c("Fort Collins, CO",
329 |                     "Denver, CO",
330 |                     "Grand Junction, CO",
331 |                     "Durango, CO",
332 |                     "Pueblo, CO")
333 | 
334 | places_loc <- geocode(colo_places)  # get longitudes and latitudes
335 | 
336 | 
337 | # Plot Open Street Map -------------------------------------
338 | colo_map + geom_point(aes(x=lon, y=lat),
339 |                              data = places_loc, 
340 |                              alpha = 0.7, 
341 |                              size = 7, 
342 |                              color = "tomato") + 
343 |                   geom_encircle(aes(x=lon, y=lat),
344 |                                 data = places_loc, size = 2, color = "blue")
345 | ```
346 | 
347 | ### Final Thoughts  
348 | 
349 | I hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!
350 | 
351 | ### Coming soon:  
352 | 
353 | - Determining whether or not you need a visualization  
354 | - Choosing the type of plot to use depending on the use case  
355 | - Visualization beyond the standard charts and graphs  
356 | 
357 | 
358 | I made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). 
359 | 
360 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)


--------------------------------------------------------------------------------
/Data Visualization - Part 2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization - Part 2"
  3 | author: "Scott Stoltzman"
  4 | date: "March 14, 2017"
  5 | output:
  6 |   html_document: default
  7 | subtitle: The Good, The Bad and The Ugly
  8 | ---
  9 | 
 10 | ----  
 11 | 
 12 | # Data Visualization - Part 2
 13 | 
 14 | ## A Quick Overview of the ggplot2 Package in R  
 15 | 
 16 | While it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. 
 17 | 
 18 | In case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)
 19 | 
 20 | <img src="http://i.imgur.com/4MX4rii.png" title="plot of chunk unnamed-chunk-1" alt="plot of chunk unnamed-chunk-1" style="display: block; margin: auto;" />
 21 | 
 22 | 
 23 | ### Why would you use ggplot2?  
 24 | 1. More robust plotting than the base plot package
 25 | 2. Better control over aesthetics - colors, axes, background, etc.
 26 | 3. Layering
 27 | 4. Variable Mapping (aes)
 28 | 5. Automatic aggregation of data
 29 | 6. Built in formulas & plotting (geom_smooth)
 30 | 7. The list goes on and on...  
 31 | 
 32 | Basically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.
 33 | 
 34 | ### Why wouldn't you use ggplot2?  
 35 | 1. A bit of a learning curve
 36 | 2. Lack of user interactivity with the plots  
 37 | 
 38 | Fundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it.  
 39 | 
 40 | **A Basic Example: Unemployment Over Time**  
 41 | 
 42 | 
 43 | ```r
 44 | library(dplyr)
 45 | library(ggplot2)
 46 | 
 47 | # Load the economics data from ggplot2
 48 | data(economics,package='ggplot2')
 49 | ```
 50 | 
 51 | 
 52 | ```r
 53 | # Take a look at the format of the data
 54 | head(economics)
 55 | ```
 56 | 
 57 | ```
 58 | ## # A tibble: 6 × 6
 59 | ##         date   pce    pop psavert uempmed unemploy
 60 | ##       <date> <dbl>  <int>   <dbl>   <dbl>    <int>
 61 | ## 1 1967-07-01 507.4 198712    12.5     4.5     2944
 62 | ## 2 1967-08-01 510.5 198911    12.5     4.7     2945
 63 | ## 3 1967-09-01 516.3 199113    11.7     4.6     2958
 64 | ## 4 1967-10-01 512.9 199311    12.5     4.9     3143
 65 | ## 5 1967-11-01 518.1 199498    12.5     4.7     3066
 66 | ## 6 1967-12-01 525.8 199657    12.1     4.8     3018
 67 | ```
 68 | 
 69 | 
 70 | 
 71 | ```r
 72 | # Create the plot
 73 | ggplot(data = economics) + geom_line(aes(x = date, y = unemploy))
 74 | ```
 75 | 
 76 | ![plot of chunk unnamed-chunk-4](http://i.imgur.com/BXzLJQ8.png)
 77 | 
 78 | 
 79 | 
 80 | ### What happened to get that?  
 81 | 
 82 | - `ggplot(economics)` loaded the data frame
 83 | - `+` tells ggplot() that there is more to be added to the plot
 84 | - `geom_line()` defined the type of plot
 85 | - `aes(x = date, y = unemploy)` mapped the variables
 86 | 
 87 | The `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what "auto-magically" brings your plot to life. You are telling ggplot2, "I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis." It's pretty straightforward in this case but there are more complex use cases as well.
 88 | 
 89 | ***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():
 90 | `ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`
 91 | 
 92 | ### Here's the basic formula for success:
 93 | 
 94 | - Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter
 95 | - Include your data frame (economics) in a ggplot function: `ggplot(data = economics)`  
 96 | - Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`
 97 |     - "geom" stands for "geometric object" and determines the type of object (there can be more than one type per plot)
 98 |     - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)
 99 | - Add in layers and utilize `fill` and `col` parameters within `aes()`
100 | 
101 | 
102 | I'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on.  
103 | 
104 | **Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).
105 | 
106 | 
107 | As usual, some packages must be loaded.  
108 | 
109 | 
110 | ```r
111 | library(reshape2)
112 | library(lubridate)
113 | library(dplyr)
114 | library(tidyr)
115 | library(ggplot2)
116 | library(scales)
117 | library(gridExtra)
118 | ```
119 | 
120 | ### The Scatterplot  
121 | 
122 | This is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).
123 | 
124 | The author of this chart was looking for a correlation between area and population. 
125 | 
126 | 
127 | ```r
128 | # Use the "midwest"" data from ggplot2
129 | data("midwest", package = "ggplot2")
130 | 
131 | head(midwest)
132 | ```
133 | 
134 | ```
135 | ## # A tibble: 6 × 28
136 | ##     PID    county state  area poptotal popdensity popwhite popblack
137 | ##   <int>     <chr> <chr> <dbl>    <int>      <dbl>    <int>    <int>
138 | ## 1   561     ADAMS    IL 0.052    66090  1270.9615    63917     1702
139 | ## 2   562 ALEXANDER    IL 0.014    10626   759.0000     7054     3496
140 | ## 3   563      BOND    IL 0.022    14991   681.4091    14477      429
141 | ## 4   564     BOONE    IL 0.017    30806  1812.1176    29344      127
142 | ## 5   565     BROWN    IL 0.018     5836   324.2222     5264      547
143 | ## 6   566    BUREAU    IL 0.050    35688   713.7600    35157       50
144 | ## # ... with 20 more variables: popamerindian <int>, popasian <int>,
145 | ## #   popother <int>, percwhite <dbl>, percblack <dbl>, percamerindan <dbl>,
146 | ## #   percasian <dbl>, percother <dbl>, popadults <int>, perchsd <dbl>,
147 | ## #   percollege <dbl>, percprof <dbl>, poppovertyknown <int>,
148 | ## #   percpovertyknown <dbl>, percbelowpoverty <dbl>,
149 | ## #   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
150 | ## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>
151 | ```
152 | 
153 | #### Here's the most basic version of the scatter plot  
154 | 
155 | This can be called by `geom_point()` in ggplot2
156 | 
157 | 
158 | ```r
159 | # Scatterplot
160 | ggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point()  #ggplot
161 | ```
162 | 
163 | <img src="http://i.imgur.com/TaneATX.png" title="plot of chunk unnamed-chunk-7" alt="plot of chunk unnamed-chunk-7" style="display: block; margin: auto;" />
164 | 
165 | #### Here's  version with some additional features  
166 | 
167 | While the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.
168 | 
169 | 
170 | ```r
171 | ggplot(data = midwest, aes(x = area, y = poptotal)) + 
172 | geom_point(aes(col=state, size=popdensity)) + 
173 |   geom_smooth(method="loess", se=F) + 
174 |   xlim(c(0, 0.1)) + 
175 |   ylim(c(0, 500000)) + 
176 |   labs(subtitle="Area Vs Population", 
177 |        y="Population", 
178 |        x="Area", 
179 |        title="Scatterplot", 
180 |        caption = "Source: midwest")
181 | ```
182 | 
183 | ![plot of chunk unnamed-chunk-8](http://i.imgur.com/JACxp6k.png)
184 | 
185 | #### Explanation:  
186 | 
187 | `ggplot(data = midwest, aes(x = area, y = poptotal)) + `    
188 | Inputs the data and maps x and y variables as area and poptotal.  
189 | 
190 | `geom_point(aes(col=state, size=popdensity)) + `  
191 | Creates a scatterplot and maps the color and size of points to state and popdensity.  
192 | 
193 | `  geom_smooth(method="loess", se=F) + `  
194 | Creates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.
195 | 
196 | `  xlim(c(0, 0.1)) + `  
197 | Sets the x-axis limits.  
198 | 
199 | `  ylim(c(0, 500000)) + `  
200 | Sets the y-axis limits.  
201 | 
202 | `labs(subtitle="Area Vs Population",`  
203 | 
204 | `       y="Population",`  
205 | 
206 | `       x="Area",`  
207 | 
208 | `       title="Scatterplot",`  
209 | 
210 | `       caption = "Source: midwest")`  
211 | Changes the labels of the subtitle, y-axis, x-axis, title and caption.
212 | 
213 | Notice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.
214 | 
215 | 
216 | ### The Density Plot  
217 | 
218 | Density plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.
219 | 
220 | 
221 | ```r
222 | # Examine the mpg data set
223 | head(mpg)
224 | ```
225 | 
226 | ```
227 | ## # A tibble: 6 × 11
228 | ##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
229 | ##          <chr> <chr> <dbl> <int> <int>      <chr> <chr> <int> <int> <chr>
230 | ## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
231 | ## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
232 | ## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
233 | ## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
234 | ## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
235 | ## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
236 | ## # ... with 1 more variables: class <chr>
237 | ```
238 | 
239 | #### Sample Density Plot
240 | 
241 | 
242 | ```r
243 | g = ggplot(mpg, aes(cty))
244 | g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + 
245 |     labs(title="Density plot", 
246 |          subtitle="City Mileage Grouped by Number of cylinders",
247 |          caption="Source: mpg",
248 |          x="City Mileage",
249 |          fill="# Cylinders")
250 | ```
251 | 
252 | ![plot of chunk unnamed-chunk-10](http://i.imgur.com/k2injTT.png)
253 | 
254 | You'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.
255 | 
256 | `ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)`  
257 | 
258 | `g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + `  
259 | `geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.
260 | 
261 | `    labs(title="Density plot",`  
262 | 
263 | `         subtitle="City Mileage Grouped by Number of cylinders",`  
264 | 
265 | `         caption="Source: mpg",`  
266 | 
267 | `         x="City Mileage",`  
268 | 
269 | `         fill="# Cylinders")`  
270 | Labeling is cleaned up at the end.
271 | 
272 | 
273 | #### How would you use your new knowledge to see the density by class instead of by number of cylinders?  
274 | 
275 | ***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.
276 | 
277 | 
278 | ```r
279 | g + geom_density(aes(fill=factor(class)), alpha=0.8) + 
280 |     labs(title="Density plot", 
281 |          subtitle="City Mileage Grouped by Class",
282 |          caption="Source: mpg",
283 |          x="City Mileage",
284 |          fill="Class")
285 | ```
286 | 
287 | ![plot of chunk unnamed-chunk-11](http://i.imgur.com/Kq7TY54.png)
288 | Notice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.
289 | 
290 | ### The Histogram  
291 | 
292 | How could we show the city mileage in a histogram?
293 | 
294 | 
295 | ```r
296 | g = ggplot(mpg,aes(cty))
297 | g + geom_histogram(bins=20) +
298 |     labs(title="Histogram", 
299 |          caption="Source: mpg",
300 |          x="City Mileage")
301 | ```
302 | 
303 | ![plot of chunk unnamed-chunk-12](http://i.imgur.com/rZVtc1G.png)
304 | 
305 | `geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.
306 | 
307 | 
308 | ### The Bar/Column Chart  
309 | 
310 | For all intensive purposes, bar and column charts are essentially the same. Technically, the term "column chart" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.
311 | 
312 | 
313 | ```r
314 | #Data Preparation
315 | freqtable <- table(mpg$manufacturer)
316 | df <- as.data.frame.table(freqtable)
317 | head(df)
318 | ```
319 | 
320 | ```
321 | ##        Var1 Freq
322 | ## 1      audi   18
323 | ## 2 chevrolet   19
324 | ## 3     dodge   37
325 | ## 4      ford   25
326 | ## 5     honda    9
327 | ## 6   hyundai   14
328 | ```
329 | 
330 | 
331 | 
332 | ```r
333 | #Set a theme
334 | theme_set(theme_classic())
335 | 
336 | g <- ggplot(df, aes(Var1, Freq))
337 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 
338 |       labs(title="Bar Chart", 
339 |            subtitle="Manufacturer of vehicles", 
340 |            caption="Source: Frequency of Manufacturers from 'mpg' dataset") +
341 |       theme(axis.text.x = element_text(angle=65, vjust=0.6))
342 | ```
343 | 
344 | ![plot of chunk unnamed-chunk-14](http://i.imgur.com/OtF2saP.png)
345 | 
346 | The addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.
347 | 
348 | However, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.
349 | 
350 | Within `geom_bar()` there is another new piece of information: `stat="identity"` which tells ggplot to use the actual value of `Freq`.
351 | 
352 | You may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. 
353 | 
354 | 
355 | ```r
356 | g <- ggplot(df, aes(reorder(Var1,Freq), Freq))
357 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 
358 |       labs(title="Bar Chart", 
359 |            x = 'Manufacturer',
360 |            subtitle="Manufacturer of vehicles", 
361 |            caption="Source: Frequency of Manufacturers from 'mpg' dataset") +
362 |       theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
363 |   coord_flip()
364 | ```
365 | 
366 | ![plot of chunk unnamed-chunk-15](http://i.imgur.com/lQkbQjO.png)
367 | 
368 | Let's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?
369 | 
370 | 
371 | ```r
372 | g = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))
373 | g + geom_bar(stat='identity', position='dodge') + 
374 |   theme(axis.text.x = element_text(angle=65, vjust=0.6))
375 | ```
376 | 
377 | ![plot of chunk unnamed-chunk-16](http://i.imgur.com/eLaSXr7.png)
378 | 
379 | `position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison.  
380 | 
381 | Despite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.
382 | 
383 | 
384 | ```r
385 | theme_set(theme_bw())
386 | 
387 | g = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))
388 | g + geom_bar(stat='identity', position='dodge') + 
389 |   facet_wrap(~manufacturer)
390 | ```
391 | 
392 | ![plot of chunk unnamed-chunk-17](http://i.imgur.com/wpsQt81.png)
393 | This created a much nicer view of the information. It "auto-magically" split everything out by manufacturer!
394 | 
395 | 
396 | ### Spatial Plots
397 | 
398 | Another nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude "auto-magically."
399 | 
400 | This map is created with `ggmap` which utilizes Google Maps API.
401 | 
402 | 
403 | ```r
404 | library(ggmap)
405 | library(ggalt)
406 | 
407 | foco <-  geocode("Fort Collins, CO")  # get longitude and latitude
408 | 
409 | # Get the Map ----------------------------------------------
410 | colo_map <- qmap("Colorado, United States",zoom = 7, source = "google")   
411 | 
412 | # Get Coordinates for Places ---------------------
413 | colo_places <- c("Fort Collins, CO",
414 |                     "Denver, CO",
415 |                     "Grand Junction, CO",
416 |                     "Durango, CO",
417 |                     "Pueblo, CO")
418 | 
419 | places_loc <- geocode(colo_places)  # get longitudes and latitudes
420 | 
421 | 
422 | # Plot Open Street Map -------------------------------------
423 | colo_map + geom_point(aes(x=lon, y=lat),
424 |                              data = places_loc, 
425 |                              alpha = 0.7, 
426 |                              size = 7, 
427 |                              color = "tomato") + 
428 |                   geom_encircle(aes(x=lon, y=lat),
429 |                                 data = places_loc, size = 2, color = "blue")
430 | ```
431 | 
432 | ![plot of chunk unnamed-chunk-18](http://i.imgur.com/rmhVRiD.png)
433 | 
434 | ### Final Thoughts  
435 | 
436 | I hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!
437 | 
438 | ### Coming soon:  
439 | 
440 | - Determining whether or not you need a visualization  
441 | - Choosing the type of plot to use depending on the use case  
442 | - Visualization beyond the standard charts and graphs  
443 | 
444 | 
445 | I made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). 
446 | 
447 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)
448 | 


--------------------------------------------------------------------------------
/Data Visualization - Tropical Storms.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tropical Storm Data"
  3 | author: "Scott Stoltzman"
  4 | date: "9/12/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, fig.align = 'center')
 10 | ```
 11 | ```{r load_libraries}
 12 | library(tidyverse)
 13 | library(ggthemes)
 14 | library(ggmap)
 15 | library(htmlwidgets)
 16 | ```
 17 | 
 18 | # Exploratory Data Analysis of Tropical Storms in R  
 19 | 
 20 | The disastrous impact of recent hurricanes, Harvey and Irma, generated a large influx of data within the online community. I was curious as to what the history of hurricanes and tropical storms looked like so I found a data set on [data.world](https://data.world/dhs/historical-tropical-storm) and started some basic Exploratory data analysis (EDA).  
 21 | 
 22 | 
 23 | EDA is crucial to starting any project. Through EDA you can start to identify errors & inconsistencies in your data, find interesting patterns, see correlations and start to develop hypotheses to test. For most people, basic spreadsheets and charts are pretty handy and provide a great place to start. They are an easy-to-use method to manipulate and visualize your data quickly. Data scientists may cringe at the idea of using a graphical user interface (GUI) to kick-off the EDA process but the reality is clear, those tools are very effective and efficient when used properly. However, if you're reading this, you're probably trying to take EDA to the next level. The best way to learn is to get your hands dirty, let's get started.  
 24 | 
 25 | 
 26 | The original source of the data was can be found at [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0).  
 27 | 
 28 | 
 29 | ----
 30 | 
 31 | 
 32 | #### Step 1:  Take a look at your data set and see how it is laid out  
 33 | 
 34 | 
 35 | ```{r read_data}
 36 | # data source https://data.world/dhs/historical-tropical-storm
 37 | data = read_csv('data/Historical_Tropical_Storm_Tracks.csv')
 38 | knitr::kable(head(data))
 39 | ```
 40 |   
 41 |   
 42 | Fortunately, this is a tidy data set which will make life easier and appears to be cleaned up substantially. The column names are relatively straightforward with the exception of "ID" columns.
 43 | 
 44 | The description as given by [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0): 
 45 | 
 46 | >This dataset represents Historical North Atlantic and Eastern North Pacific Tropical Cyclone Tracks with 6-hourly (0000, 0600, 1200, 1800 UTC) center locations and intensities for all subtropical depressions and storms, extratropical storms, tropical lows, waves, disturbances, depressions and storms, and all hurricanes, from 1851 through 2008. These data are intended for geographic display and analysis at the national level, and for large regional areas. The data should be displayed and analyzed at scales appropriate for 1:2,000,000-scale data.  
 47 | 
 48 | 
 49 | #### Step 2:  View some descriptive statistics  
 50 | 
 51 | ```{r}
 52 | knitr::kable(summary(data %>% select(YEAR, 
 53 |                                      MONTH, 
 54 |                                      DAY,
 55 |                                      WIND_KTS,
 56 |                                      PRESSURE)))
 57 | ```
 58 | 
 59 | 
 60 | We can confirm that this particular data had storms from 1851 - 2010, that means the data goes back roughly 100 years before naming storms started! We can also see that the minimum pressure values are 0, which likely means it could not be measured (due to the fact zero pressure is not possible in this case). We can see that there are recorded months from January to December along with days extending from 1 to 31. Whenever you see all of the dates laid out that way, you can smile and think to yourself, "if I need to, I can put dates in an easy to use format such as YYYY-mm-dd (2017-09-12)!"  
 61 | 
 62 | 
 63 | #### Step 3: Make a basic plot  
 64 | 
 65 | 
 66 | ```{r}
 67 | df = data %>%
 68 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
 69 |   group_by(YEAR) %>%
 70 |   summarise(Distinct_Storms = n_distinct(NAME))
 71 | 
 72 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms)) + theme_economist()
 73 | p + geom_line(size = 1.1) + 
 74 |   ggtitle("Number of Storms Per Year") + 
 75 |   geom_smooth(method='lm', se = FALSE) + 
 76 |   ylab("Storms")
 77 | ```
 78 |   
 79 | 
 80 | This is a great illustration of our data set and we can easily notice an upward trend in the number of storms over time. Before we go running to tell the world that the number of storms per year is growing, we need to drill down a bit deeper. This could simply be caused because more types of storms were added to the data set (we know there are hurricanes, tropical storms, waves, etc.) being recorded. However, we should keep it in mind when we start to develop hypotheses.  
 81 | 
 82 | 
 83 | **You will notice the data starts at 1950 rather than 1851.** I made this choice because storms were not named until this point so it would be difficult to try and count the unique storms per year. It could likely be done by finding a way to utilize the "ID" columns. However, this is a preliminary analysis so I didn't want to dig too deep.  
 84 | 
 85 | 
 86 | #### Step 4: Make some calculations
 87 | 
 88 | ```{r}
 89 | pct.diff = function(x){round((x-lag(x))/lag(x),2)}
 90 | act.diff = function(x){round((x-lag(x)),2)}
 91 | df = data %>%
 92 |   arrange(YEAR) %>%
 93 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
 94 |   group_by(YEAR) %>%
 95 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
 96 |   mutate(Distinct_Storms_Change = act.diff(Distinct_Storms),
 97 |          Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>%
 98 |   na.omit() %>%
 99 |   arrange(YEAR)
100 | df$YEAR = factor(df$YEAR)
101 | knitr::kable(head(df,10))
102 | ```
103 |   
104 |   
105 | In this case, we can see the number of storms, nominal change and percentage change per year. These calculations help to shed light on what the growth rate looks like each year.  So we can use another summary table:  
106 | 
107 | 
108 | ```{r}
109 | knitr::kable(summary(df %>% select(-YEAR)))
110 | ```
111 |   
112 | From the table we can state the following for the given time period:  
113 | 
114 |   * The mean number of storms is 23 per year (with a minimum of 6 and maximum of 43)
115 |   * The mean change in the number of storms per year is 0.34 (with a minimum of -15 and maximum of 16)
116 |   * The mean percent change in the number of storms per year is 6% (with a minimum of -42% and maximum of 114%)
117 | 
118 | Again, we have to be careful because these numbers are in aggregate and may not tell the whole story. Dividing these into groups of storms is likely much more meaningful.  
119 | 
120 | 
121 | 
122 | 
123 | #### Step 5: Make a more interesting plot  
124 | 
125 | 
126 | ```{r}
127 | df = data %>%
128 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
129 |   filter(grepl("H", CAT)) %>%
130 |   group_by(YEAR,CAT) %>%
131 |   summarise(Distinct_Storms = n_distinct(NAME))
132 | df$CAT = factor(df$CAT)
133 | 
134 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms, col = CAT)) + theme_economist()
135 | p + geom_line(size = 1.1) + 
136 |   scale_color_brewer(direction = -1, palette = "Spectral") + 
137 |   ggtitle("Number of Storms Per Year By Category (H)") + 
138 |   facet_wrap(~CAT, scales = "free_x") + 
139 |   geom_smooth(method = 'lm', se = FALSE, col = 'black') +
140 |   theme(axis.text.x = element_text(angle=90), legend.position = 'none') + 
141 |   ylab('Storms')
142 | ```
143 |   
144 | 
145 | Because I was most interested in hurricanes, I filtered out only the data which was classified as "H (1-5)." By utilizing a data visualization technique called "small multiples" I was able to pull out the different types and view them within the same graph. While this is possible to do in tables and spreadsheets, it's much easier to visualize this way. By holding the axes constant, we can see the majority of the storms are classified as H1 and then it appears to consistently drop down toward H5 (with very few actually being classified as H5). We can also see that most have an upward trend from 1950 - 2010. The steepest appears to be H1 (but it also flattens out over the last decade).  
146 | 
147 | 
148 | #### Step 5: Make a filtered calculation  
149 | 
150 | ```{r}
151 | df = data %>%
152 |   arrange(YEAR) %>%
153 |   filter(grepl("H", CAT)) %>%
154 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
155 |   group_by(YEAR) %>%
156 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
157 |   mutate(Distinct_Storms_Change = act.diff(Distinct_Storms),
158 |          Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>%
159 |   na.omit() %>%
160 |   arrange(YEAR)
161 | knitr::kable(summary(df %>% select(-YEAR)))
162 | ```
163 | 
164 | 
165 | Now we are looking strictly at hurricane data (classified as H1-H5):  
166 | 
167 |   * The mean number of hurricanes is 13 per year (with a minimum of 4 and maximum of 24)
168 |   * The mean change in the number of hurricanes per year is 0.05 (with a minimum of -11 and maximum of 10)
169 |   * The mean percent change in the number of hurricanes per year is 8% (with a minimum of -56% and maximum of 180%)  
170 |     
171 | While it doesn't really make sense to say "we got an average growth of 0.05 hurricanes per year between 1950 and 2010" ... it may make sense to say "we saw an average of growth of 8% per year in the number of hurricanes between 1950 and 2010."  
172 | 
173 | That's a great thing to put in quotes!
174 | 
175 | > During EDA we discovered an average of growth of 8% per year in the number of hurricanes between 1950 and 2010.  
176 | 
177 | Be ready, as soon as you make a statement like that, you will likely have to explain how you arrived at that conclusion. That's where having an RMarkdown notebook and data online in a repository will help you out! Reproducible research is all of the hype right now.   
178 |   
179 | 
180 | #### Step 5: Try visualizing your statements  
181 | 
182 | ```{r}
183 | 
184 | df = data %>%
185 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
186 |   filter(grepl("H", CAT)) %>%
187 |   group_by(YEAR) %>%
188 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
189 |   mutate(Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms))
190 | 
191 | p = ggplot(df,aes(x = Distinct_Storms_Pct_Change)) + theme_economist()
192 | 
193 | p1 = p + geom_histogram(bins = 20) +
194 |   ggtitle("YoY % Change Density") +
195 |   scale_x_continuous(labels = scales::percent) +
196 |   ylab('') + xlab('YoY % Change in Hurricanes')
197 | 
198 | p2 = p + geom_density(fill='darkgrey',alpha=0.5) +
199 |   ggtitle("YoY % Change Density") +
200 |   scale_x_continuous(labels = scales::percent) +
201 |   ylab('') + xlab('YoY % Change in Hurricanes')
202 | 
203 | gridExtra::grid.arrange(p1,p2,ncol=2)
204 | ```
205 | 
206 | 
207 | A histogram and/or density plot is a great way to visualize the distribution of the data you are making statements about. This plot helps to show that we are looking at a right-skewed distribution with substantial variance. Knowing that we have n = 58 (meaning 58 years after being aggregated), it's not surprising that our histogram looks sparse and our density plot has an unusual shape. At this point, you can make a decision to jot this down, research it in depth and then attack it with full force.  
208 | 
209 | 
210 | However, that's not what we're covering in this post.  
211 | 
212 | 
213 | #### Step 6: Plot another aspect of your data
214 | 
215 | 
216 | ```{r}
217 | big_map <- get_googlemap(c(lon=-95, lat=30), zoom = 4, maptype = "terrain")
218 | ggmap(big_map, extent='panel') + 
219 |   geom_point(data = data, mapping = aes(x = LONG, y = LAT),col='red',alpha=0.1)
220 | ```
221 | 
222 | 
223 | 60K pieces of data can get out of hand quickly, we need to back this down into manageable chunks. Building on the knowledge from our last exploration, we should be able to think of a way to cut this down to get some better information. The concept of small multiples could come in handy again! Splitting the data up by type of storm could prove to be invaluable. We can also tell that we are missing
224 | 
225 | -----
226 | 
227 | ```{r}
228 | df = data %>% filter(grepl("H", CAT))
229 | ggmap(big_map) + 
230 |   geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 
231 |   stat_density2d(data = df, 
232 |     aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 
233 |     bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 
234 |     guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 
235 |   facet_wrap(~CAT)
236 | ```
237 |   
238 | 
239 | After filtering the data down to hurricanes and utilizing a heatmap rather than plotting individual points we can get a better handle on what is happening where. The H4 and H5 sections are probably the most interesting. It appears as if H4 storms are more frequent on the West coast of Mexico whereas the H5 are most frequent in the Gulf of Mexico.  
240 | 
241 | 
242 | Because we're still in EDA mode, we'll continue with another plot.  
243 | 
244 | 
245 | ```{r}
246 | df = data %>% filter(!grepl("H", CAT) & !grepl("W", CAT))
247 | ggmap(big_map) + 
248 |   geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 
249 |   stat_density2d(data = df, 
250 |     aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 
251 |     bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 
252 |     guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 
253 |   facet_wrap(~CAT)
254 | ```
255 | 
256 | 
257 | Here are some of the other storms from the data set. We can see that TD, TS and L have large geographical spreads. The E, SS, and SD storms are concentrated further North toward New England.  
258 | 
259 | Digging into this type of data and building probabalistic models is a fascinating field. The actuarial sciences are extremely difficult and insurance companies really need good models. Having mapped this data, it's pretty clear you could dig in and find out what parts of the country should expect what types of storms (and you've also known this just from being alive for 10+ years). More hypotheses could be formed about location at this stage and could be tested!  
260 | 
261 | 
262 | #### Step 7: Look for a relationship
263 | 
264 | 
265 | ```{r}
266 | df = data %>% 
267 |   filter(PRESSURE > 0) %>%
268 |   filter(grepl("H", CAT)) %>%
269 |   group_by(CAT,YEAR,MONTH,DAY,LAT,LONG) %>%
270 |   summarise(MEAN_WIND_KTS = mean(WIND_KTS), MEAN_PRESSURE = mean(PRESSURE)) %>%
271 |   arrange(MEAN_WIND_KTS)
272 | df$CAT = factor(df$CAT)
273 | 
274 | p = ggplot(df,aes(x=MEAN_WIND_KTS, y = MEAN_PRESSURE, fill = CAT)) + theme_economist()
275 | p + 
276 |   geom_hex(alpha = 0.8) +
277 |   scale_fill_brewer(direction = -1, palette = "Spectral") + 
278 |   scale_y_continuous(labels = scales::comma)+ 
279 |   theme(legend.position = 'right') + 
280 |   ggtitle("Wind KTS vs. Pressure by Category (H)")
281 | ```
282 |   
283 |    
284 | What is the relationship between WIND_KTS and PRESSURE? This chart helps us to see that low PRESSURE and WIND_KTS are likely negatively correlated. We can also see that the WIND_KTS is essentially the predictor in the data set which can perfectly predict how a storm is classified. Well, it turns out, that's basically the distinguising feature when scientists are determining how to categorize these storms!  
285 | 
286 | 
287 | #### Step ........
288 | 
289 | The rest is up to you! This is a great data set and there are a lot more pieces of information lurking within it. I want people to do their own EDA and send me anything interesting! 
290 | 
291 | Some food for thought:  
292 | 
293 |   * What was the most common name for a hurricane?
294 |   * Do the names actually follow an alphabetical pattern through time? (This is one is tricky)
295 |   * Can we merge this data with FEMA, charitable donations, or other aid data?
296 | 
297 |   
298 | To get you started on the first one, here's the Top 10 most common names for tropical storms. Why do you think it's Florence?
299 | 
300 | 
301 | ```{r}
302 | top_names = data %>%
303 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
304 |   group_by(NAME) %>%
305 |   summarise(Years_Used = n_distinct(YEAR)) %>%
306 |   arrange(-Years_Used)
307 | p = ggplot(top_names %>% top_n(10), aes(x = reorder(NAME, Years_Used), y = Years_Used)) + theme_economist()
308 | p + geom_bar(stat='identity') + coord_flip() + xlab('') + ggtitle('Most Used Tropical Storm Names')
309 | ```
310 | 
311 | 
312 | Thank you for reading, I hope this helps you with your own data. The code is all written in R and is located on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson). You can also find other data visualization posts and usages of ggplot2 on my blog [Stoltzmaniac](https://www.stoltzmaniac.com?utm_campaign=bottom_of_tropical_storm_post)
313 | 


--------------------------------------------------------------------------------
/Data Visualization Lesson.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization - Introduction"
  3 | subtitle: "The Good, The Bad and The Ugly"
  4 | author: "Scott Stoltzman"
  5 | date: "March 10, 2017"
  6 | output: html_document
  7 | ---
  8 | 
  9 | ```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}
 10 | library(png)
 11 | library(grid)
 12 | ```
 13 | ----  
 14 | 
 15 | # A Lesson on Data Visualization - Introduction  
 16 | 
 17 | ```{r, fig.align='center',echo=FALSE}
 18 | img = readPNG("images/title_photo.png")
 19 | grid.raster(img)
 20 | ```
 21 | 
 22 | The topic of data visualization is still very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 
 23 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.  
 24 | 
 25 | This is the first post of many on how to utilize data visualization techniques effectively. I will focus primarily on use cases in R but visualizations go well beyond charts and graphs.
 26 | 
 27 | ----
 28 | 
 29 | ## Questions to ask before starting your project:  
 30 | 
 31 | 1. Is the information complex enough to require any visual aid?  
 32 | 2. Do you understand your audience well enough for this to provide value?
 33 | 3. Can you tailor the material to your audience?
 34 | 4. Will anyone learn anything by looking at it?
 35 | 5. Will it accurately reflect the data?
 36 | 
 37 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?***
 38 | 
 39 | This introduction should whet your appetite if you're interested in learning more. This is a broad view of some data visualization techniques in R using ggplot2.
 40 | 
 41 | ---- 
 42 | 
 43 | ## Let's take a look at some examples  
 44 | 
 45 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.
 46 | 
 47 | ### One author wanted to show the cause of death in all of Shakespeare
 48 | **Could you imagine a worse way to show this??**  
 49 | 
 50 | ```{r, fig.align='center',echo=FALSE}
 51 | img = readPNG("images/bad-pie1.png")
 52 | grid.raster(img)
 53 | ```
 54 |     
 55 | 
 56 | **Is this not insane!?!?!**  
 57 | 
 58 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. I hate wordles but due to the fact that the article wasn't trying to show the exact proportions, a wordle easily illustrates the point (a point made by [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html)).
 59 |   
 60 | 
 61 | ```{r, fig.align='center',echo=FALSE}
 62 | img = readPNG("images/bad-pie1-fix.png")
 63 | grid.raster(img)
 64 | ```
 65 | ----  
 66 | 
 67 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, we'll look at the theory and not worry too much about aesthetics (save that for a time when you're getting paid).
 68 | 
 69 | 
 70 | Bad visualizations can be:  
 71 | 
 72 | - Difficult or impossible to interpret
 73 | - Filled with completely worthless information
 74 | - Misleading (intentionally or unintentionally)
 75 | - Redundant and boring
 76 | - Inaccurate
 77 | 
 78 | Let's load up some libraries and get started.
 79 | 
 80 | ```{r libraryPrep, results='hide', warning=FALSE, message=FALSE}
 81 | library(ggplot2)
 82 | library(dplyr)
 83 | library(tidyr)
 84 | library(lubridate)
 85 | library(scales)
 86 | ```
 87 | 
 88 | ## Decide on what you're trying to accomplish first.  
 89 | 
 90 | Ask yourself the following questions to help drive your decision:  
 91 | 
 92 | - Are you making a comparison?
 93 | - Are you finding a relationship?
 94 | - Are you showing a distribution?
 95 | - Are you finding a trend over time?
 96 | - Are you showing composition?
 97 |   
 98 | Once you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.
 99 | 
100 | #### Rule of Thumb  
101 | 
102 | - **Trend: ** Column, Line  
103 | - **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter  
104 | - **Relationship: ** Line, Scatter  
105 | - **Distribution: ** Bar, Boxplot, Column  
106 | - **Composition: ** Donut, Pie, Stacked Bar, Stacked Column  
107 |   
108 | Obviously, there are choices beyond these and you need to think through your choice wisely. 
109 | 
110 | Side Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately.
111 | 
112 | For this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm)  
113 | 
114 | 
115 | ```{r dataLoading, results='hide', warning=FALSE, message=FALSE}
116 | #Custom data preparation
117 | #GitHub (linked to at bottom of this post)
118 | source('data_preparation.R')
119 | data = getData()
120 | ```
121 | 
122 | ```{r}
123 | head(data)
124 | ```
125 | 
126 | ----  
127 | 
128 | ## Trend - Line Chart
129 | 
130 | **Objective:** See what the oil production in the US looked like from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period.
131 | 
132 | I decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading.
133 | 
134 | ### Which of these views would you rather see?
135 | 
136 | #### Poor Version  
137 | The x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still almost worthless.
138 | 
139 | 
140 | ```{r,fig.align='center', fig.width=8}
141 | df = data %>% 
142 |   group_by(Year) %>%
143 |   summarise(ThousandBarrel = sum(ThousandBarrel))
144 | 
145 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 
146 | p + geom_line(stat='identity') + 
147 |   ggtitle('Oil Production Over Time') + 
148 |   theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 
149 |   xlab('') + ylab('')
150 | ```
151 | 
152 | #### Better Version  
153 | The title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.
154 | 
155 | ```{r,fig.align='center', fig.width=12}
156 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 
157 | p + geom_line(stat='identity') + 
158 |   ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +
159 |   theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 
160 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
161 |   scale_y_continuous(labels = comma)
162 | ```
163 | 
164 | 
165 | ## Comparison - Line Chart  
166 | 
167 | **Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare them.  
168 | 
169 | From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010.
170 | 
171 | ### Which of these views would you rather see?
172 | 
173 | #### Poor Version  
174 | There are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. 
175 | 
176 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
177 | df = data %>%
178 |   group_by(Location, Year) %>%
179 |   summarise(ThousandBarrel = sum(ThousandBarrel))
180 | 
181 | df$Year = as.numeric(df$Year)
182 | 
183 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))
184 | p + geom_line(stat='identity') + 
185 |   ggtitle(paste('Oil Production By Year By State in the U.S.')) + 
186 |   theme(plot.title = element_text(hjust = 0.5)) + 
187 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
188 | ```
189 | 
190 | #### Better Version  
191 | This focuses attention on the top producing states. It compares them to each other and shows the trend per state as well.
192 | 
193 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
194 | n=6 #Arbitrary at first, after trying a few, this made the most sense
195 | topN = data %>%
196 |   group_by(Location) %>%
197 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
198 |   arrange(-ThousandBarrel) %>%
199 |   top_n(n)
200 | 
201 | df = data %>%
202 |   filter(Location %in% topN$Location) %>%
203 |   group_by(Year,Location) %>%
204 |   summarise(ThousandBarrel = sum(ThousandBarrel))
205 | 
206 | df$Year = as.numeric(df$Year)
207 | df$Location = as.factor(df$Location)
208 | 
209 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))
210 | p + geom_line(stat='identity') + 
211 |   ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + 
212 |   theme(plot.title = element_text(hjust = 0.5)) + 
213 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
214 |   facet_wrap(~Location) + 
215 |   scale_y_continuous(labels = comma) 
216 | 
217 | ```
218 | 
219 | 
220 | 
221 | ## Relationship - Scatter Plot
222 | 
223 | **Objective**: See if Alaska and California data is correlated (This probably isn't important but it allows us to use the same data).
224 | 
225 | ### Which of these views would you rather see?
226 | 
227 | #### Poor Version  
228 | Lots of completely irrelevant data! Size of the point should have nothing to do with the year.  
229 | 
230 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
231 | statesList = c('Alaska','California')
232 | df = data %>%
233 |   filter(Location %in% statesList) %>%
234 |   spread(Location,ThousandBarrel) %>%
235 |   select(Alaska,California,Month,Year)
236 | 
237 | p = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))
238 | p + geom_point() + 
239 |   scale_y_continuous(labels = comma) +
240 |   scale_x_continuous(labels = comma) +
241 |   ggtitle('Oil Production - CA vs. AK') + 
242 |   theme(plot.title = element_text(hjust = 0.5))
243 | 
244 | ```
245 | 
246 | #### Better Version  
247 | The trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data.  
248 | 
249 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
250 | df = data %>%
251 |   filter(Location %in% statesList) %>%
252 |   spread(Location,ThousandBarrel) %>%
253 |   select(Alaska,California,Year)
254 | 
255 | p = ggplot(df,aes(x=Alaska,y=California))
256 | p + geom_point() + 
257 |   scale_y_continuous(labels = comma) +
258 |   scale_x_continuous(labels = comma) +
259 |   ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + 
260 |   theme(plot.title = element_text(hjust = 0.5)) + 
261 |   geom_smooth(method='lm')
262 | 
263 | ```
264 | 
265 | 
266 | 
267 | 
268 | ## Distribution - Boxplot 
269 | 
270 | **Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance.
271 | 
272 | ### Which of these views would you rather see?
273 | 
274 | #### Poor Version  
275 | 
276 | 
277 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
278 | df = data %>%
279 |   group_by(Year,Location) %>%
280 |   summarise(ThousandBarrel = sum(ThousandBarrel))
281 | 
282 | p = ggplot(df,aes(x=Location,y=ThousandBarrel))
283 | p + geom_boxplot() + 
284 |   ggtitle('Distribution of Oil Production by State')
285 | 
286 | ```
287 | 
288 | 
289 | #### Better Version  
290 | This gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players.
291 | 
292 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
293 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))
294 | p + geom_boxplot() + 
295 |   scale_y_continuous(labels = comma) +
296 |   ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + 
297 |   coord_flip()
298 | ```
299 | 
300 | 
301 | ## Composition - Stacked Bar 
302 | 
303 | **Objective**: Check out the composition of total production by state.
304 | 
305 | ### Which of these views would you rather see?
306 | 
307 | #### Poor Version  
308 | My favorite, the beautiful pie chart! There's nothing better than this...
309 | 
310 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
311 | df = data %>%
312 |   group_by(Location) %>%
313 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
314 |   mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))
315 | 
316 | df$ThousandBarrel = round(100*df$ThousandBarrel,0)
317 | 
318 | library(plotrix)
319 | pie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')
320 | 
321 | ```
322 | 
323 | 
324 | #### Better Version  
325 | The 1980's and 2010's will be missing years in terms of a "decade" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite "apples-to-apples" for a comparison because I created different decades, but you get the idea.
326 | 
327 | I also created an "Other" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely.
328 | 
329 | ```{r,warning=FALSE,fig.width=10,message=FALSE}
330 | data$Decade = '1980s'
331 | data$Decade[data$Year >= 1990] = '1990s'
332 | data$Decade[data$Year >= 2000] = '2000s'
333 | data$Decade[data$Year >= 2010] = '2010s'
334 | data$Decade = as.factor(data$Decade)
335 | 
336 | top5 = data %>%
337 |   group_by(Location) %>%
338 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
339 |   arrange(-ThousandBarrel) %>%
340 |   top_n(5) %>%
341 |   select(Location)
342 | 
343 | top5List = top5$Location
344 | 
345 | data$State = "Other"
346 | 
347 | for(i in 1:length(top5List)){
348 |   data$State[data$Location == top5List[i]] = top5List[i]
349 | }
350 | 
351 | df = data %>%
352 |   group_by(Decade,State) %>%
353 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
354 |   mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))
355 | 
356 | df$ThousandBarrel = round(df$ThousandBarrel,3)
357 | df$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')
358 | 
359 | p = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))
360 | p + geom_bar(stat='identity') + 
361 |   geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = "stack") + 
362 |   scale_y_continuous(labels = percent) +
363 |   ggtitle('Percentage of Top Oil Producing States by Decade') + 
364 |   guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + 
365 |   theme(plot.title = element_text(hjust = 0.5))
366 | 
367 | ```
368 | 
369 | 
370 | 
371 | 
372 | ### Some other fun concepts are below!  
373 | Some of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. 
374 | 
375 | Have fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. 
376 | 
377 | Now it's your turn!
378 | 
379 | 
380 | ```{r}
381 | df = data %>% 
382 |   group_by(Location) %>%
383 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
384 |   arrange(-ThousandBarrel)
385 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))
386 | p + geom_bar(stat='identity') + 
387 |   ggtitle('Oil Production 1981 - 2016 By Location') + 
388 |   theme(plot.title = element_text(hjust = 0.5)) + 
389 |   coord_flip()
390 | ```
391 | 
392 | 
393 | 
394 | 
395 | 
396 | ```{r}
397 | top10 = data %>%
398 |   group_by(Location) %>%
399 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
400 |   arrange(-ThousandBarrel) %>%
401 |   top_n(10)
402 | print(top10)
403 | 
404 | df = data %>% 
405 |   group_by(Location,Year) %>%
406 |   filter(Location %in% top10$Location) %>%
407 |   summarise(ThousandBarrel = sum(ThousandBarrel)) 
408 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))
409 | p + geom_bar(stat='identity') + 
410 |   ggtitle('Oil Production - Top 10 States') + 
411 |   theme(plot.title = element_text(hjust = 0.5)) + 
412 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
413 | ```
414 | 
415 | 
416 | 
417 | ```{r, fig.width=10}
418 | df = data %>%
419 |   filter(Year == 1990)%>%
420 |   group_by(Location) %>%
421 |   summarise(ThousandBarrel = sum(ThousandBarrel))
422 | df$Location = tolower(df$Location)
423 | 
424 | #Add States without data
425 | States = data.frame(Location = tolower(as.character(state.name)))
426 | missingStates = States$Location[!(States$Location %in% df$Location)]
427 | appendData = data.frame(Location=missingStates,ThousandBarrel=0)
428 | df = rbind(df,appendData)
429 | 
430 | states_map <- map_data("state")
431 | 
432 | ggplot(df, aes(map_id = Location)) + 
433 |     geom_map(aes(fill=ThousandBarrel), map = states_map) +
434 |     expand_limits(x = states_map$long, y = states_map$lat)
435 | 
436 | ```
437 | 
438 | 
439 | ```{r}
440 | df = data %>% 
441 |   filter(Location == 'Texas') %>%
442 |   group_by(Year,Month) %>%
443 |   summarise(ThousandBarrel = sum(ThousandBarrel))
444 | 
445 | p = ggplot(df,aes(x=Month,y=ThousandBarrel))
446 | p + geom_line(stat='identity',aes(group=Year,col=Year)) + 
447 |   ggtitle('Oil Production By Year in the U.S.') + 
448 |   theme(plot.title = element_text(hjust = 0.5)) + 
449 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
450 | ```
451 | 
452 | 
453 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)


--------------------------------------------------------------------------------
/Data Visualization Lesson.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Visualization"
  3 | subtitle: "The Good, The Bad and The Ugly"
  4 | author: "Scott Stoltzman"
  5 | date: "March 10, 2017"
  6 | output: html_document
  7 | ---
  8 | 
  9 | 
 10 | 
 11 | ----  
 12 | 
 13 | # Data Viz Ain't Easy
 14 | 
 15 | In almost every business meeting you'll attend, you will see a poorly designed chart, graph, or other visual representation of data. Most people simply lack the education required to emphasize a point.  
 16 | 
 17 | Bad visualizations can be:  
 18 | 
 19 | - Difficult or impossible to interpret
 20 | - Filled with completely worthless information
 21 | - Misleading (intentionally or unintentionally)
 22 | - Redundant and boring
 23 | - Inaccurate
 24 | 
 25 | 
 26 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/j) - it inspired a lot of this post.
 27 | 
 28 | ---- 
 29 | 
 30 | ## Let's take a look at some examples
 31 | 
 32 | ### Every Death in Shakespeare
 33 | **Could you imagine a worse way to show this??**  
 34 | 
 35 | <img src="http://i.imgur.com/4kF0q5F.png" title="plot of chunk unnamed-chunk-1" alt="plot of chunk unnamed-chunk-1" style="display: block; margin: auto;" />
 36 |     
 37 | 
 38 | **Is this not insane!?!?!**  
 39 | 
 40 | No one could ever glance at that and possibly want to read it. The only thing that would have made it worse would be if there had been a legend instead of data callouts. The author could easily have used a number of other tools to get the point across. I hate wordles but due to the fact that the article wasn't trying to show the exact proportions of type of deaths, a wordle easily illustrated the point.
 41 |   
 42 | 
 43 | <img src="http://i.imgur.com/dlDh5H8.png" title="plot of chunk unnamed-chunk-2" alt="plot of chunk unnamed-chunk-2" style="display: block; margin: auto;" />
 44 | This example came from this [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html)
 45 | 
 46 | ----  
 47 | 
 48 | To be clear, I'm not even close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't want to take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, we'll look at the theory and not worry too much about aesthetics (save that for a time when you're getting paid).
 49 | 
 50 | Let's load up some libraries and get started.
 51 | 
 52 | 
 53 | ```r
 54 | library(ggplot2)
 55 | library(dplyr)
 56 | library(tidyr)
 57 | library(lubridate)
 58 | library(scales)
 59 | ```
 60 | 
 61 | ## Decide on what you're trying to accomplish first.  
 62 | 
 63 | Ask yourself the following questions to help drive your decision:  
 64 | 
 65 | - Are you making a comparison?
 66 | - Are you finding a relationship?
 67 | - Are you showing a distribution?
 68 | - Are you finding a trend over time?
 69 | - Are you showing composition?
 70 |   
 71 | Once you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.
 72 | 
 73 | #### Rule of Thumb  
 74 | 
 75 | - **Trend: ** Column, Line  
 76 | - **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter  
 77 | - **Relationship: ** Line, Scatter  
 78 | - **Distribution: ** Bar, Boxplot, Column  
 79 | - **Composition: ** Donut, Pie, Stacked Bar, Stacked Column  
 80 |   
 81 | Obviously, there are choices beyond these and you need to think through your choice wisely. 
 82 | 
 83 | Side Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately.
 84 | 
 85 | For this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm)  
 86 | 
 87 | 
 88 | 
 89 | ```r
 90 | #Custom data preparation
 91 | #GitHub (linked to at bottom of this post)
 92 | source('data_preparation.R')
 93 | data = getData()
 94 | ```
 95 | 
 96 | 
 97 | ```r
 98 | head(data)
 99 | ```
100 | 
101 | ```
102 | ##   Location Month Year ThousandBarrel       Date
103 | ## 1  Alabama   Mar 2013            883 2013-03-01
104 | ## 2  Alabama   Apr 2013            844 2013-04-01
105 | ## 3  Alabama   May 2013            878 2013-05-01
106 | ## 4  Alabama   Feb 2013            809 2013-02-01
107 | ## 5  Alabama   Mar 1982           1687 1982-03-01
108 | ## 6  Alabama   Apr 1982           1567 1982-04-01
109 | ```
110 | 
111 | ----  
112 | 
113 | ## Trend - Line Chart
114 | 
115 | **Objective:** See what the oil production in the US looked like from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period.
116 | 
117 | I decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading.
118 | 
119 | ### Which of these views would you rather see?
120 | 
121 | #### Poor Version  
122 | The x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still almost worthless.
123 | 
124 | 
125 | 
126 | ```r
127 | df = data %>% 
128 |   group_by(Year) %>%
129 |   summarise(ThousandBarrel = sum(ThousandBarrel))
130 | 
131 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 
132 | p + geom_line(stat='identity') + 
133 |   ggtitle('Oil Production Over Time') + 
134 |   theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 
135 |   xlab('') + ylab('')
136 | ```
137 | 
138 | <img src="http://i.imgur.com/QjuY0mP.png" title="plot of chunk unnamed-chunk-4" alt="plot of chunk unnamed-chunk-4" style="display: block; margin: auto;" />
139 | 
140 | #### Better Version  
141 | The title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.
142 | 
143 | 
144 | ```r
145 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 
146 | p + geom_line(stat='identity') + 
147 |   ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +
148 |   theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 
149 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
150 |   scale_y_continuous(labels = comma)
151 | ```
152 | 
153 | <img src="http://i.imgur.com/DmOdVOK.png" title="plot of chunk unnamed-chunk-5" alt="plot of chunk unnamed-chunk-5" style="display: block; margin: auto;" />
154 | 
155 | 
156 | ## Comparison - Line Chart  
157 | 
158 | **Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare them.  
159 | 
160 | From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010.
161 | 
162 | ### Which of these views would you rather see?
163 | 
164 | #### Poor Version  
165 | There are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. 
166 | 
167 | 
168 | ```r
169 | df = data %>%
170 |   group_by(Location, Year) %>%
171 |   summarise(ThousandBarrel = sum(ThousandBarrel))
172 | 
173 | df$Year = as.numeric(df$Year)
174 | 
175 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))
176 | p + geom_line(stat='identity') + 
177 |   ggtitle(paste('Oil Production By Year By State in the U.S.')) + 
178 |   theme(plot.title = element_text(hjust = 0.5)) + 
179 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
180 | ```
181 | 
182 | ![plot of chunk unnamed-chunk-6](http://i.imgur.com/LdhZsPF.png)
183 | 
184 | #### Better Version  
185 | This focuses attention on the top producing states. It compares them to each other and shows the trend per state as well.
186 | 
187 | 
188 | ```r
189 | n=6 #Arbitrary at first, after trying a few, this made the most sense
190 | topN = data %>%
191 |   group_by(Location) %>%
192 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
193 |   arrange(-ThousandBarrel) %>%
194 |   top_n(n)
195 | 
196 | df = data %>%
197 |   filter(Location %in% topN$Location) %>%
198 |   group_by(Year,Location) %>%
199 |   summarise(ThousandBarrel = sum(ThousandBarrel))
200 | 
201 | df$Year = as.numeric(df$Year)
202 | df$Location = as.factor(df$Location)
203 | 
204 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))
205 | p + geom_line(stat='identity') + 
206 |   ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + 
207 |   theme(plot.title = element_text(hjust = 0.5)) + 
208 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
209 |   facet_wrap(~Location) + 
210 |   scale_y_continuous(labels = comma) 
211 | ```
212 | 
213 | ![plot of chunk unnamed-chunk-7](http://i.imgur.com/Vkxymeb.png)
214 | 
215 | 
216 | 
217 | ## Relationship - Scatter Plot
218 | 
219 | **Objective**: See if Alaska and California data is correlated (This probably isn't important but it allows us to use the same data).
220 | 
221 | ### Which of these views would you rather see?
222 | 
223 | #### Poor Version  
224 | Lots of completely irrelevant data! Size of the point should have nothing to do with the year.  
225 | 
226 | 
227 | ```r
228 | statesList = c('Alaska','California')
229 | df = data %>%
230 |   filter(Location %in% statesList) %>%
231 |   spread(Location,ThousandBarrel) %>%
232 |   select(Alaska,California,Month,Year)
233 | 
234 | p = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))
235 | p + geom_point() + 
236 |   scale_y_continuous(labels = comma) +
237 |   scale_x_continuous(labels = comma) +
238 |   ggtitle('Oil Production - CA vs. AK') + 
239 |   theme(plot.title = element_text(hjust = 0.5))
240 | ```
241 | 
242 | ![plot of chunk unnamed-chunk-8](http://i.imgur.com/ta0Uv6a.png)
243 | 
244 | #### Better Version  
245 | The trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data.  
246 | 
247 | 
248 | ```r
249 | df = data %>%
250 |   filter(Location %in% statesList) %>%
251 |   spread(Location,ThousandBarrel) %>%
252 |   select(Alaska,California,Year)
253 | 
254 | p = ggplot(df,aes(x=Alaska,y=California))
255 | p + geom_point() + 
256 |   scale_y_continuous(labels = comma) +
257 |   scale_x_continuous(labels = comma) +
258 |   ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + 
259 |   theme(plot.title = element_text(hjust = 0.5)) + 
260 |   geom_smooth(method='lm')
261 | ```
262 | 
263 | ![plot of chunk unnamed-chunk-9](http://i.imgur.com/Kxtvyrc.png)
264 | 
265 | 
266 | 
267 | 
268 | ## Distribution - Boxplot 
269 | 
270 | **Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance.
271 | 
272 | ### Which of these views would you rather see?
273 | 
274 | #### Poor Version  
275 | 
276 | 
277 | 
278 | ```r
279 | df = data %>%
280 |   group_by(Year,Location) %>%
281 |   summarise(ThousandBarrel = sum(ThousandBarrel))
282 | 
283 | p = ggplot(df,aes(x=Location,y=ThousandBarrel))
284 | p + geom_boxplot() + 
285 |   ggtitle('Distribution of Oil Production by State')
286 | ```
287 | 
288 | ![plot of chunk unnamed-chunk-10](http://i.imgur.com/QYyBcNn.png)
289 | 
290 | 
291 | #### Better Version  
292 | This gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players.
293 | 
294 | 
295 | ```r
296 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))
297 | p + geom_boxplot() + 
298 |   scale_y_continuous(labels = comma) +
299 |   ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + 
300 |   coord_flip()
301 | ```
302 | 
303 | ![plot of chunk unnamed-chunk-11](http://i.imgur.com/AunupK4.png)
304 | 
305 | 
306 | ## Composition - Stacked Bar 
307 | 
308 | **Objective**: Check out the composition of total production by state.
309 | 
310 | ### Which of these views would you rather see?
311 | 
312 | #### Poor Version  
313 | My favorite, the beautiful pie chart! There's nothing better than this...
314 | 
315 | 
316 | ```r
317 | df = data %>%
318 |   group_by(Location) %>%
319 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
320 |   mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))
321 | 
322 | df$ThousandBarrel = round(100*df$ThousandBarrel,0)
323 | 
324 | library(plotrix)
325 | pie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')
326 | ```
327 | 
328 | ![plot of chunk unnamed-chunk-12](http://i.imgur.com/vv1q2FX.png)
329 | 
330 | 
331 | #### Better Version  
332 | The 1980's and 2010's will be missing years in terms of a "decade" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite "apples-to-apples" for a comparison because I created different decades, but you get the idea.
333 | 
334 | I also created an "Other" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely.
335 | 
336 | 
337 | ```r
338 | data$Decade = '1980s'
339 | data$Decade[data$Year >= 1990] = '1990s'
340 | data$Decade[data$Year >= 2000] = '2000s'
341 | data$Decade[data$Year >= 2010] = '2010s'
342 | data$Decade = as.factor(data$Decade)
343 | 
344 | top5 = data %>%
345 |   group_by(Location) %>%
346 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
347 |   arrange(-ThousandBarrel) %>%
348 |   top_n(5) %>%
349 |   select(Location)
350 | 
351 | top5List = top5$Location
352 | 
353 | data$State = "Other"
354 | 
355 | for(i in 1:length(top5List)){
356 |   data$State[data$Location == top5List[i]] = top5List[i]
357 | }
358 | 
359 | df = data %>%
360 |   group_by(Decade,State) %>%
361 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
362 |   mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))
363 | 
364 | df$ThousandBarrel = round(df$ThousandBarrel,3)
365 | df$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')
366 | 
367 | p = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))
368 | p + geom_bar(stat='identity') + 
369 |   geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = "stack") + 
370 |   scale_y_continuous(labels = percent) +
371 |   ggtitle('Percentage of Top Oil Producing States by Decade') + 
372 |   guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + 
373 |   theme(plot.title = element_text(hjust = 0.5))
374 | ```
375 | 
376 | ![plot of chunk unnamed-chunk-13](http://i.imgur.com/I7Cyf6Z.png)
377 | 
378 | 
379 | 
380 | 
381 | ### Some other fun concepts are below!  
382 | Some of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. 
383 | 
384 | Have fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. 
385 | 
386 | Now it's your turn!
387 | 
388 | 
389 | 
390 | ```r
391 | df = data %>% 
392 |   group_by(Location) %>%
393 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
394 |   arrange(-ThousandBarrel)
395 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))
396 | p + geom_bar(stat='identity') + 
397 |   ggtitle('Oil Production 1981 - 2016 By Location') + 
398 |   theme(plot.title = element_text(hjust = 0.5)) + 
399 |   coord_flip()
400 | ```
401 | 
402 | ![plot of chunk unnamed-chunk-14](http://i.imgur.com/DwJ52D9.png)
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | ```r
410 | top10 = data %>%
411 |   group_by(Location) %>%
412 |   summarise(ThousandBarrel = sum(ThousandBarrel)) %>%
413 |   arrange(-ThousandBarrel) %>%
414 |   top_n(10)
415 | ```
416 | 
417 | ```
418 | ## Selecting by ThousandBarrel
419 | ```
420 | 
421 | ```r
422 | print(top10)
423 | ```
424 | 
425 | ```
426 | ## # A tibble: 10 × 2
427 | ##       Location ThousandBarrel
428 | ##          <chr>          <dbl>
429 | ## 1        Texas       23447172
430 | ## 2       Alaska       15775279
431 | ## 3   California        9988225
432 | ## 4    Louisiana        4267246
433 | ## 5     Oklahoma        3701224
434 | ## 6      Wyoming        2894624
435 | ## 7       Kansas        1708873
436 | ## 8     Colorado        1288643
437 | ## 9         Utah         894657
438 | ## 10 Mississippi         861999
439 | ```
440 | 
441 | ```r
442 | df = data %>% 
443 |   group_by(Location,Year) %>%
444 |   filter(Location %in% top10$Location) %>%
445 |   summarise(ThousandBarrel = sum(ThousandBarrel)) 
446 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))
447 | p + geom_bar(stat='identity') + 
448 |   ggtitle('Oil Production - Top 10 States') + 
449 |   theme(plot.title = element_text(hjust = 0.5)) + 
450 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
451 | ```
452 | 
453 | ![plot of chunk unnamed-chunk-15](http://i.imgur.com/Y21hxkY.png)
454 | 
455 | 
456 | 
457 | 
458 | ```r
459 | df = data %>%
460 |   filter(Year == 1990)%>%
461 |   group_by(Location) %>%
462 |   summarise(ThousandBarrel = sum(ThousandBarrel))
463 | df$Location = tolower(df$Location)
464 | 
465 | #Add States without data
466 | States = data.frame(Location = tolower(as.character(state.name)))
467 | missingStates = States$Location[!(States$Location %in% df$Location)]
468 | appendData = data.frame(Location=missingStates,ThousandBarrel=0)
469 | df = rbind(df,appendData)
470 | 
471 | states_map <- map_data("state")
472 | 
473 | ggplot(df, aes(map_id = Location)) + 
474 |     geom_map(aes(fill=ThousandBarrel), map = states_map) +
475 |     expand_limits(x = states_map$long, y = states_map$lat)
476 | ```
477 | 
478 | ![plot of chunk unnamed-chunk-16](http://i.imgur.com/jT1XTmF.png)
479 | 
480 | 
481 | 
482 | ```r
483 | df = data %>% 
484 |   filter(Location == 'Texas') %>%
485 |   group_by(Year,Month) %>%
486 |   summarise(ThousandBarrel = sum(ThousandBarrel))
487 | 
488 | p = ggplot(df,aes(x=Month,y=ThousandBarrel))
489 | p + geom_line(stat='identity',aes(group=Year,col=Year)) + 
490 |   ggtitle('Oil Production By Year in the U.S.') + 
491 |   theme(plot.title = element_text(hjust = 0.5)) + 
492 |   theme(axis.text.x = element_text(angle = 90, hjust = 1))
493 | ```
494 | 
495 | ![plot of chunk unnamed-chunk-17](http://i.imgur.com/4OwEmjV.png)
496 | 
497 | 
498 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)
499 | 


--------------------------------------------------------------------------------
/Data-Visualization-Lesson.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_1.pdf


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_2.pdf


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Tropical_Storms.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tropical Storm Data"
  3 | author: "Scott Stoltzman"
  4 | date: "9/12/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, fig.align = 'center')
 10 | ```
 11 | ```{r load_libraries}
 12 | library(tidyverse)
 13 | library(ggthemes)
 14 | library(ggmap)
 15 | library(htmlwidgets)
 16 | ```
 17 | 
 18 | # Exploratory Data Analysis of Tropical Storms in R  
 19 | 
 20 | The disastrous impact of recent hurricanes, Harvey and Irma, generated a large influx of data within the online community. I was curious as to what the history of hurricanes and tropical storms looked like so I found a data set on [data.world](https://data.world/dhs/historical-tropical-storm) and started some basic Exploratory data analysis (EDA).  
 21 | 
 22 | 
 23 | EDA is crucial to starting any project. Through EDA you can start to identify errors & inconsistencies in your data, find interesting patterns, see correlations and start to develop hypotheses to test. For most people, basic spreadsheets and charts are pretty handy and provide a great place to start. They are an easy-to-use method to manipulate and visualize your data quickly. Data scientists may cringe at the idea of using a graphical user interface (GUI) to kick-off the EDA process but the reality is clear, those tools are very effective and efficient when used properly. However, if you're reading this, you're probably trying to take EDA to the next level. The best way to learn is to get your hands dirty, let's get started.  
 24 | 
 25 | 
 26 | The original source of the data was can be found at [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0).  
 27 | 
 28 | 
 29 | ----
 30 | 
 31 | 
 32 | #### Step 1:  Take a look at your data set and see how it is laid out  
 33 | 
 34 | 
 35 | ```{r read_data}
 36 | # data source https://data.world/dhs/historical-tropical-storm
 37 | data = read_csv('data/Historical_Tropical_Storm_Tracks.csv')
 38 | knitr::kable(head(data))
 39 | ```
 40 |   
 41 |   
 42 | Fortunately, this is a tidy data set which will make life easier and appears to be cleaned up substantially. The column names are relatively straightforward with the exception of "ID" columns.
 43 | 
 44 | The description as given by [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0): 
 45 | 
 46 | >This dataset represents Historical North Atlantic and Eastern North Pacific Tropical Cyclone Tracks with 6-hourly (0000, 0600, 1200, 1800 UTC) center locations and intensities for all subtropical depressions and storms, extratropical storms, tropical lows, waves, disturbances, depressions and storms, and all hurricanes, from 1851 through 2008. These data are intended for geographic display and analysis at the national level, and for large regional areas. The data should be displayed and analyzed at scales appropriate for 1:2,000,000-scale data.  
 47 | 
 48 | 
 49 | #### Step 2:  View some descriptive statistics  
 50 | 
 51 | ```{r}
 52 | knitr::kable(summary(data %>% select(YEAR, 
 53 |                                      MONTH, 
 54 |                                      DAY,
 55 |                                      WIND_KTS,
 56 |                                      PRESSURE)))
 57 | ```
 58 | 
 59 | 
 60 | We can confirm that this particular data had storms from 1851 - 2010, that means the data goes back roughly 100 years before naming storms started! We can also see that the minimum pressure values are 0, which likely means it could not be measured (due to the fact zero pressure is not possible in this case). We can see that there are recorded months from January to December along with days extending from 1 to 31. Whenever you see all of the dates laid out that way, you can smile and think to yourself, "if I need to, I can put dates in an easy to use format such as YYYY-mm-dd (2017-09-12)!"  
 61 | 
 62 | 
 63 | #### Step 3: Make a basic plot  
 64 | 
 65 | 
 66 | ```{r}
 67 | df = data %>%
 68 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
 69 |   group_by(YEAR) %>%
 70 |   summarise(Distinct_Storms = n_distinct(NAME))
 71 | 
 72 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms)) + theme_economist()
 73 | p + geom_line(size = 1.1) + 
 74 |   ggtitle("Number of Storms Per Year") + 
 75 |   geom_smooth(method='lm', se = FALSE) + 
 76 |   ylab("Storms")
 77 | ```
 78 |   
 79 | 
 80 | This is a great illustration of our data set and we can easily notice an upward trend in the number of storms over time. Before we go running to tell the world that the number of storms per year is growing, we need to drill down a bit deeper. This could simply be caused because more types of storms were added to the data set (we know there are hurricanes, tropical storms, waves, etc.) being recorded. However, we should keep it in mind when we start to develop hypotheses.  
 81 | 
 82 | 
 83 | **You will notice the data starts at 1950 rather than 1851.** I made this choice because storms were not named until this point so it would be difficult to try and count the unique storms per year. It could likely be done by finding a way to utilize the "ID" columns. However, this is a preliminary analysis so I didn't want to dig too deep.  
 84 | 
 85 | 
 86 | #### Step 4: Make some calculations
 87 | 
 88 | ```{r}
 89 | pct.diff = function(x){round((x-lag(x))/lag(x),2)}
 90 | act.diff = function(x){round((x-lag(x)),2)}
 91 | df = data %>%
 92 |   arrange(YEAR) %>%
 93 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
 94 |   group_by(YEAR) %>%
 95 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
 96 |   mutate(Distinct_Storms_Change = act.diff(Distinct_Storms),
 97 |          Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>%
 98 |   na.omit() %>%
 99 |   arrange(YEAR)
100 | df$YEAR = factor(df$YEAR)
101 | knitr::kable(head(df,10))
102 | ```
103 |   
104 |   
105 | In this case, we can see the number of storms, nominal change and percentage change per year. These calculations help to shed light on what the growth rate looks like each year.  So we can use another summary table:  
106 | 
107 | 
108 | ```{r}
109 | knitr::kable(summary(df %>% select(-YEAR)))
110 | ```
111 |   
112 | From the table we can state the following for the given time period:  
113 | 
114 |   * The mean number of storms is 23 per year (with a minimum of 6 and maximum of 43)
115 |   * The mean change in the number of storms per year is 0.34 (with a minimum of -15 and maximum of 16)
116 |   * The mean percent change in the number of storms per year is 6% (with a minimum of -42% and maximum of 114%)
117 | 
118 | Again, we have to be careful because these numbers are in aggregate and may not tell the whole story. Dividing these into groups of storms is likely much more meaningful.  
119 | 
120 | 
121 | 
122 | 
123 | #### Step 5: Make a more interesting plot  
124 | 
125 | 
126 | ```{r}
127 | df = data %>%
128 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
129 |   filter(grepl("H", CAT)) %>%
130 |   group_by(YEAR,CAT) %>%
131 |   summarise(Distinct_Storms = n_distinct(NAME))
132 | df$CAT = factor(df$CAT)
133 | 
134 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms, col = CAT)) + theme_economist()
135 | p + geom_line(size = 1.1) + 
136 |   scale_color_brewer(direction = -1, palette = "Spectral") + 
137 |   ggtitle("Number of Storms Per Year By Category (H)") + 
138 |   facet_wrap(~CAT, scales = "free_x") + 
139 |   geom_smooth(method = 'lm', se = FALSE, col = 'black') +
140 |   theme(axis.text.x = element_text(angle=90), legend.position = 'none') + 
141 |   ylab('Storms')
142 | ```
143 |   
144 | 
145 | Because I was most interested in hurricanes, I filtered out only the data which was classified as "H (1-5)." By utilizing a data visualization technique called "small multiples" I was able to pull out the different types and view them within the same graph. While this is possible to do in tables and spreadsheets, it's much easier to visualize this way. By holding the axes constant, we can see the majority of the storms are classified as H1 and then it appears to consistently drop down toward H5 (with very few actually being classified as H5). We can also see that most have an upward trend from 1950 - 2010. The steepest appears to be H1 (but it also flattens out over the last decade).  
146 | 
147 | 
148 | #### Step 5: Make a filtered calculation  
149 | 
150 | ```{r}
151 | df = data %>%
152 |   arrange(YEAR) %>%
153 |   filter(grepl("H", CAT)) %>%
154 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
155 |   group_by(YEAR) %>%
156 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
157 |   mutate(Distinct_Storms_Change = act.diff(Distinct_Storms),
158 |          Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>%
159 |   na.omit() %>%
160 |   arrange(YEAR)
161 | knitr::kable(summary(df %>% select(-YEAR)))
162 | ```
163 | 
164 | 
165 | Now we are looking strictly at hurricane data (classified as H1-H5):  
166 | 
167 |   * The mean number of hurricanes is 13 per year (with a minimum of 4 and maximum of 24)
168 |   * The mean change in the number of hurricanes per year is 0.05 (with a minimum of -11 and maximum of 10)
169 |   * The mean percent change in the number of hurricanes per year is 8% (with a minimum of -56% and maximum of 180%)  
170 |     
171 | While it doesn't really make sense to say "we got an average growth of 0.05 hurricanes per year between 1950 and 2010" ... it may make sense to say "we saw an average of growth of 8% per year in the number of hurricanes between 1950 and 2010."  
172 | 
173 | That's a great thing to put in quotes!
174 | 
175 | > During EDA we discovered an average of growth of 8% per year in the number of hurricanes between 1950 and 2010.  
176 | 
177 | Be ready, as soon as you make a statement like that, you will likely have to explain how you arrived at that conclusion. That's where having an RMarkdown notebook and data online in a repository will help you out! Reproducible research is all of the hype right now.   
178 |   
179 | 
180 | #### Step 5: Try visualizing your statements  
181 | 
182 | ```{r}
183 | 
184 | df = data %>%
185 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
186 |   filter(grepl("H", CAT)) %>%
187 |   group_by(YEAR) %>%
188 |   summarise(Distinct_Storms = n_distinct(NAME)) %>%
189 |   mutate(Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms))
190 | 
191 | p = ggplot(df,aes(x = Distinct_Storms_Pct_Change)) + theme_economist()
192 | 
193 | p1 = p + geom_histogram(bins = 20) +
194 |   ggtitle("YoY % Change Density") +
195 |   scale_x_continuous(labels = scales::percent) +
196 |   ylab('') + xlab('YoY % Change in Hurricanes')
197 | 
198 | p2 = p + geom_density(fill='darkgrey',alpha=0.5) +
199 |   ggtitle("YoY % Change Density") +
200 |   scale_x_continuous(labels = scales::percent) +
201 |   ylab('') + xlab('YoY % Change in Hurricanes')
202 | 
203 | gridExtra::grid.arrange(p1,p2,ncol=2)
204 | ```
205 | 
206 | 
207 | A histogram and/or density plot is a great way to visualize the distribution of the data you are making statements about. This plot helps to show that we are looking at a right-skewed distribution with substantial variance. Knowing that we have n = 58 (meaning 58 years after being aggregated), it's not surprising that our histogram looks sparse and our density plot has an unusual shape. At this point, you can make a decision to jot this down, research it in depth and then attack it with full force.  
208 | 
209 | 
210 | However, that's not what we're covering in this post.  
211 | 
212 | 
213 | #### Step 6: Plot another aspect of your data
214 | 
215 | 
216 | ```{r}
217 | big_map <- get_googlemap(c(lon=-95, lat=30), zoom = 4, maptype = "terrain")
218 | ggmap(big_map, extent='panel') + 
219 |   geom_point(data = data, mapping = aes(x = LONG, y = LAT),col='red',alpha=0.1)
220 | ```
221 | 
222 | 
223 | 60K pieces of data can get out of hand quickly, we need to back this down into manageable chunks. Building on the knowledge from our last exploration, we should be able to think of a way to cut this down to get some better information. The concept of small multiples could come in handy again! Splitting the data up by type of storm could prove to be invaluable. We can also tell that we are missing
224 | 
225 | -----
226 | 
227 | ```{r}
228 | df = data %>% filter(grepl("H", CAT))
229 | ggmap(big_map) + 
230 |   geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 
231 |   stat_density2d(data = df, 
232 |     aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 
233 |     bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 
234 |     guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 
235 |   facet_wrap(~CAT)
236 | ```
237 |   
238 | 
239 | After filtering the data down to hurricanes and utilizing a heatmap rather than plotting individual points we can get a better handle on what is happening where. The H4 and H5 sections are probably the most interesting. It appears as if H4 storms are more frequent on the West coast of Mexico whereas the H5 are most frequent in the Gulf of Mexico.  
240 | 
241 | 
242 | Because we're still in EDA mode, we'll continue with another plot.  
243 | 
244 | 
245 | ```{r}
246 | df = data %>% filter(!grepl("H", CAT) & !grepl("W", CAT))
247 | ggmap(big_map) + 
248 |   geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 
249 |   stat_density2d(data = df, 
250 |     aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 
251 |     bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 
252 |     guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 
253 |   facet_wrap(~CAT)
254 | ```
255 | 
256 | 
257 | Here are some of the other storms from the data set. We can see that TD, TS and L have large geographical spreads. The E, SS, and SD storms are concentrated further North toward New England.  
258 | 
259 | Digging into this type of data and building probabalistic models is a fascinating field. The actuarial sciences are extremely difficult and insurance companies really need good models. Having mapped this data, it's pretty clear you could dig in and find out what parts of the country should expect what types of storms (and you've also known this just from being alive for 10+ years). More hypotheses could be formed about location at this stage and could be tested!  
260 | 
261 | 
262 | #### Step 7: Look for a relationship
263 | 
264 | 
265 | ```{r}
266 | df = data %>% 
267 |   filter(PRESSURE > 0) %>%
268 |   filter(grepl("H", CAT)) %>%
269 |   group_by(CAT,YEAR,MONTH,DAY,LAT,LONG) %>%
270 |   summarise(MEAN_WIND_KTS = mean(WIND_KTS), MEAN_PRESSURE = mean(PRESSURE)) %>%
271 |   arrange(MEAN_WIND_KTS)
272 | df$CAT = factor(df$CAT)
273 | 
274 | p = ggplot(df,aes(x=MEAN_WIND_KTS, y = MEAN_PRESSURE, fill = CAT)) + theme_economist()
275 | p + 
276 |   geom_hex(alpha = 0.8) +
277 |   scale_fill_brewer(direction = -1, palette = "Spectral") + 
278 |   scale_y_continuous(labels = scales::comma)+ 
279 |   theme(legend.position = 'right') + 
280 |   ggtitle("Wind KTS vs. Pressure by Category (H)")
281 | ```
282 |   
283 |    
284 | What is the relationship between WIND_KTS and PRESSURE? This chart helps us to see that low PRESSURE and WIND_KTS are likely negatively correlated. We can also see that the WIND_KTS is essentially the predictor in the data set which can perfectly predict how a storm is classified. Well, it turns out, that's basically the distinguising feature when scientists are determining how to categorize these storms!  
285 | 
286 | 
287 | #### Step ........
288 | 
289 | The rest is up to you! This is a great data set and there are a lot more pieces of information lurking within it. I want people to do their own EDA and send me anything interesting! 
290 | 
291 | Some food for thought:  
292 | 
293 |   * What was the most common name for a hurricane?
294 |   * Do the names actually follow an alphabetical pattern through time? (This is one is tricky)
295 |   * Can we merge this data with FEMA, charitable donations, or other aid data?
296 | 
297 |   
298 | To get you started on the first one, here's the Top 10 most common names for tropical storms. Why do you think it's Florence?
299 | 
300 | 
301 | ```{r}
302 | top_names = data %>%
303 |   filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>%
304 |   group_by(NAME) %>%
305 |   summarise(Years_Used = n_distinct(YEAR)) %>%
306 |   arrange(-Years_Used)
307 | p = ggplot(top_names %>% top_n(10), aes(x = reorder(NAME, Years_Used), y = Years_Used)) + theme_economist()
308 | p + geom_bar(stat='identity') + coord_flip() + xlab('') + ggtitle('Most Used Tropical Storm Names')
309 | ```
310 | 
311 | 
312 | Thank you for reading, I hope this helps you with your own data. The code is all written in R and is located on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson). You can also find other data visualization posts and usages of ggplot2 on my blog [Stoltzmaniac](https://www.stoltzmaniac.com?utm_campaign=bottom_of_tropical_storm_post)
313 | 


--------------------------------------------------------------------------------
/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-Visualization-Lesson


--------------------------------------------------------------------------------
/data_preparation.R:
--------------------------------------------------------------------------------
 1 | # This file will prepare the data
 2 | # Data was originally found on data.world
 3 | # http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm
 4 | 
 5 | library(dplyr)
 6 | library(tidyr)
 7 | library(stringr)
 8 | library(zoo)
 9 | 
10 | getData = function(){
11 |   
12 |   data = read.csv('Monthly Crude Oil Production by State 1981 - Nov 2016.csv')
13 |   data = data[data$Date != 'Back to Contents',]
14 |   data = data[data$Date != 'Sourcekey',]
15 |   data = data[data$Date != '',]
16 |   
17 |   #Convert data 
18 |   df = data.frame(data %>% gather(Location,ThousandBarrel, -Date))
19 | 
20 |   df = data.frame(df %>% separate(Date,c('Month','Year'),"-"))
21 |   df$Date = as.Date(as.yearmon(paste(df$Month,df$Year)))
22 |   
23 |   df$Location = as.character(sub('.Field.Production.of.Crude.Oil..Thousand.Barrels.','',df$Location))
24 |   df$ThousandBarrel = as.numeric(df$ThousandBarrel)
25 |   
26 |   States = data.frame(Location = as.character(state.name))
27 |   
28 |   df = merge(df,States,by='Location')
29 |   df$Month = factor(df$Month, levels = month.abb)
30 |   
31 |   return(df)
32 |   
33 | }
34 | 


--------------------------------------------------------------------------------
/figure/titlePhoto-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/titlePhoto-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-20-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/figure/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/ggmapTemp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/ggmapTemp.png


--------------------------------------------------------------------------------
/images/bad-pie1-fix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/bad-pie1-fix.png


--------------------------------------------------------------------------------
/images/bad-pie1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/bad-pie1.png


--------------------------------------------------------------------------------
/images/chart_vs_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/chart_vs_text.png


--------------------------------------------------------------------------------
/images/lie_chart_bad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/lie_chart_bad.png


--------------------------------------------------------------------------------
/images/lie_chart_fixed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/lie_chart_fixed.png


--------------------------------------------------------------------------------
/images/tg_tb_tu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/tg_tb_tu.jpg


--------------------------------------------------------------------------------
/images/tg_tb_tu.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/tg_tb_tu.xcf


--------------------------------------------------------------------------------
/images/title_photo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo.png


--------------------------------------------------------------------------------
/images/title_photo_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo_2.png


--------------------------------------------------------------------------------
/images/title_photo_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo_3.png


--------------------------------------------------------------------------------