├── .RData ├── .Rhistory ├── .Rproj.user ├── 3B0EFA5 │ ├── pcs │ │ ├── debug-breakpoints.pper │ │ ├── files-pane.pper │ │ ├── source-pane.pper │ │ ├── windowlayoutstate.pper │ │ └── workbench-pane.pper │ ├── persistent-state │ ├── rmd-outputs │ ├── saved_source_markers │ ├── sdb │ │ ├── per │ │ │ └── t │ │ │ │ ├── C00A79D2 │ │ │ │ ├── EEFE0FD5 │ │ │ │ ├── F1EE8CA4 │ │ │ │ └── FE94A69D │ │ ├── prop │ │ │ ├── 32602BA4 │ │ │ ├── 3B0BAF17 │ │ │ ├── 460054D1 │ │ │ ├── 8DD8A556 │ │ │ ├── 945C141F │ │ │ ├── C5350C47 │ │ │ ├── D635D4D3 │ │ │ ├── DA516F3B │ │ │ ├── E7318A37 │ │ │ ├── ED483BAF │ │ │ ├── EE8F787 │ │ │ ├── FFC79A6C │ │ │ └── INDEX │ │ └── s-21D18981 │ │ │ ├── 206A0EDA │ │ │ ├── 3F887430 │ │ │ ├── D09CC3F1 │ │ │ ├── EB01FDAC │ │ │ └── lock_file │ └── session-persistent-state └── shared │ └── notebooks │ └── paths ├── .gitignore ├── Data Visualization - Part 1.Rmd ├── Data Visualization - Part 1._pub.html ├── Data Visualization - Part 1.md ├── Data Visualization - Part 2.Rmd ├── Data Visualization - Part 2._pub.html ├── Data Visualization - Part 2.md ├── Data Visualization - Part 3.Rmd ├── Data Visualization - Part 3._pub.html ├── Data Visualization - Part 3.md ├── Data Visualization - Tropical Storms.Rmd ├── Data Visualization Lesson.Rmd ├── Data Visualization Lesson._pub.html ├── Data Visualization Lesson.md ├── Data-Visualization-Lesson.Rproj ├── Data_Visualization_-_Part_1.html ├── Data_Visualization_-_Part_1.pdf ├── Data_Visualization_-_Part_2.html ├── Data_Visualization_-_Part_2.pdf ├── Data_Visualization_-_Part_3.html ├── Data_Visualization_-_Part_3_files └── figure-html │ ├── unnamed-chunk-2-1.png │ ├── unnamed-chunk-3-1.png │ ├── unnamed-chunk-4-1.png │ └── unnamed-chunk-5-1.png ├── Data_Visualization_-_Tropical_Storms.Rmd ├── Data_Visualization_-_Tropical_Storms.html ├── Data_Visualization_-_Tropical_Storms_files └── figure-html │ ├── unnamed-chunk-2-1.png │ ├── unnamed-chunk-5-1.png │ ├── unnamed-chunk-7-1.png │ └── unnamed-chunk-8-1.png ├── Data_Visualization_Lesson.html ├── Monthly Crude Oil Production by State 1981 - Nov 2016.csv ├── Oil_Production_By_State.html ├── README.md ├── data └── Historical_Tropical_Storm_Tracks.csv ├── data_preparation.R ├── figure ├── titlePhoto-1.png ├── unnamed-chunk-1-1.png ├── unnamed-chunk-10-1.png ├── unnamed-chunk-11-1.png ├── unnamed-chunk-12-1.png ├── unnamed-chunk-13-1.png ├── unnamed-chunk-14-1.png ├── unnamed-chunk-15-1.png ├── unnamed-chunk-16-1.png ├── unnamed-chunk-17-1.png ├── unnamed-chunk-18-1.png ├── unnamed-chunk-19-1.png ├── unnamed-chunk-2-1.png ├── unnamed-chunk-20-1.png ├── unnamed-chunk-3-1.png ├── unnamed-chunk-4-1.png ├── unnamed-chunk-5-1.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-7-1.png ├── unnamed-chunk-8-1.png └── unnamed-chunk-9-1.png ├── ggmapTemp.png ├── hurricane_leaflet.html ├── images ├── bad-pie1-fix.png ├── bad-pie1.png ├── chart_vs_text.png ├── lie_chart_bad.png ├── lie_chart_fixed.png ├── tg_tb_tu.jpg ├── tg_tb_tu.xcf ├── title_photo.png ├── title_photo_2.png └── title_photo_3.png └── m.html /.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/.RData -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/pcs/debug-breakpoints.pper: -------------------------------------------------------------------------------- 1 | { 2 | "debugBreakpointsState" : { 3 | "breakpoints" : [ 4 | ] 5 | } 6 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson", 3 | "sortOrder" : [ 4 | { 5 | "ascending" : true, 6 | "columnIndex" : 2 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab" : 3 3 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left" : { 3 | "panelheight" : 740, 4 | "splitterpos" : 309, 5 | "topwindowstate" : "NORMAL", 6 | "windowheight" : 778 7 | }, 8 | "right" : { 9 | "panelheight" : 740, 10 | "splitterpos" : 465, 11 | "topwindowstate" : "NORMAL", 12 | "windowheight" : 778 13 | } 14 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1" : 0, 3 | "TabSet2" : 3, 4 | "TabZoom" : { 5 | } 6 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/persistent-state: -------------------------------------------------------------------------------- 1 | build-last-errors="[]" 2 | build-last-errors-base-dir="" 3 | build-last-outputs="[]" 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" 5 | console_procs="[]" 6 | files.monitored-path="" 7 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":true,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}" 8 | imageDirtyState="1" 9 | saveActionState="-1" 10 | -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/rmd-outputs: -------------------------------------------------------------------------------- 1 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html 2 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html 3 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html 4 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html 5 | ~/Documents/GitHub/Data-Visualization-Lesson/Data_Visualization_-_Part_3.html 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/saved_source_markers: -------------------------------------------------------------------------------- 1 | {"active_set":"","sets":[]} -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/per/t/C00A79D2: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "library(knitr)\n# Set figure dimensions\n#opts_chunk$set(fig.width=5, fig.height=5)\n# Set figures to upload to imgur.com\n#opts_knit$set(upload.fun = imgur_upload, base.url = NULL)\nopts_knit$set(upload.fun = function(file){library(RWordPress);uploadFile(file)$url;})\n\nrmd.file <- \"Data Visualization - Part 3.Rmd\"\n# Knit the .Rmd file\nknit(rmd.file)\n# Set up input/ output files\nmarkdown.file <- gsub(pattern = \"Rmd$\", replacement = \"md\", x = rmd.file)\nhtml.file <- gsub(pattern = \"md$\", replacement = \"_pub.html\", x = markdown.file)\n\nlibrary(markdown)\n# Removes 'yaml' information\nmarkdownToHTML(file = markdown.file, output = html.file, fragment.only = TRUE)\n\nlibrary(RWordPress)\n# Set your WP username, password, and your site URL\noptions(WordpressLogin = c(stoltzmaniac = 'ejkDD$$ckckslppzzzekAABV'),\n WordpressURL = 'https://stoltzmaniac.com/xmlrpc.php')\n# Create a line-by-line text vector\ntext = paste(readLines(html.file), collapse = \"\\n\")\n# Send to Worpdress\nnewPost(list(description = text, title = \"Data Visualization - Part 3\"), publish = FALSE)\n", 4 | "created" : 1491592764133.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "2697624838", 9 | "id" : "C00A79D2", 10 | "lastKnownWriteTime" : 1491595519, 11 | "last_content_update" : 1491595519472, 12 | "path" : "~/Desktop/uploading to wp.R", 13 | "project_path" : null, 14 | "properties" : { 15 | "tempName" : "Untitled1" 16 | }, 17 | "relative_order" : 4, 18 | "source_on_save" : false, 19 | "source_window" : "", 20 | "type" : "r_source" 21 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/per/t/EEFE0FD5: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 3\"\nauthor: \"Scott Stoltzman\"\ndate: \"April 7, 2017\"\noutput: html_document\n---\n\n\n### What Type of Data Visualization Do You Choose (if any)? \n\nDetermining whether or not you need a visualization is ***step one***. While it seems silly, this is probably something everyone (including myself) should be doing more often. A lot of times, it seems like a great way to showcase the amount of work you have been doing, but winds up being completely ineffective and could potentially harm what you're doing. Once you determine that you actually need to visualize your data, you should have a rough idea of the options to look at. This post will explain and demonstrate some of the common types of charts and plots. \n\n\n```{r, echo=FALSE,results='hide', warning=FALSE, message=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo_3.png\")\ngrid.raster(img)\n```\n\nThis is Part 3 in a series about Data visualization: \n\n* [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_TOP)\n* [Data Visualization - Part 2](https://www.stoltzmaniac.com/data-visualization-part-2?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_TOP)\n\n#### Determine whether or not you actually need a visualizatoin in the first place.\n\nLike the best practices I listed in [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1?utm_medium=SERIES&utm_source=DATA_VISUALIZATION_MID_ARTICLE), make sure your visualizations:\n\n- Are clearly illustrating a relevant point \n- Are tailored to the appropriate audience \n- Are tailored to the presentation medium \n- Are memorable to those who care about the material \n- Are increasing the understanding of the subject matter \n \nIf these don't seem possible, ***you probably don't need a data visualization.*** \n\n#### If you do need one, what's a good first step to take?\n\nTake a look at the forum in which you're presenting, it matters! If you are writing for a scientific journal, it will be different than presenting live to a thousand person audience. Think about a Ted Talk compared to the Journal of Physics. \n\nPoint being: **consider your audience!** \n\nLet's talk about a high-level presentation. Everyone has seen a slideshow with fancy charts that add zero value. Do not be the person presenting something that way! Providing useless content will confuse the audience and/or lead to boredom.\n\nIf your point is to show year-over-year change of a single metric - show it as a simple number on the page in big bold font rather than a chart.\n\nIn this made up example, I am displaying revenue over the last few years (***note: be more specific*** when it comes to what type of revenue you're talking about). \n\nWhich of the following makes more sense to put on a slide?\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/chart_vs_text.png\")\ngrid.raster(img)\n```\n\nIf you agree with me, the one on the right will be much easier for people to understand in a presentation. It gets the point across without requiring processing which will allow people to focus on what is important. Any additional nuggets you would like to point out can be spoken to. \n\nNow, let's talk about publishing content that isn't for academic use but will reach the public (i.e. newspapers, magazines, blogs, etc.). These types of charts can cover a wide range of topics so we'll have to stick to the basics. We're going to look at displaying information which is interesting and adds value. \n\nHere is a great example from [Junk Charts](http://junkcharts.typepad.com/junk_charts/2017/04/what-does-lying-politicians-have-in-common-with-rainbow-colors.html) in which the author of the original [Daily Kos Article](http://www.dailykos.com/story/2016/8/7/1556666/-Three-lessons-from-the-rise-of-Donald-Trump) is showing a type of \"lie detector\" chart. The chart does a number of things well: it illustrates a relevant point, it is appropriate to the audience and medium, and really helps to understand the subject matter better. However, the original chart is too colorful which takes away from its effectiveness. Junk Charts took it to the next level by simplifying the colors and axes. \n\n\n#### Original Version (Daily Kos)\n```{r, fig.align='center',echo=FALSE,fig.height=6,fig.show='hold'}\nimg = readPNG(\"images/lie_chart_bad.png\")\ngrid.raster(img)\n``` \n\n#### Modified Version (Junk Charts)\n```{r, fig.align='center',echo=FALSE,fig.height=6,fig.show='hold'}\nimg = readPNG(\"images/lie_chart_fixed.png\")\ngrid.raster(img)\n``` \n\nBy merely looking at this chart you can see how it is ranked, a sense of scale, the comparison between people, and clearly labeled names. Fantastic work! \n\nRather than going over more examples of work others are doing, please visit [Chart Porn](http://chartporn.org/) (don't worry about the name, it's a great data visualization site) and [Junk Charts](http://junkcharts.typepad.com/). They have phenomenal examples of what to do (and what not to do) when publishing to the public.\n\n#### You have a point, now what? \n\nThere is no rulebook as to how to display your data. However, as you have seen, there are both great and poor options. The choice is up to you - so think long and hard before making a decision (and you can always try a number of them out on people before publishing).\n\n**Ask yourself the following questions to help drive your decision:** \n\n- Are you making a comparison?\n- Are you finding a relationship?\n- Are you showing a distribution?\n- Are you finding a trend over time?\n- Are you showing composition?\n \nOnce you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.\n\n#### Rule of Thumb \n\n- **Trend:** Column, Line \n- **Comparison:** Area, Bar, Bullet, Column, Line, Scatter \n- **Relationship:** Line, Scatter \n- **Distribution:** Bar, Boxplot, Column \n- **Composition:** Donut, Pie, Stacked Bar, Stacked Column \n \nObviously, there are plenty of choices beyond these, so don't hesitate to use what works best. I will go over some of these basics and show some comparisons of poor charting techniques vs. slightly better ones.\n\nFor this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm) \n\nLet's load up some libraries and get started.\n\n```{r libraryPrep, results='hide', warning=FALSE, message=FALSE}\nlibrary(ggplot2)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(lubridate)\nlibrary(scales)\n```\n\n\n\n```{r dataLoading, results='hide', warning=FALSE, message=FALSE}\n#Custom data preparation\n#GitHub (linked to at bottom of this post)\nsource('data_preparation.R')\ndata = getData()\n```\n\n```{r}\nhead(data)\n```\n\n---- \n\n## Trend - Line Chart\n\n**Objective:** Visualize a trend in oil production in the US from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period.\n\n#### Poor Version \nThe x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still worthless.\n\n\n```{r,fig.align='center', fig.width=4}\ndf = data %>% \n group_by(Year) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n ggtitle('Oil Production Over Time') + \n theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n xlab('') + ylab('')\n```\n\n#### Better Version \nThe title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.\n\n```{r,fig.align='center', fig.width=12}\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +\n theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n scale_y_continuous(labels = comma)\n```\n\n\n----\n\n## Comparison - Line Chart \n\n**Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare their trends over the time period. From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010.\n\n#### Poor Version \nThere are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Location, Year) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))\np + geom_line(stat='identity') + \n ggtitle(paste('Oil Production By Year By State in the U.S.')) + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n#### Better Version \nThis focuses attention on the top producing states. It compares them to each other and shows the trend per state as well. Using facet_wrap() tends to be used in what's known as \"small multiples\" - this is a technique which helps to break up the visual components of the data into easy-to-understand pieces which make intuitive sense.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nn=6 #Arbitrary at first, after trying a few, this made the most sense\ntopN = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(n)\n\ndf = data %>%\n filter(Location %in% topN$Location) %>%\n group_by(Year,Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\ndf$Location = as.factor(df$Location)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))\np + geom_line(stat='identity') + \n ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n facet_wrap(~Location) + \n scale_y_continuous(labels = comma) \n\n```\n\n----\n\n## Relationship - Scatter Plot\n\n**Objective**: Check to see if data from Alaska and California is correlated. While this isn't extremely interesting, it does allow us to use this same data set (sorry). The charts indicate that there appears to be a strong positive correlation between the two states.\n\n#### Poor Version \nLots of completely irrelevant data! The size of the point should have nothing to do with the year. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nstatesList = c('Alaska','California')\ndf = data %>%\n filter(Location %in% statesList) %>%\n spread(Location,ThousandBarrel) %>%\n select(Alaska,California,Month,Year)\n\np = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))\np + geom_point() + \n scale_y_continuous(labels = comma) +\n scale_x_continuous(labels = comma) +\n ggtitle('Oil Production - CA vs. AK') + \n theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n#### Better Version \nThe points are all the same size and a trend line helps to visualize the relationship. While it can sometimes be misleading, it makes sense with our current data. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n filter(Location %in% statesList) %>%\n spread(Location,ThousandBarrel) %>%\n select(Alaska,California,Year)\n\np = ggplot(df,aes(x=Alaska,y=California))\np + geom_point() + \n scale_y_continuous(labels = comma) +\n scale_x_continuous(labels = comma) +\n ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + \n theme(plot.title = element_text(hjust = 0.5)) + \n geom_smooth(method='lm')\n\n```\n\n## Distribution - Boxplot \n\n**Objective**: Examine the range of production by state (per year) to give us an idea of the variance. While the sums and means are nice, it's quite important to have an idea of distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! \n\n\n#### Poor Version \nAlphabetical order doesn't add any value, names are overlapping on top of each other. While you can tell who the big players are, this visual does not add the value it should.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Year,Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Location,y=ThousandBarrel))\np + geom_boxplot() + \n ggtitle('Distribution of Oil Production by State')\n\n```\n\n\n#### Better Version \nThis gives a nice ranking to the plot while still showing their distributions. We could take this a step further and separate out the big players from the small players (I'll leave that up to you).\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_boxplot() + \n scale_y_continuous(labels = comma) +\n ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + \n coord_flip()\n```\n\n\n## Composition - Stacked Bar \n\n**Objective**: Check out the composition of total production by state. It's interesting to see how the composition was relatively similar across decades until the 2010's. Texas was 50% of the output!\n\n\n#### Poor Version \nMy favorite, the beautiful pie chart! There's nothing better than this... (no need for further commentary).\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(100*df$ThousandBarrel,0)\n\nlibrary(plotrix)\npie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')\n\n```\n\n\n#### Better Version \nThe 1980's and 2010's will be missing years in terms of a \"decade\" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite \"apples-to-apples\" for a comparison because I created different decades, but you get the idea.\n\nI also created an \"Other\" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely - **do this carefully and transparently!**\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndata$Decade = '1980s'\ndata$Decade[data$Year >= 1990] = '1990s'\ndata$Decade[data$Year >= 2000] = '2000s'\ndata$Decade[data$Year >= 2010] = '2010s'\ndata$Decade = as.factor(data$Decade)\n\ntop5 = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(5) %>%\n select(Location)\n\ntop5List = top5$Location\n\ndata$State = \"Other\"\n\nfor(i in 1:length(top5List)){\n data$State[data$Location == top5List[i]] = top5List[i]\n}\n\ndf = data %>%\n group_by(Decade,State) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(df$ThousandBarrel,3)\ndf$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')\n\np = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))\np + geom_bar(stat='identity') + \n geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = \"stack\") + \n scale_y_continuous(labels = percent) +\n ggtitle('Percentage of Top Oil Producing States by Decade') + \n guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + \n theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n\n\n\n### Some other fun concepts are below! \nSome of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. \n\nHave fun with your data visualizations: be creative, think outside the box, use tools other than computers if it makes sense, fail often but learn quickly. I'm sure I'll think of a thousand better ways to have illustrated the concepts in this post by tomorrow, so I'll make updates as I think of them!\n\nNow it's your turn!\n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)\n\n\n```{r,fig.height=4}\ndf = data %>% \n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel)\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_bar(stat='identity') + \n ggtitle('Oil Production 1981 - 2016 By Location') + \n theme(plot.title = element_text(hjust = 0.5)) + \n coord_flip()\n```\n\n\n\n\n\n```{r,fig.height=4}\ntop10 = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(10)\nprint(top10)\n\ndf = data %>% \n group_by(Location,Year) %>%\n filter(Location %in% top10$Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) \np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))\np + geom_bar(stat='identity') + \n ggtitle('Oil Production - Top 10 States') + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n```{r, fig.height=4}\ndf = data %>%\n filter(Year == 1990)%>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\ndf$Location = tolower(df$Location)\n\n#Add States without data\nStates = data.frame(Location = tolower(as.character(state.name)))\nmissingStates = States$Location[!(States$Location %in% df$Location)]\nappendData = data.frame(Location=missingStates,ThousandBarrel=0)\ndf = rbind(df,appendData)\n\nstates_map <- map_data(\"state\")\n\nggplot(df, aes(map_id = Location)) + \n geom_map(aes(fill=ThousandBarrel), map = states_map) +\n expand_limits(x = states_map$long, y = states_map$lat)\n\n```\n\n\n```{r, fig.height=4}\ndf = data %>% \n filter(Location == 'Texas') %>%\n group_by(Year,Month) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Month,y=ThousandBarrel))\np + geom_line(stat='identity',aes(group=Year,col=Year)) + \n ggtitle('Oil Production By Year in the U.S.') + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n\n\n\n\n", 4 | "created" : 1491578298512.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "985338938", 9 | "id" : "EEFE0FD5", 10 | "lastKnownWriteTime" : 1491596055, 11 | "last_content_update" : 1491596055925, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd", 13 | "project_path" : "Data Visualization - Part 3.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 1, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/per/t/F1EE8CA4: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 2\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n---- \n\n# Data Visualization - Part 2\n\n## A Quick Overview of the ggplot2 Package in R \n\nWhile it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. \n\nIn case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)\n\n```{r, fig.align='center',echo=FALSE}\nlibrary(png)\nlibrary(grid)\nimg = readPNG(\"images/title_photo_2.png\")\ngrid.raster(img)\n```\n\n\n### Why would you use ggplot2? \n1. More robust plotting than the base plot package\n2. Better control over aesthetics - colors, axes, background, etc.\n3. Layering\n4. Variable Mapping (aes)\n5. Automatic aggregation of data\n6. Built in formulas & plotting (geom_smooth)\n7. The list goes on and on... \n\nBasically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.\n\n### Why wouldn't you use ggplot2? \n1. A bit of a learning curve\n2. Lack of user interactivity with the plots \n\nFundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it. \n\n**A Basic Example: Unemployment Over Time** \n\n```{r,results='hide', warning=FALSE, message=FALSE}\nlibrary(dplyr)\nlibrary(ggplot2)\n\n# Load the economics data from ggplot2\ndata(economics,package='ggplot2')\n```\n\n```{r}\n# Take a look at the format of the data\nhead(economics)\n```\n\n\n```{r, fig.height = 4}\n# Create the plot\nggplot(data = economics) + geom_line(aes(x = date, y = unemploy))\n```\n\n\n\n### What happened to get that? \n\n- `ggplot(economics)` loaded the data frame\n- `+` tells ggplot() that there is more to be added to the plot\n- `geom_line()` defined the type of plot\n- `aes(x = date, y = unemploy)` mapped the variables\n\nThe `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what \"auto-magically\" brings your plot to life. You are telling ggplot2, \"I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis.\" It's pretty straightforward in this case but there are more complex use cases as well.\n\n***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():\n`ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`\n\n### Here's the basic formula for success:\n\n- Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter\n- Include your data frame (economics) in a ggplot function: `ggplot(data = economics)` \n- Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`\n - \"geom\" stands for \"geometric object\" and determines the type of object (there can be more than one type per plot)\n - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)\n- Add in layers and utilize `fill` and `col` parameters within `aes()`\n\n\nI'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on. \n\n**Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).\n\n\nAs usual, some packages must be loaded. \n\n```{r, results='hide', warning=FALSE, message=FALSE}\nlibrary(reshape2)\nlibrary(lubridate)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(ggplot2)\nlibrary(scales)\nlibrary(gridExtra)\n```\n\n### The Scatterplot \n\nThis is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).\n\nThe author of this chart was looking for a correlation between area and population. \n\n```{r}\n# Use the \"midwest\"\" data from ggplot2\ndata(\"midwest\", package = \"ggplot2\")\n\nhead(midwest)\n```\n\n#### Here's the most basic version of the scatter plot \n\nThis can be called by `geom_point()` in ggplot2\n\n```{r, warning=FALSE, fig.align='center',fig.height = 4}\n# Scatterplot\nggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point() #ggplot\n```\n\n#### Here's version with some additional features \n\nWhile the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.\n\n```{r, warning=FALSE,message=FALSE,fig.height = 4}\nggplot(data = midwest, aes(x = area, y = poptotal)) + \ngeom_point(aes(col=state, size=popdensity)) + \n geom_smooth(method=\"loess\", se=F) + \n xlim(c(0, 0.1)) + \n ylim(c(0, 500000)) + \n labs(subtitle=\"Area Vs Population\", \n y=\"Population\", \n x=\"Area\", \n title=\"Scatterplot\", \n caption = \"Source: midwest\")\n```\n\n#### Explanation: \n\n`ggplot(data = midwest, aes(x = area, y = poptotal)) + ` \nInputs the data and maps x and y variables as area and poptotal. \n\n`geom_point(aes(col=state, size=popdensity)) + ` \nCreates a scatterplot and maps the color and size of points to state and popdensity. \n\n` geom_smooth(method=\"loess\", se=F) + ` \nCreates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.\n\n` xlim(c(0, 0.1)) + ` \nSets the x-axis limits. \n\n` ylim(c(0, 500000)) + ` \nSets the y-axis limits. \n\n`labs(subtitle=\"Area Vs Population\",` \n\n` y=\"Population\",` \n\n` x=\"Area\",` \n\n` title=\"Scatterplot\",` \n\n` caption = \"Source: midwest\")` \nChanges the labels of the subtitle, y-axis, x-axis, title and caption.\n\nNotice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.\n\n\n### The Density Plot \n\nDensity plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.\n\n```{r}\n# Examine the mpg data set\nhead(mpg)\n```\n\n#### Sample Density Plot\n\n```{r,fig.height = 4}\ng = ggplot(mpg, aes(cty))\ng + geom_density(aes(fill=factor(cyl)), alpha=0.8) + \n labs(title=\"Density plot\", \n subtitle=\"City Mileage Grouped by Number of cylinders\",\n caption=\"Source: mpg\",\n x=\"City Mileage\",\n fill=\"# Cylinders\")\n\n```\n\nYou'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.\n\n`ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)` \n\n`g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + ` \n`geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.\n\n` labs(title=\"Density plot\",` \n\n` subtitle=\"City Mileage Grouped by Number of cylinders\",` \n\n` caption=\"Source: mpg\",` \n\n` x=\"City Mileage\",` \n\n` fill=\"# Cylinders\")` \nLabeling is cleaned up at the end.\n\n\n#### How would you use your new knowledge to see the density by class instead of by number of cylinders? \n\n***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.\n\n```{r,fig.height = 4}\ng + geom_density(aes(fill=factor(class)), alpha=0.8) + \n labs(title=\"Density plot\", \n subtitle=\"City Mileage Grouped by Class\",\n caption=\"Source: mpg\",\n x=\"City Mileage\",\n fill=\"Class\")\n```\nNotice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.\n\n### The Histogram \n\nHow could we show the city mileage in a histogram?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(cty))\ng + geom_histogram(bins=20) +\n labs(title=\"Histogram\", \n caption=\"Source: mpg\",\n x=\"City Mileage\")\n``` \n\n`geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.\n\n\n### The Bar/Column Chart \n\nFor all intensive purposes, bar and column charts are essentially the same. Technically, the term \"column chart\" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.\n\n```{r}\n#Data Preparation\nfreqtable <- table(mpg$manufacturer)\ndf <- as.data.frame.table(freqtable)\nhead(df)\n```\n\n\n```{r,fig.height = 4}\n#Set a theme\ntheme_set(theme_classic())\n\ng <- ggplot(df, aes(Var1, Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n labs(title=\"Bar Chart\", \n subtitle=\"Manufacturer of vehicles\", \n caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\nThe addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.\n\nHowever, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.\n\nWithin `geom_bar()` there is another new piece of information: `stat=\"identity\"` which tells ggplot to use the actual value of `Freq`.\n\nYou may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. \n\n```{r,fig.height = 4}\ng <- ggplot(df, aes(reorder(Var1,Freq), Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n labs(title=\"Bar Chart\", \n x = 'Manufacturer',\n subtitle=\"Manufacturer of vehicles\", \n caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n theme(axis.text.x = element_text(angle=65, vjust=0.6)) + \n coord_flip()\n```\n\nLet's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\n`position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison. \n\nDespite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.\n\n```{r}\ntheme_set(theme_bw())\n\ng = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n facet_wrap(~manufacturer)\n```\nThis created a much nicer view of the information. It \"auto-magically\" split everything out by manufacturer!\n\n\n### Spatial Plots\n\nAnother nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude \"auto-magically.\"\n\nThis map is created with `ggmap` which utilizes Google Maps API.\n\n```{r, warning=FALSE, message=FALSE}\nlibrary(ggmap)\nlibrary(ggalt)\n\nfoco <- geocode(\"Fort Collins, CO\") # get longitude and latitude\n\n# Get the Map ----------------------------------------------\ncolo_map <- qmap(\"Colorado, United States\",zoom = 7, source = \"google\") \n\n# Get Coordinates for Places ---------------------\ncolo_places <- c(\"Fort Collins, CO\",\n \"Denver, CO\",\n \"Grand Junction, CO\",\n \"Durango, CO\",\n \"Pueblo, CO\")\n\nplaces_loc <- geocode(colo_places) # get longitudes and latitudes\n\n\n# Plot Open Street Map -------------------------------------\ncolo_map + geom_point(aes(x=lon, y=lat),\n data = places_loc, \n alpha = 0.7, \n size = 7, \n color = \"tomato\") + \n geom_encircle(aes(x=lon, y=lat),\n data = places_loc, size = 2, color = \"blue\")\n```\n\n### Final Thoughts \n\nI hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!\n\n### Coming soon: \n\n- Determining whether or not you need a visualization \n- Choosing the type of plot to use depending on the use case \n- Visualization beyond the standard charts and graphs \n\n\nI made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)", 4 | "created" : 1491578353227.000, 5 | "dirty" : true, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "3000572754", 9 | "id" : "F1EE8CA4", 10 | "lastKnownWriteTime" : 1490212163, 11 | "last_content_update" : 1491581505392, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 2.Rmd", 13 | "project_path" : "Data Visualization - Part 2.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 2, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/per/t/FE94A69D: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 1\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n pdf_document: default\n html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n---- \n\n# Introduction to Data Visualization\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo.png\")\ngrid.raster(img)\n```\n\nThe topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach \n$7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate. \n\nIn short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.\n\n---- \n\nI have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.\n\n### One author at Vox wanted to show the cause of death in all of Shakespeare\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1.png\")\ngrid.raster(img)\n```\n \n\n**Is this not insane!?!?!** \n\nUsing a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).\n \n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1-fix.png\")\ngrid.raster(img)\n```\n---- \n\nTo be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).\n\n----\n\n### The Good, The Bad, The Ugly \n\n**\"The Good\" visualizations:** \n\n- Clearly illustrate a point \n- Are tailored to the appropriate audience \n - Analysts may want detail \n - Executives may want a high-level view \n- Are tailored to the presentation medium \n - A piece in an academic journal can be analyzed slowly and carefully \n - A slide in front of 5,000 people in a conference will be glanced at quickly \n- Are memorable to those who care about the material \n- Make an impact which increases the understanding of the subject matter \n\n**\"The Bad\" visualizations:** \n\n- Are difficult to interpret \n- Are unintentionally misleading \n- Contain redundant and boring information \n\n**\"The Ugly\" visualizations:** \n\n- Are almost impossible to interpret \n- Are filled with completely worthless information \n- Are intentionally created to mislead the audience \n- Are inaccurate \n\n### Coming soon: \n\n- Determining whether or not you need a visualization \n- Choosing the type of plot to use depending on the use case \n- Introduction to the ggplot2 in R and how it works \n- Visualization beyond the standard charts and graphs \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)", 4 | "created" : 1491581474797.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "1337660815", 9 | "id" : "FE94A69D", 10 | "lastKnownWriteTime" : 1489685647, 11 | "last_content_update" : 1489685647, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 1.Rmd", 13 | "project_path" : "Data Visualization - Part 1.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 3, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/32602BA4: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/3B0BAF17: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/460054D1: -------------------------------------------------------------------------------- 1 | { 2 | "last_setup_crc32" : "BEDB844B56df664a", 3 | "tempName" : "Untitled1" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/8DD8A556: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/945C141F: -------------------------------------------------------------------------------- 1 | { 2 | "last_setup_crc32" : "", 3 | "tempName" : "Untitled1" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/C5350C47: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/D635D4D3: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/DA516F3B: -------------------------------------------------------------------------------- 1 | { 2 | "tempName" : "Untitled1" 3 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/E7318A37: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/ED483BAF: -------------------------------------------------------------------------------- 1 | { 2 | "last_setup_crc32" : "", 3 | "tempName" : "Untitled1" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/EE8F787: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/FFC79A6C: -------------------------------------------------------------------------------- 1 | { 2 | "last_setup_crc32" : "", 3 | "tempName" : "Untitled1" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/prop/INDEX: -------------------------------------------------------------------------------- 1 | ~%2FDesktop%2Fbookdown-demo-master%2Findex.Rmd="E7318A37" 2 | ~%2FDesktop%2Fuploading%20to%20wp.R="DA516F3B" 3 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%201.Rmd="FFC79A6C" 4 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%202.Rmd="945C141F" 5 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20-%20Part%203.Rmd="ED483BAF" 6 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FData%20Visualization%20Lesson.Rmd="460054D1" 7 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2FOil%20Production%20By%20State.Rmd="D635D4D3" 8 | ~%2FDocuments%2FGitHub%2FData-Visualization-Lesson%2Fdata_preparation.R="C5350C47" 9 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2001%20-%20Crime%20Analysis.Rmd="EE8F787" 10 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2003%20-%20Crime%20Analysis%20adapted%20for%20Ghost.Rmd="3B0BAF17" 11 | ~%2FDocuments%2FGitHub%2FDenver-Crime-Analysis%2FPart%2003%20-%20Crime%20Analysis.Rmd="8DD8A556" 12 | ~%2FDocuments%2FGitHub%2Fsc-paid-search-reporting%2FOtter%20Products%20-%20Paid%20Search%20Report.Rmd="32602BA4" 13 | -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/s-21D18981/206A0EDA: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 2\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n---- \n\n# Data Visualization - Part 2\n\n## A Quick Overview of the ggplot2 Package in R \n\nWhile it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. \n\nIn case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/)\n\n```{r, fig.align='center',echo=FALSE}\nlibrary(png)\nlibrary(grid)\nimg = readPNG(\"images/title_photo_2.png\")\ngrid.raster(img)\n```\n\n\n### Why would you use ggplot2? \n1. More robust plotting than the base plot package\n2. Better control over aesthetics - colors, axes, background, etc.\n3. Layering\n4. Variable Mapping (aes)\n5. Automatic aggregation of data\n6. Built in formulas & plotting (geom_smooth)\n7. The list goes on and on... \n\nBasically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier.\n\n### Why wouldn't you use ggplot2? \n1. A bit of a learning curve\n2. Lack of user interactivity with the plots \n\nFundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it. \n\n**A Basic Example: Unemployment Over Time** \n\n```{r,results='hide', warning=FALSE, message=FALSE}\nlibrary(dplyr)\nlibrary(ggplot2)\n\n# Load the economics data from ggplot2\ndata(economics,package='ggplot2')\n```\n\n```{r}\n# Take a look at the format of the data\nhead(economics)\n```\n\n\n```{r, fig.height = 4}\n# Create the plot\nggplot(data = economics) + geom_line(aes(x = date, y = unemploy))\n```\n\n\n\n### What happened to get that? \n\n- `ggplot(economics)` loaded the data frame\n- `+` tells ggplot() that there is more to be added to the plot\n- `geom_line()` defined the type of plot\n- `aes(x = date, y = unemploy)` mapped the variables\n\nThe `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what \"auto-magically\" brings your plot to life. You are telling ggplot2, \"I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis.\" It's pretty straightforward in this case but there are more complex use cases as well.\n\n***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line():\n`ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()`\n\n### Here's the basic formula for success:\n\n- Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter\n- Include your data frame (economics) in a ggplot function: `ggplot(data = economics)` \n- Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))`\n - \"geom\" stands for \"geometric object\" and determines the type of object (there can be more than one type per plot)\n - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/)\n- Add in layers and utilize `fill` and `col` parameters within `aes()`\n\n\nI'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on. \n\n**Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/).\n\n\nAs usual, some packages must be loaded. \n\n```{r, results='hide', warning=FALSE, message=FALSE}\nlibrary(reshape2)\nlibrary(lubridate)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(ggplot2)\nlibrary(scales)\nlibrary(gridExtra)\n```\n\n### The Scatterplot \n\nThis is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in).\n\nThe author of this chart was looking for a correlation between area and population. \n\n```{r}\n# Use the \"midwest\"\" data from ggplot2\ndata(\"midwest\", package = \"ggplot2\")\n\nhead(midwest)\n```\n\n#### Here's the most basic version of the scatter plot \n\nThis can be called by `geom_point()` in ggplot2\n\n```{r, warning=FALSE, fig.align='center',fig.height = 4}\n# Scatterplot\nggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point() #ggplot\n```\n\n#### Here's version with some additional features \n\nWhile the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2.\n\n```{r, warning=FALSE,message=FALSE,fig.height = 4}\nggplot(data = midwest, aes(x = area, y = poptotal)) + \ngeom_point(aes(col=state, size=popdensity)) + \n geom_smooth(method=\"loess\", se=F) + \n xlim(c(0, 0.1)) + \n ylim(c(0, 500000)) + \n labs(subtitle=\"Area Vs Population\", \n y=\"Population\", \n x=\"Area\", \n title=\"Scatterplot\", \n caption = \"Source: midwest\")\n```\n\n#### Explanation: \n\n`ggplot(data = midwest, aes(x = area, y = poptotal)) + ` \nInputs the data and maps x and y variables as area and poptotal. \n\n`geom_point(aes(col=state, size=popdensity)) + ` \nCreates a scatterplot and maps the color and size of points to state and popdensity. \n\n` geom_smooth(method=\"loess\", se=F) + ` \nCreates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars.\n\n` xlim(c(0, 0.1)) + ` \nSets the x-axis limits. \n\n` ylim(c(0, 500000)) + ` \nSets the y-axis limits. \n\n`labs(subtitle=\"Area Vs Population\",` \n\n` y=\"Population\",` \n\n` x=\"Area\",` \n\n` title=\"Scatterplot\",` \n\n` caption = \"Source: midwest\")` \nChanges the labels of the subtitle, y-axis, x-axis, title and caption.\n\nNotice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily.\n\n\n### The Density Plot \n\nDensity plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has.\n\n```{r}\n# Examine the mpg data set\nhead(mpg)\n```\n\n#### Sample Density Plot\n\n```{r,fig.height = 4}\ng = ggplot(mpg, aes(cty))\ng + geom_density(aes(fill=factor(cyl)), alpha=0.8) + \n labs(title=\"Density plot\", \n subtitle=\"City Mileage Grouped by Number of cylinders\",\n caption=\"Source: mpg\",\n x=\"City Mileage\",\n fill=\"# Cylinders\")\n\n```\n\nYou'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file.\n\n`ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)` \n\n`g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + ` \n`geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve.\n\n` labs(title=\"Density plot\",` \n\n` subtitle=\"City Mileage Grouped by Number of cylinders\",` \n\n` caption=\"Source: mpg\",` \n\n` x=\"City Mileage\",` \n\n` fill=\"# Cylinders\")` \nLabeling is cleaned up at the end.\n\n\n#### How would you use your new knowledge to see the density by class instead of by number of cylinders? \n\n***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established.\n\n```{r,fig.height = 4}\ng + geom_density(aes(fill=factor(class)), alpha=0.8) + \n labs(title=\"Density plot\", \n subtitle=\"City Mileage Grouped by Class\",\n caption=\"Source: mpg\",\n x=\"City Mileage\",\n fill=\"Class\")\n```\nNotice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`.\n\n### The Histogram \n\nHow could we show the city mileage in a histogram?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(cty))\ng + geom_histogram(bins=20) +\n labs(title=\"Histogram\", \n caption=\"Source: mpg\",\n x=\"City Mileage\")\n``` \n\n`geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one.\n\n\n### The Bar/Column Chart \n\nFor all intensive purposes, bar and column charts are essentially the same. Technically, the term \"column chart\" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set.\n\n```{r}\n#Data Preparation\nfreqtable <- table(mpg$manufacturer)\ndf <- as.data.frame.table(freqtable)\nhead(df)\n```\n\n\n```{r,fig.height = 4}\n#Set a theme\ntheme_set(theme_classic())\n\ng <- ggplot(df, aes(Var1, Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n labs(title=\"Bar Chart\", \n subtitle=\"Manufacturer of vehicles\", \n caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\nThe addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time.\n\nHowever, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line.\n\nWithin `geom_bar()` there is another new piece of information: `stat=\"identity\"` which tells ggplot to use the actual value of `Freq`.\n\nYou may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. \n\n```{r,fig.height = 4}\ng <- ggplot(df, aes(reorder(Var1,Freq), Freq))\ng + geom_bar(stat=\"identity\", width = 0.5, fill=\"tomato2\") + \n labs(title=\"Bar Chart\", \n x = 'Manufacturer',\n subtitle=\"Manufacturer of vehicles\", \n caption=\"Source: Frequency of Manufacturers from 'mpg' dataset\") +\n theme(axis.text.x = element_text(angle=65, vjust=0.6)) + \n coord_flip()\n```\n\nLet's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`?\n\n```{r,fig.height = 4}\ng = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n theme(axis.text.x = element_text(angle=65, vjust=0.6))\n```\n\n`position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison. \n\nDespite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy.\n\n```{r}\ntheme_set(theme_bw())\n\ng = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl)))\ng + geom_bar(stat='identity', position='dodge') + \n facet_wrap(~manufacturer)\n```\nThis created a much nicer view of the information. It \"auto-magically\" split everything out by manufacturer!\n\n\n### Spatial Plots\n\nAnother nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude \"auto-magically.\"\n\nThis map is created with `ggmap` which utilizes Google Maps API.\n\n```{r, warning=FALSE, message=FALSE}\nlibrary(ggmap)\nlibrary(ggalt)\n\nfoco <- geocode(\"Fort Collins, CO\") # get longitude and latitude\n\n# Get the Map ----------------------------------------------\ncolo_map <- qmap(\"Colorado, United States\",zoom = 7, source = \"google\") \n\n# Get Coordinates for Places ---------------------\ncolo_places <- c(\"Fort Collins, CO\",\n \"Denver, CO\",\n \"Grand Junction, CO\",\n \"Durango, CO\",\n \"Pueblo, CO\")\n\nplaces_loc <- geocode(colo_places) # get longitudes and latitudes\n\n\n# Plot Open Street Map -------------------------------------\ncolo_map + geom_point(aes(x=lon, y=lat),\n data = places_loc, \n alpha = 0.7, \n size = 7, \n color = \"tomato\") + \n geom_encircle(aes(x=lon, y=lat),\n data = places_loc, size = 2, color = \"blue\")\n```\n\n### Final Thoughts \n\nI hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up!\n\n### Coming soon: \n\n- Determining whether or not you need a visualization \n- Choosing the type of plot to use depending on the use case \n- Visualization beyond the standard charts and graphs \n\n\nI made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)", 4 | "created" : 1489621938889.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "3000572754", 9 | "id" : "206A0EDA", 10 | "lastKnownWriteTime" : 1490212163, 11 | "last_content_update" : 1490212163, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 2.Rmd", 13 | "project_path" : "Data Visualization - Part 2.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 2, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/s-21D18981/3F887430: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "library(knitr)\n# Set figure dimensions\n#opts_chunk$set(fig.width=5, fig.height=5)\n# Set figures to upload to imgur.com\nopts_knit$set(upload.fun = imgur_upload, base.url = NULL)\n\nrmd.file <- \"Data Visualization - Part 2.Rmd\"\n# Knit the .Rmd file\nknit(rmd.file)\n# Set up input/ output files\nmarkdown.file <- gsub(pattern = \"Rmd$\", replacement = \"md\", x = rmd.file)\nhtml.file <- gsub(pattern = \"md$\", replacement = \"_pub.html\", x = markdown.file)\n\nlibrary(markdown)\n# Removes 'yaml' information\nmarkdownToHTML(file = markdown.file, output = html.file, fragment.only = TRUE)\n\nlibrary(RWordPress)\n# Set your WP username, password, and your site URL\noptions(WordpressLogin = c(stoltzmaniac = 'ejkDD$$ckckslppzzzekAABV'),\n WordpressURL = 'https://stoltzmaniac.com/xmlrpc.php')\n# Create a line-by-line text vector\ntext = paste(readLines(html.file), collapse = \"\\n\")\n# Send to Worpdress\nnewPost(list(description = text, title = \"Data Visualization - Part 2\"), publish = FALSE)\n", 4 | "created" : 1489416648044.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "4101462966", 9 | "id" : "3F887430", 10 | "lastKnownWriteTime" : 1489621861, 11 | "last_content_update" : 1489621861, 12 | "path" : "~/Desktop/uploading to wp.R", 13 | "project_path" : null, 14 | "properties" : { 15 | "tempName" : "Untitled1" 16 | }, 17 | "relative_order" : 2, 18 | "source_on_save" : false, 19 | "source_window" : "", 20 | "type" : "r_source" 21 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/s-21D18981/D09CC3F1: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 3\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 28, 2017\"\noutput: html_document\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n\n### Do You Actually Need a Visualization?\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo_3.png\")\ngrid.raster(img)\n```\n\n#### Know what you are trying to achieve before starting\n\nLet's load up some libraries and get started.\n\n```{r libraryPrep, results='hide', warning=FALSE, message=FALSE}\nlibrary(ggplot2)\nlibrary(dplyr)\nlibrary(tidyr)\nlibrary(lubridate)\nlibrary(scales)\n```\n\n## Decide on what you're trying to accomplish first. \n\nAsk yourself the following questions to help drive your decision: \n\n- Are you making a comparison?\n- Are you finding a relationship?\n- Are you showing a distribution?\n- Are you finding a trend over time?\n- Are you showing composition?\n \nOnce you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options.\n\n#### Rule of Thumb \n\n- **Trend: ** Column, Line \n- **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter \n- **Relationship: ** Line, Scatter \n- **Distribution: ** Bar, Boxplot, Column \n- **Composition: ** Donut, Pie, Stacked Bar, Stacked Column \n \nObviously, there are choices beyond these and you need to make your choice wisely. \n\nSide Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately.\n\nFor this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm) \n\n\n```{r dataLoading, results='hide', warning=FALSE, message=FALSE}\n#Custom data preparation\n#GitHub (linked to at bottom of this post)\nsource('data_preparation.R')\ndata = getData()\n```\n\n```{r}\nhead(data)\n```\n\n---- \n\n## Trend - Line Chart\n\n**Objective:** Visualize a trend in oil production in the US from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period. This is ***exploratory visualization.***\n\nI decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading.\n\n#### Poor Version \nThe x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still worthless.\n\n\n```{r,fig.align='center', fig.width=4}\ndf = data %>% \n group_by(Year) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n ggtitle('Oil Production Over Time') + \n theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n xlab('') + ylab('')\n```\n\n#### Better Version \nThe title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible.\n\n```{r,fig.align='center', fig.width=12}\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) \np + geom_line(stat='identity') + \n ggtitle('Thousand Barrel Oil Production By Year in the U.S.') +\n theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n scale_y_continuous(labels = comma)\n```\n\n\n----\n\n## Comparison - Line Chart \n\n**Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare their trends over the time period. From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010. This is another example of ***exploratory visualization.***\n\n#### Poor Version \nThere are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Location, Year) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location))\np + geom_line(stat='identity') + \n ggtitle(paste('Oil Production By Year By State in the U.S.')) + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n#### Better Version \nThis focuses attention on the top producing states. It compares them to each other and shows the trend per state as well.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nn=6 #Arbitrary at first, after trying a few, this made the most sense\ntopN = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(n)\n\ndf = data %>%\n filter(Location %in% topN$Location) %>%\n group_by(Year,Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\ndf$Year = as.numeric(df$Year)\ndf$Location = as.factor(df$Location)\n\np = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1))\np + geom_line(stat='identity') + \n ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1)) + \n facet_wrap(~Location) + \n scale_y_continuous(labels = comma) \n\n```\n\n----\n\n## Relationship - Scatter Plot\n\n**Objective**: See if data from Alaska and California is correlated (This probably isn't important but it allows us to use the same data).\n\n### Which of these views would you rather see?\n\n#### Poor Version \nLots of completely irrelevant data! Size of the point should have nothing to do with the year. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\nstatesList = c('Alaska','California')\ndf = data %>%\n filter(Location %in% statesList) %>%\n spread(Location,ThousandBarrel) %>%\n select(Alaska,California,Month,Year)\n\np = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year))\np + geom_point() + \n scale_y_continuous(labels = comma) +\n scale_x_continuous(labels = comma) +\n ggtitle('Oil Production - CA vs. AK') + \n theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n#### Better Version \nThe trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data. \n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n filter(Location %in% statesList) %>%\n spread(Location,ThousandBarrel) %>%\n select(Alaska,California,Year)\n\np = ggplot(df,aes(x=Alaska,y=California))\np + geom_point() + \n scale_y_continuous(labels = comma) +\n scale_x_continuous(labels = comma) +\n ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + \n theme(plot.title = element_text(hjust = 0.5)) + \n geom_smooth(method='lm')\n\n```\n\n\n\n\n## Distribution - Boxplot \n\n**Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance.\n\n### Which of these views would you rather see?\n\n#### Poor Version \n\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Year,Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Location,y=ThousandBarrel))\np + geom_boxplot() + \n ggtitle('Distribution of Oil Production by State')\n\n```\n\n\n#### Better Version \nThis gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_boxplot() + \n scale_y_continuous(labels = comma) +\n ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + \n coord_flip()\n```\n\n\n## Composition - Stacked Bar \n\n**Objective**: Check out the composition of total production by state.\n\n### Which of these views would you rather see?\n\n#### Poor Version \nMy favorite, the beautiful pie chart! There's nothing better than this...\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndf = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(100*df$ThousandBarrel,0)\n\nlibrary(plotrix)\npie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State')\n\n```\n\n\n#### Better Version \nThe 1980's and 2010's will be missing years in terms of a \"decade\" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite \"apples-to-apples\" for a comparison because I created different decades, but you get the idea.\n\nI also created an \"Other\" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely.\n\n```{r,warning=FALSE,fig.width=10,message=FALSE}\ndata$Decade = '1980s'\ndata$Decade[data$Year >= 1990] = '1990s'\ndata$Decade[data$Year >= 2000] = '2000s'\ndata$Decade[data$Year >= 2010] = '2010s'\ndata$Decade = as.factor(data$Decade)\n\ntop5 = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(5) %>%\n select(Location)\n\ntop5List = top5$Location\n\ndata$State = \"Other\"\n\nfor(i in 1:length(top5List)){\n data$State[data$Location == top5List[i]] = top5List[i]\n}\n\ndf = data %>%\n group_by(Decade,State) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel))\n\ndf$ThousandBarrel = round(df$ThousandBarrel,3)\ndf$text = paste(round(100*df$ThousandBarrel,0),'%', sep='')\n\np = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel)))\np + geom_bar(stat='identity') + \n geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = \"stack\") + \n scale_y_continuous(labels = percent) +\n ggtitle('Percentage of Top Oil Producing States by Decade') + \n guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + \n theme(plot.title = element_text(hjust = 0.5))\n\n```\n\n\n\n\n### Some other fun concepts are below! \nSome of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. \n\nHave fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. \n\nNow it's your turn!\n\n\n```{r}\ndf = data %>% \n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel)\np = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel))\np + geom_bar(stat='identity') + \n ggtitle('Oil Production 1981 - 2016 By Location') + \n theme(plot.title = element_text(hjust = 0.5)) + \n coord_flip()\n```\n\n\n\n\n\n```{r}\ntop10 = data %>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) %>%\n arrange(-ThousandBarrel) %>%\n top_n(10)\nprint(top10)\n\ndf = data %>% \n group_by(Location,Year) %>%\n filter(Location %in% top10$Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel)) \np = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location))\np + geom_bar(stat='identity') + \n ggtitle('Oil Production - Top 10 States') + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\n\n```{r, fig.width=10}\ndf = data %>%\n filter(Year == 1990)%>%\n group_by(Location) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\ndf$Location = tolower(df$Location)\n\n#Add States without data\nStates = data.frame(Location = tolower(as.character(state.name)))\nmissingStates = States$Location[!(States$Location %in% df$Location)]\nappendData = data.frame(Location=missingStates,ThousandBarrel=0)\ndf = rbind(df,appendData)\n\nstates_map <- map_data(\"state\")\n\nggplot(df, aes(map_id = Location)) + \n geom_map(aes(fill=ThousandBarrel), map = states_map) +\n expand_limits(x = states_map$long, y = states_map$lat)\n\n```\n\n\n```{r}\ndf = data %>% \n filter(Location == 'Texas') %>%\n group_by(Year,Month) %>%\n summarise(ThousandBarrel = sum(ThousandBarrel))\n\np = ggplot(df,aes(x=Month,y=ThousandBarrel))\np + geom_line(stat='identity',aes(group=Year,col=Year)) + \n ggtitle('Oil Production By Year in the U.S.') + \n theme(plot.title = element_text(hjust = 0.5)) + \n theme(axis.text.x = element_text(angle = 90, hjust = 1))\n```\n\n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)", 4 | "created" : 1490662963331.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "289843358", 9 | "id" : "D09CC3F1", 10 | "lastKnownWriteTime" : 1490664979, 11 | "last_content_update" : 1490664979460, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd", 13 | "project_path" : "Data Visualization - Part 3.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "BEDB844B56df664a", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 4, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/s-21D18981/EB01FDAC: -------------------------------------------------------------------------------- 1 | { 2 | "collab_server" : "", 3 | "contents" : "---\ntitle: \"Data Visualization - Part 1\"\nauthor: \"Scott Stoltzman\"\ndate: \"March 14, 2017\"\noutput:\n pdf_document: default\n html_document: default\nsubtitle: The Good, The Bad and The Ugly\n---\n\n```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE}\nlibrary(png)\nlibrary(grid)\n```\n---- \n\n# Introduction to Data Visualization\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/title_photo.png\")\ngrid.raster(img)\n```\n\nThe topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach \n$7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate. \n\nIn short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.\n\n---- \n\nI have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post.\n\n### One author at Vox wanted to show the cause of death in all of Shakespeare\n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1.png\")\ngrid.raster(img)\n```\n \n\n**Is this not insane!?!?!** \n\nUsing a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html).\n \n\n```{r, fig.align='center',echo=FALSE}\nimg = readPNG(\"images/bad-pie1-fix.png\")\ngrid.raster(img)\n```\n---- \n\nTo be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).\n\n----\n\n### The Good, The Bad, The Ugly \n\n**\"The Good\" visualizations:** \n\n- Clearly illustrate a point \n- Are tailored to the appropriate audience \n - Analysts may want detail \n - Executives may want a high-level view \n- Are tailored to the presentation medium \n - A piece in an academic journal can be analyzed slowly and carefully \n - A slide in front of 5,000 people in a conference will be glanced at quickly \n- Are memorable to those who care about the material \n- Make an impact which increases the understanding of the subject matter \n\n**\"The Bad\" visualizations:** \n\n- Are difficult to interpret \n- Are unintentionally misleading \n- Contain redundant and boring information \n\n**\"The Ugly\" visualizations:** \n\n- Are almost impossible to interpret \n- Are filled with completely worthless information \n- Are intentionally created to mislead the audience \n- Are inaccurate \n\n### Coming soon: \n\n- Determining whether or not you need a visualization \n- Choosing the type of plot to use depending on the use case \n- Introduction to the ggplot2 in R and how it works \n- Visualization beyond the standard charts and graphs \n\nAs always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson)", 4 | "created" : 1489685598305.000, 5 | "dirty" : false, 6 | "encoding" : "UTF-8", 7 | "folds" : "", 8 | "hash" : "1337660815", 9 | "id" : "EB01FDAC", 10 | "lastKnownWriteTime" : 1489685647, 11 | "last_content_update" : 1489685647034, 12 | "path" : "~/Documents/GitHub/Data-Visualization-Lesson/Data Visualization - Part 1.Rmd", 13 | "project_path" : "Data Visualization - Part 1.Rmd", 14 | "properties" : { 15 | "last_setup_crc32" : "", 16 | "tempName" : "Untitled1" 17 | }, 18 | "relative_order" : 3, 19 | "source_on_save" : false, 20 | "source_window" : "", 21 | "type" : "r_markdown" 22 | } -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/sdb/s-21D18981/lock_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/.Rproj.user/3B0EFA5/sdb/s-21D18981/lock_file -------------------------------------------------------------------------------- /.Rproj.user/3B0EFA5/session-persistent-state: -------------------------------------------------------------------------------- 1 | virtual-session-id="BEDB844B" 2 | -------------------------------------------------------------------------------- /.Rproj.user/shared/notebooks/paths: -------------------------------------------------------------------------------- 1 | /Users/stoltzmanconsulting/Documents/Git-Repositories/GitHub/Data-Science-Certificate/Data-Visualization-Lesson/Data Visualization - Part 3.Rmd="95580727" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /Data Visualization - Part 1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization - Part 1" 3 | author: "Scott Stoltzman" 4 | date: "March 14, 2017" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | subtitle: The Good, The Bad and The Ugly 9 | --- 10 | 11 | ```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE} 12 | library(png) 13 | library(grid) 14 | ``` 15 | ---- 16 | 17 | # Introduction to Data Visualization 18 | 19 | ```{r, fig.align='center',echo=FALSE} 20 | img = readPNG("images/title_photo.png") 21 | grid.raster(img) 22 | ``` 23 | 24 | The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 25 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate. 26 | 27 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy. 28 | 29 | ---- 30 | 31 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post. 32 | 33 | ### One author at Vox wanted to show the cause of death in all of Shakespeare 34 | 35 | ```{r, fig.align='center',echo=FALSE} 36 | img = readPNG("images/bad-pie1.png") 37 | grid.raster(img) 38 | ``` 39 | 40 | 41 | **Is this not insane!?!?!** 42 | 43 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html). 44 | 45 | 46 | ```{r, fig.align='center',echo=FALSE} 47 | img = readPNG("images/bad-pie1-fix.png") 48 | grid.raster(img) 49 | ``` 50 | ---- 51 | 52 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid). 53 | 54 | ---- 55 | 56 | ### The Good, The Bad, The Ugly 57 | 58 | **"The Good" visualizations:** 59 | 60 | - Clearly illustrate a point 61 | - Are tailored to the appropriate audience 62 | - Analysts may want detail 63 | - Executives may want a high-level view 64 | - Are tailored to the presentation medium 65 | - A piece in an academic journal can be analyzed slowly and carefully 66 | - A slide in front of 5,000 people in a conference will be glanced at quickly 67 | - Are memorable to those who care about the material 68 | - Make an impact which increases the understanding of the subject matter 69 | 70 | **"The Bad" visualizations:** 71 | 72 | - Are difficult to interpret 73 | - Are unintentionally misleading 74 | - Contain redundant and boring information 75 | 76 | **"The Ugly" visualizations:** 77 | 78 | - Are almost impossible to interpret 79 | - Are filled with completely worthless information 80 | - Are intentionally created to mislead the audience 81 | - Are inaccurate 82 | 83 | ### Coming soon: 84 | 85 | - Determining whether or not you need a visualization 86 | - Choosing the type of plot to use depending on the use case 87 | - Introduction to the ggplot2 in R and how it works 88 | - Visualization beyond the standard charts and graphs 89 | 90 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) -------------------------------------------------------------------------------- /Data Visualization - Part 1._pub.html: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Introduction to Data Visualization

4 | 5 |

plot of chunk unnamed-chunk-1

6 | 7 |

The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 8 | $7 Billion by the end of 2022 according to Mordor Intelligence. While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate.

9 | 10 |

In short, are you adding value to your work or are you simply adding this to make it seem less boring? Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy.

11 | 12 |
13 | 14 |

I have to give credit to Junk Charts - it inspired a lot of this post.

15 | 16 |

One author at Vox wanted to show the cause of death in all of Shakespeare

17 | 18 |

plot of chunk unnamed-chunk-2

19 | 20 |

Is this not insane!?!?!

21 | 22 |

Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. Junk Charts Article.

23 | 24 |

plot of chunk unnamed-chunk-3

25 | 26 |

To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid).

27 | 28 |
29 | 30 |

The Good, The Bad, The Ugly

31 | 32 |

“The Good” visualizations:

33 | 34 | 51 | 52 |

“The Bad” visualizations:

53 | 54 | 59 | 60 |

“The Ugly” visualizations:

61 | 62 | 68 | 69 |

Coming soon:

70 | 71 | 77 | 78 |

As always, the code used in this post is on my GitHub

79 | 80 | -------------------------------------------------------------------------------- /Data Visualization - Part 1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization - Part 1" 3 | subtitle: "The Good, The Bad and The Ugly" 4 | author: "Scott Stoltzman" 5 | date: "March 14, 2017" 6 | output: html_document 7 | --- 8 | 9 | 10 | ---- 11 | 12 | # Introduction to Data Visualization 13 | 14 | plot of chunk unnamed-chunk-1 15 | 16 | The topic of data visualization is very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 17 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate. 18 | 19 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** Let's take a look at some examples before going through the Stoltzmaniac Data Visualization Philosophy. 20 | 21 | ---- 22 | 23 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post. 24 | 25 | ### One author at Vox wanted to show the cause of death in all of Shakespeare 26 | 27 | plot of chunk unnamed-chunk-2 28 | 29 | 30 | **Is this not insane!?!?!** 31 | 32 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. While wordles are not ideal for any work requiring exact proportions, it does make for a great visual in this article. [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html). 33 | 34 | 35 | plot of chunk unnamed-chunk-3 36 | ---- 37 | 38 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, I'll walk through my theory and not worry too much about aesthetics (save that for a time when you're getting paid). 39 | 40 | ---- 41 | 42 | ### The Good, The Bad, The Ugly 43 | 44 | **"The Good" visualizations:** 45 | 46 | - Clearly illustrate a point 47 | - Are tailored to the appropriate audience 48 | - Analysts may want detail 49 | - Executives may want a high-level view 50 | - Are tailored to the presentation medium 51 | - A piece in an academic journal can be analyzed slowly and carefully 52 | - A slide in front of 5,000 people in a conference will be glanced at quickly 53 | - Are memorable to those who care about the material 54 | - Make an impact which increases the understanding of the subject matter 55 | 56 | **"The Bad" visualizations:** 57 | 58 | - Are difficult to interpret 59 | - Are unintentionally misleading 60 | - Contain redundant and boring information 61 | 62 | **"The Ugly" visualizations:** 63 | 64 | - Are almost impossible to interpret 65 | - Are filled with completely worthless information 66 | - Are intentionally created to mislead the audience 67 | - Are inaccurate 68 | 69 | ### Coming soon: 70 | 71 | - Determining whether or not you need a visualization 72 | - Choosing the type of plot to use depending on the use case 73 | - Introduction to the ggplot2 in R and how it works 74 | - Visualization beyond the standard charts and graphs 75 | 76 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) 77 | -------------------------------------------------------------------------------- /Data Visualization - Part 2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization - Part 2" 3 | author: "Scott Stoltzman" 4 | date: "March 14, 2017" 5 | output: 6 | html_document: default 7 | subtitle: The Good, The Bad and The Ugly 8 | --- 9 | 10 | ---- 11 | 12 | # Data Visualization - Part 2 13 | 14 | ## A Quick Overview of the ggplot2 Package in R 15 | 16 | While it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. 17 | 18 | In case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/) 19 | 20 | ```{r, fig.align='center',echo=FALSE} 21 | library(png) 22 | library(grid) 23 | img = readPNG("images/title_photo_2.png") 24 | grid.raster(img) 25 | ``` 26 | 27 | 28 | ### Why would you use ggplot2? 29 | 1. More robust plotting than the base plot package 30 | 2. Better control over aesthetics - colors, axes, background, etc. 31 | 3. Layering 32 | 4. Variable Mapping (aes) 33 | 5. Automatic aggregation of data 34 | 6. Built in formulas & plotting (geom_smooth) 35 | 7. The list goes on and on... 36 | 37 | Basically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier. 38 | 39 | ### Why wouldn't you use ggplot2? 40 | 1. A bit of a learning curve 41 | 2. Lack of user interactivity with the plots 42 | 43 | Fundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it. 44 | 45 | **A Basic Example: Unemployment Over Time** 46 | 47 | ```{r,results='hide', warning=FALSE, message=FALSE} 48 | library(dplyr) 49 | library(ggplot2) 50 | 51 | # Load the economics data from ggplot2 52 | data(economics,package='ggplot2') 53 | ``` 54 | 55 | ```{r} 56 | # Take a look at the format of the data 57 | head(economics) 58 | ``` 59 | 60 | 61 | ```{r, fig.height = 4} 62 | # Create the plot 63 | ggplot(data = economics) + geom_line(aes(x = date, y = unemploy)) 64 | ``` 65 | 66 | 67 | 68 | ### What happened to get that? 69 | 70 | - `ggplot(economics)` loaded the data frame 71 | - `+` tells ggplot() that there is more to be added to the plot 72 | - `geom_line()` defined the type of plot 73 | - `aes(x = date, y = unemploy)` mapped the variables 74 | 75 | The `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what "auto-magically" brings your plot to life. You are telling ggplot2, "I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis." It's pretty straightforward in this case but there are more complex use cases as well. 76 | 77 | ***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line(): 78 | `ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()` 79 | 80 | ### Here's the basic formula for success: 81 | 82 | - Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter 83 | - Include your data frame (economics) in a ggplot function: `ggplot(data = economics)` 84 | - Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))` 85 | - "geom" stands for "geometric object" and determines the type of object (there can be more than one type per plot) 86 | - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/) 87 | - Add in layers and utilize `fill` and `col` parameters within `aes()` 88 | 89 | 90 | I'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on. 91 | 92 | **Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/). 93 | 94 | 95 | As usual, some packages must be loaded. 96 | 97 | ```{r, results='hide', warning=FALSE, message=FALSE} 98 | library(reshape2) 99 | library(lubridate) 100 | library(dplyr) 101 | library(tidyr) 102 | library(ggplot2) 103 | library(scales) 104 | library(gridExtra) 105 | ``` 106 | 107 | ### The Scatterplot 108 | 109 | This is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in). 110 | 111 | The author of this chart was looking for a correlation between area and population. 112 | 113 | ```{r} 114 | # Use the "midwest"" data from ggplot2 115 | data("midwest", package = "ggplot2") 116 | 117 | head(midwest) 118 | ``` 119 | 120 | #### Here's the most basic version of the scatter plot 121 | 122 | This can be called by `geom_point()` in ggplot2 123 | 124 | ```{r, warning=FALSE, fig.align='center',fig.height = 4} 125 | # Scatterplot 126 | ggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point() #ggplot 127 | ``` 128 | 129 | #### Here's version with some additional features 130 | 131 | While the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2. 132 | 133 | ```{r, warning=FALSE,message=FALSE,fig.height = 4} 134 | ggplot(data = midwest, aes(x = area, y = poptotal)) + 135 | geom_point(aes(col=state, size=popdensity)) + 136 | geom_smooth(method="loess", se=F) + 137 | xlim(c(0, 0.1)) + 138 | ylim(c(0, 500000)) + 139 | labs(subtitle="Area Vs Population", 140 | y="Population", 141 | x="Area", 142 | title="Scatterplot", 143 | caption = "Source: midwest") 144 | ``` 145 | 146 | #### Explanation: 147 | 148 | `ggplot(data = midwest, aes(x = area, y = poptotal)) + ` 149 | Inputs the data and maps x and y variables as area and poptotal. 150 | 151 | `geom_point(aes(col=state, size=popdensity)) + ` 152 | Creates a scatterplot and maps the color and size of points to state and popdensity. 153 | 154 | ` geom_smooth(method="loess", se=F) + ` 155 | Creates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars. 156 | 157 | ` xlim(c(0, 0.1)) + ` 158 | Sets the x-axis limits. 159 | 160 | ` ylim(c(0, 500000)) + ` 161 | Sets the y-axis limits. 162 | 163 | `labs(subtitle="Area Vs Population",` 164 | 165 | ` y="Population",` 166 | 167 | ` x="Area",` 168 | 169 | ` title="Scatterplot",` 170 | 171 | ` caption = "Source: midwest")` 172 | Changes the labels of the subtitle, y-axis, x-axis, title and caption. 173 | 174 | Notice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily. 175 | 176 | 177 | ### The Density Plot 178 | 179 | Density plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has. 180 | 181 | ```{r} 182 | # Examine the mpg data set 183 | head(mpg) 184 | ``` 185 | 186 | #### Sample Density Plot 187 | 188 | ```{r,fig.height = 4} 189 | g = ggplot(mpg, aes(cty)) 190 | g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + 191 | labs(title="Density plot", 192 | subtitle="City Mileage Grouped by Number of cylinders", 193 | caption="Source: mpg", 194 | x="City Mileage", 195 | fill="# Cylinders") 196 | 197 | ``` 198 | 199 | You'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file. 200 | 201 | `ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)` 202 | 203 | `g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + ` 204 | `geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve. 205 | 206 | ` labs(title="Density plot",` 207 | 208 | ` subtitle="City Mileage Grouped by Number of cylinders",` 209 | 210 | ` caption="Source: mpg",` 211 | 212 | ` x="City Mileage",` 213 | 214 | ` fill="# Cylinders")` 215 | Labeling is cleaned up at the end. 216 | 217 | 218 | #### How would you use your new knowledge to see the density by class instead of by number of cylinders? 219 | 220 | ***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established. 221 | 222 | ```{r,fig.height = 4} 223 | g + geom_density(aes(fill=factor(class)), alpha=0.8) + 224 | labs(title="Density plot", 225 | subtitle="City Mileage Grouped by Class", 226 | caption="Source: mpg", 227 | x="City Mileage", 228 | fill="Class") 229 | ``` 230 | Notice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`. 231 | 232 | ### The Histogram 233 | 234 | How could we show the city mileage in a histogram? 235 | 236 | ```{r,fig.height = 4} 237 | g = ggplot(mpg,aes(cty)) 238 | g + geom_histogram(bins=20) + 239 | labs(title="Histogram", 240 | caption="Source: mpg", 241 | x="City Mileage") 242 | ``` 243 | 244 | `geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one. 245 | 246 | 247 | ### The Bar/Column Chart 248 | 249 | For all intensive purposes, bar and column charts are essentially the same. Technically, the term "column chart" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set. 250 | 251 | ```{r} 252 | #Data Preparation 253 | freqtable <- table(mpg$manufacturer) 254 | df <- as.data.frame.table(freqtable) 255 | head(df) 256 | ``` 257 | 258 | 259 | ```{r,fig.height = 4} 260 | #Set a theme 261 | theme_set(theme_classic()) 262 | 263 | g <- ggplot(df, aes(Var1, Freq)) 264 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 265 | labs(title="Bar Chart", 266 | subtitle="Manufacturer of vehicles", 267 | caption="Source: Frequency of Manufacturers from 'mpg' dataset") + 268 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) 269 | ``` 270 | 271 | The addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time. 272 | 273 | However, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line. 274 | 275 | Within `geom_bar()` there is another new piece of information: `stat="identity"` which tells ggplot to use the actual value of `Freq`. 276 | 277 | You may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. 278 | 279 | ```{r,fig.height = 4} 280 | g <- ggplot(df, aes(reorder(Var1,Freq), Freq)) 281 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 282 | labs(title="Bar Chart", 283 | x = 'Manufacturer', 284 | subtitle="Manufacturer of vehicles", 285 | caption="Source: Frequency of Manufacturers from 'mpg' dataset") + 286 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 287 | coord_flip() 288 | ``` 289 | 290 | Let's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`? 291 | 292 | ```{r,fig.height = 4} 293 | g = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl))) 294 | g + geom_bar(stat='identity', position='dodge') + 295 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) 296 | ``` 297 | 298 | `position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison. 299 | 300 | Despite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy. 301 | 302 | ```{r} 303 | theme_set(theme_bw()) 304 | 305 | g = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl))) 306 | g + geom_bar(stat='identity', position='dodge') + 307 | facet_wrap(~manufacturer) 308 | ``` 309 | This created a much nicer view of the information. It "auto-magically" split everything out by manufacturer! 310 | 311 | 312 | ### Spatial Plots 313 | 314 | Another nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude "auto-magically." 315 | 316 | This map is created with `ggmap` which utilizes Google Maps API. 317 | 318 | ```{r, warning=FALSE, message=FALSE} 319 | library(ggmap) 320 | library(ggalt) 321 | 322 | foco <- geocode("Fort Collins, CO") # get longitude and latitude 323 | 324 | # Get the Map ---------------------------------------------- 325 | colo_map <- qmap("Colorado, United States",zoom = 7, source = "google") 326 | 327 | # Get Coordinates for Places --------------------- 328 | colo_places <- c("Fort Collins, CO", 329 | "Denver, CO", 330 | "Grand Junction, CO", 331 | "Durango, CO", 332 | "Pueblo, CO") 333 | 334 | places_loc <- geocode(colo_places) # get longitudes and latitudes 335 | 336 | 337 | # Plot Open Street Map ------------------------------------- 338 | colo_map + geom_point(aes(x=lon, y=lat), 339 | data = places_loc, 340 | alpha = 0.7, 341 | size = 7, 342 | color = "tomato") + 343 | geom_encircle(aes(x=lon, y=lat), 344 | data = places_loc, size = 2, color = "blue") 345 | ``` 346 | 347 | ### Final Thoughts 348 | 349 | I hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up! 350 | 351 | ### Coming soon: 352 | 353 | - Determining whether or not you need a visualization 354 | - Choosing the type of plot to use depending on the use case 355 | - Visualization beyond the standard charts and graphs 356 | 357 | 358 | I made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). 359 | 360 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) -------------------------------------------------------------------------------- /Data Visualization - Part 2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization - Part 2" 3 | author: "Scott Stoltzman" 4 | date: "March 14, 2017" 5 | output: 6 | html_document: default 7 | subtitle: The Good, The Bad and The Ugly 8 | --- 9 | 10 | ---- 11 | 12 | # Data Visualization - Part 2 13 | 14 | ## A Quick Overview of the ggplot2 Package in R 15 | 16 | While it will be important to focus on theory, I want to explain the ggplot2 package because I will be using it throughout the rest of this series. Knowing how it works will keep the focus on the results rather than the code. It's an incredibly powerful package and once you wrap your head around what it's doing, your life will change for the better! There are a lot of tools out there which provide better charts, graphs and ease of use (i.e. plot.ly, d3.js, Qlik, Tableau), but ggplot2 is still a fantastic resource and I use it all of the time. 17 | 18 | In case you missed it, here's a link to [Data Visualization - Part 1](https://www.stoltzmaniac.com/data-visualization-part-1/) 19 | 20 | plot of chunk unnamed-chunk-1 21 | 22 | 23 | ### Why would you use ggplot2? 24 | 1. More robust plotting than the base plot package 25 | 2. Better control over aesthetics - colors, axes, background, etc. 26 | 3. Layering 27 | 4. Variable Mapping (aes) 28 | 5. Automatic aggregation of data 29 | 6. Built in formulas & plotting (geom_smooth) 30 | 7. The list goes on and on... 31 | 32 | Basically, ggplot2 allows for a lot more customization of plots with a lot less code (the rest of it is behind the scenes). Once you are used to the syntax, there's no going back. It's faster and easier. 33 | 34 | ### Why wouldn't you use ggplot2? 35 | 1. A bit of a learning curve 36 | 2. Lack of user interactivity with the plots 37 | 38 | Fundamentally, ggplot2 gives the user the ability to start a plot and layer everything in. There are many ways to accomplish the same thing, so figure out what makes sense for you and stick to it. 39 | 40 | **A Basic Example: Unemployment Over Time** 41 | 42 | 43 | ```r 44 | library(dplyr) 45 | library(ggplot2) 46 | 47 | # Load the economics data from ggplot2 48 | data(economics,package='ggplot2') 49 | ``` 50 | 51 | 52 | ```r 53 | # Take a look at the format of the data 54 | head(economics) 55 | ``` 56 | 57 | ``` 58 | ## # A tibble: 6 × 6 59 | ## date pce pop psavert uempmed unemploy 60 | ## 61 | ## 1 1967-07-01 507.4 198712 12.5 4.5 2944 62 | ## 2 1967-08-01 510.5 198911 12.5 4.7 2945 63 | ## 3 1967-09-01 516.3 199113 11.7 4.6 2958 64 | ## 4 1967-10-01 512.9 199311 12.5 4.9 3143 65 | ## 5 1967-11-01 518.1 199498 12.5 4.7 3066 66 | ## 6 1967-12-01 525.8 199657 12.1 4.8 3018 67 | ``` 68 | 69 | 70 | 71 | ```r 72 | # Create the plot 73 | ggplot(data = economics) + geom_line(aes(x = date, y = unemploy)) 74 | ``` 75 | 76 | ![plot of chunk unnamed-chunk-4](http://i.imgur.com/BXzLJQ8.png) 77 | 78 | 79 | 80 | ### What happened to get that? 81 | 82 | - `ggplot(economics)` loaded the data frame 83 | - `+` tells ggplot() that there is more to be added to the plot 84 | - `geom_line()` defined the type of plot 85 | - `aes(x = date, y = unemploy)` mapped the variables 86 | 87 | The `aes()` portion is what typically throws new users off but is my favorite feature of ggplot2. In simple terms, this is what "auto-magically" brings your plot to life. You are telling ggplot2, "I want 'date' to be on the x-axis and 'unemploy' to be on the y-axis." It's pretty straightforward in this case but there are more complex use cases as well. 88 | 89 | ***Side Note:*** you could have achieved the same result by mapping the variables in the ggplot() function rather than in geom_line(): 90 | `ggplot(data = economics, aes(x = date, y = unemploy)) + geom_line()` 91 | 92 | ### Here's the basic formula for success: 93 | 94 | - Everything in ggplot2 starts with `ggplot(data)` and utilizes `+` to add on every element thereafter 95 | - Include your data frame (economics) in a ggplot function: `ggplot(data = economics)` 96 | - Input the type of plot you would like (i.e. line chart of unemployment over time): `+ geom_line(aes(x = date, y = unemploy))` 97 | - "geom" stands for "geometric object" and determines the type of object (there can be more than one type per plot) 98 | - There are ***a lot*** of types of geometric objects - check them out [here](http://docs.ggplot2.org/current/) 99 | - Add in layers and utilize `fill` and `col` parameters within `aes()` 100 | 101 | 102 | I'll go through some of the examples from the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). I will be using their examples but I will also explain what's going on. 103 | 104 | **Note:** I believe the intention of the author of the [Top 50 ggplot2 Visualizations Master List](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) was to illustrate how to use ggplot2 rather than doing a full demonstration of what important data visualization techniques are - so keep that in mind as I go through these examples. Some of the visuals do not line up with my best practices addressed in my [first post on data visualization](https://www.stoltzmaniac.com/data-visualization-part-1/). 105 | 106 | 107 | As usual, some packages must be loaded. 108 | 109 | 110 | ```r 111 | library(reshape2) 112 | library(lubridate) 113 | library(dplyr) 114 | library(tidyr) 115 | library(ggplot2) 116 | library(scales) 117 | library(gridExtra) 118 | ``` 119 | 120 | ### The Scatterplot 121 | 122 | This is one of the most visually powerful tool for data analysis. However, you have to be careful when using it because it's primarily used by people doing analysis and not reporting (depending on what industry you're in). 123 | 124 | The author of this chart was looking for a correlation between area and population. 125 | 126 | 127 | ```r 128 | # Use the "midwest"" data from ggplot2 129 | data("midwest", package = "ggplot2") 130 | 131 | head(midwest) 132 | ``` 133 | 134 | ``` 135 | ## # A tibble: 6 × 28 136 | ## PID county state area poptotal popdensity popwhite popblack 137 | ## 138 | ## 1 561 ADAMS IL 0.052 66090 1270.9615 63917 1702 139 | ## 2 562 ALEXANDER IL 0.014 10626 759.0000 7054 3496 140 | ## 3 563 BOND IL 0.022 14991 681.4091 14477 429 141 | ## 4 564 BOONE IL 0.017 30806 1812.1176 29344 127 142 | ## 5 565 BROWN IL 0.018 5836 324.2222 5264 547 143 | ## 6 566 BUREAU IL 0.050 35688 713.7600 35157 50 144 | ## # ... with 20 more variables: popamerindian , popasian , 145 | ## # popother , percwhite , percblack , percamerindan , 146 | ## # percasian , percother , popadults , perchsd , 147 | ## # percollege , percprof , poppovertyknown , 148 | ## # percpovertyknown , percbelowpoverty , 149 | ## # percchildbelowpovert , percadultpoverty , 150 | ## # percelderlypoverty , inmetro , category 151 | ``` 152 | 153 | #### Here's the most basic version of the scatter plot 154 | 155 | This can be called by `geom_point()` in ggplot2 156 | 157 | 158 | ```r 159 | # Scatterplot 160 | ggplot(data = midwest, aes(x = area, y = poptotal)) + geom_point() #ggplot 161 | ``` 162 | 163 | plot of chunk unnamed-chunk-7 164 | 165 | #### Here's version with some additional features 166 | 167 | While the addition of the size of the points and color don't add value, it does show the level of customization that's possible with ggplot2. 168 | 169 | 170 | ```r 171 | ggplot(data = midwest, aes(x = area, y = poptotal)) + 172 | geom_point(aes(col=state, size=popdensity)) + 173 | geom_smooth(method="loess", se=F) + 174 | xlim(c(0, 0.1)) + 175 | ylim(c(0, 500000)) + 176 | labs(subtitle="Area Vs Population", 177 | y="Population", 178 | x="Area", 179 | title="Scatterplot", 180 | caption = "Source: midwest") 181 | ``` 182 | 183 | ![plot of chunk unnamed-chunk-8](http://i.imgur.com/JACxp6k.png) 184 | 185 | #### Explanation: 186 | 187 | `ggplot(data = midwest, aes(x = area, y = poptotal)) + ` 188 | Inputs the data and maps x and y variables as area and poptotal. 189 | 190 | `geom_point(aes(col=state, size=popdensity)) + ` 191 | Creates a scatterplot and maps the color and size of points to state and popdensity. 192 | 193 | ` geom_smooth(method="loess", se=F) + ` 194 | Creates a smoothing curve to fit the data. `method` is the type of fit and `se` determines whether or not to show error bars. 195 | 196 | ` xlim(c(0, 0.1)) + ` 197 | Sets the x-axis limits. 198 | 199 | ` ylim(c(0, 500000)) + ` 200 | Sets the y-axis limits. 201 | 202 | `labs(subtitle="Area Vs Population",` 203 | 204 | ` y="Population",` 205 | 206 | ` x="Area",` 207 | 208 | ` title="Scatterplot",` 209 | 210 | ` caption = "Source: midwest")` 211 | Changes the labels of the subtitle, y-axis, x-axis, title and caption. 212 | 213 | Notice that the legend was automatically created and placed on the lefthand side. This is also highly customizable and can be changed easily. 214 | 215 | 216 | ### The Density Plot 217 | 218 | Density plots are a great way to see how data is distributed. They are similar to histograms in a sense, but show values in terms of percentage of the total. In this example, the author used the mpg data set and is looking to see the different distributions of City Mileage based off of the number of cylinders the car has. 219 | 220 | 221 | ```r 222 | # Examine the mpg data set 223 | head(mpg) 224 | ``` 225 | 226 | ``` 227 | ## # A tibble: 6 × 11 228 | ## manufacturer model displ year cyl trans drv cty hwy fl 229 | ## 230 | ## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p 231 | ## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p 232 | ## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p 233 | ## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p 234 | ## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p 235 | ## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p 236 | ## # ... with 1 more variables: class 237 | ``` 238 | 239 | #### Sample Density Plot 240 | 241 | 242 | ```r 243 | g = ggplot(mpg, aes(cty)) 244 | g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + 245 | labs(title="Density plot", 246 | subtitle="City Mileage Grouped by Number of cylinders", 247 | caption="Source: mpg", 248 | x="City Mileage", 249 | fill="# Cylinders") 250 | ``` 251 | 252 | ![plot of chunk unnamed-chunk-10](http://i.imgur.com/k2injTT.png) 253 | 254 | You'll notice one immediate difference here. The author decided to create a the object `g` to equal `ggplot(mpg, aes(cty))` - this is a nice trick and will save you some time if you plan on keeping `ggplot(mpg, aes(cty))` as the fundamental plot and simply exploring other visualizations on top of it. It is also handy if you need to save the output of a chart to an image file. 255 | 256 | `ggplot(mpg, aes(cty))` loads the mpg data and `aes(cty)` assumes `aes(x = cty)` 257 | 258 | `g + geom_density(aes(fill=factor(cyl)), alpha=0.8) + ` 259 | `geom_density` kicks off a density plot and the mapping of `cyl` is used for colors. `alpha` is the transparency/opacity of the area under the curve. 260 | 261 | ` labs(title="Density plot",` 262 | 263 | ` subtitle="City Mileage Grouped by Number of cylinders",` 264 | 265 | ` caption="Source: mpg",` 266 | 267 | ` x="City Mileage",` 268 | 269 | ` fill="# Cylinders")` 270 | Labeling is cleaned up at the end. 271 | 272 | 273 | #### How would you use your new knowledge to see the density by class instead of by number of cylinders? 274 | 275 | ***Hint: *** `g = ggplot(mpg, aes(cty))` has already been established. 276 | 277 | 278 | ```r 279 | g + geom_density(aes(fill=factor(class)), alpha=0.8) + 280 | labs(title="Density plot", 281 | subtitle="City Mileage Grouped by Class", 282 | caption="Source: mpg", 283 | x="City Mileage", 284 | fill="Class") 285 | ``` 286 | 287 | ![plot of chunk unnamed-chunk-11](http://i.imgur.com/Kq7TY54.png) 288 | Notice how I didn't have to write out `ggplot()` again because it was already stored in the object `g`. 289 | 290 | ### The Histogram 291 | 292 | How could we show the city mileage in a histogram? 293 | 294 | 295 | ```r 296 | g = ggplot(mpg,aes(cty)) 297 | g + geom_histogram(bins=20) + 298 | labs(title="Histogram", 299 | caption="Source: mpg", 300 | x="City Mileage") 301 | ``` 302 | 303 | ![plot of chunk unnamed-chunk-12](http://i.imgur.com/rZVtc1G.png) 304 | 305 | `geom_histogram(bins=20)` plots the histogram. If `bins` isn't set, ggplot2 will automatically set one. 306 | 307 | 308 | ### The Bar/Column Chart 309 | 310 | For all intensive purposes, bar and column charts are essentially the same. Technically, the term "column chart" can be used when the bars run vertically. The author of this chart was simply looking at the frequency of the vehicles listed in the data set. 311 | 312 | 313 | ```r 314 | #Data Preparation 315 | freqtable <- table(mpg$manufacturer) 316 | df <- as.data.frame.table(freqtable) 317 | head(df) 318 | ``` 319 | 320 | ``` 321 | ## Var1 Freq 322 | ## 1 audi 18 323 | ## 2 chevrolet 19 324 | ## 3 dodge 37 325 | ## 4 ford 25 326 | ## 5 honda 9 327 | ## 6 hyundai 14 328 | ``` 329 | 330 | 331 | 332 | ```r 333 | #Set a theme 334 | theme_set(theme_classic()) 335 | 336 | g <- ggplot(df, aes(Var1, Freq)) 337 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 338 | labs(title="Bar Chart", 339 | subtitle="Manufacturer of vehicles", 340 | caption="Source: Frequency of Manufacturers from 'mpg' dataset") + 341 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) 342 | ``` 343 | 344 | ![plot of chunk unnamed-chunk-14](http://i.imgur.com/OtF2saP.png) 345 | 346 | The addition of `theme_set(theme_classic())` adds a preset theme to the chart. You can create your own or select from a large list of themes. This can help set your work apart from others and save a lot of time. 347 | 348 | However, theme_set() is different than the `theme(axis.text.x = element_text(angle=65, vjust=0.6))` the one used inside the plot itself in this case. The author decided to tilt the text along the x-axis. `vjust=0.6` changes how far it is spaced away from the axis line. 349 | 350 | Within `geom_bar()` there is another new piece of information: `stat="identity"` which tells ggplot to use the actual value of `Freq`. 351 | 352 | You may also notice that ggplot arranged all of the data in alphabetical order based off of the manufacturer. If you want to change the order, it's best to use the `reorder()` function. This next chart will use the `Freq` and `coord_flip()` to orient the chart differently. 353 | 354 | 355 | ```r 356 | g <- ggplot(df, aes(reorder(Var1,Freq), Freq)) 357 | g + geom_bar(stat="identity", width = 0.5, fill="tomato2") + 358 | labs(title="Bar Chart", 359 | x = 'Manufacturer', 360 | subtitle="Manufacturer of vehicles", 361 | caption="Source: Frequency of Manufacturers from 'mpg' dataset") + 362 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 363 | coord_flip() 364 | ``` 365 | 366 | ![plot of chunk unnamed-chunk-15](http://i.imgur.com/lQkbQjO.png) 367 | 368 | Let's continue with bar charts - what if we wanted to see what `hwy` looked like by `manufacturer` and in terms of `cyl`? 369 | 370 | 371 | ```r 372 | g = ggplot(mpg,aes(x=manufacturer,y=hwy,col=factor(cyl),fill=factor(cyl))) 373 | g + geom_bar(stat='identity', position='dodge') + 374 | theme(axis.text.x = element_text(angle=65, vjust=0.6)) 375 | ``` 376 | 377 | ![plot of chunk unnamed-chunk-16](http://i.imgur.com/eLaSXr7.png) 378 | 379 | `position='dodge'` had to be used because the default setting is to stack the bars, `'dodge'` places them side by side for comparison. 380 | 381 | Despite the fact that the chart did what I wanted, it is very difficult to read due to how many manufacturers there are. This is where the `facet_wrap()` feature comes in handy. 382 | 383 | 384 | ```r 385 | theme_set(theme_bw()) 386 | 387 | g = ggplot(mpg,aes(x=factor(cyl),y=hwy,col=factor(cyl),fill=factor(cyl))) 388 | g + geom_bar(stat='identity', position='dodge') + 389 | facet_wrap(~manufacturer) 390 | ``` 391 | 392 | ![plot of chunk unnamed-chunk-17](http://i.imgur.com/wpsQt81.png) 393 | This created a much nicer view of the information. It "auto-magically" split everything out by manufacturer! 394 | 395 | 396 | ### Spatial Plots 397 | 398 | Another nice feature of ggplot2 is the integration with maps and spatial plotting. In this simple example, I wanted to plot a few cities in Colorado and draw a border around them. Other than the addition of the map, ggplot simply places the dots directly on the locations via their longitude and latitude "auto-magically." 399 | 400 | This map is created with `ggmap` which utilizes Google Maps API. 401 | 402 | 403 | ```r 404 | library(ggmap) 405 | library(ggalt) 406 | 407 | foco <- geocode("Fort Collins, CO") # get longitude and latitude 408 | 409 | # Get the Map ---------------------------------------------- 410 | colo_map <- qmap("Colorado, United States",zoom = 7, source = "google") 411 | 412 | # Get Coordinates for Places --------------------- 413 | colo_places <- c("Fort Collins, CO", 414 | "Denver, CO", 415 | "Grand Junction, CO", 416 | "Durango, CO", 417 | "Pueblo, CO") 418 | 419 | places_loc <- geocode(colo_places) # get longitudes and latitudes 420 | 421 | 422 | # Plot Open Street Map ------------------------------------- 423 | colo_map + geom_point(aes(x=lon, y=lat), 424 | data = places_loc, 425 | alpha = 0.7, 426 | size = 7, 427 | color = "tomato") + 428 | geom_encircle(aes(x=lon, y=lat), 429 | data = places_loc, size = 2, color = "blue") 430 | ``` 431 | 432 | ![plot of chunk unnamed-chunk-18](http://i.imgur.com/rmhVRiD.png) 433 | 434 | ### Final Thoughts 435 | 436 | I hope you learned a lot about the basics of ggplot2 in this. It's extremely powerful but yet easy to use once you get the hang of it. The best way to really learn it is to try it out. Find some data on your own and try to manipulate it and get it plotted. Without a doubt, you will have all kinds of errors pop up, data you expect to be plotted won't show up, colors and fills will be different, etc. However, your visualizations will be leveled-up! 437 | 438 | ### Coming soon: 439 | 440 | - Determining whether or not you need a visualization 441 | - Choosing the type of plot to use depending on the use case 442 | - Visualization beyond the standard charts and graphs 443 | 444 | 445 | I made some modifications to the code, but almost all of the examples here were from [Top 50 ggplot2 Visualizations - The Master List ](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html). 446 | 447 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) 448 | -------------------------------------------------------------------------------- /Data Visualization - Tropical Storms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tropical Storm Data" 3 | author: "Scott Stoltzman" 4 | date: "9/12/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, fig.align = 'center') 10 | ``` 11 | ```{r load_libraries} 12 | library(tidyverse) 13 | library(ggthemes) 14 | library(ggmap) 15 | library(htmlwidgets) 16 | ``` 17 | 18 | # Exploratory Data Analysis of Tropical Storms in R 19 | 20 | The disastrous impact of recent hurricanes, Harvey and Irma, generated a large influx of data within the online community. I was curious as to what the history of hurricanes and tropical storms looked like so I found a data set on [data.world](https://data.world/dhs/historical-tropical-storm) and started some basic Exploratory data analysis (EDA). 21 | 22 | 23 | EDA is crucial to starting any project. Through EDA you can start to identify errors & inconsistencies in your data, find interesting patterns, see correlations and start to develop hypotheses to test. For most people, basic spreadsheets and charts are pretty handy and provide a great place to start. They are an easy-to-use method to manipulate and visualize your data quickly. Data scientists may cringe at the idea of using a graphical user interface (GUI) to kick-off the EDA process but the reality is clear, those tools are very effective and efficient when used properly. However, if you're reading this, you're probably trying to take EDA to the next level. The best way to learn is to get your hands dirty, let's get started. 24 | 25 | 26 | The original source of the data was can be found at [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0). 27 | 28 | 29 | ---- 30 | 31 | 32 | #### Step 1: Take a look at your data set and see how it is laid out 33 | 34 | 35 | ```{r read_data} 36 | # data source https://data.world/dhs/historical-tropical-storm 37 | data = read_csv('data/Historical_Tropical_Storm_Tracks.csv') 38 | knitr::kable(head(data)) 39 | ``` 40 | 41 | 42 | Fortunately, this is a tidy data set which will make life easier and appears to be cleaned up substantially. The column names are relatively straightforward with the exception of "ID" columns. 43 | 44 | The description as given by [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0): 45 | 46 | >This dataset represents Historical North Atlantic and Eastern North Pacific Tropical Cyclone Tracks with 6-hourly (0000, 0600, 1200, 1800 UTC) center locations and intensities for all subtropical depressions and storms, extratropical storms, tropical lows, waves, disturbances, depressions and storms, and all hurricanes, from 1851 through 2008. These data are intended for geographic display and analysis at the national level, and for large regional areas. The data should be displayed and analyzed at scales appropriate for 1:2,000,000-scale data. 47 | 48 | 49 | #### Step 2: View some descriptive statistics 50 | 51 | ```{r} 52 | knitr::kable(summary(data %>% select(YEAR, 53 | MONTH, 54 | DAY, 55 | WIND_KTS, 56 | PRESSURE))) 57 | ``` 58 | 59 | 60 | We can confirm that this particular data had storms from 1851 - 2010, that means the data goes back roughly 100 years before naming storms started! We can also see that the minimum pressure values are 0, which likely means it could not be measured (due to the fact zero pressure is not possible in this case). We can see that there are recorded months from January to December along with days extending from 1 to 31. Whenever you see all of the dates laid out that way, you can smile and think to yourself, "if I need to, I can put dates in an easy to use format such as YYYY-mm-dd (2017-09-12)!" 61 | 62 | 63 | #### Step 3: Make a basic plot 64 | 65 | 66 | ```{r} 67 | df = data %>% 68 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 69 | group_by(YEAR) %>% 70 | summarise(Distinct_Storms = n_distinct(NAME)) 71 | 72 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms)) + theme_economist() 73 | p + geom_line(size = 1.1) + 74 | ggtitle("Number of Storms Per Year") + 75 | geom_smooth(method='lm', se = FALSE) + 76 | ylab("Storms") 77 | ``` 78 | 79 | 80 | This is a great illustration of our data set and we can easily notice an upward trend in the number of storms over time. Before we go running to tell the world that the number of storms per year is growing, we need to drill down a bit deeper. This could simply be caused because more types of storms were added to the data set (we know there are hurricanes, tropical storms, waves, etc.) being recorded. However, we should keep it in mind when we start to develop hypotheses. 81 | 82 | 83 | **You will notice the data starts at 1950 rather than 1851.** I made this choice because storms were not named until this point so it would be difficult to try and count the unique storms per year. It could likely be done by finding a way to utilize the "ID" columns. However, this is a preliminary analysis so I didn't want to dig too deep. 84 | 85 | 86 | #### Step 4: Make some calculations 87 | 88 | ```{r} 89 | pct.diff = function(x){round((x-lag(x))/lag(x),2)} 90 | act.diff = function(x){round((x-lag(x)),2)} 91 | df = data %>% 92 | arrange(YEAR) %>% 93 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 94 | group_by(YEAR) %>% 95 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 96 | mutate(Distinct_Storms_Change = act.diff(Distinct_Storms), 97 | Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>% 98 | na.omit() %>% 99 | arrange(YEAR) 100 | df$YEAR = factor(df$YEAR) 101 | knitr::kable(head(df,10)) 102 | ``` 103 | 104 | 105 | In this case, we can see the number of storms, nominal change and percentage change per year. These calculations help to shed light on what the growth rate looks like each year. So we can use another summary table: 106 | 107 | 108 | ```{r} 109 | knitr::kable(summary(df %>% select(-YEAR))) 110 | ``` 111 | 112 | From the table we can state the following for the given time period: 113 | 114 | * The mean number of storms is 23 per year (with a minimum of 6 and maximum of 43) 115 | * The mean change in the number of storms per year is 0.34 (with a minimum of -15 and maximum of 16) 116 | * The mean percent change in the number of storms per year is 6% (with a minimum of -42% and maximum of 114%) 117 | 118 | Again, we have to be careful because these numbers are in aggregate and may not tell the whole story. Dividing these into groups of storms is likely much more meaningful. 119 | 120 | 121 | 122 | 123 | #### Step 5: Make a more interesting plot 124 | 125 | 126 | ```{r} 127 | df = data %>% 128 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 129 | filter(grepl("H", CAT)) %>% 130 | group_by(YEAR,CAT) %>% 131 | summarise(Distinct_Storms = n_distinct(NAME)) 132 | df$CAT = factor(df$CAT) 133 | 134 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms, col = CAT)) + theme_economist() 135 | p + geom_line(size = 1.1) + 136 | scale_color_brewer(direction = -1, palette = "Spectral") + 137 | ggtitle("Number of Storms Per Year By Category (H)") + 138 | facet_wrap(~CAT, scales = "free_x") + 139 | geom_smooth(method = 'lm', se = FALSE, col = 'black') + 140 | theme(axis.text.x = element_text(angle=90), legend.position = 'none') + 141 | ylab('Storms') 142 | ``` 143 | 144 | 145 | Because I was most interested in hurricanes, I filtered out only the data which was classified as "H (1-5)." By utilizing a data visualization technique called "small multiples" I was able to pull out the different types and view them within the same graph. While this is possible to do in tables and spreadsheets, it's much easier to visualize this way. By holding the axes constant, we can see the majority of the storms are classified as H1 and then it appears to consistently drop down toward H5 (with very few actually being classified as H5). We can also see that most have an upward trend from 1950 - 2010. The steepest appears to be H1 (but it also flattens out over the last decade). 146 | 147 | 148 | #### Step 5: Make a filtered calculation 149 | 150 | ```{r} 151 | df = data %>% 152 | arrange(YEAR) %>% 153 | filter(grepl("H", CAT)) %>% 154 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 155 | group_by(YEAR) %>% 156 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 157 | mutate(Distinct_Storms_Change = act.diff(Distinct_Storms), 158 | Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>% 159 | na.omit() %>% 160 | arrange(YEAR) 161 | knitr::kable(summary(df %>% select(-YEAR))) 162 | ``` 163 | 164 | 165 | Now we are looking strictly at hurricane data (classified as H1-H5): 166 | 167 | * The mean number of hurricanes is 13 per year (with a minimum of 4 and maximum of 24) 168 | * The mean change in the number of hurricanes per year is 0.05 (with a minimum of -11 and maximum of 10) 169 | * The mean percent change in the number of hurricanes per year is 8% (with a minimum of -56% and maximum of 180%) 170 | 171 | While it doesn't really make sense to say "we got an average growth of 0.05 hurricanes per year between 1950 and 2010" ... it may make sense to say "we saw an average of growth of 8% per year in the number of hurricanes between 1950 and 2010." 172 | 173 | That's a great thing to put in quotes! 174 | 175 | > During EDA we discovered an average of growth of 8% per year in the number of hurricanes between 1950 and 2010. 176 | 177 | Be ready, as soon as you make a statement like that, you will likely have to explain how you arrived at that conclusion. That's where having an RMarkdown notebook and data online in a repository will help you out! Reproducible research is all of the hype right now. 178 | 179 | 180 | #### Step 5: Try visualizing your statements 181 | 182 | ```{r} 183 | 184 | df = data %>% 185 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 186 | filter(grepl("H", CAT)) %>% 187 | group_by(YEAR) %>% 188 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 189 | mutate(Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) 190 | 191 | p = ggplot(df,aes(x = Distinct_Storms_Pct_Change)) + theme_economist() 192 | 193 | p1 = p + geom_histogram(bins = 20) + 194 | ggtitle("YoY % Change Density") + 195 | scale_x_continuous(labels = scales::percent) + 196 | ylab('') + xlab('YoY % Change in Hurricanes') 197 | 198 | p2 = p + geom_density(fill='darkgrey',alpha=0.5) + 199 | ggtitle("YoY % Change Density") + 200 | scale_x_continuous(labels = scales::percent) + 201 | ylab('') + xlab('YoY % Change in Hurricanes') 202 | 203 | gridExtra::grid.arrange(p1,p2,ncol=2) 204 | ``` 205 | 206 | 207 | A histogram and/or density plot is a great way to visualize the distribution of the data you are making statements about. This plot helps to show that we are looking at a right-skewed distribution with substantial variance. Knowing that we have n = 58 (meaning 58 years after being aggregated), it's not surprising that our histogram looks sparse and our density plot has an unusual shape. At this point, you can make a decision to jot this down, research it in depth and then attack it with full force. 208 | 209 | 210 | However, that's not what we're covering in this post. 211 | 212 | 213 | #### Step 6: Plot another aspect of your data 214 | 215 | 216 | ```{r} 217 | big_map <- get_googlemap(c(lon=-95, lat=30), zoom = 4, maptype = "terrain") 218 | ggmap(big_map, extent='panel') + 219 | geom_point(data = data, mapping = aes(x = LONG, y = LAT),col='red',alpha=0.1) 220 | ``` 221 | 222 | 223 | 60K pieces of data can get out of hand quickly, we need to back this down into manageable chunks. Building on the knowledge from our last exploration, we should be able to think of a way to cut this down to get some better information. The concept of small multiples could come in handy again! Splitting the data up by type of storm could prove to be invaluable. We can also tell that we are missing 224 | 225 | ----- 226 | 227 | ```{r} 228 | df = data %>% filter(grepl("H", CAT)) 229 | ggmap(big_map) + 230 | geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 231 | stat_density2d(data = df, 232 | aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 233 | bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 234 | guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 235 | facet_wrap(~CAT) 236 | ``` 237 | 238 | 239 | After filtering the data down to hurricanes and utilizing a heatmap rather than plotting individual points we can get a better handle on what is happening where. The H4 and H5 sections are probably the most interesting. It appears as if H4 storms are more frequent on the West coast of Mexico whereas the H5 are most frequent in the Gulf of Mexico. 240 | 241 | 242 | Because we're still in EDA mode, we'll continue with another plot. 243 | 244 | 245 | ```{r} 246 | df = data %>% filter(!grepl("H", CAT) & !grepl("W", CAT)) 247 | ggmap(big_map) + 248 | geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 249 | stat_density2d(data = df, 250 | aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 251 | bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 252 | guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 253 | facet_wrap(~CAT) 254 | ``` 255 | 256 | 257 | Here are some of the other storms from the data set. We can see that TD, TS and L have large geographical spreads. The E, SS, and SD storms are concentrated further North toward New England. 258 | 259 | Digging into this type of data and building probabalistic models is a fascinating field. The actuarial sciences are extremely difficult and insurance companies really need good models. Having mapped this data, it's pretty clear you could dig in and find out what parts of the country should expect what types of storms (and you've also known this just from being alive for 10+ years). More hypotheses could be formed about location at this stage and could be tested! 260 | 261 | 262 | #### Step 7: Look for a relationship 263 | 264 | 265 | ```{r} 266 | df = data %>% 267 | filter(PRESSURE > 0) %>% 268 | filter(grepl("H", CAT)) %>% 269 | group_by(CAT,YEAR,MONTH,DAY,LAT,LONG) %>% 270 | summarise(MEAN_WIND_KTS = mean(WIND_KTS), MEAN_PRESSURE = mean(PRESSURE)) %>% 271 | arrange(MEAN_WIND_KTS) 272 | df$CAT = factor(df$CAT) 273 | 274 | p = ggplot(df,aes(x=MEAN_WIND_KTS, y = MEAN_PRESSURE, fill = CAT)) + theme_economist() 275 | p + 276 | geom_hex(alpha = 0.8) + 277 | scale_fill_brewer(direction = -1, palette = "Spectral") + 278 | scale_y_continuous(labels = scales::comma)+ 279 | theme(legend.position = 'right') + 280 | ggtitle("Wind KTS vs. Pressure by Category (H)") 281 | ``` 282 | 283 | 284 | What is the relationship between WIND_KTS and PRESSURE? This chart helps us to see that low PRESSURE and WIND_KTS are likely negatively correlated. We can also see that the WIND_KTS is essentially the predictor in the data set which can perfectly predict how a storm is classified. Well, it turns out, that's basically the distinguising feature when scientists are determining how to categorize these storms! 285 | 286 | 287 | #### Step ........ 288 | 289 | The rest is up to you! This is a great data set and there are a lot more pieces of information lurking within it. I want people to do their own EDA and send me anything interesting! 290 | 291 | Some food for thought: 292 | 293 | * What was the most common name for a hurricane? 294 | * Do the names actually follow an alphabetical pattern through time? (This is one is tricky) 295 | * Can we merge this data with FEMA, charitable donations, or other aid data? 296 | 297 | 298 | To get you started on the first one, here's the Top 10 most common names for tropical storms. Why do you think it's Florence? 299 | 300 | 301 | ```{r} 302 | top_names = data %>% 303 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 304 | group_by(NAME) %>% 305 | summarise(Years_Used = n_distinct(YEAR)) %>% 306 | arrange(-Years_Used) 307 | p = ggplot(top_names %>% top_n(10), aes(x = reorder(NAME, Years_Used), y = Years_Used)) + theme_economist() 308 | p + geom_bar(stat='identity') + coord_flip() + xlab('') + ggtitle('Most Used Tropical Storm Names') 309 | ``` 310 | 311 | 312 | Thank you for reading, I hope this helps you with your own data. The code is all written in R and is located on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson). You can also find other data visualization posts and usages of ggplot2 on my blog [Stoltzmaniac](https://www.stoltzmaniac.com?utm_campaign=bottom_of_tropical_storm_post) 313 | -------------------------------------------------------------------------------- /Data Visualization Lesson.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization - Introduction" 3 | subtitle: "The Good, The Bad and The Ugly" 4 | author: "Scott Stoltzman" 5 | date: "March 10, 2017" 6 | output: html_document 7 | --- 8 | 9 | ```{r setup, results='hide', warning=FALSE, message=FALSE,echo=FALSE} 10 | library(png) 11 | library(grid) 12 | ``` 13 | ---- 14 | 15 | # A Lesson on Data Visualization - Introduction 16 | 17 | ```{r, fig.align='center',echo=FALSE} 18 | img = readPNG("images/title_photo.png") 19 | grid.raster(img) 20 | ``` 21 | 22 | The topic of data visualization is still very popular in the data science community. The market size for visualization products is valued at $4 Billion and is projected to reach 23 | $7 Billion by the end of 2022 according to [Mordor Intelligence.](https://www.mordorintelligence.com/industry-reports/data-visualization-applications-market-future-of-decision-making-industry) While we have seen amazing advances in the technology to display information, the understanding of how, why, and when to use visualization techniques has not kept up. Unfortunately, people are often taught how to make a chart before even thinking about whether or not it's appropriate. 24 | 25 | This is the first post of many on how to utilize data visualization techniques effectively. I will focus primarily on use cases in R but visualizations go well beyond charts and graphs. 26 | 27 | ---- 28 | 29 | ## Questions to ask before starting your project: 30 | 31 | 1. Is the information complex enough to require any visual aid? 32 | 2. Do you understand your audience well enough for this to provide value? 33 | 3. Can you tailor the material to your audience? 34 | 4. Will anyone learn anything by looking at it? 35 | 5. Will it accurately reflect the data? 36 | 37 | In short, are you adding value to your work or are you simply adding this to make it seem ***less boring?*** 38 | 39 | This introduction should whet your appetite if you're interested in learning more. This is a broad view of some data visualization techniques in R using ggplot2. 40 | 41 | ---- 42 | 43 | ## Let's take a look at some examples 44 | 45 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/) - it inspired a lot of this post. 46 | 47 | ### One author wanted to show the cause of death in all of Shakespeare 48 | **Could you imagine a worse way to show this??** 49 | 50 | ```{r, fig.align='center',echo=FALSE} 51 | img = readPNG("images/bad-pie1.png") 52 | grid.raster(img) 53 | ``` 54 | 55 | 56 | **Is this not insane!?!?!** 57 | 58 | Using a legend instead of data callouts is the only thing that could have made this worse. The author could easily have used a number of other tools to get the point across. I hate wordles but due to the fact that the article wasn't trying to show the exact proportions, a wordle easily illustrates the point (a point made by [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html)). 59 | 60 | 61 | ```{r, fig.align='center',echo=FALSE} 62 | img = readPNG("images/bad-pie1-fix.png") 63 | grid.raster(img) 64 | ``` 65 | ---- 66 | 67 | To be clear, I'm not close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, we'll look at the theory and not worry too much about aesthetics (save that for a time when you're getting paid). 68 | 69 | 70 | Bad visualizations can be: 71 | 72 | - Difficult or impossible to interpret 73 | - Filled with completely worthless information 74 | - Misleading (intentionally or unintentionally) 75 | - Redundant and boring 76 | - Inaccurate 77 | 78 | Let's load up some libraries and get started. 79 | 80 | ```{r libraryPrep, results='hide', warning=FALSE, message=FALSE} 81 | library(ggplot2) 82 | library(dplyr) 83 | library(tidyr) 84 | library(lubridate) 85 | library(scales) 86 | ``` 87 | 88 | ## Decide on what you're trying to accomplish first. 89 | 90 | Ask yourself the following questions to help drive your decision: 91 | 92 | - Are you making a comparison? 93 | - Are you finding a relationship? 94 | - Are you showing a distribution? 95 | - Are you finding a trend over time? 96 | - Are you showing composition? 97 | 98 | Once you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options. 99 | 100 | #### Rule of Thumb 101 | 102 | - **Trend: ** Column, Line 103 | - **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter 104 | - **Relationship: ** Line, Scatter 105 | - **Distribution: ** Bar, Boxplot, Column 106 | - **Composition: ** Donut, Pie, Stacked Bar, Stacked Column 107 | 108 | Obviously, there are choices beyond these and you need to think through your choice wisely. 109 | 110 | Side Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately. 111 | 112 | For this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm) 113 | 114 | 115 | ```{r dataLoading, results='hide', warning=FALSE, message=FALSE} 116 | #Custom data preparation 117 | #GitHub (linked to at bottom of this post) 118 | source('data_preparation.R') 119 | data = getData() 120 | ``` 121 | 122 | ```{r} 123 | head(data) 124 | ``` 125 | 126 | ---- 127 | 128 | ## Trend - Line Chart 129 | 130 | **Objective:** See what the oil production in the US looked like from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period. 131 | 132 | I decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading. 133 | 134 | ### Which of these views would you rather see? 135 | 136 | #### Poor Version 137 | The x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still almost worthless. 138 | 139 | 140 | ```{r,fig.align='center', fig.width=8} 141 | df = data %>% 142 | group_by(Year) %>% 143 | summarise(ThousandBarrel = sum(ThousandBarrel)) 144 | 145 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 146 | p + geom_line(stat='identity') + 147 | ggtitle('Oil Production Over Time') + 148 | theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 149 | xlab('') + ylab('') 150 | ``` 151 | 152 | #### Better Version 153 | The title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible. 154 | 155 | ```{r,fig.align='center', fig.width=12} 156 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 157 | p + geom_line(stat='identity') + 158 | ggtitle('Thousand Barrel Oil Production By Year in the U.S.') + 159 | theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 160 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 161 | scale_y_continuous(labels = comma) 162 | ``` 163 | 164 | 165 | ## Comparison - Line Chart 166 | 167 | **Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare them. 168 | 169 | From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010. 170 | 171 | ### Which of these views would you rather see? 172 | 173 | #### Poor Version 174 | There are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. 175 | 176 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 177 | df = data %>% 178 | group_by(Location, Year) %>% 179 | summarise(ThousandBarrel = sum(ThousandBarrel)) 180 | 181 | df$Year = as.numeric(df$Year) 182 | 183 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location)) 184 | p + geom_line(stat='identity') + 185 | ggtitle(paste('Oil Production By Year By State in the U.S.')) + 186 | theme(plot.title = element_text(hjust = 0.5)) + 187 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 188 | ``` 189 | 190 | #### Better Version 191 | This focuses attention on the top producing states. It compares them to each other and shows the trend per state as well. 192 | 193 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 194 | n=6 #Arbitrary at first, after trying a few, this made the most sense 195 | topN = data %>% 196 | group_by(Location) %>% 197 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 198 | arrange(-ThousandBarrel) %>% 199 | top_n(n) 200 | 201 | df = data %>% 202 | filter(Location %in% topN$Location) %>% 203 | group_by(Year,Location) %>% 204 | summarise(ThousandBarrel = sum(ThousandBarrel)) 205 | 206 | df$Year = as.numeric(df$Year) 207 | df$Location = as.factor(df$Location) 208 | 209 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 210 | p + geom_line(stat='identity') + 211 | ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + 212 | theme(plot.title = element_text(hjust = 0.5)) + 213 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 214 | facet_wrap(~Location) + 215 | scale_y_continuous(labels = comma) 216 | 217 | ``` 218 | 219 | 220 | 221 | ## Relationship - Scatter Plot 222 | 223 | **Objective**: See if Alaska and California data is correlated (This probably isn't important but it allows us to use the same data). 224 | 225 | ### Which of these views would you rather see? 226 | 227 | #### Poor Version 228 | Lots of completely irrelevant data! Size of the point should have nothing to do with the year. 229 | 230 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 231 | statesList = c('Alaska','California') 232 | df = data %>% 233 | filter(Location %in% statesList) %>% 234 | spread(Location,ThousandBarrel) %>% 235 | select(Alaska,California,Month,Year) 236 | 237 | p = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year)) 238 | p + geom_point() + 239 | scale_y_continuous(labels = comma) + 240 | scale_x_continuous(labels = comma) + 241 | ggtitle('Oil Production - CA vs. AK') + 242 | theme(plot.title = element_text(hjust = 0.5)) 243 | 244 | ``` 245 | 246 | #### Better Version 247 | The trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data. 248 | 249 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 250 | df = data %>% 251 | filter(Location %in% statesList) %>% 252 | spread(Location,ThousandBarrel) %>% 253 | select(Alaska,California,Year) 254 | 255 | p = ggplot(df,aes(x=Alaska,y=California)) 256 | p + geom_point() + 257 | scale_y_continuous(labels = comma) + 258 | scale_x_continuous(labels = comma) + 259 | ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + 260 | theme(plot.title = element_text(hjust = 0.5)) + 261 | geom_smooth(method='lm') 262 | 263 | ``` 264 | 265 | 266 | 267 | 268 | ## Distribution - Boxplot 269 | 270 | **Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance. 271 | 272 | ### Which of these views would you rather see? 273 | 274 | #### Poor Version 275 | 276 | 277 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 278 | df = data %>% 279 | group_by(Year,Location) %>% 280 | summarise(ThousandBarrel = sum(ThousandBarrel)) 281 | 282 | p = ggplot(df,aes(x=Location,y=ThousandBarrel)) 283 | p + geom_boxplot() + 284 | ggtitle('Distribution of Oil Production by State') 285 | 286 | ``` 287 | 288 | 289 | #### Better Version 290 | This gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players. 291 | 292 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 293 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel)) 294 | p + geom_boxplot() + 295 | scale_y_continuous(labels = comma) + 296 | ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + 297 | coord_flip() 298 | ``` 299 | 300 | 301 | ## Composition - Stacked Bar 302 | 303 | **Objective**: Check out the composition of total production by state. 304 | 305 | ### Which of these views would you rather see? 306 | 307 | #### Poor Version 308 | My favorite, the beautiful pie chart! There's nothing better than this... 309 | 310 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 311 | df = data %>% 312 | group_by(Location) %>% 313 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 314 | mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel)) 315 | 316 | df$ThousandBarrel = round(100*df$ThousandBarrel,0) 317 | 318 | library(plotrix) 319 | pie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State') 320 | 321 | ``` 322 | 323 | 324 | #### Better Version 325 | The 1980's and 2010's will be missing years in terms of a "decade" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite "apples-to-apples" for a comparison because I created different decades, but you get the idea. 326 | 327 | I also created an "Other" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely. 328 | 329 | ```{r,warning=FALSE,fig.width=10,message=FALSE} 330 | data$Decade = '1980s' 331 | data$Decade[data$Year >= 1990] = '1990s' 332 | data$Decade[data$Year >= 2000] = '2000s' 333 | data$Decade[data$Year >= 2010] = '2010s' 334 | data$Decade = as.factor(data$Decade) 335 | 336 | top5 = data %>% 337 | group_by(Location) %>% 338 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 339 | arrange(-ThousandBarrel) %>% 340 | top_n(5) %>% 341 | select(Location) 342 | 343 | top5List = top5$Location 344 | 345 | data$State = "Other" 346 | 347 | for(i in 1:length(top5List)){ 348 | data$State[data$Location == top5List[i]] = top5List[i] 349 | } 350 | 351 | df = data %>% 352 | group_by(Decade,State) %>% 353 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 354 | mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel)) 355 | 356 | df$ThousandBarrel = round(df$ThousandBarrel,3) 357 | df$text = paste(round(100*df$ThousandBarrel,0),'%', sep='') 358 | 359 | p = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel))) 360 | p + geom_bar(stat='identity') + 361 | geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = "stack") + 362 | scale_y_continuous(labels = percent) + 363 | ggtitle('Percentage of Top Oil Producing States by Decade') + 364 | guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + 365 | theme(plot.title = element_text(hjust = 0.5)) 366 | 367 | ``` 368 | 369 | 370 | 371 | 372 | ### Some other fun concepts are below! 373 | Some of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. 374 | 375 | Have fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. 376 | 377 | Now it's your turn! 378 | 379 | 380 | ```{r} 381 | df = data %>% 382 | group_by(Location) %>% 383 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 384 | arrange(-ThousandBarrel) 385 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel)) 386 | p + geom_bar(stat='identity') + 387 | ggtitle('Oil Production 1981 - 2016 By Location') + 388 | theme(plot.title = element_text(hjust = 0.5)) + 389 | coord_flip() 390 | ``` 391 | 392 | 393 | 394 | 395 | 396 | ```{r} 397 | top10 = data %>% 398 | group_by(Location) %>% 399 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 400 | arrange(-ThousandBarrel) %>% 401 | top_n(10) 402 | print(top10) 403 | 404 | df = data %>% 405 | group_by(Location,Year) %>% 406 | filter(Location %in% top10$Location) %>% 407 | summarise(ThousandBarrel = sum(ThousandBarrel)) 408 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location)) 409 | p + geom_bar(stat='identity') + 410 | ggtitle('Oil Production - Top 10 States') + 411 | theme(plot.title = element_text(hjust = 0.5)) + 412 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 413 | ``` 414 | 415 | 416 | 417 | ```{r, fig.width=10} 418 | df = data %>% 419 | filter(Year == 1990)%>% 420 | group_by(Location) %>% 421 | summarise(ThousandBarrel = sum(ThousandBarrel)) 422 | df$Location = tolower(df$Location) 423 | 424 | #Add States without data 425 | States = data.frame(Location = tolower(as.character(state.name))) 426 | missingStates = States$Location[!(States$Location %in% df$Location)] 427 | appendData = data.frame(Location=missingStates,ThousandBarrel=0) 428 | df = rbind(df,appendData) 429 | 430 | states_map <- map_data("state") 431 | 432 | ggplot(df, aes(map_id = Location)) + 433 | geom_map(aes(fill=ThousandBarrel), map = states_map) + 434 | expand_limits(x = states_map$long, y = states_map$lat) 435 | 436 | ``` 437 | 438 | 439 | ```{r} 440 | df = data %>% 441 | filter(Location == 'Texas') %>% 442 | group_by(Year,Month) %>% 443 | summarise(ThousandBarrel = sum(ThousandBarrel)) 444 | 445 | p = ggplot(df,aes(x=Month,y=ThousandBarrel)) 446 | p + geom_line(stat='identity',aes(group=Year,col=Year)) + 447 | ggtitle('Oil Production By Year in the U.S.') + 448 | theme(plot.title = element_text(hjust = 0.5)) + 449 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 450 | ``` 451 | 452 | 453 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) -------------------------------------------------------------------------------- /Data Visualization Lesson.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Visualization" 3 | subtitle: "The Good, The Bad and The Ugly" 4 | author: "Scott Stoltzman" 5 | date: "March 10, 2017" 6 | output: html_document 7 | --- 8 | 9 | 10 | 11 | ---- 12 | 13 | # Data Viz Ain't Easy 14 | 15 | In almost every business meeting you'll attend, you will see a poorly designed chart, graph, or other visual representation of data. Most people simply lack the education required to emphasize a point. 16 | 17 | Bad visualizations can be: 18 | 19 | - Difficult or impossible to interpret 20 | - Filled with completely worthless information 21 | - Misleading (intentionally or unintentionally) 22 | - Redundant and boring 23 | - Inaccurate 24 | 25 | 26 | I have to give credit to [Junk Charts](http://junkcharts.typepad.com/j) - it inspired a lot of this post. 27 | 28 | ---- 29 | 30 | ## Let's take a look at some examples 31 | 32 | ### Every Death in Shakespeare 33 | **Could you imagine a worse way to show this??** 34 | 35 | plot of chunk unnamed-chunk-1 36 | 37 | 38 | **Is this not insane!?!?!** 39 | 40 | No one could ever glance at that and possibly want to read it. The only thing that would have made it worse would be if there had been a legend instead of data callouts. The author could easily have used a number of other tools to get the point across. I hate wordles but due to the fact that the article wasn't trying to show the exact proportions of type of deaths, a wordle easily illustrated the point. 41 | 42 | 43 | plot of chunk unnamed-chunk-2 44 | This example came from this [Junk Charts Article](http://junkcharts.typepad.com/junk_charts/2016/03/which-way-to-die-the-bard-asked-onelesspie.html) 45 | 46 | ---- 47 | 48 | To be clear, I'm not even close to being perfect when it comes to visualizations in my blog. The sizes, shapes, font colors, etc. tend to get out of control and I don't want to take the time in R to tinker with all of the details. However, when it comes to displaying things professionally, it has to be spot on! So, we'll look at the theory and not worry too much about aesthetics (save that for a time when you're getting paid). 49 | 50 | Let's load up some libraries and get started. 51 | 52 | 53 | ```r 54 | library(ggplot2) 55 | library(dplyr) 56 | library(tidyr) 57 | library(lubridate) 58 | library(scales) 59 | ``` 60 | 61 | ## Decide on what you're trying to accomplish first. 62 | 63 | Ask yourself the following questions to help drive your decision: 64 | 65 | - Are you making a comparison? 66 | - Are you finding a relationship? 67 | - Are you showing a distribution? 68 | - Are you finding a trend over time? 69 | - Are you showing composition? 70 | 71 | Once you know which question you are asking, it will keep your mind focused on the outcome and will quickly narrow down your charting options. 72 | 73 | #### Rule of Thumb 74 | 75 | - **Trend: ** Column, Line 76 | - **Comparison: ** Area, Bar, Bullet, Column, Line, Scatter 77 | - **Relationship: ** Line, Scatter 78 | - **Distribution: ** Bar, Boxplot, Column 79 | - **Composition: ** Donut, Pie, Stacked Bar, Stacked Column 80 | 81 | Obviously, there are choices beyond these and you need to think through your choice wisely. 82 | 83 | Side Note: I ***hate*** donut and pie charts! When used properly, they're terriffic! However, I'm very used to gagging every time one appears on a projector screen due to how frequently they're used inappropriately. 84 | 85 | For this project, I'll use some oil production data that I found while digging through http://data.world (pretty great site). The data can be found [here](http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm) 86 | 87 | 88 | 89 | ```r 90 | #Custom data preparation 91 | #GitHub (linked to at bottom of this post) 92 | source('data_preparation.R') 93 | data = getData() 94 | ``` 95 | 96 | 97 | ```r 98 | head(data) 99 | ``` 100 | 101 | ``` 102 | ## Location Month Year ThousandBarrel Date 103 | ## 1 Alabama Mar 2013 883 2013-03-01 104 | ## 2 Alabama Apr 2013 844 2013-04-01 105 | ## 3 Alabama May 2013 878 2013-05-01 106 | ## 4 Alabama Feb 2013 809 2013-02-01 107 | ## 5 Alabama Mar 1982 1687 1982-03-01 108 | ## 6 Alabama Apr 1982 1567 1982-04-01 109 | ``` 110 | 111 | ---- 112 | 113 | ## Trend - Line Chart 114 | 115 | **Objective:** See what the oil production in the US looked like from 1981 - 2016 by year. I want to illustrate the changes over the time period. This is a very high-level view and only shows us a decline followed by a ramp up at the end of the period. 116 | 117 | I decided to use a line chart to show the trend over time. When using discrete data you should use a column chart to avoid any confusion that in between these years the data actually was simply linear. However, it paints a much clearer picture this way and is not misleading. 118 | 119 | ### Which of these views would you rather see? 120 | 121 | #### Poor Version 122 | The x-axis is a disaster and the y-axis isn't formatted well. While it gets the point across, it's still almost worthless. 123 | 124 | 125 | 126 | ```r 127 | df = data %>% 128 | group_by(Year) %>% 129 | summarise(ThousandBarrel = sum(ThousandBarrel)) 130 | 131 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 132 | p + geom_line(stat='identity') + 133 | ggtitle('Oil Production Over Time') + 134 | theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 135 | xlab('') + ylab('') 136 | ``` 137 | 138 | plot of chunk unnamed-chunk-4 139 | 140 | #### Better Version 141 | The title gives us a much better understanding of what we're looking at. The chart is slightly wider and the axes are formatted to be legible. 142 | 143 | 144 | ```r 145 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 146 | p + geom_line(stat='identity') + 147 | ggtitle('Thousand Barrel Oil Production By Year in the U.S.') + 148 | theme(plot.title = element_text(hjust = 0.5),plot.subtitle = element_text(hjust = 0.5)) + 149 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 150 | scale_y_continuous(labels = comma) 151 | ``` 152 | 153 | plot of chunk unnamed-chunk-5 154 | 155 | 156 | ## Comparison - Line Chart 157 | 158 | **Objective**: Identify which states affected the trend the most. Evaluate them simultaneously in order to paint the picture and compare them. 159 | 160 | From this visual you can see the top states are Alaska, California, Louisiana, Oklahoma, Texas and Wyoming. Texas seems to break the mold quite drastically and drove the spike which occurred after 2010. 161 | 162 | ### Which of these views would you rather see? 163 | 164 | #### Poor Version 165 | There are far too many colors going on here. Everything at the bottom of the chart is relatively useless and takes our focus away from the big players. 166 | 167 | 168 | ```r 169 | df = data %>% 170 | group_by(Location, Year) %>% 171 | summarise(ThousandBarrel = sum(ThousandBarrel)) 172 | 173 | df$Year = as.numeric(df$Year) 174 | 175 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location)) 176 | p + geom_line(stat='identity') + 177 | ggtitle(paste('Oil Production By Year By State in the U.S.')) + 178 | theme(plot.title = element_text(hjust = 0.5)) + 179 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 180 | ``` 181 | 182 | ![plot of chunk unnamed-chunk-6](http://i.imgur.com/LdhZsPF.png) 183 | 184 | #### Better Version 185 | This focuses attention on the top producing states. It compares them to each other and shows the trend per state as well. 186 | 187 | 188 | ```r 189 | n=6 #Arbitrary at first, after trying a few, this made the most sense 190 | topN = data %>% 191 | group_by(Location) %>% 192 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 193 | arrange(-ThousandBarrel) %>% 194 | top_n(n) 195 | 196 | df = data %>% 197 | filter(Location %in% topN$Location) %>% 198 | group_by(Year,Location) %>% 199 | summarise(ThousandBarrel = sum(ThousandBarrel)) 200 | 201 | df$Year = as.numeric(df$Year) 202 | df$Location = as.factor(df$Location) 203 | 204 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,group=1)) 205 | p + geom_line(stat='identity') + 206 | ggtitle(paste('Top',as.character(n),'States - Oil Production By Year in the U.S.')) + 207 | theme(plot.title = element_text(hjust = 0.5)) + 208 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 209 | facet_wrap(~Location) + 210 | scale_y_continuous(labels = comma) 211 | ``` 212 | 213 | ![plot of chunk unnamed-chunk-7](http://i.imgur.com/Vkxymeb.png) 214 | 215 | 216 | 217 | ## Relationship - Scatter Plot 218 | 219 | **Objective**: See if Alaska and California data is correlated (This probably isn't important but it allows us to use the same data). 220 | 221 | ### Which of these views would you rather see? 222 | 223 | #### Poor Version 224 | Lots of completely irrelevant data! Size of the point should have nothing to do with the year. 225 | 226 | 227 | ```r 228 | statesList = c('Alaska','California') 229 | df = data %>% 230 | filter(Location %in% statesList) %>% 231 | spread(Location,ThousandBarrel) %>% 232 | select(Alaska,California,Month,Year) 233 | 234 | p = ggplot(df,aes(x=Alaska,y=California,col=Month,size=Year)) 235 | p + geom_point() + 236 | scale_y_continuous(labels = comma) + 237 | scale_x_continuous(labels = comma) + 238 | ggtitle('Oil Production - CA vs. AK') + 239 | theme(plot.title = element_text(hjust = 0.5)) 240 | ``` 241 | 242 | ![plot of chunk unnamed-chunk-8](http://i.imgur.com/ta0Uv6a.png) 243 | 244 | #### Better Version 245 | The trend line is nice because it helps to visualize the relationship even more. While it can sometimes be misleading, it makes sense with our current data. 246 | 247 | 248 | ```r 249 | df = data %>% 250 | filter(Location %in% statesList) %>% 251 | spread(Location,ThousandBarrel) %>% 252 | select(Alaska,California,Year) 253 | 254 | p = ggplot(df,aes(x=Alaska,y=California)) 255 | p + geom_point() + 256 | scale_y_continuous(labels = comma) + 257 | scale_x_continuous(labels = comma) + 258 | ggtitle('Monthly Thousand Barrel Oil Production 1981-2016 CA vs. AK') + 259 | theme(plot.title = element_text(hjust = 0.5)) + 260 | geom_smooth(method='lm') 261 | ``` 262 | 263 | ![plot of chunk unnamed-chunk-9](http://i.imgur.com/Kxtvyrc.png) 264 | 265 | 266 | 267 | 268 | ## Distribution - Boxplot 269 | 270 | **Objective**: Examine the range of production by state and year over the time period to give us an idea of the variance. 271 | 272 | ### Which of these views would you rather see? 273 | 274 | #### Poor Version 275 | 276 | 277 | 278 | ```r 279 | df = data %>% 280 | group_by(Year,Location) %>% 281 | summarise(ThousandBarrel = sum(ThousandBarrel)) 282 | 283 | p = ggplot(df,aes(x=Location,y=ThousandBarrel)) 284 | p + geom_boxplot() + 285 | ggtitle('Distribution of Oil Production by State') 286 | ``` 287 | 288 | ![plot of chunk unnamed-chunk-10](http://i.imgur.com/QYyBcNn.png) 289 | 290 | 291 | #### Better Version 292 | This gives a nice ranking to the plot while still showing their distributions. While it was semi-apparent in the line charts, the variance of Texas is huge compared to the others! We could take this a step further and separate out the big players from the smaller players. 293 | 294 | 295 | ```r 296 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel)) 297 | p + geom_boxplot() + 298 | scale_y_continuous(labels = comma) + 299 | ggtitle('Distribution of Annual Oil Production By State (1981 - 2016)') + 300 | coord_flip() 301 | ``` 302 | 303 | ![plot of chunk unnamed-chunk-11](http://i.imgur.com/AunupK4.png) 304 | 305 | 306 | ## Composition - Stacked Bar 307 | 308 | **Objective**: Check out the composition of total production by state. 309 | 310 | ### Which of these views would you rather see? 311 | 312 | #### Poor Version 313 | My favorite, the beautiful pie chart! There's nothing better than this... 314 | 315 | 316 | ```r 317 | df = data %>% 318 | group_by(Location) %>% 319 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 320 | mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel)) 321 | 322 | df$ThousandBarrel = round(100*df$ThousandBarrel,0) 323 | 324 | library(plotrix) 325 | pie(x=df$ThousandBarrel,labels=df$Location,explode=0.1,col=rainbow(nrow(df)),main='Percentage of Oil Production by State') 326 | ``` 327 | 328 | ![plot of chunk unnamed-chunk-12](http://i.imgur.com/vv1q2FX.png) 329 | 330 | 331 | #### Better Version 332 | The 1980's and 2010's will be missing years in terms of a "decade" due to the data provided (and it's only 2017). While the percentage labels are slightly off center, it's certainly much better than the pie chart. It's not quite "apples-to-apples" for a comparison because I created different decades, but you get the idea. 333 | 334 | I also created an "Other" category in order to simplify the output. When you are doing comparisons, it's typically a good idea to find a way to reduce the number of variables in the output while not removing data by dropping it completely. 335 | 336 | 337 | ```r 338 | data$Decade = '1980s' 339 | data$Decade[data$Year >= 1990] = '1990s' 340 | data$Decade[data$Year >= 2000] = '2000s' 341 | data$Decade[data$Year >= 2010] = '2010s' 342 | data$Decade = as.factor(data$Decade) 343 | 344 | top5 = data %>% 345 | group_by(Location) %>% 346 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 347 | arrange(-ThousandBarrel) %>% 348 | top_n(5) %>% 349 | select(Location) 350 | 351 | top5List = top5$Location 352 | 353 | data$State = "Other" 354 | 355 | for(i in 1:length(top5List)){ 356 | data$State[data$Location == top5List[i]] = top5List[i] 357 | } 358 | 359 | df = data %>% 360 | group_by(Decade,State) %>% 361 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 362 | mutate(ThousandBarrel = ThousandBarrel/sum(ThousandBarrel)) 363 | 364 | df$ThousandBarrel = round(df$ThousandBarrel,3) 365 | df$text = paste(round(100*df$ThousandBarrel,0),'%', sep='') 366 | 367 | p = ggplot(df,aes(x=Decade,y=ThousandBarrel,col=reorder(State,ThousandBarrel),fill=reorder(State,ThousandBarrel))) 368 | p + geom_bar(stat='identity') + 369 | geom_text(aes(label=text),col='Black',size = 4, hjust = 0.5, vjust = 3, position = "stack") + 370 | scale_y_continuous(labels = percent) + 371 | ggtitle('Percentage of Top Oil Producing States by Decade') + 372 | guides(fill=guide_legend(title='State'),col=guide_legend(title='State')) + 373 | theme(plot.title = element_text(hjust = 0.5)) 374 | ``` 375 | 376 | ![plot of chunk unnamed-chunk-13](http://i.imgur.com/I7Cyf6Z.png) 377 | 378 | 379 | 380 | 381 | ### Some other fun concepts are below! 382 | Some of them are nice, others are terrible! I won't comment on any of them, but I felt it was necessary to include some other ideas I toyed around with. 383 | 384 | Have fun with your data visualizations. The charts I showed here are extremely simple. Being creative by using things other than R wind up making visuals people can remember. There are plenty of examples around, but they all tend to follow basic principles of design. There are ***A TON*** of good books out there on this topic. 385 | 386 | Now it's your turn! 387 | 388 | 389 | 390 | ```r 391 | df = data %>% 392 | group_by(Location) %>% 393 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 394 | arrange(-ThousandBarrel) 395 | p = ggplot(df,aes(x=reorder(Location,ThousandBarrel),y=ThousandBarrel)) 396 | p + geom_bar(stat='identity') + 397 | ggtitle('Oil Production 1981 - 2016 By Location') + 398 | theme(plot.title = element_text(hjust = 0.5)) + 399 | coord_flip() 400 | ``` 401 | 402 | ![plot of chunk unnamed-chunk-14](http://i.imgur.com/DwJ52D9.png) 403 | 404 | 405 | 406 | 407 | 408 | 409 | ```r 410 | top10 = data %>% 411 | group_by(Location) %>% 412 | summarise(ThousandBarrel = sum(ThousandBarrel)) %>% 413 | arrange(-ThousandBarrel) %>% 414 | top_n(10) 415 | ``` 416 | 417 | ``` 418 | ## Selecting by ThousandBarrel 419 | ``` 420 | 421 | ```r 422 | print(top10) 423 | ``` 424 | 425 | ``` 426 | ## # A tibble: 10 × 2 427 | ## Location ThousandBarrel 428 | ## 429 | ## 1 Texas 23447172 430 | ## 2 Alaska 15775279 431 | ## 3 California 9988225 432 | ## 4 Louisiana 4267246 433 | ## 5 Oklahoma 3701224 434 | ## 6 Wyoming 2894624 435 | ## 7 Kansas 1708873 436 | ## 8 Colorado 1288643 437 | ## 9 Utah 894657 438 | ## 10 Mississippi 861999 439 | ``` 440 | 441 | ```r 442 | df = data %>% 443 | group_by(Location,Year) %>% 444 | filter(Location %in% top10$Location) %>% 445 | summarise(ThousandBarrel = sum(ThousandBarrel)) 446 | p = ggplot(df,aes(x=Year,y=ThousandBarrel,col=Location,fill=Location)) 447 | p + geom_bar(stat='identity') + 448 | ggtitle('Oil Production - Top 10 States') + 449 | theme(plot.title = element_text(hjust = 0.5)) + 450 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 451 | ``` 452 | 453 | ![plot of chunk unnamed-chunk-15](http://i.imgur.com/Y21hxkY.png) 454 | 455 | 456 | 457 | 458 | ```r 459 | df = data %>% 460 | filter(Year == 1990)%>% 461 | group_by(Location) %>% 462 | summarise(ThousandBarrel = sum(ThousandBarrel)) 463 | df$Location = tolower(df$Location) 464 | 465 | #Add States without data 466 | States = data.frame(Location = tolower(as.character(state.name))) 467 | missingStates = States$Location[!(States$Location %in% df$Location)] 468 | appendData = data.frame(Location=missingStates,ThousandBarrel=0) 469 | df = rbind(df,appendData) 470 | 471 | states_map <- map_data("state") 472 | 473 | ggplot(df, aes(map_id = Location)) + 474 | geom_map(aes(fill=ThousandBarrel), map = states_map) + 475 | expand_limits(x = states_map$long, y = states_map$lat) 476 | ``` 477 | 478 | ![plot of chunk unnamed-chunk-16](http://i.imgur.com/jT1XTmF.png) 479 | 480 | 481 | 482 | ```r 483 | df = data %>% 484 | filter(Location == 'Texas') %>% 485 | group_by(Year,Month) %>% 486 | summarise(ThousandBarrel = sum(ThousandBarrel)) 487 | 488 | p = ggplot(df,aes(x=Month,y=ThousandBarrel)) 489 | p + geom_line(stat='identity',aes(group=Year,col=Year)) + 490 | ggtitle('Oil Production By Year in the U.S.') + 491 | theme(plot.title = element_text(hjust = 0.5)) + 492 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 493 | ``` 494 | 495 | ![plot of chunk unnamed-chunk-17](http://i.imgur.com/4OwEmjV.png) 496 | 497 | 498 | As always, the code used in this post is on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson) 499 | -------------------------------------------------------------------------------- /Data-Visualization-Lesson.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /Data_Visualization_-_Part_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_1.pdf -------------------------------------------------------------------------------- /Data_Visualization_-_Part_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_2.pdf -------------------------------------------------------------------------------- /Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Part_3_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Tropical_Storms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tropical Storm Data" 3 | author: "Scott Stoltzman" 4 | date: "9/12/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE, fig.align = 'center') 10 | ``` 11 | ```{r load_libraries} 12 | library(tidyverse) 13 | library(ggthemes) 14 | library(ggmap) 15 | library(htmlwidgets) 16 | ``` 17 | 18 | # Exploratory Data Analysis of Tropical Storms in R 19 | 20 | The disastrous impact of recent hurricanes, Harvey and Irma, generated a large influx of data within the online community. I was curious as to what the history of hurricanes and tropical storms looked like so I found a data set on [data.world](https://data.world/dhs/historical-tropical-storm) and started some basic Exploratory data analysis (EDA). 21 | 22 | 23 | EDA is crucial to starting any project. Through EDA you can start to identify errors & inconsistencies in your data, find interesting patterns, see correlations and start to develop hypotheses to test. For most people, basic spreadsheets and charts are pretty handy and provide a great place to start. They are an easy-to-use method to manipulate and visualize your data quickly. Data scientists may cringe at the idea of using a graphical user interface (GUI) to kick-off the EDA process but the reality is clear, those tools are very effective and efficient when used properly. However, if you're reading this, you're probably trying to take EDA to the next level. The best way to learn is to get your hands dirty, let's get started. 24 | 25 | 26 | The original source of the data was can be found at [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0). 27 | 28 | 29 | ---- 30 | 31 | 32 | #### Step 1: Take a look at your data set and see how it is laid out 33 | 34 | 35 | ```{r read_data} 36 | # data source https://data.world/dhs/historical-tropical-storm 37 | data = read_csv('data/Historical_Tropical_Storm_Tracks.csv') 38 | knitr::kable(head(data)) 39 | ``` 40 | 41 | 42 | Fortunately, this is a tidy data set which will make life easier and appears to be cleaned up substantially. The column names are relatively straightforward with the exception of "ID" columns. 43 | 44 | The description as given by [DHS.gov](https://hifld-dhs-gii.opendata.arcgis.com/datasets/3ea21accbfab4ed8b14ede2e802cc2ec_0): 45 | 46 | >This dataset represents Historical North Atlantic and Eastern North Pacific Tropical Cyclone Tracks with 6-hourly (0000, 0600, 1200, 1800 UTC) center locations and intensities for all subtropical depressions and storms, extratropical storms, tropical lows, waves, disturbances, depressions and storms, and all hurricanes, from 1851 through 2008. These data are intended for geographic display and analysis at the national level, and for large regional areas. The data should be displayed and analyzed at scales appropriate for 1:2,000,000-scale data. 47 | 48 | 49 | #### Step 2: View some descriptive statistics 50 | 51 | ```{r} 52 | knitr::kable(summary(data %>% select(YEAR, 53 | MONTH, 54 | DAY, 55 | WIND_KTS, 56 | PRESSURE))) 57 | ``` 58 | 59 | 60 | We can confirm that this particular data had storms from 1851 - 2010, that means the data goes back roughly 100 years before naming storms started! We can also see that the minimum pressure values are 0, which likely means it could not be measured (due to the fact zero pressure is not possible in this case). We can see that there are recorded months from January to December along with days extending from 1 to 31. Whenever you see all of the dates laid out that way, you can smile and think to yourself, "if I need to, I can put dates in an easy to use format such as YYYY-mm-dd (2017-09-12)!" 61 | 62 | 63 | #### Step 3: Make a basic plot 64 | 65 | 66 | ```{r} 67 | df = data %>% 68 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 69 | group_by(YEAR) %>% 70 | summarise(Distinct_Storms = n_distinct(NAME)) 71 | 72 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms)) + theme_economist() 73 | p + geom_line(size = 1.1) + 74 | ggtitle("Number of Storms Per Year") + 75 | geom_smooth(method='lm', se = FALSE) + 76 | ylab("Storms") 77 | ``` 78 | 79 | 80 | This is a great illustration of our data set and we can easily notice an upward trend in the number of storms over time. Before we go running to tell the world that the number of storms per year is growing, we need to drill down a bit deeper. This could simply be caused because more types of storms were added to the data set (we know there are hurricanes, tropical storms, waves, etc.) being recorded. However, we should keep it in mind when we start to develop hypotheses. 81 | 82 | 83 | **You will notice the data starts at 1950 rather than 1851.** I made this choice because storms were not named until this point so it would be difficult to try and count the unique storms per year. It could likely be done by finding a way to utilize the "ID" columns. However, this is a preliminary analysis so I didn't want to dig too deep. 84 | 85 | 86 | #### Step 4: Make some calculations 87 | 88 | ```{r} 89 | pct.diff = function(x){round((x-lag(x))/lag(x),2)} 90 | act.diff = function(x){round((x-lag(x)),2)} 91 | df = data %>% 92 | arrange(YEAR) %>% 93 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 94 | group_by(YEAR) %>% 95 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 96 | mutate(Distinct_Storms_Change = act.diff(Distinct_Storms), 97 | Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>% 98 | na.omit() %>% 99 | arrange(YEAR) 100 | df$YEAR = factor(df$YEAR) 101 | knitr::kable(head(df,10)) 102 | ``` 103 | 104 | 105 | In this case, we can see the number of storms, nominal change and percentage change per year. These calculations help to shed light on what the growth rate looks like each year. So we can use another summary table: 106 | 107 | 108 | ```{r} 109 | knitr::kable(summary(df %>% select(-YEAR))) 110 | ``` 111 | 112 | From the table we can state the following for the given time period: 113 | 114 | * The mean number of storms is 23 per year (with a minimum of 6 and maximum of 43) 115 | * The mean change in the number of storms per year is 0.34 (with a minimum of -15 and maximum of 16) 116 | * The mean percent change in the number of storms per year is 6% (with a minimum of -42% and maximum of 114%) 117 | 118 | Again, we have to be careful because these numbers are in aggregate and may not tell the whole story. Dividing these into groups of storms is likely much more meaningful. 119 | 120 | 121 | 122 | 123 | #### Step 5: Make a more interesting plot 124 | 125 | 126 | ```{r} 127 | df = data %>% 128 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 129 | filter(grepl("H", CAT)) %>% 130 | group_by(YEAR,CAT) %>% 131 | summarise(Distinct_Storms = n_distinct(NAME)) 132 | df$CAT = factor(df$CAT) 133 | 134 | p = ggplot(df, aes(x = YEAR, y = Distinct_Storms, col = CAT)) + theme_economist() 135 | p + geom_line(size = 1.1) + 136 | scale_color_brewer(direction = -1, palette = "Spectral") + 137 | ggtitle("Number of Storms Per Year By Category (H)") + 138 | facet_wrap(~CAT, scales = "free_x") + 139 | geom_smooth(method = 'lm', se = FALSE, col = 'black') + 140 | theme(axis.text.x = element_text(angle=90), legend.position = 'none') + 141 | ylab('Storms') 142 | ``` 143 | 144 | 145 | Because I was most interested in hurricanes, I filtered out only the data which was classified as "H (1-5)." By utilizing a data visualization technique called "small multiples" I was able to pull out the different types and view them within the same graph. While this is possible to do in tables and spreadsheets, it's much easier to visualize this way. By holding the axes constant, we can see the majority of the storms are classified as H1 and then it appears to consistently drop down toward H5 (with very few actually being classified as H5). We can also see that most have an upward trend from 1950 - 2010. The steepest appears to be H1 (but it also flattens out over the last decade). 146 | 147 | 148 | #### Step 5: Make a filtered calculation 149 | 150 | ```{r} 151 | df = data %>% 152 | arrange(YEAR) %>% 153 | filter(grepl("H", CAT)) %>% 154 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 155 | group_by(YEAR) %>% 156 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 157 | mutate(Distinct_Storms_Change = act.diff(Distinct_Storms), 158 | Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) %>% 159 | na.omit() %>% 160 | arrange(YEAR) 161 | knitr::kable(summary(df %>% select(-YEAR))) 162 | ``` 163 | 164 | 165 | Now we are looking strictly at hurricane data (classified as H1-H5): 166 | 167 | * The mean number of hurricanes is 13 per year (with a minimum of 4 and maximum of 24) 168 | * The mean change in the number of hurricanes per year is 0.05 (with a minimum of -11 and maximum of 10) 169 | * The mean percent change in the number of hurricanes per year is 8% (with a minimum of -56% and maximum of 180%) 170 | 171 | While it doesn't really make sense to say "we got an average growth of 0.05 hurricanes per year between 1950 and 2010" ... it may make sense to say "we saw an average of growth of 8% per year in the number of hurricanes between 1950 and 2010." 172 | 173 | That's a great thing to put in quotes! 174 | 175 | > During EDA we discovered an average of growth of 8% per year in the number of hurricanes between 1950 and 2010. 176 | 177 | Be ready, as soon as you make a statement like that, you will likely have to explain how you arrived at that conclusion. That's where having an RMarkdown notebook and data online in a repository will help you out! Reproducible research is all of the hype right now. 178 | 179 | 180 | #### Step 5: Try visualizing your statements 181 | 182 | ```{r} 183 | 184 | df = data %>% 185 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 186 | filter(grepl("H", CAT)) %>% 187 | group_by(YEAR) %>% 188 | summarise(Distinct_Storms = n_distinct(NAME)) %>% 189 | mutate(Distinct_Storms_Pct_Change = pct.diff(Distinct_Storms)) 190 | 191 | p = ggplot(df,aes(x = Distinct_Storms_Pct_Change)) + theme_economist() 192 | 193 | p1 = p + geom_histogram(bins = 20) + 194 | ggtitle("YoY % Change Density") + 195 | scale_x_continuous(labels = scales::percent) + 196 | ylab('') + xlab('YoY % Change in Hurricanes') 197 | 198 | p2 = p + geom_density(fill='darkgrey',alpha=0.5) + 199 | ggtitle("YoY % Change Density") + 200 | scale_x_continuous(labels = scales::percent) + 201 | ylab('') + xlab('YoY % Change in Hurricanes') 202 | 203 | gridExtra::grid.arrange(p1,p2,ncol=2) 204 | ``` 205 | 206 | 207 | A histogram and/or density plot is a great way to visualize the distribution of the data you are making statements about. This plot helps to show that we are looking at a right-skewed distribution with substantial variance. Knowing that we have n = 58 (meaning 58 years after being aggregated), it's not surprising that our histogram looks sparse and our density plot has an unusual shape. At this point, you can make a decision to jot this down, research it in depth and then attack it with full force. 208 | 209 | 210 | However, that's not what we're covering in this post. 211 | 212 | 213 | #### Step 6: Plot another aspect of your data 214 | 215 | 216 | ```{r} 217 | big_map <- get_googlemap(c(lon=-95, lat=30), zoom = 4, maptype = "terrain") 218 | ggmap(big_map, extent='panel') + 219 | geom_point(data = data, mapping = aes(x = LONG, y = LAT),col='red',alpha=0.1) 220 | ``` 221 | 222 | 223 | 60K pieces of data can get out of hand quickly, we need to back this down into manageable chunks. Building on the knowledge from our last exploration, we should be able to think of a way to cut this down to get some better information. The concept of small multiples could come in handy again! Splitting the data up by type of storm could prove to be invaluable. We can also tell that we are missing 224 | 225 | ----- 226 | 227 | ```{r} 228 | df = data %>% filter(grepl("H", CAT)) 229 | ggmap(big_map) + 230 | geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 231 | stat_density2d(data = df, 232 | aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 233 | bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 234 | guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 235 | facet_wrap(~CAT) 236 | ``` 237 | 238 | 239 | After filtering the data down to hurricanes and utilizing a heatmap rather than plotting individual points we can get a better handle on what is happening where. The H4 and H5 sections are probably the most interesting. It appears as if H4 storms are more frequent on the West coast of Mexico whereas the H5 are most frequent in the Gulf of Mexico. 240 | 241 | 242 | Because we're still in EDA mode, we'll continue with another plot. 243 | 244 | 245 | ```{r} 246 | df = data %>% filter(!grepl("H", CAT) & !grepl("W", CAT)) 247 | ggmap(big_map) + 248 | geom_density_2d(data = df, mapping = aes(x = LONG, y = LAT), size = 0.5) + 249 | stat_density2d(data = df, 250 | aes(x = LONG, y = LAT, fill = ..level.., alpha = ..level..), size = 0.1, 251 | bins = 20, geom = "polygon") + scale_fill_gradient(low = "green", high = "red", 252 | guide = FALSE) + scale_alpha(range = c(0.1, 0.5), guide = FALSE) + 253 | facet_wrap(~CAT) 254 | ``` 255 | 256 | 257 | Here are some of the other storms from the data set. We can see that TD, TS and L have large geographical spreads. The E, SS, and SD storms are concentrated further North toward New England. 258 | 259 | Digging into this type of data and building probabalistic models is a fascinating field. The actuarial sciences are extremely difficult and insurance companies really need good models. Having mapped this data, it's pretty clear you could dig in and find out what parts of the country should expect what types of storms (and you've also known this just from being alive for 10+ years). More hypotheses could be formed about location at this stage and could be tested! 260 | 261 | 262 | #### Step 7: Look for a relationship 263 | 264 | 265 | ```{r} 266 | df = data %>% 267 | filter(PRESSURE > 0) %>% 268 | filter(grepl("H", CAT)) %>% 269 | group_by(CAT,YEAR,MONTH,DAY,LAT,LONG) %>% 270 | summarise(MEAN_WIND_KTS = mean(WIND_KTS), MEAN_PRESSURE = mean(PRESSURE)) %>% 271 | arrange(MEAN_WIND_KTS) 272 | df$CAT = factor(df$CAT) 273 | 274 | p = ggplot(df,aes(x=MEAN_WIND_KTS, y = MEAN_PRESSURE, fill = CAT)) + theme_economist() 275 | p + 276 | geom_hex(alpha = 0.8) + 277 | scale_fill_brewer(direction = -1, palette = "Spectral") + 278 | scale_y_continuous(labels = scales::comma)+ 279 | theme(legend.position = 'right') + 280 | ggtitle("Wind KTS vs. Pressure by Category (H)") 281 | ``` 282 | 283 | 284 | What is the relationship between WIND_KTS and PRESSURE? This chart helps us to see that low PRESSURE and WIND_KTS are likely negatively correlated. We can also see that the WIND_KTS is essentially the predictor in the data set which can perfectly predict how a storm is classified. Well, it turns out, that's basically the distinguising feature when scientists are determining how to categorize these storms! 285 | 286 | 287 | #### Step ........ 288 | 289 | The rest is up to you! This is a great data set and there are a lot more pieces of information lurking within it. I want people to do their own EDA and send me anything interesting! 290 | 291 | Some food for thought: 292 | 293 | * What was the most common name for a hurricane? 294 | * Do the names actually follow an alphabetical pattern through time? (This is one is tricky) 295 | * Can we merge this data with FEMA, charitable donations, or other aid data? 296 | 297 | 298 | To get you started on the first one, here's the Top 10 most common names for tropical storms. Why do you think it's Florence? 299 | 300 | 301 | ```{r} 302 | top_names = data %>% 303 | filter(NAME != 'NOTNAMED' & NAME != 'SUBTROP1') %>% 304 | group_by(NAME) %>% 305 | summarise(Years_Used = n_distinct(YEAR)) %>% 306 | arrange(-Years_Used) 307 | p = ggplot(top_names %>% top_n(10), aes(x = reorder(NAME, Years_Used), y = Years_Used)) + theme_economist() 308 | p + geom_bar(stat='identity') + coord_flip() + xlab('') + ggtitle('Most Used Tropical Storm Names') 309 | ``` 310 | 311 | 312 | Thank you for reading, I hope this helps you with your own data. The code is all written in R and is located on my [GitHub](https://github.com/stoltzmaniac/Data-Visualization-Lesson). You can also find other data visualization posts and usages of ggplot2 on my blog [Stoltzmaniac](https://www.stoltzmaniac.com?utm_campaign=bottom_of_tropical_storm_post) 313 | -------------------------------------------------------------------------------- /Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/Data_Visualization_-_Tropical_Storms_files/figure-html/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Visualization-Lesson -------------------------------------------------------------------------------- /data_preparation.R: -------------------------------------------------------------------------------- 1 | # This file will prepare the data 2 | # Data was originally found on data.world 3 | # http://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_m.htm 4 | 5 | library(dplyr) 6 | library(tidyr) 7 | library(stringr) 8 | library(zoo) 9 | 10 | getData = function(){ 11 | 12 | data = read.csv('Monthly Crude Oil Production by State 1981 - Nov 2016.csv') 13 | data = data[data$Date != 'Back to Contents',] 14 | data = data[data$Date != 'Sourcekey',] 15 | data = data[data$Date != '',] 16 | 17 | #Convert data 18 | df = data.frame(data %>% gather(Location,ThousandBarrel, -Date)) 19 | 20 | df = data.frame(df %>% separate(Date,c('Month','Year'),"-")) 21 | df$Date = as.Date(as.yearmon(paste(df$Month,df$Year))) 22 | 23 | df$Location = as.character(sub('.Field.Production.of.Crude.Oil..Thousand.Barrels.','',df$Location)) 24 | df$ThousandBarrel = as.numeric(df$ThousandBarrel) 25 | 26 | States = data.frame(Location = as.character(state.name)) 27 | 28 | df = merge(df,States,by='Location') 29 | df$Month = factor(df$Month, levels = month.abb) 30 | 31 | return(df) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /figure/titlePhoto-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/titlePhoto-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-13-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-13-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /figure/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/figure/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /ggmapTemp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/ggmapTemp.png -------------------------------------------------------------------------------- /images/bad-pie1-fix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/bad-pie1-fix.png -------------------------------------------------------------------------------- /images/bad-pie1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/bad-pie1.png -------------------------------------------------------------------------------- /images/chart_vs_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/chart_vs_text.png -------------------------------------------------------------------------------- /images/lie_chart_bad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/lie_chart_bad.png -------------------------------------------------------------------------------- /images/lie_chart_fixed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/lie_chart_fixed.png -------------------------------------------------------------------------------- /images/tg_tb_tu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/tg_tb_tu.jpg -------------------------------------------------------------------------------- /images/tg_tb_tu.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/tg_tb_tu.xcf -------------------------------------------------------------------------------- /images/title_photo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo.png -------------------------------------------------------------------------------- /images/title_photo_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo_2.png -------------------------------------------------------------------------------- /images/title_photo_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stoltzmaniac/Data-Visualization-Lesson/3431eaf40bd116330a9ab89d29fef49180611e3a/images/title_photo_3.png --------------------------------------------------------------------------------