├── SUB-IP-EST2019-ANNRES-06.xlsx ├── Project_Notes.md ├── README.md ├── .gitignore ├── ProjectProposal_Group099_WI24.ipynb └── DataCheckpoint_Group099_WI24.ipynb /SUB-IP-EST2019-ANNRES-06.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/COGS108/Group099_WI24/master/SUB-IP-EST2019-ANNRES-06.xlsx -------------------------------------------------------------------------------- /Project_Notes.md: -------------------------------------------------------------------------------- 1 | # COGS 108 Group 99 Project 2 | 3 | ## Project Ideas: 4 | - animals return to shelter during the holidays (?) and the income of the county/city 5 | - gender composition on campus and the number of cafes (?) within a number mile radius 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is your group repo for your final project for COGS108. 2 | 3 | This repository is private, and is only visible to the course instructors and your group mates; it is not visible to anyone else. 4 | 5 | Template notebooks for each component are provided. Only work on the notebook prior to its due date. After each submission is due, move onto the next notebook (For example, after the proposal is due, start working in the Data Checkpoint notebook). 6 | 7 | This repository will be frozen on the final project due date. No further changes can be made after that time. 8 | 9 | Your project proposal and final project will be graded based solely on the corresponding project notebooks in this repository. 10 | 11 | Template Jupyter notebooks have been included, with your group number replacing the XXX in the following file names. For each due date, make sure you have a notebook present in this repository by each due date with the following name (where XXX is replaced by your group number): 12 | 13 | - `ProjectProposal_groupXXX.ipynb` 14 | - `DataCheckpoint_groupXXX.ipynb` 15 | - `EDACheckpoint_groupXXX.ipynb` 16 | - `FinalProject_groupXXX.ipynb` 17 | 18 | This is *your* repo. You are free to manage the repo as you see fit, edit this README, add data files, add scripts, etc. So long as there are the four files above on due dates with the required information, the rest is up to you all. 19 | 20 | Also, you are free and encouraged to share this project after the course and to add it to your portfolio. Just be sure to fork it to your GitHub at the end of the quarter! 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | -------------------------------------------------------------------------------- /ProjectProposal_Group099_WI24.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# COGS 108 - Project Proposal" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Names\n", 15 | "\n", 16 | "- Nicole Kim\n", 17 | "- Rikako Ono\n", 18 | "- Geena Limfat\n", 19 | "- MyungJoo Kim\n", 20 | "- Elizaveta Beltyukova" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Research Question" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "How can we predict when car accidents are most likely to happen in the United States? What factors can we depend on to make an assumption?\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Background and Prior Work" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "\n", 49 | "Over the years, cars have become safer whether it be through improvements in structural design or the development of new technologies.\n", 50 | "Despite such progressions, however, the frequency of car accidents has risen significantly over the last several years.\n", 51 | "These include less researched technology, regenerative braking, increased total mass, and faster acceleration.\n", 52 | "Specifically, there were 6,102,936 police-reported vehicle accidents in the United States in just the year 2021 itself[3](#cite_note-3).\n", 53 | "\n", 54 | "Interestingly, the factors influencing car crashes in the United States are quite diverse. Accidents can be the result of negative behaviors such as distracted driving, speeding, drunk driving, reckless driving, and tailgating[1](#cite_note-1).\n", 55 | "In fact, more than a third (36%) of all fatal crashes involve alcohol[3](#cite_note-3).\n", 56 | "All of these factors contribute to the high rate of car accidents in the US, making it crucial for individuals and authorities to address these issues through awareness, preventive measures, and effective policies to improve road safety. \n", 57 | "\n", 58 | "Accidents can even correlate to variables as general as the state one is in, to those as specific as the month, day, and even hour. For instance, there is previous work that has been done to analyze the frequency of car crashes based on days and times of the week.\n", 59 | "This analysis was conducted by NSC, which used data from 2021 and analyzed the frequency of fatal and non-fatal car crashes based on the time of the day and week.\n", 60 | "They concluded that for warmer months (spring and summer), fatal crashes peaked in the late evening and night (8 p.m. and midnight), meanwhile, for winter and early spring, the fatal crashes peaked between 4 and 8 p.m.[2](#cite_note-2)\n", 61 | "Another observation that could be extracted from their analysis is that more crashes tended to happen closer to the end of the week (Friday to Sunday). This will be interesting to contrast with our analysis later on to see if it’s consistent with our findings for the years 2016-2020.\n", 62 | "1. [^](#cite_ref-1) [*Top 25 causes of car accidents: Exploring the major factors.* GJEL Accident Attorneys. (2023, November 24).](https://www.gjel.com/car-accident-lawyers/top-causes-car-accidents/) \n", 63 | "2. [^](#cite_ref-2) [*Car crashes by time of day and day of week.* Injury Facts. (2023, April 18). ](https://injuryfacts.nsc.org/motor-vehicle/overview/crashes-by-time-of-day-and-day-of-week/) \n", 64 | "3. [^](#cite_ref-3) [Moore, T. *Fatal car crash statistics 2024.* USA Today. (2024, January 16).](https://www.usatoday.com/money/blueprint/auto-insurance/fatal-car-crash-statistics/#:~:text=There%20are%20nearly%2043%2C000%20fatal,accidents%20in%20the%20United%20States.&text=Of%20those%2C%2039%2C508%20were%20fatal) \n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Hypothesis\n" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "\n", 79 | "We hypothesize that car accidents will happen most frequently around the holiday season because of various factors such as the increase of vacations which leads to an increase in traffic, and the influence of alcohol.\n", 80 | "Additionally, we feel that there will be more accidents on weekday mornings, due to a high volume of commuters to work and school.\n", 81 | "In particular, we feel that innovative features such as autopilot and regenerative braking can negatively affect electric vehicle drivers.\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Data" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Variables: Time of Day, Days of the week, Month, Alcohol involvement, Number of pedestrians, Severity of the accident \n", 96 | "\n", 97 | "The ideal data set would contain the specific variables that we are observing along with any additional information that may help reduce biases and the location would be clearly stated to help reduce biases. The dataset would span multiple years in the United States as it would allow us to compare the data collected and identify any possible trends. The data should compare traffic statistics on various timeline bases, including daily, weekly, monthly, yearly, etc. We should be able to compare correlations between variables such as alcohol involvement, number of pedestrians, and severity of the accident (measured by the number of people injured). \n", 98 | "\n", 99 | "Ideally, the data collected would be as numbers so the data will be cleaner and easier to represent. There would also be a descriptive key/scale that tells us how to interpret the data. The data should be collected by the National Center for Statistics and Analysis(NHTSA) or any other official government department specializing in this area. Roughly 259,000 observations are required as NHTSA collected 259,077 US Traffic Accident data sets from 2016-2020. The data should be organized in CSV file format, without the null or empty cell with a consistent variable name to be tidy data. \n", 100 | "\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# Ethics & Privacy" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "There should not be an issue with any biases from the source of collection because the agency that collected the data was the NHTSA which is the National Highway Traffic Safety Administration. Since this is a reputable government organization in charge of collecting and organizing the data, we can assume that there are no demographic/ethnic biases when collecting the data. However, there may be biases in which accidents were reported to the government as there may be racial/ethical biases when it came to reporting the accidents to the government. This may exclude certain demographics of people or target others, but as we are not focusing on race/demographics, it should not affect our data analysis as much. Additionally, since the data is vastly spread across the country and over four years, the data should even out itself. The large dataset will also aid in protecting the privacy of the people involved in the accidents as they fall under police jurisdiction regions but do not specify exactly where the accident occurred. \n", 115 | "\n", 116 | "One factor we may fail to fully consider is the location and its impact on each accident.\n", 117 | "For example, there may be weather conditions or other variables that we are unable to record that may be the main reason for the accident, yet we concluded a different reasoning with the information we analyzed. However, this should still be acceptable as the main goal of our hypothesis and research is to find out how certain factors consistently result in an accident. In other words, our focus is more on concluding if there will be an accident if a certain variable is present, rather than which variable is the most crucial out of all variables to lead to an accident.\n", 118 | "\n", 119 | "In the dataset that we are using, we must also take into account the COVID-19 Pandemic as it states that between the months of May and March the data will be more sparse because there were less people on the road. We will take into account this bias in our analysis as we understand that trends should be skewed because of this event.\n", 120 | "\n", 121 | "We will detect these specific biases before analysis by thoroughly discussing our research question before finding datasets and identifying any additional variables that would skew our data and affect our question. During data analysis, we will identify any points in the data that do not match up with the rest of the data and conduct additional research to identify whether those specific data points are abnormalities or whether they are biases that we did not take into account before. Then, we will explain the abnormalities/biases in our analysis and discuss how this affects our data. After we have already written our analysis of the data, we will proofread our conclusion for any other biases that may stand out and if they do occur, we will revise and address them again. Additionally, we may conduct a peer review (if allowed) with another group so we can have unbiased feedback. \n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "# Team Expectations " 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "\n", 136 | "* *Keep in touch with group members via iMessage or Discord groupchat.*\n", 137 | "* *Fill out When2Meet's on time in order to coordinate meeting times.*\n", 138 | "* *Complete individually assigned tasks on time.*\n", 139 | "* *Communicate promptly if any change of plans arises.*\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "# Project Timeline Proposal" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "| Meeting Date | Meeting Time| Completed Before Meeting | Discuss at Meeting |\n", 154 | "|---|---|---|---|\n", 155 | "| 2/3 | 12 PM | Determind best form of communication | Complete Previous Project Review Assignment | \n", 156 | "| 2/7 | 6:30PM | Brainstorm ideas for project | Work on Project Proposal Assignment | \n", 157 | "| 2/12 | 8:30 PM | Brainstorm any new ideas for project + potential datasets | Finish Project Proposal Assignment|\n", 158 | "| 2/17 | 12 PM | Find Data and work on Research | Work on Checkpoint #1: Data |\n", 159 | "| 2/25 | 6:30 PM | Import & Wrangle Data | Complete Checkpoint #1: Data |\n", 160 | "| 3/2 | 12 PM | Work on Assigned Tasks | Work on Checkpoint #2: EDA |\n", 161 | "| 3/10 | 6:30 PM | Complete analysis; Draft results/conclusion/discussion | Complete Checkpoint #2: EDA |\n", 162 | "| 3/? | 12 PM | Work on assigned Tasks | Work together to discuss and wrap up the Final Project |\n", 163 | "| 3/20 | Before 11:59 PM | Make final edits and prepare to turn in Final Project | Turn in Final Project & Group Project Surveys |" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.9.7" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 2 188 | } 189 | -------------------------------------------------------------------------------- /DataCheckpoint_Group099_WI24.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**If you lost points on the last checkpoint you can get them back by responding to TA/IA feedback** \n", 8 | "\n", 9 | "Update/change the relevant sections where you lost those points, make sure you respond on GitHub Issues to your TA/IA to call their attention to the changes you made here.\n", 10 | "\n", 11 | "Please update your Timeline... no battle plan survives contact with the enemy, so make sure we understand how your plans have changed." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# COGS 108 - Data Checkpoint" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Names\n", 26 | "\n", 27 | "- Nicole Kim\n", 28 | "- Rikako Ono\n", 29 | "- Geena Limfat\n", 30 | "- MyungJoo Kim\n", 31 | "- Elizaveta Beltyukova" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "# Research Question" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Does higher usage of public transportation in a county affect the number of car crashes in that county? \n", 46 | "\n", 47 | "Additionally, does higher use of public transportation lessen the severity of the car accident?\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Background and Prior Work" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "\n", 62 | "Over the years, advancements in car safety, including structural design improvements and technological developments, have enhanced vehicle safety.\n", 63 | "Despite these progressions, the frequency of car accidents has increased significantly in recent years.\n", 64 | "In the United States alone, there were 6,102,936 police-reported vehicle accidents in 2021[2](#cite_note-2).\n", 65 | "Various factors contribute to car crashes in the US, such as distracted driving, speeding, drunk driving, reckless driving, and tailgating.\n", 66 | "Notably, more than a third (36%) of all fatal crashes involve alcohol[2](#cite_note-2).\n", 67 | "\n", 68 | "These behaviors collectively contribute to the high rate of car accidents in the US, underlining the importance of addressing these issues through awareness campaigns, preventive measures, and effective policies to enhance road safety.\n", 69 | "\n", 70 | "Public transportation serves as a vital component of urban areas by offering consistent transportation services through modes like buses, streetcars, light rail, ferries, and subways.\n", 71 | "This mode of transportation is essential for diverse groups such as older adults, individuals with disabilities, and commuters, particularly in major cities where public transit is prevalent. \n", 72 | "Public transportation not only provides a means of travel but also contributes to health and equity in various ways.\n", 73 | "Research indicates that public transportation can reduce traffic accidents and air pollution while increasing physical activity and improving access to essential services like medical care, healthy food, employment opportunities, and social connections.[3](#cite_note-3). \n", 74 | "Despite these benefits, it remains unexplored whether the utilization of public transportation correlates with the severity of car crashes in an area.\n", 75 | "\n", 76 | "To investigate the relationship between public transport usage and the number and severity of car crashes in an area, one could explore how increased public transportation utilization might lead to reduced traffic congestion and fewer private vehicles on the road. \n", 77 | "However, it’s also possible that less traffic congestion could encourage drivers to travel at higher speeds, thus being more likely to cause a more severe/fatal accident in the area.\n", 78 | "Another factor to consider is that with increased public transport use, there could be more pedestrians, and therefore potentially more car-caused pedestrian injuries or deaths. \n", 79 | "\n", 80 | "1. [^](#cite_ref-1) [Evaluating Public Transportation Health Benefits. (n.d.).](https://www.vtpi.org/tran_health.pdf) \n", 81 | "2. [^](#cite_ref-2) [Moore, T. (2024, January 16). Fatal car crash statistics 2024. USA Today.](https://www.usatoday.com/money/blueprint/auto-insurance/fatal-car-crash-statistics/#:~:text=There%20are%20nearly%2043%2C000%20fatal,accidents%20in%20the%20United%20States.&text=Of%20those%2C%2039%2C508%20were%20fatal) \n", 82 | "3. [^](#cite_ref-3) [Top 25 causes of car accidents: Exploring the major factors. GJEL Accident Attorneys. (2023, November 24).](https://www.gjel.com/car-accident-lawyers/top-causes-car-accidents ) \n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# Hypothesis\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "\n", 97 | "Alternate Hypothesis: We hypothesize that higher use of public transportation lessens the number of accidents due to several factors. One of the factors being that public transportation involves professional drivers who go through rigorous training to operate the mode of transportation that they are in charge of and may adhere more strictly to traffic regulations and safety protocols which can reduce the likelihood of accidents. Therefore, they will be better experienced and equipped to handle situations that may cause accidents. Additionally, a higher usage of public transportation will result in fewer vehicles on the road which can reduce congestion and lower the probability of accidents. Thus, we hypothesize that a higher utilization of public transportation could contribute to a safer transportation environment and reduce the number of accidents on the road. In addition to this, we believe that higher use of public transportation will also lessen the severity of car accidents that occur. A higher use of public transportation indicates that there will be fewer vehicles on the road which can potentially reduce the number of reckless drivers on the road as well. Fewer reckless drivers on the road can lessen the severity of accidents. Since there are also less cars on the road, there would be less cars to crash into each other and create a 'domino effect' and involve other vehicles, thus involving less people and potentially lessening the overall severity of the entire crash. Additionally, public transportation drivers are better equipped to handle accidents as they have more training so they can better react to the situation and potentially lesssen the severity of the crash.\n", 98 | "\n", 99 | "Null Hypothesis: The use of public transportation does not affect the number of car crashes. It also does not affect the severity of the accident." 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Data" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## Data overview\n", 114 | "\n", 115 | "- Dataset #1\n", 116 | " - Dataset Name: US Accidents (2016 - 2023)\n", 117 | " - Link to the dataset: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents?select=US_Accidents_March23.csv\n", 118 | " - Number of observations: 7,728,394\n", 119 | " - Number of variables: 46\n", 120 | "- Dataset #2\n", 121 | " - Dataset Name: Highest Public Transit Usage Cities in California\n", 122 | " - Link to the dataset: https://www.homearea.com/rankings/place-in-ca/percent_using_public_transportation/#:~:text=The%20California%20percent%20using%20public,year%20saw%20several%20big%20changes\n", 123 | " - Number of observations: 62\n", 124 | " - Number of variables: 3\n", 125 | "\n", 126 | "The first dataset refers to the US accident, encompassing 7,728,394 observations recorded between 2016 and 2023.\n", 127 | "The main features in this dataset include the name of City, start time, description of the accident, and severity.\n", 128 | "The second dataset focuses on public transportation usage in California, specifically in 62 cities with an assumed population of around 60,000 in CA.\n", 129 | "The number of observations could be estimated with the population of CA and the probability of using public transportation on average, 10%.\n", 130 | "\n", 131 | "Estimated Observation = np = (population) X ((average percentage of public transportation use)/100).\n", 132 | "This yields approximately 6,000 observations. Relevant features in the second dataset include City Name, and the percentage of public transportation use in 2017.\n", 133 | "\n", 134 | "We propose combining the subset of the first dataset, focusing only on the region of California in 2017, with the second dataset to address the impact of public transportation on the car accident rate in California and other potential factors.\n", 135 | "This combined dataset allows a comprehensive analysis of the relationship between public transportation use, and various factors influencing car accidents in California.\n", 136 | "\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Data Cleaning\n", 144 | "\n", 145 | "### Dataset #1\n", 146 | "\n", 147 | "This dataset is somewhat clean. The dataset is consistent and uniform throughout. There is a small issue with completeness, as there is some missing data. \n", 148 | "The dataset's description states, “There is missing data for certain days, which could be due to network connectivity issues during data collection.” \n", 149 | "However, for the values that do remain, we are confident that they are accurate and reliable. \n", 150 | "In the description, it states “The data was collected from APIs broadcast traffic data captured by various entities, including the US and state departments of transportation, law enforcement agencies, traffic cameras, and traffic sensors within the road networks.” \n", 151 | "Because APIs have low latency, real-time data processing, and quick response times, we can conclude that the data is precise. \n", 152 | "We can also conclude from this excerpt that the sources are credible and trustworthy.\n", 153 | "\n", 154 | "Getting the data into a usable format was very difficult. The downloaded file was very large so at first we were unable to upload it to GitHub properly due to storage limitations. \n", 155 | "Even when loading the file into Excel or Visual Studio Code, the file was very heavy and laggy. We had to condense and clean the file many times to be able to finally upload and wrangle the data. \n", 156 | "\n", 157 | "In terms of pre-processing, we will clean the data by removing any missing inputs. \n", 158 | "We will attempt to make the file smaller so that it can be cleaned and uploaded to GitHub. \n", 159 | "We have confirmed that the dataset included at least a year’s worth of data to observe trends for appropriate time frames. \n", 160 | "We also checked that the source was reliable. \n", 161 | "\n", 162 | "\n", 163 | "### Dataset #2\n", 164 | "\n", 165 | "This dataset is very clean. This is mainly because we created our own data set. Thus, we had full control over what variables, observations, and units of measurement were included.\n", 166 | "We had full control of the quality of the data set. We made sure all the data was consistent and complete.\n", 167 | "We ensured the credibility of the data, by making sure it was taken from a reputable source like the U.S. Census Bureau American Community Survey.\n", 168 | "Unfortunately, the data may have small holes in precision, as survey participation is often voluntary.\n", 169 | "However, the source states that certain cities “were ranked the previous year but did not have sufficient data or population for the most recent rankings.”\n", 170 | "From this we can conclude that the source has only published results that had solid data to support it, thus making the research conducted accurate.\n", 171 | "\n", 172 | "To get the data into a usable format we had to take research and statistics from various websites related to our study.\n", 173 | "We then had to transform the file type twice. First, we inputted data into an Excel spreadsheet for ease of viewing and editing.\n", 174 | "Second, we converted the file into a CSV to be read by pandas.\n", 175 | "\n", 176 | "In terms of pre-processing, we cleaned the data as we entered it into our dataset. We checked that it included at least a year’s worth of data to observe trends for appropriate time frames.\n", 177 | "We checked that it was consistent with our other data set. We checked that it was a trustworthy source.\n", 178 | "\n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### Set up" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 1, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "import pandas as pd\n", 195 | "import numpy as np" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Dataset #1 - US Accidents (2016 - 2023)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "Because the data was incredibly large, we have filtered the data before uploading to GitHub" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 2, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "
| \n", 238 | " | Start_Time | \n", 239 | "Severity | \n", 240 | "City | \n", 241 | "Description | \n", 242 | "
|---|---|---|---|---|
| 0 | \n", 247 | "2017-01-01 00:03:31 | \n", 248 | "3 | \n", 249 | "Norwalk | \n", 250 | "Accident on I-5 Northbound at Exits 120 120A B... | \n", 251 | "
| 1 | \n", 254 | "2017-01-01 00:09:26 | \n", 255 | "3 | \n", 256 | "Lynwood | \n", 257 | "Accident on I-710 Southbound at Exits 12 12A 1... | \n", 258 | "
| 2 | \n", 261 | "2017-01-01 00:09:52 | \n", 262 | "3 | \n", 263 | "Hesperia | \n", 264 | "Accident on I-15 Northbound at Exit 138 Oak Hi... | \n", 265 | "
| 3 | \n", 268 | "2017-01-01 00:10:14 | \n", 269 | "2 | \n", 270 | "Pasadena | \n", 271 | "Accident on CA-110 Southbound at Glenarm St. | \n", 272 | "
| 4 | \n", 275 | "2017-01-01 00:11:14 | \n", 276 | "3 | \n", 277 | "Colton | \n", 278 | "Accident on I-10 Eastbound at Exit 72 I-215. | \n", 279 | "
| \n", 343 | " | City | \n", 344 | "Percent Using Transit | \n", 345 | "
|---|---|---|
| 0 | \n", 350 | "San Francisco | \n", 351 | "0.347 | \n", 352 | "
| 1 | \n", 355 | "Oakland | \n", 356 | "0.227 | \n", 357 | "
| 2 | \n", 360 | "Berkeley | \n", 361 | "0.221 | \n", 362 | "
| 3 | \n", 365 | "Daly City | \n", 366 | "0.207 | \n", 367 | "
| 4 | \n", 370 | "Alameda | \n", 371 | "0.185 | \n", 372 | "
| ... | \n", 375 | "... | \n", 376 | "... | \n", 377 | "
| 57 | \n", 380 | "Lakewood | \n", 381 | "0.008 | \n", 382 | "
| 58 | \n", 385 | "San Buenaventura (Ventura) | \n", 386 | "0.008 | \n", 387 | "
| 59 | \n", 390 | "Irvine | \n", 391 | "0.008 | \n", 392 | "
| 60 | \n", 395 | "Visalia | \n", 396 | "0.006 | \n", 397 | "
| 61 | \n", 400 | "Napa | \n", 401 | "0.005 | \n", 402 | "
62 rows × 2 columns
\n", 406 | "