├── MAPS ├── usa_shapefile │ ├── cb_2017_us_nation_5m.cpg │ ├── cb_2017_us_nation_5m.dbf │ ├── cb_2017_us_nation_5m.shp │ ├── cb_2017_us_nation_5m.shx │ ├── cb_2017_us_nation_5m.prj │ ├── cb_2017_us_nation_5m.shp.ea.iso.xml │ ├── cb_2017_us_nation_5m.shp.xml │ └── cb_2017_us_nation_5m.shp.iso.xml ├── cb_2017_us_county_20m │ ├── cb_2017_us_county_20m.cpg │ ├── cb_2017_us_county_20m.dbf │ ├── cb_2017_us_county_20m.shp │ ├── cb_2017_us_county_20m.shx │ ├── cb_2017_us_county_20m.prj │ ├── cb_2017_us_county_20m.shp.ea.iso.xml │ ├── cb_2017_us_county_20m.shp.xml │ └── cb_2017_us_county_20m.shp.iso.xml └── .DS_Store ├── .DS_Store ├── Project Report.pdf ├── datasets └── socialmedia-disaster-tweets-DFE.csv ├── README.md ├── 1 - Twitter_API_to_DataFrame.ipynb └── 4 - Model_hurricane_michael_tweets.ipynb /MAPS/usa_shapefile/cb_2017_us_nation_5m.cpg: -------------------------------------------------------------------------------- 1 | UTF-8 -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.cpg: -------------------------------------------------------------------------------- 1 | UTF-8 -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/.DS_Store -------------------------------------------------------------------------------- /MAPS/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/.DS_Store -------------------------------------------------------------------------------- /Project Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/Project Report.pdf -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/usa_shapefile/cb_2017_us_nation_5m.dbf -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/usa_shapefile/cb_2017_us_nation_5m.shp -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/usa_shapefile/cb_2017_us_nation_5m.shx -------------------------------------------------------------------------------- /datasets/socialmedia-disaster-tweets-DFE.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/datasets/socialmedia-disaster-tweets-DFE.csv -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.dbf -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shp -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cameronbronstein/Project-4-New-Light-Technologies-Client-Project/HEAD/MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shx -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.prj: -------------------------------------------------------------------------------- 1 | GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Executive Summary 2 | 3 | ### Problem Statement 4 | **How can we leverage natural language processing of social media posts to map natural disasters?** 5 | 6 | Goal: Flag specific social media messages that have "Urgent Help" content and map their location. 7 | - Identify people that need help from the urgency of their social media posts. 8 | - Pinpoint their specific location and alert emergency response agencies. 9 | 10 | ### Data & Modeling 11 | **Steps:** 12 | - We collected data from Twitter's API and third-party databases (Figure Eight and Kaggle) 13 | - Twitter API: Data accessed within 50 miles of Mexico Beach within the week surrounding landfall of Hurricane Michael. 14 | - Only geolocated tweets were used for this portion of the project. 15 | - We examined the data and manufactured our own labels for classification, based on prevelance of urgent, traumatic language. These labels were proxies for "Urgent" and "Non-Urgent". The original data was not labeled in this manner. Non-urgent language in this dataset typically came from transcribed and translated phone-calls from impoverished, developing areas after natural disasters. 16 | - We preprocessed our data (tokenize, lemmatize/stem) to prepare the data for classification using Bag-of-Words Natural Language Processing and Logistic Regression Classification models. 17 | 18 | **Modeling** 19 | - First model: Identify social media messages in urgent need of help after a natural disaster. The model could be applied to a specific location based on current maps or information regarding location of natural disaster impact. Geolocation must be enabled for flagged posts to be mapped. 20 | - Second model: Given a time frame and radius, predict whether a disaster occured based on prevelance of "urgent help" type language. This model could provide an initial scan of broader geographic ranges for more selective deploymet of a robust natural language processing model, like model 1. 21 | 22 | ### Findings 23 | - Social media is full of reposted, news-type or informational messages regarding natural disasters. 24 | - Urgent messages often do not include descriptive details about the natural disaster in question. 25 | - Models do well at learning text data associated with labeled groups. They can be applied to unrelated data and mirror similar trends. However, effectively creating proxy labels is very difficult. This results in extensive noise and decreases model effectiveness and generalizability. 26 | 27 | 28 | ### Recommendations & Next Steps 29 | - Disaster response organizations should partner with social media to access geolocation data during natural disasters. Without access to large geolocated datasets, models will lack the geospatial data necessary for improved model performance. 30 | - Clustering and sentiment analyses could be used to train models to learn sentiments and language context. 31 | - More complex NLP tools, like Neural Networks or Word2Vec, could better learn words associated with survivors in urgent need of help. 32 | -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.shp.ea.iso.xml: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | Feature Catalog for the 2017 United States 1:5,000,000 Cartographic Boundary File 12 | 13 | 14 | The United States at a scale of 1:5,000,000 15 | 16 | 17 | cb_2017_nation_5m 18 | 19 | 20 | 2018-03 21 | 22 | 23 | eng 24 | 25 | 26 | utf8 29 | 30 | 31 | 33 | 34 | 35 | 36 | cb_2017_us_nation_5m.shp 37 | 38 | 39 | Current Nation (national) 40 | 41 | 42 | false 43 | 44 | 45 | 46 | 47 | 48 | AFFGEOID 49 | 50 | 51 | American FactFinder summary level code + geovariant code + '00US' 52 | 53 | 54 | 56 | 57 | 58 | 59 | American FactFinder geographic identifier 60 | 61 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | GEOID 71 | 72 | 73 | Nation identifier 74 | 75 | 76 | 78 | 79 | 80 | 81 | US 82 | 83 | 84 | United States 85 | 86 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | NAME 96 | 97 | 98 | Nation name 99 | 100 | 101 | 103 | 104 | 105 | 106 | United States 107 | 108 | 109 | Nation 110 | 111 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /1 - Twitter_API_to_DataFrame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Project 4 - Client Problem #1: \n", 8 | "# Leveraging Social Media to Map Disasters\n", 9 | "# Acquiring the historical tweet data - accessing the Twitter API, extracting the text, location and time stamp from the JSON, and saving it as a DataFrame." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Here we retrieved all tweets with a location within 25 miles of Panama City for the first two weeks of October 2018 including the week before and the week after Hurricane Michael." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "#!pip install TwitterAPI" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 42, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "from TwitterAPI import TwitterAPI\n", 35 | "import json\n", 36 | "\n", 37 | "import pandas as pd\n", 38 | "import time" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 21, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "api = TwitterAPI(consumer_key=CONSUMER_API_KEY,\n", 48 | " consumer_secret=CONSUMER_API_SECRET_KEY,\n", 49 | " access_token_key=ACCESS_TOKEN,\n", 50 | " access_token_secret=ACCESS_TOKEN_SECRET)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 22, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "PRODUCT = 'fullarchive'\n", 60 | "LABEL = 'Development' # This is specific to your application\n", 61 | "# i.e. whatever label you set for your Dev environment, and is case sensitive" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 23, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# list_tweets = []\n", 71 | "# next=None" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 44, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "4" 83 | ] 84 | }, 85 | "execution_count": 44, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "#web_request_count = 3\n", 92 | "web_request_count" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### The following code calls the Twitter API in a loop to extract all the tweets for the first two weeks in October 2018 for the two places near the Hurricane Michael landfall: Mexico Beach and Panama City, Florida, using a search radius of 20 miles and 25 miles respectively." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 45, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "r.status_code: 200\n", 112 | "web_request_count: 4\n", 113 | "r.status_code: 200\n", 114 | "web_request_count: 5\n", 115 | "r.status_code: 200\n", 116 | "web_request_count: 6\n", 117 | "r.status_code: 200\n", 118 | "web_request_count: 7\n", 119 | "r.status_code: 200\n", 120 | "web_request_count: 8\n", 121 | "r.status_code: 200\n", 122 | "web_request_count: 9\n", 123 | "r.status_code: 200\n", 124 | "web_request_count: 10\n", 125 | "r.status_code: 200\n", 126 | "web_request_count: 11\n", 127 | "r.status_code: 200\n", 128 | "web_request_count: 12\n", 129 | "r.status_code: 200\n", 130 | "web_request_count: 13\n", 131 | "r.status_code: 200\n" 132 | ] 133 | }, 134 | { 135 | "ename": "KeyError", 136 | "evalue": "'next'", 137 | "output_type": "error", 138 | "traceback": [ 139 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 140 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", 141 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'r.status_code: '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus_code\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mnext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'next'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;31m#print('next: ', next)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'web_request_count: '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweb_request_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 142 | "\u001b[0;31mKeyError\u001b[0m: 'next'" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "# r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), \n", 148 | "# {'query':'point_radius:[-85.4180 29.9480 50mi]','toDate':'201810180000',\n", 149 | "# 'fromDate':'201810010000'#,\n", 150 | "# # 'next': next\n", 151 | "# }) This was for Mexico Beach\n", 152 | "\n", 153 | "while next is not None:\n", 154 | " r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), \n", 155 | " {'query':'point_radius:[-85.6602 30.1588 25mi]','toDate':'201810180000',\n", 156 | " 'fromDate':'201810010000',\n", 157 | " 'next': next\n", 158 | " }) # This is for Panama City, Florida\n", 159 | "\n", 160 | " print('r.status_code: ', r.status_code)\n", 161 | " next = r.json()['next']\n", 162 | " #print('next: ', next)\n", 163 | " print('web_request_count: ', web_request_count)\n", 164 | " web_request_count += 1\n", 165 | "\n", 166 | " results = r.json()['results']\n", 167 | "\n", 168 | " for tweet in results:\n", 169 | "\n", 170 | " coordinates = tweet['coordinates']['coordinates']\n", 171 | " tweet_date = tweet['created_at']\n", 172 | " tweet_text = tweet['text'] \n", 173 | "\n", 174 | " if 'extended_tweet' in tweet.keys():\n", 175 | " tweet_text = tweet['extended_tweet']['full_text'] \n", 176 | "\n", 177 | " tweet_row = {'long_lat':coordinates,\n", 178 | " 'date_utc':tweet_date,\n", 179 | " 'full_text':tweet_text}\n", 180 | "\n", 181 | " list_tweets.append(tweet_row) \n", 182 | "\n", 183 | " df = pd.DataFrame(list_tweets)\n", 184 | "\n", 185 | " df.to_json('tweets_df_panama_city_25mi_oct.json', orient='records')\n", 186 | " \n", 187 | " time.sleep(2.1) # wait i.e. only 30 requests per minute allowed" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 52, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "1352" 199 | ] 200 | }, 201 | "execution_count": 52, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "results = r.json()['results']\n", 208 | "results\n", 209 | "len(results) # 52\n", 210 | "len(list_tweets) # 1300\n", 211 | "# 1352" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### The final total count of tweets retrieved for the 25 mile radius search of Panama City was 1352" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 40, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "4" 230 | ] 231 | }, 232 | "execution_count": 40, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "web_request_count" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 39, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "'eyJhdXRoZW50aWNpdHkiOiIwZDIyY2U4ZDQzZTVjYTI3MGQ4NDVjOGE0YTJkYjU1MzI2OTViODAyOWI5ZmY2N2NlZjhjMmM2NjMzYmE2NDFmIiwiZnJvbURhdGUiOiIyMDE4MTAwMTAwMDAiLCJ0b0RhdGUiOiIyMDE4MTAxODAwMDAiLCJuZXh0IjoiMjAxODEwMTMyMTEzMTktMTA1MTIxOTM1MzUxMTgzMzU5OS0wIn0='" 250 | ] 251 | }, 252 | "execution_count": 39, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "next" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 35, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "200" 270 | ] 271 | }, 272 | "execution_count": 35, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "len(list_tweets)\n", 279 | "# 100\n", 280 | "# 200" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 27, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "{'error': {'message': \"There were errors processing your request: Invalid 'point_radius':'[-85.4180 29.9480 50mi]'. Radius must be less than 25 miles (at position 1)\",\n", 292 | " 'sent': '2019-01-16T18:37:36+00:00',\n", 293 | " 'transactionId': '00f238f9006c1a28'}}" 294 | ] 295 | }, 296 | "execution_count": 27, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "r.json()\n", 303 | "# {'error': {'message': \"There were errors processing your request: \n", 304 | "# Invalid 'point_radius':'[-85.4180 29.9480 50mi]'. \n", 305 | "# Radius must be less than 25 miles (at position 1)\",\n", 306 | "# 'sent': '2019-01-16T18:37:36+00:00',\n", 307 | "# 'transactionId': '00f238f9006c1a28'}}\n", 308 | "# So change from Mexico Beach as center to Panama City (approx 25 miles away)\n", 309 | "# using the max search radius = 25 miles" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 13, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "r.status_code: 403\n", 322 | "r.response.reason: Forbidden\n", 323 | "r.response: \n", 324 | "r.get_quota(): {'remaining': None, 'limit': None, 'reset': None}\n", 325 | "r.headers: {'content-length': '184', 'content-type': 'application/json; charset=utf-8', 'date': 'Wed, 16 Jan 2019 02:38:26 GMT', 'server': 'tsa_b', 'set-cookie': 'personalization_id=\"v1_IH6Sczd+qho/W/0mYvCYQA==\"; Max-Age=63072000; Expires=Fri, 15 Jan 2021 02:38:26 GMT; Path=/; Domain=.twitter.com, guest_id=v1%3A154760630636354601; Max-Age=63072000; Expires=Fri, 15 Jan 2021 02:38:26 GMT; Path=/; Domain=.twitter.com', 'strict-transport-security': 'max-age=631138519', 'x-connection-hash': '1649ff52983c9a1452653f7a2b0e8115', 'x-rate-limit-limit': '1800', 'x-rate-limit-remaining': '1798', 'x-rate-limit-reset': '1547607042', 'x-response-time': '39'}\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "print('r.status_code: ', r.status_code) # r.status_code: \n", 331 | "print('r.response.reason: ', r.response.reason)\n", 332 | "print ('r.response: ', r.response)\n", 333 | "#print('r.json(): ',r.json())\n", 334 | "#print('r.text: ', r.text)\n", 335 | "print('r.get_quota(): ', r.get_quota())\n", 336 | "print('r.headers: ', r.headers)" 337 | ] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.7.0" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 2 361 | } 362 | -------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.shp.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Cartographic Products and Services Branch 7 | 201803 8 | 2017 Cartographic Boundary File, United States, 1:5,000,000 9 | vector digital data 10 | 11 | Cartographic Boundary Files 12 | 2017 13 | 14 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_nation_5m.zip 15 | 16 | 17 | 18 | The 2017 cartographic boundary shapefiles are simplified representations of selected geographic areas from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). These boundary files are specifically designed for small-scale thematic mapping. When possible, generalization is performed with the intent to maintain the hierarchical relationships among geographies and to maintain the alignment of geographies within a file set for a given year. Geographic areas may not align with the same areas from another year. Some geographies are available as nation-based files while others are available only as state-based files. 19 | 20 | This file depicts the shape of the United States clipped back to a generalized coastline. This nation layer covers the extent of the fifty states, the District of Columbia, Puerto Rico, and each of the Island Areas (American Samoa, the Commonwealth of the Northern Mariana Islands, Guam, and the U.S. Virgin Islands) when scale appropriate. 21 | These files were specifically created to support small-scale thematic mapping. To improve the appearance of shapes at small scales, areas are represented with fewer vertices than detailed TIGER/Line Shapefiles. Cartographic boundary files take up less disk space than their ungeneralized counterparts. Cartographic boundary files take less time to render on screen than TIGER/Line Shapefiles. You can join this file with table data downloaded from American FactFinder by using the AFFGEOID field in the cartographic boundary file. If detailed boundaries are required, please use the TIGER/Line Shapefiles instead of the generalized cartographic boundary files. 22 | 23 | 24 | 25 | 26 | 201803 27 | 201803 28 | 29 | 30 | publication date 31 | 32 | 33 | Complete 34 | None planned. No changes or updates will be made to this version of the cartographic boundary files. New versions of the cartographic boundary files will be produced on an annual release schedule. Types of geography released may vary from year to year. 35 | 36 | 37 | 38 | -179.14734 39 | 179.77847 40 | 71.352561 41 | -14.552549 42 | 43 | 44 | 45 | 46 | ISO 19115 Topic Categories 47 | Boundaries 48 | 49 | 50 | None 51 | 2017 52 | SHP 53 | Cartographic Boundary 54 | Generalized 55 | United States 56 | 57 | 58 | ISO 3166 Codes for the representation of names of countries and their subdivisions 59 | United States 60 | US 61 | 62 | 63 | None 64 | The intended display scale for this file is 1:5,000,000. This file should not be displayed at scales larger than 1:5,000,000. 65 | 66 | These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. The boundary information is for visual display at appropriate small scales only. Cartographic boundary files should not be used for geographic analysis including area or perimeter calculation. Files should not be used for geocoding addresses. Files should not be used for determining precise geographic area relationships. 67 | 68 | 69 | 70 | 71 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 72 | 73 | 74 | mailing 75 |
4600 Silver Hill Road
76 | Washington 77 | DC 78 | 20233-7400 79 | United States 80 |
81 | 301.763.1128 82 | 301.763.4710 83 | geo.geography@census.gov 84 |
85 |
86 |
87 | 88 | 89 | Accurate against American National Standards Institute (ANSI) Publication INCITS 446-2008 (Geographic Names Information System (GNIS)) at the 100% level for the codes and base names present in the file. The remaining attribute information has been examined but has not been fully tested for accuracy. 90 | 91 | The Census Bureau performed automated tests to ensure logical consistency of the source database. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons were tested for closure. The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as INCITS (formerly FIPS) codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Feature attribute information has been examined but has not been fully tested for consistency. 92 | 93 | For the cartographic boundary files, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the file's data table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. Cartographic Boundary File multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a file, the object count will be less than the actual number of G-polygons. 94 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 95 | 96 | 97 | Data are not accurate. Data are generalized representations of geographic boundaries at 1:5,000,000. 98 | 99 | 100 | 101 | 102 | 103 | 104 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 105 | Unpublished material 106 | Census MAF/TIGER database 107 | 108 | 109 | Geo-spatial Relational Database 110 | 111 | 112 | 113 | 201706 114 | 201705 115 | 116 | 117 | The dates describe the effective date of 2017 cartographic boundaries. 118 | 119 | MAF/TIGER 120 | All spatial and feature data 121 | 122 | 123 | Spatial data were extracted from the MAF/TIGER database and processed through a U.S. Census Bureau batch generalization system. 124 | MAF/TIGER 125 | 201803 126 | 127 | 128 | 129 | 130 | INCITS (formerly FIPS) codes 131 | Vector 132 | 133 | 134 | G-polygon 135 | 1 136 | 137 | 138 | 139 | 140 | 141 | 142 | 0.000458 143 | 0.000458 144 | Decimal degrees 145 | 146 | 147 | North American Datum of 1983 148 | Geodetic Reference System 80 149 | 6378137.000000 150 | 298.257222 151 | 152 | 153 | 154 | 155 | 156 | 157 | cb_2017_us_nation_5m.shp 158 | Current Nation (national) 159 | U.S. Census Bureau 160 | 161 | 162 | AFFGEOID 163 | American FactFinder summary level code + geovariant code + '00US' 164 | U.S. Census Bureau 165 | 166 | 167 | American FactFinder geographic identifier 168 | U.S. Census Bureau 169 | 170 | 171 | 172 | 173 | GEOID 174 | Nation identifier 175 | U.S. Census Bureau 176 | 177 | 178 | US 179 | United States 180 | U.S. Census Bureau 181 | 182 | 183 | 184 | 185 | NAME 186 | Nation name 187 | U.S. Census Bureau 188 | 189 | 190 | United States 191 | Nation 192 | U.S. Census Bureau 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 203 | 204 | 205 | mailing 206 |
4600 Silver Hill Road
207 | Washington 208 | DC 209 | 20233-7400 210 | United States 211 |
212 | 301.763.1128 213 | 301.763.4710 214 | geo.geography@census.gov 215 |
216 |
217 | No warranty, expressed or implied is made with regard to the accuracy of these data, and no liability is assumed by the U.S. Government in general or the U.S. Census Bureau in specific as to the spatial or attribute accuracy of the data. The act of distribution shall not constitute any such warranty and no responsibility is assumed by the U.S. government in the use of these files. The boundary information is for small-scale mapping purposes only; boundary depiction and designation for small-scale mapping purposes do not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions. 218 | 219 | 220 | 221 | SHP 222 | PK-ZIP, version 1.93A or higher 223 | 224 | 225 | 226 | 227 | 228 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_nation_5m.zip 229 | 230 | 231 | 232 | 233 | 234 | The online cartographic boundary files may be downloaded without charge. 235 | To obtain more information about ordering Cartographic Boundary Files visit https://www.census.gov/geo/www/tiger. 236 | 237 | The cartographic boundary files contain geographic data only and do not include display mapping software or statistical data. For information on how to use cartographic boundary file data with specific software package users shall contact the company that produced the software. 238 |
239 | 240 | 201803 241 | 242 | 243 | 244 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 245 | 246 | 247 | mailing 248 |
4600 Silver Hill Road
249 | Washington 250 | DC 251 | 20233-7400 252 | United States 253 |
254 | 301.763.1128 255 | 301.763.4710 256 | geo.geography@census.gov 257 |
258 |
259 | Content Standard for Digital Geospatial Metadata 260 | FGDC-STD-001-1998 261 |
262 |
-------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shp.ea.iso.xml: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | Feature Catalog for the 2017 United States 1:20,000,000 Cartographic Boundary File 12 | 13 | 14 | The Current County and Equivalent at a scale of 1:20,000,000 15 | 16 | 17 | cb_2017_county_20m 18 | 19 | 20 | 2018-03 21 | 22 | 23 | eng 24 | 25 | 26 | utf8 29 | 30 | 31 | 33 | 34 | 35 | 36 | cb_2017_us_county_20m.shp 37 | 38 | 39 | Current County and Equivalent (national) 40 | 41 | 42 | false 43 | 44 | 45 | 46 | 47 | 48 | STATEFP 49 | 50 | 51 | Current state Federal Information Processing Series (FIPS) code 52 | 53 | 54 | 56 | 57 | 58 | 59 | National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents 60 | 61 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | COUNTYFP 71 | 72 | 73 | Current county Federal Information Processing Series (FIPS) code 74 | 75 | 76 | 78 | 79 | 80 | 81 | National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents 82 | 83 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | COUNTYNS 93 | 94 | 95 | Current county Geographic Names Information System (GNIS) code 96 | 97 | 98 | 100 | 101 | 102 | 103 | INCITS 446:2008 (Geographic Names Information System (GNIS)), Identifying Attributes for Named Physical and Cultural Geographic Features (Except Roads and Highways) of the United States, Its Territories, Outlying Areas, and Freely Associated Areas, and the Waters of the Same to the Limit of the Twelve-Mile Statutory Zone 104 | 105 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | AFFGEOID 115 | 116 | 117 | American FactFinder summary level code + geovariant code + '00US' + GEOID 118 | 119 | 120 | 122 | 123 | 124 | 125 | American FactFinder geographic identifier 126 | 127 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | GEOID 137 | 138 | 139 | County identifier; a concatenation of current state Federal Information Processing Series (FIPS) code and county FIPS code 140 | 141 | 142 | 144 | 145 | 146 | 147 | 148 | The GEOID attribute is a concatenation of the state FIPS code followed by the county FIPS code. No spaces are allowed between the two codes. The state FIPS code is taken from "National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States". The county FIPS code is taken from "National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents". 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | NAME 158 | 159 | 160 | Current county name 161 | 162 | 163 | 165 | 166 | 167 | 168 | National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents 169 | 170 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | LSAD 180 | 181 | 182 | Current legal/statistical area description code for county 183 | 184 | 185 | 187 | 188 | 189 | 190 | 00 191 | 192 | 193 | Blank 194 | 195 | 197 | 198 | 199 | 200 | 201 | 202 | 03 203 | 204 | 205 | City and Borough (suffix) 206 | 207 | 209 | 210 | 211 | 212 | 213 | 214 | 04 215 | 216 | 217 | Borough (suffix) 218 | 219 | 221 | 222 | 223 | 224 | 225 | 226 | 05 227 | 228 | 229 | Census Area (suffix) 230 | 231 | 233 | 234 | 235 | 236 | 237 | 238 | 06 239 | 240 | 241 | County (suffix) 242 | 243 | 245 | 246 | 247 | 248 | 249 | 250 | 07 251 | 252 | 253 | District (suffix) 254 | 255 | 257 | 258 | 259 | 260 | 261 | 262 | 10 263 | 264 | 265 | Island (suffix) 266 | 267 | 269 | 270 | 271 | 272 | 273 | 274 | 12 275 | 276 | 277 | Municipality (suffix) 278 | 279 | 281 | 282 | 283 | 284 | 285 | 286 | 13 287 | 288 | 289 | Municipio (suffix) 290 | 291 | 293 | 294 | 295 | 296 | 297 | 298 | 15 299 | 300 | 301 | Parish (suffix) 302 | 303 | 305 | 306 | 307 | 308 | 309 | 310 | 25 311 | 312 | 313 | city (suffix) 314 | 315 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | ALAND 325 | 326 | 327 | Current land area (square meters) 328 | 329 | 330 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | Range Domain Minimum: 0 343 | Range Domain Maximum: 9,999,999,999,999 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | AWATER 353 | 354 | 355 | Current water area (square meters) 356 | 357 | 358 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | Range Domain Minimum: 0 371 | Range Domain Maximum: 9,999,999,999,999 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shp.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Cartographic Products and Services Branch 7 | 201803 8 | 2017 Cartographic Boundary File, Current County and Equivalent for United States, 1:20,000,000 9 | vector digital data 10 | 11 | Cartographic Boundary Files 12 | 2017 13 | 14 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_county_20m.zip 15 | 16 | 17 | 18 | The 2017 cartographic boundary shapefiles are simplified representations of selected geographic areas from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). These boundary files are specifically designed for small-scale thematic mapping. When possible, generalization is performed with the intent to maintain the hierarchical relationships among geographies and to maintain the alignment of geographies within a file set for a given year. Geographic areas may not align with the same areas from another year. Some geographies are available as nation-based files while others are available only as state-based files. 19 | 20 | The primary legal divisions of most states are termed counties. In Louisiana, these divisions are known as parishes. In Alaska, which has no counties, the equivalent entities are the organized boroughs, city and boroughs, municipalities, and for the unorganized area, census areas. The latter are delineated cooperatively for statistical purposes by the State of Alaska and the Census Bureau. In four states (Maryland, Missouri, Nevada, and Virginia), there are one or more incorporated places that are independent of any county organization and thus constitute primary divisions of their states. These incorporated places are known as independent cities and are treated as equivalent entities for purposes of data presentation. The District of Columbia and Guam have no primary divisions, and each area is considered an equivalent entity for purposes of data presentation. The Census Bureau treats the following entities as equivalents of counties for purposes of data presentation: Municipios in Puerto Rico, Districts and Islands in American Samoa, Municipalities in the Commonwealth of the Northern Mariana Islands, and Islands in the U.S. Virgin Islands. The entire area of the United States, Puerto Rico, and the Island Areas is covered by counties or equivalent entities. 21 | 22 | The generalized boundaries for counties and equivalent entities are based on those as of January 1, 2017, primarily as reported through the Census Bureau's Boundary and Annexation Survey (BAS). 23 | These files were specifically created to support small-scale thematic mapping. To improve the appearance of shapes at small scales, areas are represented with fewer vertices than detailed TIGER/Line Shapefiles. Cartographic boundary files take up less disk space than their ungeneralized counterparts. Cartographic boundary files take less time to render on screen than TIGER/Line Shapefiles. You can join this file with table data downloaded from American FactFinder by using the AFFGEOID field in the cartographic boundary file. If detailed boundaries are required, please use the TIGER/Line Shapefiles instead of the generalized cartographic boundary files. 24 | 25 | 26 | 27 | 28 | 201803 29 | 201803 30 | 31 | 32 | publication date 33 | 34 | 35 | Complete 36 | None planned. No changes or updates will be made to this version of the cartographic boundary files. New versions of the cartographic boundary files will be produced on an annual release schedule. Types of geography released may vary from year to year. 37 | 38 | 39 | 40 | -179.174265 41 | 179.773922 42 | 71.352561 43 | 17.913769 44 | 45 | 46 | 47 | 48 | ISO 19115 Topic Categories 49 | Boundaries 50 | 51 | 52 | None 53 | 2017 54 | SHP 55 | Borough 56 | Cartographic Boundary 57 | Census Area 58 | City 59 | City and Borough 60 | County 61 | County equivalent 62 | District 63 | Generalized 64 | Independent City 65 | Island 66 | Municipality 67 | Municipio 68 | Parish 69 | State 70 | 71 | 72 | ISO 3166 Codes for the representation of names of countries and their subdivisions 73 | United States 74 | US 75 | 76 | 77 | None 78 | The intended display scale for this file is 1:20,000,000. This file should not be displayed at scales larger than 1:20,000,000. 79 | 80 | These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. The boundary information is for visual display at appropriate small scales only. Cartographic boundary files should not be used for geographic analysis including area or perimeter calculation. Files should not be used for geocoding addresses. Files should not be used for determining precise geographic area relationships. 81 | 82 | 83 | 84 | 85 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 86 | 87 | 88 | mailing 89 |
4600 Silver Hill Road
90 | Washington 91 | DC 92 | 20233-7400 93 | United States 94 |
95 | 301.763.1128 96 | 301.763.4710 97 | geo.geography@census.gov 98 |
99 |
100 |
101 | 102 | 103 | Accurate against American National Standards Institute (ANSI) Publication INCITS 446-2008 (Geographic Names Information System (GNIS)) at the 100% level for the codes and base names present in the file. The remaining attribute information has been examined but has not been fully tested for accuracy. 104 | 105 | The Census Bureau performed automated tests to ensure logical consistency of the source database. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons were tested for closure. The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as INCITS (formerly FIPS) codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Feature attribute information has been examined but has not been fully tested for consistency. 106 | 107 | For the cartographic boundary files, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the file's data table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. Cartographic Boundary File multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a file, the object count will be less than the actual number of G-polygons. 108 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 109 | 110 | 111 | Data are not accurate. Data are generalized representations of geographic boundaries at 1:20,000,000. 112 | 113 | 114 | 115 | 116 | 117 | 118 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 119 | Unpublished material 120 | Census MAF/TIGER database 121 | 122 | 123 | Geo-spatial Relational Database 124 | 125 | 126 | 127 | 201706 128 | 201705 129 | 130 | 131 | The dates describe the effective date of 2017 cartographic boundaries. 132 | 133 | MAF/TIGER 134 | All spatial and feature data 135 | 136 | 137 | Spatial data were extracted from the MAF/TIGER database and processed through a U.S. Census Bureau batch generalization system. 138 | MAF/TIGER 139 | 201803 140 | 141 | 142 | 143 | 144 | INCITS (formerly FIPS) codes 145 | Vector 146 | 147 | 148 | G-polygon 149 | 3220 150 | 151 | 152 | 153 | 154 | 155 | 156 | 0.000458 157 | 0.000458 158 | Decimal degrees 159 | 160 | 161 | North American Datum of 1983 162 | Geodetic Reference System 80 163 | 6378137.000000 164 | 298.257222 165 | 166 | 167 | 168 | 169 | 170 | 171 | cb_2017_us_county_20m.shp 172 | Current County and Equivalent (national) 173 | U.S. Census Bureau 174 | 175 | 176 | STATEFP 177 | Current state Federal Information Processing Series (FIPS) code 178 | U.S. Census Bureau 179 | 180 | 181 | National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents 182 | U.S. Census Bureau 183 | 184 | 185 | 186 | 187 | COUNTYFP 188 | Current county Federal Information Processing Series (FIPS) code 189 | U.S. Census Bureau 190 | 191 | 192 | National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents 193 | U.S. Census Bureau 194 | 195 | 196 | 197 | 198 | COUNTYNS 199 | Current county Geographic Names Information System (GNIS) code 200 | U.S. Census Bureau 201 | 202 | 203 | INCITS 446:2008 (Geographic Names Information System (GNIS)), Identifying Attributes for Named Physical and Cultural Geographic Features (Except Roads and Highways) of the United States, Its Territories, Outlying Areas, and Freely Associated Areas, and the Waters of the Same to the Limit of the Twelve-Mile Statutory Zone 204 | U.S. Geological Survey (USGS) 205 | 206 | 207 | 208 | 209 | AFFGEOID 210 | American FactFinder summary level code + geovariant code + '00US' + GEOID 211 | U.S. Census Bureau 212 | 213 | 214 | American FactFinder geographic identifier 215 | U.S. Census Bureau 216 | 217 | 218 | 219 | 220 | GEOID 221 | County identifier; a concatenation of current state Federal Information Processing Series (FIPS) code and county FIPS code 222 | U.S. Census Bureau 223 | 224 | The GEOID attribute is a concatenation of the state FIPS code followed by the county FIPS code. No spaces are allowed between the two codes. The state FIPS code is taken from "National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States". The county FIPS code is taken from "National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents". 225 | 226 | 227 | 228 | NAME 229 | Current county name 230 | U.S. Census Bureau 231 | 232 | 233 | National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents 234 | U.S. Census Bureau 235 | 236 | 237 | 238 | 239 | LSAD 240 | Current legal/statistical area description code for county 241 | U.S. Census Bureau 242 | 243 | 244 | 00 245 | Blank 246 | U.S. Census Bureau 247 | 248 | 249 | 250 | 251 | 03 252 | City and Borough (suffix) 253 | U.S. Census Bureau 254 | 255 | 256 | 257 | 258 | 04 259 | Borough (suffix) 260 | U.S. Census Bureau 261 | 262 | 263 | 264 | 265 | 05 266 | Census Area (suffix) 267 | U.S. Census Bureau 268 | 269 | 270 | 271 | 272 | 06 273 | County (suffix) 274 | U.S. Census Bureau 275 | 276 | 277 | 278 | 279 | 07 280 | District (suffix) 281 | U.S. Census Bureau 282 | 283 | 284 | 285 | 286 | 10 287 | Island (suffix) 288 | U.S. Census Bureau 289 | 290 | 291 | 292 | 293 | 12 294 | Municipality (suffix) 295 | U.S. Census Bureau 296 | 297 | 298 | 299 | 300 | 13 301 | Municipio (suffix) 302 | U.S. Census Bureau 303 | 304 | 305 | 306 | 307 | 15 308 | Parish (suffix) 309 | U.S. Census Bureau 310 | 311 | 312 | 313 | 314 | 25 315 | city (suffix) 316 | U.S. Census Bureau 317 | 318 | 319 | 320 | 321 | ALAND 322 | Current land area (square meters) 323 | U.S. Census Bureau 324 | 325 | 326 | 0 327 | 9,999,999,999,999 328 | square meters 329 | 330 | 331 | 332 | 333 | AWATER 334 | Current water area (square meters) 335 | U.S. Census Bureau 336 | 337 | 338 | 0 339 | 9,999,999,999,999 340 | square meters 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 351 | 352 | 353 | mailing 354 |
4600 Silver Hill Road
355 | Washington 356 | DC 357 | 20233-7400 358 | United States 359 |
360 | 301.763.1128 361 | 301.763.4710 362 | geo.geography@census.gov 363 |
364 |
365 | No warranty, expressed or implied is made with regard to the accuracy of these data, and no liability is assumed by the U.S. Government in general or the U.S. Census Bureau in specific as to the spatial or attribute accuracy of the data. The act of distribution shall not constitute any such warranty and no responsibility is assumed by the U.S. government in the use of these files. The boundary information is for small-scale mapping purposes only; boundary depiction and designation for small-scale mapping purposes do not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions. 366 | 367 | 368 | 369 | SHP 370 | PK-ZIP, version 1.93A or higher 371 | 372 | 373 | 374 | 375 | 376 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_county_20m.zip 377 | 378 | 379 | 380 | 381 | 382 | The online cartographic boundary files may be downloaded without charge. 383 | To obtain more information about ordering Cartographic Boundary Files visit https://www.census.gov/geo/www/tiger. 384 | 385 | The cartographic boundary files contain geographic data only and do not include display mapping software or statistical data. For information on how to use cartographic boundary file data with specific software package users shall contact the company that produced the software. 386 |
387 | 388 | 201803 389 | 390 | 391 | 392 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 393 | 394 | 395 | mailing 396 |
4600 Silver Hill Road
397 | Washington 398 | DC 399 | 20233-7400 400 | United States 401 |
402 | 301.763.1128 403 | 301.763.4710 404 | geo.geography@census.gov 405 |
406 |
407 | Content Standard for Digital Geospatial Metadata 408 | FGDC-STD-001-1998 409 |
410 |
-------------------------------------------------------------------------------- /MAPS/usa_shapefile/cb_2017_us_nation_5m.shp.iso.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | cb_2017_us_nation_5m.shp.iso.xml 11 | 12 | 13 | eng 14 | 15 | 16 | UTF-8 19 | 20 | 21 | 23 | dataset 24 | 25 | 26 | 28 | 29 | 2018-03 30 | 31 | 32 | ISO 19115 Geographic Information - Metadata 33 | 34 | 35 | 2009-02-15 36 | 37 | 38 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_nation_5m.zip 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 49 | complex 50 | 51 | 52 | 1 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | INCITS (formerly FIPS) codes 66 | 67 | 68 | 69 | 70 | 71 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 2017 Cartographic Boundary File, United States, 1:5,000,000 80 | 81 | 82 | 83 | 84 | 85 | 2018-03 86 | 87 | 88 | publication 91 | 92 | 93 | 94 | 95 | 97 | 98 | 100 | 101 | 102 | 103 | 104 | The 2017 cartographic boundary shapefiles are simplified representations of selected geographic areas from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). These boundary files are specifically designed for small-scale thematic mapping. When possible, generalization is performed with the intent to maintain the hierarchical relationships among geographies and to maintain the alignment of geographies within a file set for a given year. Geographic areas may not align with the same areas from another year. Some geographies are available as nation-based files while others are available only as state-based files. 105 | 106 | This file depicts the shape of the United States clipped back to a generalized coastline. This nation layer covers the extent of the fifty states, the District of Columbia, Puerto Rico, and each of the Island Areas (American Samoa, the Commonwealth of the Northern Mariana Islands, Guam, and the U.S. Virgin Islands) when scale appropriate. 107 | 108 | 109 | These files were specifically created to support small-scale thematic mapping. To improve the appearance of shapes at small scales, areas are represented with fewer vertices than detailed TIGER/Line Shapefiles. Cartographic boundary files take up less disk space than their ungeneralized counterparts. Cartographic boundary files take less time to render on screen than TIGER/Line Shapefiles. You can join this file with table data downloaded from American FactFinder by using the AFFGEOID field in the cartographic boundary file. If detailed boundaries are required, please use the TIGER/Line Shapefiles instead of the generalized cartographic boundary files. 110 | 111 | 112 | 114 | completed 115 | 116 | 117 | 119 | 120 | 121 | 122 | 125 | notPlanned 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | Boundaries 134 | 135 | 136 | theme 139 | 140 | 141 | 142 | 143 | ISO 19115 Topic Categories 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 2017 154 | 155 | 156 | SHP 157 | 158 | 159 | Cartographic Boundary 160 | 161 | 162 | Generalized 163 | 164 | 165 | United States 166 | 167 | 168 | theme 171 | 172 | 173 | 174 | 175 | None 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | United States 186 | 187 | 188 | US 189 | 190 | 191 | place 194 | 195 | 196 | 197 | 198 | ISO 3166 Codes for the representation of names of countries and their subdivisions 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | otherRestrictions 211 | 212 | 213 | 216 | 217 | 218 | Access Constraints: None 219 | 220 | 221 | Use Constraints:The intended display scale for this file is 1:5,000,000. This file should not be displayed at scales larger than 1:5,000,000. 222 | 223 | These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. The boundary information is for visual display at appropriate small scales only. Cartographic boundary files should not be used for geographic analysis including area or perimeter calculation. Files should not be used for geocoding addresses. Files should not be used for determining precise geographic area relationships. 224 | 225 | 226 | 227 | 228 | 229 | 230 | vector 232 | 233 | 234 | 235 | 236 | 237 | 238 | 5000000 239 | 240 | 241 | 242 | 243 | 244 | 245 | eng 246 | 247 | 248 | 251 | 252 | 253 | boundaries 254 | 255 | 256 | The cartographic boundary files contain geographic data only and do not include display mapping software or statistical data. For information on how to use cartographic boundary file data with specific software package users shall contact the company that produced the software. 257 | 258 | 259 | 260 | 261 | 262 | 263 | -179.14734 264 | 265 | 266 | 179.77847 267 | 268 | 269 | -14.552549 270 | 271 | 272 | 71.352561 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | publication date 281 | 2018-03 282 | 2018-03 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | true 296 | 297 | 298 | 299 | 300 | Feature Catalog for the 2017 1:5,000,000 Cartographic Boundary File 301 | 302 | 303 | 304 | 305 | 306 | 309 | 310 | 311 | 312 | 313 | https://meta.geo.census.gov/data/existing/decennial/GEO/CPMB/boundary/2016cb/nation_5m/2016_nation_5m.ea.iso.xml 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | SHP 325 | 326 | 327 | 328 | PK-ZIP, version 1.93A or higher 329 | 330 | 331 | 332 | 333 | 334 | 335 | HTML 336 | 337 | 338 | 339 | 340 | 341 | 342 | 344 | 345 | 346 | 347 | The online cartographic boundary files may be downloaded without charge. 348 | 349 | 350 | To obtain more information about ordering Cartographic Boundary Files visit https://www.census.gov/geo/www/tiger. 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_nation_5m.zip 362 | 363 | 364 | Shapefile Zip File 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | https://www.census.gov/geo/maps-data/data/tiger-cart-boundary.html 376 | 377 | 378 | Cartographic Boundary Shapefiles 379 | 380 | 381 | Simplified representations of selected geographic areas from the Census Bureau's MAF/TIGER geographic database 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | dataset 397 | 398 | 399 | 400 | 401 | 402 | 403 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | The Census Bureau performed automated tests to ensure logical consistency of the source database. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons were tested for closure. The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as INCITS (formerly FIPS) codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Feature attribute information has been examined but has not been fully tested for consistency. 420 | 421 | For the cartographic boundary files, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the file's data table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. Cartographic Boundary File multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a file, the object count will be less than the actual number of G-polygons. 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | Spatial data were extracted from the MAF/TIGER database and processed through a U.S. Census Bureau batch generalization system. 432 | 433 | 434 | 2018-03-01T00:00:00 435 | 436 | 437 | 438 | 439 | Geo-spatial Relational Database 440 | 441 | 442 | 443 | 444 | Census MAF/TIGER database 445 | 446 | 447 | MAF/TIGER 448 | 449 | 450 | 451 | 452 | 453 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 454 | 455 | 456 | originator 458 | 459 | 460 | 461 | 462 | 463 | Source Contribution: All spatial and feature data 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 201706 474 | 201705 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 495 | notPlanned 496 | 497 | 498 | 499 | This was transformed from the Census Metadata Import Format 500 | 501 | 503 | 504 | 505 | -------------------------------------------------------------------------------- /MAPS/cb_2017_us_county_20m/cb_2017_us_county_20m.shp.iso.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | cb_2017_us_county_20m.shp.iso.xml 11 | 12 | 13 | eng 14 | 15 | 16 | UTF-8 19 | 20 | 21 | 23 | dataset 24 | 25 | 26 | 28 | 29 | 2018-03 30 | 31 | 32 | ISO 19115 Geographic Information - Metadata 33 | 34 | 35 | 2009-02-15 36 | 37 | 38 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_county_20m.zip 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 49 | complex 50 | 51 | 52 | 3220 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | INCITS (formerly FIPS) codes 66 | 67 | 68 | 69 | 70 | 71 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 2017 Cartographic Boundary File, Current County and Equivalent for United States, 1:20,000,000 80 | 81 | 82 | 83 | 84 | 85 | 2018-03 86 | 87 | 88 | publication 91 | 92 | 93 | 94 | 95 | 97 | 98 | 100 | 101 | 102 | 103 | 104 | The 2017 cartographic boundary shapefiles are simplified representations of selected geographic areas from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). These boundary files are specifically designed for small-scale thematic mapping. When possible, generalization is performed with the intent to maintain the hierarchical relationships among geographies and to maintain the alignment of geographies within a file set for a given year. Geographic areas may not align with the same areas from another year. Some geographies are available as nation-based files while others are available only as state-based files. 105 | 106 | The primary legal divisions of most states are termed counties. In Louisiana, these divisions are known as parishes. In Alaska, which has no counties, the equivalent entities are the organized boroughs, city and boroughs, municipalities, and for the unorganized area, census areas. The latter are delineated cooperatively for statistical purposes by the State of Alaska and the Census Bureau. In four states (Maryland, Missouri, Nevada, and Virginia), there are one or more incorporated places that are independent of any county organization and thus constitute primary divisions of their states. These incorporated places are known as independent cities and are treated as equivalent entities for purposes of data presentation. The District of Columbia and Guam have no primary divisions, and each area is considered an equivalent entity for purposes of data presentation. The Census Bureau treats the following entities as equivalents of counties for purposes of data presentation: Municipios in Puerto Rico, Districts and Islands in American Samoa, Municipalities in the Commonwealth of the Northern Mariana Islands, and Islands in the U.S. Virgin Islands. The entire area of the United States, Puerto Rico, and the Island Areas is covered by counties or equivalent entities. 107 | 108 | The generalized boundaries for counties and equivalent entities are based on those as of January 1, 2017, primarily as reported through the Census Bureau's Boundary and Annexation Survey (BAS). 109 | 110 | 111 | These files were specifically created to support small-scale thematic mapping. To improve the appearance of shapes at small scales, areas are represented with fewer vertices than detailed TIGER/Line Shapefiles. Cartographic boundary files take up less disk space than their ungeneralized counterparts. Cartographic boundary files take less time to render on screen than TIGER/Line Shapefiles. You can join this file with table data downloaded from American FactFinder by using the AFFGEOID field in the cartographic boundary file. If detailed boundaries are required, please use the TIGER/Line Shapefiles instead of the generalized cartographic boundary files. 112 | 113 | 114 | 116 | completed 117 | 118 | 119 | 121 | 122 | 123 | 124 | 127 | notPlanned 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | Boundaries 136 | 137 | 138 | theme 141 | 142 | 143 | 144 | 145 | ISO 19115 Topic Categories 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 2017 156 | 157 | 158 | SHP 159 | 160 | 161 | Borough 162 | 163 | 164 | Cartographic Boundary 165 | 166 | 167 | Census Area 168 | 169 | 170 | City 171 | 172 | 173 | City and Borough 174 | 175 | 176 | County 177 | 178 | 179 | County equivalent 180 | 181 | 182 | District 183 | 184 | 185 | Generalized 186 | 187 | 188 | Independent City 189 | 190 | 191 | Island 192 | 193 | 194 | Municipality 195 | 196 | 197 | Municipio 198 | 199 | 200 | Parish 201 | 202 | 203 | State 204 | 205 | 206 | theme 209 | 210 | 211 | 212 | 213 | None 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | United States 224 | 225 | 226 | US 227 | 228 | 229 | place 232 | 233 | 234 | 235 | 236 | ISO 3166 Codes for the representation of names of countries and their subdivisions 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | otherRestrictions 249 | 250 | 251 | 254 | 255 | 256 | Access Constraints: None 257 | 258 | 259 | Use Constraints:The intended display scale for this file is 1:20,000,000. This file should not be displayed at scales larger than 1:20,000,000. 260 | 261 | These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. The boundary information is for visual display at appropriate small scales only. Cartographic boundary files should not be used for geographic analysis including area or perimeter calculation. Files should not be used for geocoding addresses. Files should not be used for determining precise geographic area relationships. 262 | 263 | 264 | 265 | 266 | 267 | 268 | vector 270 | 271 | 272 | 273 | 274 | 275 | 276 | 20000000 277 | 278 | 279 | 280 | 281 | 282 | 283 | eng 284 | 285 | 286 | 289 | 290 | 291 | boundaries 292 | 293 | 294 | The cartographic boundary files contain geographic data only and do not include display mapping software or statistical data. For information on how to use cartographic boundary file data with specific software package users shall contact the company that produced the software. 295 | 296 | 297 | 298 | 299 | 300 | 301 | -179.174265 302 | 303 | 304 | 179.773922 305 | 306 | 307 | 17.913769 308 | 309 | 310 | 71.352561 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | publication date 319 | 2018-03 320 | 2018-03 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | true 334 | 335 | 336 | 337 | 338 | Feature Catalog for the 2017 Current County and Equivalent 1:20,000,000 Cartographic Boundary File 339 | 340 | 341 | 342 | 343 | 344 | 347 | 348 | 349 | 350 | 351 | https://meta.geo.census.gov/data/existing/decennial/GEO/CPMB/boundary/2016cb/county_20m/2016_county_20m.ea.iso.xml 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | SHP 363 | 364 | 365 | 366 | PK-ZIP, version 1.93A or higher 367 | 368 | 369 | 370 | 371 | 372 | 373 | HTML 374 | 375 | 376 | 377 | 378 | 379 | 380 | 382 | 383 | 384 | 385 | The online cartographic boundary files may be downloaded without charge. 386 | 387 | 388 | To obtain more information about ordering Cartographic Boundary Files visit https://www.census.gov/geo/www/tiger. 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_county_20m.zip 400 | 401 | 402 | Shapefile Zip File 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | https://www.census.gov/geo/maps-data/data/tiger-cart-boundary.html 414 | 415 | 416 | Cartographic Boundary Shapefiles 417 | 418 | 419 | Simplified representations of selected geographic areas from the Census Bureau's MAF/TIGER geographic database 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | dataset 435 | 436 | 437 | 438 | 439 | 440 | 441 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | The cartographic boundary files are generalized representations of extracts taken from the MAF/TIGER Database. Generalized boundary files are clipped to a simplified version of the U.S. outline. As a result, some off-shore areas may be excluded from the generalized files. Some small geographic areas, holes, or discontiguous parts of areas may not be included in generalized files if they are not visible at the target scale. 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | The Census Bureau performed automated tests to ensure logical consistency of the source database. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons were tested for closure. The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as INCITS (formerly FIPS) codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Feature attribute information has been examined but has not been fully tested for consistency. 458 | 459 | For the cartographic boundary files, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the file's data table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. Cartographic Boundary File multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a file, the object count will be less than the actual number of G-polygons. 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | Spatial data were extracted from the MAF/TIGER database and processed through a U.S. Census Bureau batch generalization system. 470 | 471 | 472 | 2018-03-01T00:00:00 473 | 474 | 475 | 476 | 477 | Geo-spatial Relational Database 478 | 479 | 480 | 481 | 482 | Census MAF/TIGER database 483 | 484 | 485 | MAF/TIGER 486 | 487 | 488 | 489 | 490 | 491 | U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Geographic Customer Services Branch 492 | 493 | 494 | originator 496 | 497 | 498 | 499 | 500 | 501 | Source Contribution: All spatial and feature data 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 201706 512 | 201705 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 533 | notPlanned 534 | 535 | 536 | 537 | This was transformed from the Census Metadata Import Format 538 | 539 | 541 | 542 | 543 | -------------------------------------------------------------------------------- /4 - Model_hurricane_michael_tweets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Project 4 - Client Problem #1: \n", 8 | "# Leveraging Social Media to Map Disasters\n", 9 | "# Modelling the hurricane tweets" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### This model will identify the Hurricane impact without classifying individual tweets. \n", 17 | "\n", 18 | "### Each observation will be a period of time (1 hour here) for a given geographical location. Historical data will be used to label each observation as to whether there was Hurricane impact at the time. \n", 19 | "\n", 20 | "### The model will then identify the most important words etc in the text that indicates that there is a natural disaster impact. This is done as part of the training process." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 43, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import json\n", 30 | "import pandas as pd\n", 31 | "from pprint import pprint\n", 32 | "import datetime\n", 33 | "\n", 34 | "import numpy as np\n", 35 | "\n", 36 | "from sklearn.feature_extraction.text import CountVectorizer\n", 37 | "from sklearn.linear_model import LogisticRegressionCV\n", 38 | "from sklearn.model_selection import train_test_split\n", 39 | "\n", 40 | "import matplotlib.pyplot as plt" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Read in the file that was saved after the Exploratory Data Analysis was complete" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "df = pd.read_json('tweets_df_michael_time_series_oct.json')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
tweet_counthourly_text
2018-10-01 00:00:003Sorry fellas, she’s unaVEILable! #HechtYeaIDo ...
2018-10-01 01:00:002Happy birthday to my little princess!!! Love ...
2018-10-01 02:00:004At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O...
2018-10-01 03:00:004#old pic, and yeah I think I forgot how to whe...
2018-10-01 09:00:001We saved carbs all week for this, and it was w...
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " tweet_count \\\n", 121 | "2018-10-01 00:00:00 3 \n", 122 | "2018-10-01 01:00:00 2 \n", 123 | "2018-10-01 02:00:00 4 \n", 124 | "2018-10-01 03:00:00 4 \n", 125 | "2018-10-01 09:00:00 1 \n", 126 | "\n", 127 | " hourly_text \n", 128 | "2018-10-01 00:00:00 Sorry fellas, she’s unaVEILable! #HechtYeaIDo ... \n", 129 | "2018-10-01 01:00:00 Happy birthday to my little princess!!! Love ... \n", 130 | "2018-10-01 02:00:00 At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O... \n", 131 | "2018-10-01 03:00:00 #old pic, and yeah I think I forgot how to whe... \n", 132 | "2018-10-01 09:00:00 We saved carbs all week for this, and it was w... " 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "df.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 4, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "(341, 2)" 153 | ] 154 | }, 155 | "execution_count": 4, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "df.shape # (341, 2)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 7, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "tweet_count 3\n", 173 | "hourly_text Sorry fellas, she’s unaVEILable! #HechtYeaIDo ...\n", 174 | "Name: 2018-10-01 00:00:00, dtype: object" 175 | ] 176 | }, 177 | "execution_count": 7, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "df.iloc[0]" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "\"Sorry fellas, she’s unaVEILable! #HechtYeaIDo #joiningjordan\\n\\n👰🏽\\nShop my look by clicking this link —> https://t.co/JKFHxk2xOQ \\nOr screenshot to shop on the https://t.co/lgaMntMElS app! #liketkit… https://t.co/MLeLYTc1lT ~|~ These two did it ALL this weekend at the beach. #Brothers @ Shell Island White Sand Beach https://t.co/FtMa1krCAo ~|~ 🍍🍍🍍🍍🍍🍍 @ Pineapple Willy's Restaurant https://t.co/pZILxSXKLj\"" 195 | ] 196 | }, 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "df.iloc[0].hourly_text" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Get the count of all the tweets on the day Hurricane Michael made landfall" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 18, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "(23, 2)" 222 | ] 223 | }, 224 | "execution_count": 18, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "np.sum(df['2018-10-10']) # tweet_count: 149\n", 231 | "df['2018-10-10'].shape # (23, 2)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Create a target column as the label for prediction training. Initially default it to zero." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 24, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "df['target'] = 0" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 25, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/html": [ 258 | "
\n", 259 | "\n", 272 | "\n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
tweet_counthourly_texttarget
2018-10-01 00:00:003Sorry fellas, she’s unaVEILable! #HechtYeaIDo ...0
2018-10-01 01:00:002Happy birthday to my little princess!!! Love ...0
2018-10-01 02:00:004At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O...0
2018-10-01 03:00:004#old pic, and yeah I think I forgot how to whe...0
2018-10-01 09:00:001We saved carbs all week for this, and it was w...0
\n", 314 | "
" 315 | ], 316 | "text/plain": [ 317 | " tweet_count \\\n", 318 | "2018-10-01 00:00:00 3 \n", 319 | "2018-10-01 01:00:00 2 \n", 320 | "2018-10-01 02:00:00 4 \n", 321 | "2018-10-01 03:00:00 4 \n", 322 | "2018-10-01 09:00:00 1 \n", 323 | "\n", 324 | " hourly_text target \n", 325 | "2018-10-01 00:00:00 Sorry fellas, she’s unaVEILable! #HechtYeaIDo ... 0 \n", 326 | "2018-10-01 01:00:00 Happy birthday to my little princess!!! Love ... 0 \n", 327 | "2018-10-01 02:00:00 At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O... 0 \n", 328 | "2018-10-01 03:00:00 #old pic, and yeah I think I forgot how to whe... 0 \n", 329 | "2018-10-01 09:00:00 We saved carbs all week for this, and it was w... 0 " 330 | ] 331 | }, 332 | "execution_count": 25, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "df.head()" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "## Hurricane Michael made landfall at Mexico Beach on Oct 10, 2018 at 2pm EDT. That was 6pm UTC which is what the Twitter timestamps use.\n", 346 | "\n", 347 | "#### We are going to arbitrarily set the target time (when we are trying to detect the presence of the hurricane) to 24 hours before and 72 hours after. This would be from Oct 9, 6pm UTC to Oct 13, 6pm UTC" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 31, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stderr", 357 | "output_type": "stream", 358 | "text": [ 359 | "/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py:4405: SettingWithCopyWarning: \n", 360 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 361 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 362 | "\n", 363 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 364 | " self[name] = value\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "df['2018-10-09 18:00:00':'2018-10-13 18:00:00']\n", 370 | "df['2018-10-09 18:00:00':'2018-10-13 18:00:00'].target = 1\n", 371 | "df.loc['2018-10-09 18:00:00':'2018-10-13 18:00:00'].target = 1" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 34, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/html": [ 382 | "
\n", 383 | "\n", 396 | "\n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | "
tweet_counthourly_texttarget
2018-10-01 00:00:003Sorry fellas, she’s unaVEILable! #HechtYeaIDo ...0
2018-10-01 01:00:002Happy birthday to my little princess!!! Love ...0
2018-10-01 02:00:004At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O...0
2018-10-01 03:00:004#old pic, and yeah I think I forgot how to whe...0
2018-10-01 09:00:001We saved carbs all week for this, and it was w...0
\n", 438 | "
" 439 | ], 440 | "text/plain": [ 441 | " tweet_count \\\n", 442 | "2018-10-01 00:00:00 3 \n", 443 | "2018-10-01 01:00:00 2 \n", 444 | "2018-10-01 02:00:00 4 \n", 445 | "2018-10-01 03:00:00 4 \n", 446 | "2018-10-01 09:00:00 1 \n", 447 | "\n", 448 | " hourly_text target \n", 449 | "2018-10-01 00:00:00 Sorry fellas, she’s unaVEILable! #HechtYeaIDo ... 0 \n", 450 | "2018-10-01 01:00:00 Happy birthday to my little princess!!! Love ... 0 \n", 451 | "2018-10-01 02:00:00 At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O... 0 \n", 452 | "2018-10-01 03:00:00 #old pic, and yeah I think I forgot how to whe... 0 \n", 453 | "2018-10-01 09:00:00 We saved carbs all week for this, and it was w... 0 " 454 | ] 455 | }, 456 | "execution_count": 34, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "df[df.target==1]\n", 463 | "df.head()" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 35, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "data": { 473 | "text/plain": [ 474 | "(341, 3)" 475 | ] 476 | }, 477 | "execution_count": 35, 478 | "metadata": {}, 479 | "output_type": "execute_result" 480 | } 481 | ], 482 | "source": [ 483 | "df.shape # (341, 3)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### Set the columns of X to be the tweet count, and the hourly text i.e. the combined text of the tweets in that one hour period." 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 41, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/html": [ 501 | "
\n", 502 | "\n", 515 | "\n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | "
tweet_counthourly_text
2018-10-01 00:00:003Sorry fellas, she’s unaVEILable! #HechtYeaIDo ...
2018-10-01 01:00:002Happy birthday to my little princess!!! Love ...
2018-10-01 02:00:004At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O...
2018-10-01 03:00:004#old pic, and yeah I think I forgot how to whe...
2018-10-01 09:00:001We saved carbs all week for this, and it was w...
\n", 551 | "
" 552 | ], 553 | "text/plain": [ 554 | " tweet_count \\\n", 555 | "2018-10-01 00:00:00 3 \n", 556 | "2018-10-01 01:00:00 2 \n", 557 | "2018-10-01 02:00:00 4 \n", 558 | "2018-10-01 03:00:00 4 \n", 559 | "2018-10-01 09:00:00 1 \n", 560 | "\n", 561 | " hourly_text \n", 562 | "2018-10-01 00:00:00 Sorry fellas, she’s unaVEILable! #HechtYeaIDo ... \n", 563 | "2018-10-01 01:00:00 Happy birthday to my little princess!!! Love ... \n", 564 | "2018-10-01 02:00:00 At 4:24 PM EDT, 2 S Panama City [Bay Co, FL] O... \n", 565 | "2018-10-01 03:00:00 #old pic, and yeah I think I forgot how to whe... \n", 566 | "2018-10-01 09:00:00 We saved carbs all week for this, and it was w... " 567 | ] 568 | }, 569 | "execution_count": 41, 570 | "metadata": {}, 571 | "output_type": "execute_result" 572 | } 573 | ], 574 | "source": [ 575 | "X = df[['tweet_count', 'hourly_text']]\n", 576 | "X.head()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 42, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "y = df.target" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 44, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [ 594 | "X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 48, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "data": { 604 | "text/plain": [ 605 | "(86,)" 606 | ] 607 | }, 608 | "execution_count": 48, 609 | "metadata": {}, 610 | "output_type": "execute_result" 611 | } 612 | ], 613 | "source": [ 614 | "X_train.shape # (255, 2)\n", 615 | "X_test.shape # (86, 2)\n", 616 | "y_train.shape # (255,)\n", 617 | "y_test.shape # (86,)" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 51, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "# cvect = CountVectorizer(lowercase=True, stop_words='english',max_df=1.0, min_df=1, \n", 627 | "# max_features=None)\n", 628 | "cvect = CountVectorizer()" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 63, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", 640 | " dtype=, encoding='utf-8', input='content',\n", 641 | " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", 642 | " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", 643 | " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", 644 | " tokenizer=None, vocabulary=None)" 645 | ] 646 | }, 647 | "execution_count": 63, 648 | "metadata": {}, 649 | "output_type": "execute_result" 650 | } 651 | ], 652 | "source": [ 653 | "cvect.fit(X_train.hourly_text)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 69, 659 | "metadata": {}, 660 | "outputs": [ 661 | { 662 | "data": { 663 | "text/plain": [ 664 | "['00', '000', '00pm', '03j9p2auhc', '04']" 665 | ] 666 | }, 667 | "execution_count": 69, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "len(cvect.get_feature_names()) # 5085 = X_train\n", 674 | "# 6187 = X\n", 675 | "cvect.get_feature_names()[:5]" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": 77, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [ 684 | "X_train_vect = cvect.transform(X_train.hourly_text)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 88, 690 | "metadata": {}, 691 | "outputs": [ 692 | { 693 | "data": { 694 | "text/plain": [ 695 | "array([[0, 0, 0, ..., 0, 0, 0],\n", 696 | " [0, 0, 0, ..., 0, 0, 0],\n", 697 | " [0, 0, 0, ..., 0, 0, 0],\n", 698 | " ...,\n", 699 | " [0, 0, 0, ..., 0, 0, 0],\n", 700 | " [0, 0, 0, ..., 0, 0, 0],\n", 701 | " [0, 0, 0, ..., 0, 0, 0]])" 702 | ] 703 | }, 704 | "execution_count": 88, 705 | "metadata": {}, 706 | "output_type": "execute_result" 707 | } 708 | ], 709 | "source": [ 710 | "X_train_vect\n", 711 | "# <255x5085 sparse matrix of type ''\n", 712 | "# \twith 16165 stored elements in Compressed Sparse Row format>\n", 713 | "X_train_vect.toarray()" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 74, 719 | "metadata": {}, 720 | "outputs": [ 721 | { 722 | "data": { 723 | "text/plain": [ 724 | "(86, 2)" 725 | ] 726 | }, 727 | "execution_count": 74, 728 | "metadata": {}, 729 | "output_type": "execute_result" 730 | } 731 | ], 732 | "source": [ 733 | "X_train.shape # (255, 2)\n", 734 | "X.shape # (341, 2)\n", 735 | "X_test.shape # (86, 2)" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 80, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "lr = LogisticRegressionCV()" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 81, 750 | "metadata": {}, 751 | "outputs": [ 752 | { 753 | "name": "stderr", 754 | "output_type": "stream", 755 | "text": [ 756 | "/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 757 | " warnings.warn(CV_WARNING, FutureWarning)\n" 758 | ] 759 | }, 760 | { 761 | "data": { 762 | "text/plain": [ 763 | "LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,\n", 764 | " fit_intercept=True, intercept_scaling=1.0, max_iter=100,\n", 765 | " multi_class='warn', n_jobs=None, penalty='l2',\n", 766 | " random_state=None, refit=True, scoring=None, solver='lbfgs',\n", 767 | " tol=0.0001, verbose=0)" 768 | ] 769 | }, 770 | "execution_count": 81, 771 | "metadata": {}, 772 | "output_type": "execute_result" 773 | } 774 | ], 775 | "source": [ 776 | "lr.fit(X_train_vect, y_train)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 82, 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "0.9882352941176471" 788 | ] 789 | }, 790 | "execution_count": 82, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "lr.score(X_train_vect, y_train)\n", 797 | "# 0.9882352941176471" 798 | ] 799 | }, 800 | { 801 | "cell_type": "markdown", 802 | "metadata": {}, 803 | "source": [ 804 | "### So the training score was 0.988" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 83, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "X_test_vect = cvect.transform(X_test.hourly_text)" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 84, 819 | "metadata": {}, 820 | "outputs": [ 821 | { 822 | "data": { 823 | "text/plain": [ 824 | "0.8255813953488372" 825 | ] 826 | }, 827 | "execution_count": 84, 828 | "metadata": {}, 829 | "output_type": "execute_result" 830 | } 831 | ], 832 | "source": [ 833 | "lr.score(X_test_vect, y_test)\n", 834 | "# 0.8255813953488372" 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": {}, 840 | "source": [ 841 | "### The test score was 0.82558, indicating the model is overfit i.e. it is significantly less than the training score (0.988)." 842 | ] 843 | }, 844 | { 845 | "cell_type": "markdown", 846 | "metadata": {}, 847 | "source": [ 848 | "### Now we will identify the words that were considered most significant in identifying the target variable." 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 85, 854 | "metadata": {}, 855 | "outputs": [ 856 | { 857 | "data": { 858 | "text/plain": [ 859 | "array([[ 0.1396091 , 0.02024425, -0.0139385 , ..., -0.00707096,\n", 860 | " -0.00707096, -0.00707096]])" 861 | ] 862 | }, 863 | "execution_count": 85, 864 | "metadata": {}, 865 | "output_type": "execute_result" 866 | } 867 | ], 868 | "source": [ 869 | "lr.coef_" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 117, 875 | "metadata": {}, 876 | "outputs": [ 877 | { 878 | "data": { 879 | "text/plain": [ 880 | "'100'" 881 | ] 882 | }, 883 | "execution_count": 117, 884 | "metadata": {}, 885 | "output_type": "execute_result" 886 | } 887 | ], 888 | "source": [ 889 | "coefs=lr.coef_[0]\n", 890 | "top_ten = np.argpartition(coefs, -10)[-10:] # get the top ten\n", 891 | "\n", 892 | "top_ten[np.argsort(coefs[top_ten])] # sort the top ten\n", 893 | "# array([ 393, 41, 4579, 2900, 3288, 74, 28, 3654, 337, 2267]) \n", 894 | "# last one is most important\n", 895 | "\n", 896 | "# In order of importance - most important first\n", 897 | "#features_array[2267] # 'hurricanemichael'\n", 898 | "# features_array[337] # '80'\n", 899 | "# features_array[3654] # 'rain'\n", 900 | "#features_array[28] # '100'\n", 901 | "#features_array[74] # '13mph'\n", 902 | "#features_array[3288] # 'panhandle'\n", 903 | "#features_array[2900] # 'michael'\n", 904 | "#features_array[4579] # 'tyndall'\n", 905 | "#features_array[41] # '1027mb'\n", 906 | "#features_array[393] # '9mph'" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "metadata": {}, 912 | "source": [ 913 | "### The ten most significant words were\n", 914 | "\n", 915 | "### hurricanemichael, 80, rain, 100, 13mph, panhandle, michael, tyndall, 1027mb, 9mph" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 91, 921 | "metadata": {}, 922 | "outputs": [ 923 | { 924 | "data": { 925 | "text/plain": [ 926 | "array([ 0.1396091 , 0.02024425, -0.0139385 , ..., -0.00707096,\n", 927 | " -0.00707096, -0.00707096])" 928 | ] 929 | }, 930 | "execution_count": 91, 931 | "metadata": {}, 932 | "output_type": "execute_result" 933 | } 934 | ], 935 | "source": [ 936 | "lr.coef_[0]" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": {}, 942 | "source": [ 943 | "### Now we will look to see how many false Positives and false Negatives we had." 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 122, 949 | "metadata": {}, 950 | "outputs": [ 951 | { 952 | "data": { 953 | "text/plain": [ 954 | "pandas.core.series.Series" 955 | ] 956 | }, 957 | "execution_count": 122, 958 | "metadata": {}, 959 | "output_type": "execute_result" 960 | } 961 | ], 962 | "source": [ 963 | "preds_array = lr.predict(X_test_vect)\n", 964 | "#y_test\n", 965 | "#y_test_preds = y_test.assign(target = preds_array)\n", 966 | "type(y_test) # pandas.core.series.Series" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 139, 972 | "metadata": {}, 973 | "outputs": [ 974 | { 975 | "data": { 976 | "text/plain": [ 977 | "numpy.ndarray" 978 | ] 979 | }, 980 | "execution_count": 139, 981 | "metadata": {}, 982 | "output_type": "execute_result" 983 | } 984 | ], 985 | "source": [ 986 | "#cvect.get_feature_names()\n", 987 | "type(preds_array) # numpy.ndarray" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 142, 993 | "metadata": {}, 994 | "outputs": [ 995 | { 996 | "data": { 997 | "text/html": [ 998 | "
\n", 999 | "\n", 1012 | "\n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | "
tweet_counthourly_text
2018-10-17 01:00:005#hurricanemichael photos @rickcyoung @ Bay Cou...
2018-10-07 01:00:004Kisses for the couple #itskloiberingtime #rosi...
2018-10-06 22:00:007Panama City FL Sat Oct 6th PM Forecast: TONIGH...
2018-10-03 04:00:001light intensity drizzle -&gt; clear sky\\ntempe...
2018-10-07 16:00:005I'm at Grayton Beach State Park in Santa Rosa ...
\n", 1048 | "
" 1049 | ], 1050 | "text/plain": [ 1051 | " tweet_count \\\n", 1052 | "2018-10-17 01:00:00 5 \n", 1053 | "2018-10-07 01:00:00 4 \n", 1054 | "2018-10-06 22:00:00 7 \n", 1055 | "2018-10-03 04:00:00 1 \n", 1056 | "2018-10-07 16:00:00 5 \n", 1057 | "\n", 1058 | " hourly_text \n", 1059 | "2018-10-17 01:00:00 #hurricanemichael photos @rickcyoung @ Bay Cou... \n", 1060 | "2018-10-07 01:00:00 Kisses for the couple #itskloiberingtime #rosi... \n", 1061 | "2018-10-06 22:00:00 Panama City FL Sat Oct 6th PM Forecast: TONIGH... \n", 1062 | "2018-10-03 04:00:00 light intensity drizzle -> clear sky\\ntempe... \n", 1063 | "2018-10-07 16:00:00 I'm at Grayton Beach State Park in Santa Rosa ... " 1064 | ] 1065 | }, 1066 | "execution_count": 142, 1067 | "metadata": {}, 1068 | "output_type": "execute_result" 1069 | } 1070 | ], 1071 | "source": [ 1072 | "X_test.shape # (86, 2)\n", 1073 | "X_test.head()" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": 147, 1079 | "metadata": {}, 1080 | "outputs": [], 1081 | "source": [ 1082 | "X_test_actual = X_test.assign(actual=y_test)" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "execution_count": 149, 1088 | "metadata": {}, 1089 | "outputs": [ 1090 | { 1091 | "data": { 1092 | "text/html": [ 1093 | "
\n", 1094 | "\n", 1107 | "\n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | "
tweet_counthourly_textactual
2018-10-17 01:00:005#hurricanemichael photos @rickcyoung @ Bay Cou...0
2018-10-07 01:00:004Kisses for the couple #itskloiberingtime #rosi...0
2018-10-06 22:00:007Panama City FL Sat Oct 6th PM Forecast: TONIGH...0
2018-10-03 04:00:001light intensity drizzle -&gt; clear sky\\ntempe...0
2018-10-07 16:00:005I'm at Grayton Beach State Park in Santa Rosa ...0
\n", 1149 | "
" 1150 | ], 1151 | "text/plain": [ 1152 | " tweet_count \\\n", 1153 | "2018-10-17 01:00:00 5 \n", 1154 | "2018-10-07 01:00:00 4 \n", 1155 | "2018-10-06 22:00:00 7 \n", 1156 | "2018-10-03 04:00:00 1 \n", 1157 | "2018-10-07 16:00:00 5 \n", 1158 | "\n", 1159 | " hourly_text actual \n", 1160 | "2018-10-17 01:00:00 #hurricanemichael photos @rickcyoung @ Bay Cou... 0 \n", 1161 | "2018-10-07 01:00:00 Kisses for the couple #itskloiberingtime #rosi... 0 \n", 1162 | "2018-10-06 22:00:00 Panama City FL Sat Oct 6th PM Forecast: TONIGH... 0 \n", 1163 | "2018-10-03 04:00:00 light intensity drizzle -> clear sky\\ntempe... 0 \n", 1164 | "2018-10-07 16:00:00 I'm at Grayton Beach State Park in Santa Rosa ... 0 " 1165 | ] 1166 | }, 1167 | "execution_count": 149, 1168 | "metadata": {}, 1169 | "output_type": "execute_result" 1170 | } 1171 | ], 1172 | "source": [ 1173 | "X_test_actual.head()" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "execution_count": 154, 1179 | "metadata": {}, 1180 | "outputs": [], 1181 | "source": [ 1182 | "X_test_actual_preds = X_test_actual.assign(predicted=preds_array)" 1183 | ] 1184 | }, 1185 | { 1186 | "cell_type": "markdown", 1187 | "metadata": {}, 1188 | "source": [ 1189 | "### Results:\n", 1190 | "### 10 are correctly predicted as hurricane out of 14 total that are predicted as hurricane.\n", 1191 | "### 11 are incorrectly marked as not hurricane out of 21 actual hurricane flags" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "execution_count": 164, 1197 | "metadata": {}, 1198 | "outputs": [ 1199 | { 1200 | "data": { 1201 | "text/html": [ 1202 | "
\n", 1203 | "\n", 1216 | "\n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | "
tweet_counthourly_textactualpredicted
2018-10-12 05:00:002🌀 #TeamTrimew #prayforflorida #panamabeach #pa...10
2018-10-10 10:00:002WALL TO WALL COVERAGE!!!!!!!! TUNE INTO ALL OF...10
2018-10-10 06:00:002🌤 @ Panama City, Florida https://t.co/SRo9kDNU...10
2018-10-12 04:00:009St. John the Evangelist middle school @ St. An...11
2018-10-11 06:00:001moderate rain -&gt; light rain\\ntemperature up...11
2018-10-09 23:00:005Where did everyone go? Beautiful day out here....11
2018-10-10 04:00:004Livvie loving the waves! #goldendoodle #waves ...10
2018-10-11 14:00:003Disgruntled tourists. @ Rosemary Beach, Florid...10
2018-10-11 03:00:002Our house! But we are safe and our house has m...10
2018-10-12 14:00:004I’ve seen pictures and videos of the damage in...10
2018-10-10 21:00:004This is #history being made 🌀#hurricanemichael...11
2018-10-13 15:00:009Ground Zero: #MexicoBeach #hurricanemichael @ ...10
2018-10-10 11:00:003moderate rain -&gt; heavy intensity rain\\ntemp...11
2018-10-12 23:00:005Panama City: 6:13pm: sunset ~|~ current weathe...11
2018-10-11 02:00:005I took this from someone’s post. Interesting! ...10
2018-10-12 18:00:005#Repost tallerrally with get_repost\\n・・・\\n📲 PR...10
2018-10-10 00:00:005This is pretty much how my day went...minus me...11
2018-10-11 17:00:007The road to Mexico, FL #hurricanemichael @weat...11
2018-10-11 15:00:007Prayers for all of those affected by #Hurrican...11
2018-10-12 22:00:008Panama City FL Fri Oct 12th PM Forecast: TONIG...11
2018-10-13 18:00:002Just some of the devistation Ashley's family s...10
\n", 1376 | "
" 1377 | ], 1378 | "text/plain": [ 1379 | " tweet_count \\\n", 1380 | "2018-10-12 05:00:00 2 \n", 1381 | "2018-10-10 10:00:00 2 \n", 1382 | "2018-10-10 06:00:00 2 \n", 1383 | "2018-10-12 04:00:00 9 \n", 1384 | "2018-10-11 06:00:00 1 \n", 1385 | "2018-10-09 23:00:00 5 \n", 1386 | "2018-10-10 04:00:00 4 \n", 1387 | "2018-10-11 14:00:00 3 \n", 1388 | "2018-10-11 03:00:00 2 \n", 1389 | "2018-10-12 14:00:00 4 \n", 1390 | "2018-10-10 21:00:00 4 \n", 1391 | "2018-10-13 15:00:00 9 \n", 1392 | "2018-10-10 11:00:00 3 \n", 1393 | "2018-10-12 23:00:00 5 \n", 1394 | "2018-10-11 02:00:00 5 \n", 1395 | "2018-10-12 18:00:00 5 \n", 1396 | "2018-10-10 00:00:00 5 \n", 1397 | "2018-10-11 17:00:00 7 \n", 1398 | "2018-10-11 15:00:00 7 \n", 1399 | "2018-10-12 22:00:00 8 \n", 1400 | "2018-10-13 18:00:00 2 \n", 1401 | "\n", 1402 | " hourly_text \\\n", 1403 | "2018-10-12 05:00:00 🌀 #TeamTrimew #prayforflorida #panamabeach #pa... \n", 1404 | "2018-10-10 10:00:00 WALL TO WALL COVERAGE!!!!!!!! TUNE INTO ALL OF... \n", 1405 | "2018-10-10 06:00:00 🌤 @ Panama City, Florida https://t.co/SRo9kDNU... \n", 1406 | "2018-10-12 04:00:00 St. John the Evangelist middle school @ St. An... \n", 1407 | "2018-10-11 06:00:00 moderate rain -> light rain\\ntemperature up... \n", 1408 | "2018-10-09 23:00:00 Where did everyone go? Beautiful day out here.... \n", 1409 | "2018-10-10 04:00:00 Livvie loving the waves! #goldendoodle #waves ... \n", 1410 | "2018-10-11 14:00:00 Disgruntled tourists. @ Rosemary Beach, Florid... \n", 1411 | "2018-10-11 03:00:00 Our house! But we are safe and our house has m... \n", 1412 | "2018-10-12 14:00:00 I’ve seen pictures and videos of the damage in... \n", 1413 | "2018-10-10 21:00:00 This is #history being made 🌀#hurricanemichael... \n", 1414 | "2018-10-13 15:00:00 Ground Zero: #MexicoBeach #hurricanemichael @ ... \n", 1415 | "2018-10-10 11:00:00 moderate rain -> heavy intensity rain\\ntemp... \n", 1416 | "2018-10-12 23:00:00 Panama City: 6:13pm: sunset ~|~ current weathe... \n", 1417 | "2018-10-11 02:00:00 I took this from someone’s post. Interesting! ... \n", 1418 | "2018-10-12 18:00:00 #Repost tallerrally with get_repost\\n・・・\\n📲 PR... \n", 1419 | "2018-10-10 00:00:00 This is pretty much how my day went...minus me... \n", 1420 | "2018-10-11 17:00:00 The road to Mexico, FL #hurricanemichael @weat... \n", 1421 | "2018-10-11 15:00:00 Prayers for all of those affected by #Hurrican... \n", 1422 | "2018-10-12 22:00:00 Panama City FL Fri Oct 12th PM Forecast: TONIG... \n", 1423 | "2018-10-13 18:00:00 Just some of the devistation Ashley's family s... \n", 1424 | "\n", 1425 | " actual predicted \n", 1426 | "2018-10-12 05:00:00 1 0 \n", 1427 | "2018-10-10 10:00:00 1 0 \n", 1428 | "2018-10-10 06:00:00 1 0 \n", 1429 | "2018-10-12 04:00:00 1 1 \n", 1430 | "2018-10-11 06:00:00 1 1 \n", 1431 | "2018-10-09 23:00:00 1 1 \n", 1432 | "2018-10-10 04:00:00 1 0 \n", 1433 | "2018-10-11 14:00:00 1 0 \n", 1434 | "2018-10-11 03:00:00 1 0 \n", 1435 | "2018-10-12 14:00:00 1 0 \n", 1436 | "2018-10-10 21:00:00 1 1 \n", 1437 | "2018-10-13 15:00:00 1 0 \n", 1438 | "2018-10-10 11:00:00 1 1 \n", 1439 | "2018-10-12 23:00:00 1 1 \n", 1440 | "2018-10-11 02:00:00 1 0 \n", 1441 | "2018-10-12 18:00:00 1 0 \n", 1442 | "2018-10-10 00:00:00 1 1 \n", 1443 | "2018-10-11 17:00:00 1 1 \n", 1444 | "2018-10-11 15:00:00 1 1 \n", 1445 | "2018-10-12 22:00:00 1 1 \n", 1446 | "2018-10-13 18:00:00 1 0 " 1447 | ] 1448 | }, 1449 | "execution_count": 164, 1450 | "metadata": {}, 1451 | "output_type": "execute_result" 1452 | } 1453 | ], 1454 | "source": [ 1455 | "X_test_actual_preds[X_test_actual_preds.actual==1].shape # (21, 4)\n", 1456 | "X_test_actual_preds[X_test_actual_preds.predicted==1] \n", 1457 | "# 10 are correctly predicted as hurricane out of 14 total that are predicted as hurricane\n", 1458 | "\n", 1459 | "X_test_actual_preds[X_test_actual_preds.actual==1] \n", 1460 | "# 11 are incorrectly marked as not hurricane out of 21 actual hurricane flags\n", 1461 | "\n", 1462 | "#X_test.shape # (86, 2)" 1463 | ] 1464 | } 1465 | ], 1466 | "metadata": { 1467 | "kernelspec": { 1468 | "display_name": "Python 3", 1469 | "language": "python", 1470 | "name": "python3" 1471 | }, 1472 | "language_info": { 1473 | "codemirror_mode": { 1474 | "name": "ipython", 1475 | "version": 3 1476 | }, 1477 | "file_extension": ".py", 1478 | "mimetype": "text/x-python", 1479 | "name": "python", 1480 | "nbconvert_exporter": "python", 1481 | "pygments_lexer": "ipython3", 1482 | "version": "3.7.0" 1483 | } 1484 | }, 1485 | "nbformat": 4, 1486 | "nbformat_minor": 2 1487 | } 1488 | --------------------------------------------------------------------------------