├── Cleaning Data with Python.ipynb ├── Cleaning_Data_with_SQL_Part1.sql ├── Exploring Data with Python (EDA).ipynb ├── Exploring_Data_SQL_Part1.sql ├── LEGO Data Analysis - Answered.ipynb ├── LEGO Data Analysis - Rebrickable.ipynb ├── LEGO_Data_Analysis.sql ├── RFM_Segmentation_Sales_Analysis_Main.sql ├── Visualizing_Data_with_Tableau.sql └── cohort_rentention_analysis.sql /Cleaning Data with Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f78f26ee", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#importing libraries\n", 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "0ed3cdb1", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Getting Data\n", 23 | "data = pd.read_excel(\"SBA Table of Size Standards_Effective Aug 19, 2019.xlsx\", sheet_name = 'table_of_size_standards-all')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "28101559", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "
\n", 36 | "\n", 49 | "\n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotes
0NaNSector 11 – Agriculture, Forestry, Fishing and...NaNNaNNaN
1Subsector 111 – Crop ProductionNaNNaNNaNNaN
2111110Soybean Farming1NaNNaN
3111120Oilseed (except Soybean) Farming1NaNNaN
4111130Dry Pea and Bean Farming1NaNNaN
..................
1144813930Labor Unions and Similar Labor Organizations8NaNNaN
1145813940Political Organizations8NaNNaN
1146813990Other Similar Organizations (except Business, ...8NaNNaN
1147NaNSector 92 – Public AdministrationNaNNaNSee footnote 19
1148NaN(Small business size standards are not establi...NaNNaNNaN
\n", 151 | "

1149 rows × 5 columns

\n", 152 | "
" 153 | ], 154 | "text/plain": [ 155 | " NAICS Codes \\\n", 156 | "0 NaN \n", 157 | "1 Subsector 111 – Crop Production \n", 158 | "2 111110 \n", 159 | "3 111120 \n", 160 | "4 111130 \n", 161 | "... ... \n", 162 | "1144 813930 \n", 163 | "1145 813940 \n", 164 | "1146 813990 \n", 165 | "1147 NaN \n", 166 | "1148 NaN \n", 167 | "\n", 168 | " NAICS Industry Description \\\n", 169 | "0 Sector 11 – Agriculture, Forestry, Fishing and... \n", 170 | "1 NaN \n", 171 | "2 Soybean Farming \n", 172 | "3 Oilseed (except Soybean) Farming \n", 173 | "4 Dry Pea and Bean Farming \n", 174 | "... ... \n", 175 | "1144 Labor Unions and Similar Labor Organizations \n", 176 | "1145 Political Organizations \n", 177 | "1146 Other Similar Organizations (except Business, ... \n", 178 | "1147 Sector 92 – Public Administration \n", 179 | "1148 (Small business size standards are not establi... \n", 180 | "\n", 181 | " Size Standards \\nin millions of dollars \\\n", 182 | "0 NaN \n", 183 | "1 NaN \n", 184 | "2 1 \n", 185 | "3 1 \n", 186 | "4 1 \n", 187 | "... ... \n", 188 | "1144 8 \n", 189 | "1145 8 \n", 190 | "1146 8 \n", 191 | "1147 NaN \n", 192 | "1148 NaN \n", 193 | "\n", 194 | " Size standards in number of employees Footnotes \n", 195 | "0 NaN NaN \n", 196 | "1 NaN NaN \n", 197 | "2 NaN NaN \n", 198 | "3 NaN NaN \n", 199 | "4 NaN NaN \n", 200 | "... ... ... \n", 201 | "1144 NaN NaN \n", 202 | "1145 NaN NaN \n", 203 | "1146 NaN NaN \n", 204 | "1147 NaN See footnote 19 \n", 205 | "1148 NaN NaN \n", 206 | "\n", 207 | "[1149 rows x 5 columns]" 208 | ] 209 | }, 210 | "execution_count": 3, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "data" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 4, 222 | "id": "9b5e1e9d", 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/html": [ 228 | "
\n", 229 | "\n", 242 | "\n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotes
0NaNSector 11 – Agriculture, Forestry, Fishing and...NaNNaNNaN
1Subsector 111 – Crop ProductionNaNNaNNaNNaN
2111110Soybean Farming1NaNNaN
3111120Oilseed (except Soybean) Farming1NaNNaN
4111130Dry Pea and Bean Farming1NaNNaN
\n", 296 | "
" 297 | ], 298 | "text/plain": [ 299 | " NAICS Codes \\\n", 300 | "0 NaN \n", 301 | "1 Subsector 111 – Crop Production \n", 302 | "2 111110 \n", 303 | "3 111120 \n", 304 | "4 111130 \n", 305 | "\n", 306 | " NAICS Industry Description \\\n", 307 | "0 Sector 11 – Agriculture, Forestry, Fishing and... \n", 308 | "1 NaN \n", 309 | "2 Soybean Farming \n", 310 | "3 Oilseed (except Soybean) Farming \n", 311 | "4 Dry Pea and Bean Farming \n", 312 | "\n", 313 | " Size Standards \\nin millions of dollars \\\n", 314 | "0 NaN \n", 315 | "1 NaN \n", 316 | "2 1 \n", 317 | "3 1 \n", 318 | "4 1 \n", 319 | "\n", 320 | " Size standards in number of employees Footnotes \n", 321 | "0 NaN NaN \n", 322 | "1 NaN NaN \n", 323 | "2 NaN NaN \n", 324 | "3 NaN NaN \n", 325 | "4 NaN NaN " 326 | ] 327 | }, 328 | "execution_count": 4, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "data.head()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 5, 340 | "id": "d4cd6801", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/html": [ 346 | "
\n", 347 | "\n", 360 | "\n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotes
1144813930Labor Unions and Similar Labor Organizations8NaNNaN
1145813940Political Organizations8NaNNaN
1146813990Other Similar Organizations (except Business, ...8NaNNaN
1147NaNSector 92 – Public AdministrationNaNNaNSee footnote 19
1148NaN(Small business size standards are not establi...NaNNaNNaN
\n", 414 | "
" 415 | ], 416 | "text/plain": [ 417 | " NAICS Codes NAICS Industry Description \\\n", 418 | "1144 813930 Labor Unions and Similar Labor Organizations \n", 419 | "1145 813940 Political Organizations \n", 420 | "1146 813990 Other Similar Organizations (except Business, ... \n", 421 | "1147 NaN Sector 92 – Public Administration \n", 422 | "1148 NaN (Small business size standards are not establi... \n", 423 | "\n", 424 | " Size Standards \\nin millions of dollars \\\n", 425 | "1144 8 \n", 426 | "1145 8 \n", 427 | "1146 8 \n", 428 | "1147 NaN \n", 429 | "1148 NaN \n", 430 | "\n", 431 | " Size standards in number of employees Footnotes \n", 432 | "1144 NaN NaN \n", 433 | "1145 NaN NaN \n", 434 | "1146 NaN NaN \n", 435 | "1147 NaN See footnote 19 \n", 436 | "1148 NaN NaN " 437 | ] 438 | }, 439 | "execution_count": 5, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "data.tail()" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 6, 451 | "id": "347079e1", 452 | "metadata": {}, 453 | "outputs": [ 454 | { 455 | "name": "stdout", 456 | "output_type": "stream", 457 | "text": [ 458 | "\n", 459 | "RangeIndex: 1149 entries, 0 to 1148\n", 460 | "Data columns (total 5 columns):\n", 461 | " # Column Non-Null Count Dtype \n", 462 | "--- ------ -------------- ----- \n", 463 | " 0 NAICS Codes 1126 non-null object \n", 464 | " 1 NAICS Industry Description 1060 non-null object \n", 465 | " 2 Size Standards \n", 466 | "in millions of dollars 532 non-null object \n", 467 | " 3 Size standards in number of employees 505 non-null float64\n", 468 | " 4 Footnotes 39 non-null object \n", 469 | "dtypes: float64(1), object(4)\n", 470 | "memory usage: 45.0+ KB\n" 471 | ] 472 | } 473 | ], 474 | "source": [ 475 | "data.info()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 7, 481 | "id": "1bae2125", 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "NAICS Codes 23\n", 488 | "NAICS Industry Description 89\n", 489 | "Size Standards \\nin millions of dollars 617\n", 490 | "Size standards in number of employees 644\n", 491 | "Footnotes 1110\n", 492 | "dtype: int64" 493 | ] 494 | }, 495 | "execution_count": 7, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "#checking for null\n", 502 | "data.isnull().sum()" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 9, 508 | "id": "559ad113", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "#selecting only the relevant records\n", 513 | "data = data [data['NAICS Codes'].isnull()]" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 10, 519 | "id": "946bd789", 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "name": "stdout", 524 | "output_type": "stream", 525 | "text": [ 526 | "\n", 527 | "Int64Index: 23 entries, 0 to 1148\n", 528 | "Data columns (total 5 columns):\n", 529 | " # Column Non-Null Count Dtype \n", 530 | "--- ------ -------------- ----- \n", 531 | " 0 NAICS Codes 0 non-null object \n", 532 | " 1 NAICS Industry Description 23 non-null object \n", 533 | " 2 Size Standards \n", 534 | "in millions of dollars 0 non-null object \n", 535 | " 3 Size standards in number of employees 0 non-null float64\n", 536 | " 4 Footnotes 1 non-null object \n", 537 | "dtypes: float64(1), object(4)\n", 538 | "memory usage: 1.1+ KB\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "data.info()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 11, 549 | "id": "c30a02d7", 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "data": { 554 | "text/html": [ 555 | "
\n", 556 | "\n", 569 | "\n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotes
0NaNSector 11 – Agriculture, Forestry, Fishing and...NaNNaNNaN
71NaNSector 21 – Mining, Quarrying, and Oil and Gas...NaNNaNNaN
103NaNSector 22 – UtilitiesNaNNaNNaN
119NaNSector 23 – ConstructionNaNNaNNaN
156NaNSector 31 – 33 – ManufacturingNaNNaNNaN
\n", 623 | "
" 624 | ], 625 | "text/plain": [ 626 | " NAICS Codes NAICS Industry Description \\\n", 627 | "0 NaN Sector 11 – Agriculture, Forestry, Fishing and... \n", 628 | "71 NaN Sector 21 – Mining, Quarrying, and Oil and Gas... \n", 629 | "103 NaN Sector 22 – Utilities \n", 630 | "119 NaN Sector 23 – Construction \n", 631 | "156 NaN Sector 31 – 33 – Manufacturing \n", 632 | "\n", 633 | " Size Standards \\nin millions of dollars \\\n", 634 | "0 NaN \n", 635 | "71 NaN \n", 636 | "103 NaN \n", 637 | "119 NaN \n", 638 | "156 NaN \n", 639 | "\n", 640 | " Size standards in number of employees Footnotes \n", 641 | "0 NaN NaN \n", 642 | "71 NaN NaN \n", 643 | "103 NaN NaN \n", 644 | "119 NaN NaN \n", 645 | "156 NaN NaN " 646 | ] 647 | }, 648 | "execution_count": 11, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "data.head()" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 12, 660 | "id": "5e56fe75", 661 | "metadata": {}, 662 | "outputs": [ 663 | { 664 | "name": "stderr", 665 | "output_type": "stream", 666 | "text": [ 667 | "C:\\Users\\AFRIMP~1\\AppData\\Local\\Temp/ipykernel_37288/359482711.py:1: SettingWithCopyWarning: \n", 668 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 669 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 670 | "\n", 671 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 672 | " data['LookupCodes'] = data['NAICS Industry Description'].str.slice(start = 7, stop = 9)\n" 673 | ] 674 | } 675 | ], 676 | "source": [ 677 | "data['LookupCodes'] = data['NAICS Industry Description'].str.slice(start = 7, stop = 9)" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 13, 683 | "id": "91bb5074", 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "text/html": [ 689 | "
\n", 690 | "\n", 703 | "\n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotesLookupCodes
0NaNSector 11 – Agriculture, Forestry, Fishing and...NaNNaNNaN11
71NaNSector 21 – Mining, Quarrying, and Oil and Gas...NaNNaNNaN21
103NaNSector 22 – UtilitiesNaNNaNNaN22
119NaNSector 23 – ConstructionNaNNaNNaN23
156NaNSector 31 – 33 – ManufacturingNaNNaNNaN31
\n", 763 | "
" 764 | ], 765 | "text/plain": [ 766 | " NAICS Codes NAICS Industry Description \\\n", 767 | "0 NaN Sector 11 – Agriculture, Forestry, Fishing and... \n", 768 | "71 NaN Sector 21 – Mining, Quarrying, and Oil and Gas... \n", 769 | "103 NaN Sector 22 – Utilities \n", 770 | "119 NaN Sector 23 – Construction \n", 771 | "156 NaN Sector 31 – 33 – Manufacturing \n", 772 | "\n", 773 | " Size Standards \\nin millions of dollars \\\n", 774 | "0 NaN \n", 775 | "71 NaN \n", 776 | "103 NaN \n", 777 | "119 NaN \n", 778 | "156 NaN \n", 779 | "\n", 780 | " Size standards in number of employees Footnotes LookupCodes \n", 781 | "0 NaN NaN 11 \n", 782 | "71 NaN NaN 21 \n", 783 | "103 NaN NaN 22 \n", 784 | "119 NaN NaN 23 \n", 785 | "156 NaN NaN 31 " 786 | ] 787 | }, 788 | "execution_count": 13, 789 | "metadata": {}, 790 | "output_type": "execute_result" 791 | } 792 | ], 793 | "source": [ 794 | "data.head()" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 14, 800 | "id": "bf7c114c", 801 | "metadata": {}, 802 | "outputs": [ 803 | { 804 | "data": { 805 | "text/plain": [ 806 | "0 54\n", 807 | "71 57\n", 808 | "103 21\n", 809 | "119 24\n", 810 | "156 30\n", 811 | "538 27\n", 812 | "539 537\n", 813 | "614 29\n", 814 | "615 537\n", 815 | "694 47\n", 816 | "764 23\n", 817 | "802 33\n", 818 | "847 46\n", 819 | "875 59\n", 820 | "932 51\n", 821 | "936 81\n", 822 | "984 32\n", 823 | "1004 45\n", 824 | "1048 46\n", 825 | "1077 43\n", 826 | "1095 26\n", 827 | "1147 33\n", 828 | "1148 284\n", 829 | "Name: NAICS Industry Description, dtype: int64" 830 | ] 831 | }, 832 | "execution_count": 14, 833 | "metadata": {}, 834 | "output_type": "execute_result" 835 | } 836 | ], 837 | "source": [ 838 | "data['NAICS Industry Description'].str.len()" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 17, 844 | "id": "c663ca5f", 845 | "metadata": {}, 846 | "outputs": [ 847 | { 848 | "name": "stderr", 849 | "output_type": "stream", 850 | "text": [ 851 | "C:\\Users\\AFRIMP~1\\AppData\\Local\\Temp/ipykernel_37288/4027759782.py:1: SettingWithCopyWarning: \n", 852 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 853 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 854 | "\n", 855 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 856 | " data['Sector'] = data['NAICS Industry Description'].str.slice(start = 12, stop = 538)\n" 857 | ] 858 | } 859 | ], 860 | "source": [ 861 | "data['Sector'] = data['NAICS Industry Description'].str.slice(start = 12, stop = 538)" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 18, 867 | "id": "5906f4be", 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "data": { 872 | "text/html": [ 873 | "
\n", 874 | "\n", 887 | "\n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | "
NAICS CodesNAICS Industry DescriptionSize Standards \\nin millions of dollarsSize standards in number of employeesFootnotesLookupCodesSector
0NaNSector 11 – Agriculture, Forestry, Fishing and...NaNNaNNaN11Agriculture, Forestry, Fishing and Hunting
71NaNSector 21 – Mining, Quarrying, and Oil and Gas...NaNNaNNaN21Mining, Quarrying, and Oil and Gas Extraction
103NaNSector 22 – UtilitiesNaNNaNNaN22Utilities
119NaNSector 23 – ConstructionNaNNaNNaN23Construction
156NaNSector 31 – 33 – ManufacturingNaNNaNNaN3133 – Manufacturing
\n", 953 | "
" 954 | ], 955 | "text/plain": [ 956 | " NAICS Codes NAICS Industry Description \\\n", 957 | "0 NaN Sector 11 – Agriculture, Forestry, Fishing and... \n", 958 | "71 NaN Sector 21 – Mining, Quarrying, and Oil and Gas... \n", 959 | "103 NaN Sector 22 – Utilities \n", 960 | "119 NaN Sector 23 – Construction \n", 961 | "156 NaN Sector 31 – 33 – Manufacturing \n", 962 | "\n", 963 | " Size Standards \\nin millions of dollars \\\n", 964 | "0 NaN \n", 965 | "71 NaN \n", 966 | "103 NaN \n", 967 | "119 NaN \n", 968 | "156 NaN \n", 969 | "\n", 970 | " Size standards in number of employees Footnotes LookupCodes \\\n", 971 | "0 NaN NaN 11 \n", 972 | "71 NaN NaN 21 \n", 973 | "103 NaN NaN 22 \n", 974 | "119 NaN NaN 23 \n", 975 | "156 NaN NaN 31 \n", 976 | "\n", 977 | " Sector \n", 978 | "0 Agriculture, Forestry, Fishing and Hunting \n", 979 | "71 Mining, Quarrying, and Oil and Gas Extraction \n", 980 | "103 Utilities \n", 981 | "119 Construction \n", 982 | "156 33 – Manufacturing " 983 | ] 984 | }, 985 | "execution_count": 18, 986 | "metadata": {}, 987 | "output_type": "execute_result" 988 | } 989 | ], 990 | "source": [ 991 | "data.head()" 992 | ] 993 | }, 994 | { 995 | "cell_type": "code", 996 | "execution_count": 22, 997 | "id": "920b9475", 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [ 1001 | "data = data [['NAICS Industry Description', 'LookupCodes', 'Sector']]" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": 23, 1007 | "id": "477b52c6", 1008 | "metadata": {}, 1009 | "outputs": [ 1010 | { 1011 | "name": "stdout", 1012 | "output_type": "stream", 1013 | "text": [ 1014 | "\n", 1015 | "Int64Index: 23 entries, 0 to 1148\n", 1016 | "Data columns (total 3 columns):\n", 1017 | " # Column Non-Null Count Dtype \n", 1018 | "--- ------ -------------- ----- \n", 1019 | " 0 NAICS Industry Description 23 non-null object\n", 1020 | " 1 LookupCodes 23 non-null object\n", 1021 | " 2 Sector 23 non-null object\n", 1022 | "dtypes: object(3)\n", 1023 | "memory usage: 736.0+ bytes\n" 1024 | ] 1025 | } 1026 | ], 1027 | "source": [ 1028 | "data.info()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": 25, 1034 | "id": "42e8a010", 1035 | "metadata": {}, 1036 | "outputs": [ 1037 | { 1038 | "data": { 1039 | "text/html": [ 1040 | "
\n", 1041 | "\n", 1054 | "\n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | "
NAICS Industry DescriptionLookupCodesSector
0Sector 11 – Agriculture, Forestry, Fishing and...11Agriculture, Forestry, Fishing and Hunting
71Sector 21 – Mining, Quarrying, and Oil and Gas...21Mining, Quarrying, and Oil and Gas Extraction
103Sector 22 – Utilities22Utilities
119Sector 23 – Construction23Construction
156Sector 31 – 33 – Manufacturing3133 – Manufacturing
538Sector 42 – Wholesale Trade42Wholesale Trade
539(These NAICS codes shall not be used to classi...NAcodes shall not be used to classify Governmen...
614Sector 44 - 45 – Retail Trade4445 – Retail Trade
615(These NAICS codes shall not be used to classi...NAcodes shall not be used to classify Governmen...
694Sector 48 - 49 – Transportation and Warehousing4849 – Transportation and Warehousing
764Sector 51 – Information51Information
802Sector 52 – Finance and Insurance52Finance and Insurance
847Sector 53 – Real Estate and Rental and Leasing53Real Estate and Rental and Leasing
875Sector 54 – Professional, Scientific and Techn...54Professional, Scientific and Technical Services
932Sector 55 – Management of Companies and Enterp...55Management of Companies and Enterprises
936Sector 56 – Administrative and Support, Waste ...56Administrative and Support, Waste Management a...
984Sector 61 – Educational Services61Educational Services
1004Sector 62 – Health Care and Social Assistance62Health Care and Social Assistance
1048Sector 71 – Arts, Entertainment and Recreation71Arts, Entertainment and Recreation
1077Sector 72 – Accommodation and Food Services72Accommodation and Food Services
1095Sector 81 – Other Services81Other Services
1147Sector 92 – Public Administration92Public Administration
1148(Small business size standards are not establi...buess size standards are not established for thi...
\n", 1204 | "
" 1205 | ], 1206 | "text/plain": [ 1207 | " NAICS Industry Description LookupCodes \\\n", 1208 | "0 Sector 11 – Agriculture, Forestry, Fishing and... 11 \n", 1209 | "71 Sector 21 – Mining, Quarrying, and Oil and Gas... 21 \n", 1210 | "103 Sector 22 – Utilities 22 \n", 1211 | "119 Sector 23 – Construction 23 \n", 1212 | "156 Sector 31 – 33 – Manufacturing 31 \n", 1213 | "538 Sector 42 – Wholesale Trade 42 \n", 1214 | "539 (These NAICS codes shall not be used to classi... NA \n", 1215 | "614 Sector 44 - 45 – Retail Trade 44 \n", 1216 | "615 (These NAICS codes shall not be used to classi... NA \n", 1217 | "694 Sector 48 - 49 – Transportation and Warehousing 48 \n", 1218 | "764 Sector 51 – Information 51 \n", 1219 | "802 Sector 52 – Finance and Insurance 52 \n", 1220 | "847 Sector 53 – Real Estate and Rental and Leasing 53 \n", 1221 | "875 Sector 54 – Professional, Scientific and Techn... 54 \n", 1222 | "932 Sector 55 – Management of Companies and Enterp... 55 \n", 1223 | "936 Sector 56 – Administrative and Support, Waste ... 56 \n", 1224 | "984 Sector 61 – Educational Services 61 \n", 1225 | "1004 Sector 62 – Health Care and Social Assistance 62 \n", 1226 | "1048 Sector 71 – Arts, Entertainment and Recreation 71 \n", 1227 | "1077 Sector 72 – Accommodation and Food Services 72 \n", 1228 | "1095 Sector 81 – Other Services 81 \n", 1229 | "1147 Sector 92 – Public Administration 92 \n", 1230 | "1148 (Small business size standards are not establi... bu \n", 1231 | "\n", 1232 | " Sector \n", 1233 | "0 Agriculture, Forestry, Fishing and Hunting \n", 1234 | "71 Mining, Quarrying, and Oil and Gas Extraction \n", 1235 | "103 Utilities \n", 1236 | "119 Construction \n", 1237 | "156 33 – Manufacturing \n", 1238 | "538 Wholesale Trade \n", 1239 | "539 codes shall not be used to classify Governmen... \n", 1240 | "614 45 – Retail Trade \n", 1241 | "615 codes shall not be used to classify Governmen... \n", 1242 | "694 49 – Transportation and Warehousing \n", 1243 | "764 Information \n", 1244 | "802 Finance and Insurance \n", 1245 | "847 Real Estate and Rental and Leasing \n", 1246 | "875 Professional, Scientific and Technical Services \n", 1247 | "932 Management of Companies and Enterprises \n", 1248 | "936 Administrative and Support, Waste Management a... \n", 1249 | "984 Educational Services \n", 1250 | "1004 Health Care and Social Assistance \n", 1251 | "1048 Arts, Entertainment and Recreation \n", 1252 | "1077 Accommodation and Food Services \n", 1253 | "1095 Other Services \n", 1254 | "1147 Public Administration \n", 1255 | "1148 ess size standards are not established for thi... " 1256 | ] 1257 | }, 1258 | "execution_count": 25, 1259 | "metadata": {}, 1260 | "output_type": "execute_result" 1261 | } 1262 | ], 1263 | "source": [ 1264 | "data" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": 27, 1270 | "id": "ffb3d068", 1271 | "metadata": {}, 1272 | "outputs": [], 1273 | "source": [ 1274 | "data = data[data['LookupCodes'].str.isnumeric()]" 1275 | ] 1276 | }, 1277 | { 1278 | "cell_type": "code", 1279 | "execution_count": 28, 1280 | "id": "ac50fdd3", 1281 | "metadata": {}, 1282 | "outputs": [ 1283 | { 1284 | "name": "stderr", 1285 | "output_type": "stream", 1286 | "text": [ 1287 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:1732: SettingWithCopyWarning: \n", 1288 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 1289 | "\n", 1290 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1291 | " self._setitem_single_block(indexer, value, name)\n", 1292 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:723: SettingWithCopyWarning: \n", 1293 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 1294 | "\n", 1295 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1296 | " iloc._setitem_with_indexer(indexer, value, self.name)\n" 1297 | ] 1298 | } 1299 | ], 1300 | "source": [ 1301 | "# replacing some values in sector\n", 1302 | "data.loc[ data['Sector'] == '33 – Manufacturing', 'Sector'] = 'Manufacturing'" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 29, 1308 | "id": "6e94eeba", 1309 | "metadata": {}, 1310 | "outputs": [ 1311 | { 1312 | "data": { 1313 | "text/html": [ 1314 | "
\n", 1315 | "\n", 1328 | "\n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | "
NAICS Industry DescriptionLookupCodesSector
0Sector 11 – Agriculture, Forestry, Fishing and...11Agriculture, Forestry, Fishing and Hunting
71Sector 21 – Mining, Quarrying, and Oil and Gas...21Mining, Quarrying, and Oil and Gas Extraction
103Sector 22 – Utilities22Utilities
119Sector 23 – Construction23Construction
156Sector 31 – 33 – Manufacturing31Manufacturing
538Sector 42 – Wholesale Trade42Wholesale Trade
614Sector 44 - 45 – Retail Trade4445 – Retail Trade
694Sector 48 - 49 – Transportation and Warehousing4849 – Transportation and Warehousing
764Sector 51 – Information51Information
802Sector 52 – Finance and Insurance52Finance and Insurance
847Sector 53 – Real Estate and Rental and Leasing53Real Estate and Rental and Leasing
875Sector 54 – Professional, Scientific and Techn...54Professional, Scientific and Technical Services
932Sector 55 – Management of Companies and Enterp...55Management of Companies and Enterprises
936Sector 56 – Administrative and Support, Waste ...56Administrative and Support, Waste Management a...
984Sector 61 – Educational Services61Educational Services
1004Sector 62 – Health Care and Social Assistance62Health Care and Social Assistance
1048Sector 71 – Arts, Entertainment and Recreation71Arts, Entertainment and Recreation
1077Sector 72 – Accommodation and Food Services72Accommodation and Food Services
1095Sector 81 – Other Services81Other Services
1147Sector 92 – Public Administration92Public Administration
\n", 1460 | "
" 1461 | ], 1462 | "text/plain": [ 1463 | " NAICS Industry Description LookupCodes \\\n", 1464 | "0 Sector 11 – Agriculture, Forestry, Fishing and... 11 \n", 1465 | "71 Sector 21 – Mining, Quarrying, and Oil and Gas... 21 \n", 1466 | "103 Sector 22 – Utilities 22 \n", 1467 | "119 Sector 23 – Construction 23 \n", 1468 | "156 Sector 31 – 33 – Manufacturing 31 \n", 1469 | "538 Sector 42 – Wholesale Trade 42 \n", 1470 | "614 Sector 44 - 45 – Retail Trade 44 \n", 1471 | "694 Sector 48 - 49 – Transportation and Warehousing 48 \n", 1472 | "764 Sector 51 – Information 51 \n", 1473 | "802 Sector 52 – Finance and Insurance 52 \n", 1474 | "847 Sector 53 – Real Estate and Rental and Leasing 53 \n", 1475 | "875 Sector 54 – Professional, Scientific and Techn... 54 \n", 1476 | "932 Sector 55 – Management of Companies and Enterp... 55 \n", 1477 | "936 Sector 56 – Administrative and Support, Waste ... 56 \n", 1478 | "984 Sector 61 – Educational Services 61 \n", 1479 | "1004 Sector 62 – Health Care and Social Assistance 62 \n", 1480 | "1048 Sector 71 – Arts, Entertainment and Recreation 71 \n", 1481 | "1077 Sector 72 – Accommodation and Food Services 72 \n", 1482 | "1095 Sector 81 – Other Services 81 \n", 1483 | "1147 Sector 92 – Public Administration 92 \n", 1484 | "\n", 1485 | " Sector \n", 1486 | "0 Agriculture, Forestry, Fishing and Hunting \n", 1487 | "71 Mining, Quarrying, and Oil and Gas Extraction \n", 1488 | "103 Utilities \n", 1489 | "119 Construction \n", 1490 | "156 Manufacturing \n", 1491 | "538 Wholesale Trade \n", 1492 | "614 45 – Retail Trade \n", 1493 | "694 49 – Transportation and Warehousing \n", 1494 | "764 Information \n", 1495 | "802 Finance and Insurance \n", 1496 | "847 Real Estate and Rental and Leasing \n", 1497 | "875 Professional, Scientific and Technical Services \n", 1498 | "932 Management of Companies and Enterprises \n", 1499 | "936 Administrative and Support, Waste Management a... \n", 1500 | "984 Educational Services \n", 1501 | "1004 Health Care and Social Assistance \n", 1502 | "1048 Arts, Entertainment and Recreation \n", 1503 | "1077 Accommodation and Food Services \n", 1504 | "1095 Other Services \n", 1505 | "1147 Public Administration " 1506 | ] 1507 | }, 1508 | "execution_count": 29, 1509 | "metadata": {}, 1510 | "output_type": "execute_result" 1511 | } 1512 | ], 1513 | "source": [ 1514 | "data" 1515 | ] 1516 | }, 1517 | { 1518 | "cell_type": "code", 1519 | "execution_count": 30, 1520 | "id": "a2b7c1fc", 1521 | "metadata": {}, 1522 | "outputs": [ 1523 | { 1524 | "name": "stderr", 1525 | "output_type": "stream", 1526 | "text": [ 1527 | "C:\\Users\\AFRIMP~1\\AppData\\Local\\Temp/ipykernel_37288/1237734635.py:1: SettingWithCopyWarning: \n", 1528 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1529 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1530 | "\n", 1531 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1532 | " data['Sector'] = np.where(data['Sector'] == '45 – Retail Trade', 'Retail Trade', data['Sector'])\n", 1533 | "C:\\Users\\AFRIMP~1\\AppData\\Local\\Temp/ipykernel_37288/1237734635.py:2: SettingWithCopyWarning: \n", 1534 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1535 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1536 | "\n", 1537 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1538 | " data['Sector'] = np.where(data['Sector'] == '49 – Transportation and Warehousing', 'Transportation and Warehousing', data['Sector'])\n" 1539 | ] 1540 | } 1541 | ], 1542 | "source": [ 1543 | " data['Sector'] = np.where(data['Sector'] == '45 – Retail Trade', 'Retail Trade', data['Sector'])\n", 1544 | "data['Sector'] = np.where(data['Sector'] == '49 – Transportation and Warehousing', 'Transportation and Warehousing', data['Sector'])" 1545 | ] 1546 | }, 1547 | { 1548 | "cell_type": "code", 1549 | "execution_count": 31, 1550 | "id": "1b5c30ad", 1551 | "metadata": {}, 1552 | "outputs": [ 1553 | { 1554 | "data": { 1555 | "text/html": [ 1556 | "
\n", 1557 | "\n", 1570 | "\n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | "
NAICS Industry DescriptionLookupCodesSector
0Sector 11 – Agriculture, Forestry, Fishing and...11Agriculture, Forestry, Fishing and Hunting
71Sector 21 – Mining, Quarrying, and Oil and Gas...21Mining, Quarrying, and Oil and Gas Extraction
103Sector 22 – Utilities22Utilities
119Sector 23 – Construction23Construction
156Sector 31 – 33 – Manufacturing31Manufacturing
538Sector 42 – Wholesale Trade42Wholesale Trade
614Sector 44 - 45 – Retail Trade44Retail Trade
694Sector 48 - 49 – Transportation and Warehousing48Transportation and Warehousing
764Sector 51 – Information51Information
802Sector 52 – Finance and Insurance52Finance and Insurance
847Sector 53 – Real Estate and Rental and Leasing53Real Estate and Rental and Leasing
875Sector 54 – Professional, Scientific and Techn...54Professional, Scientific and Technical Services
932Sector 55 – Management of Companies and Enterp...55Management of Companies and Enterprises
936Sector 56 – Administrative and Support, Waste ...56Administrative and Support, Waste Management a...
984Sector 61 – Educational Services61Educational Services
1004Sector 62 – Health Care and Social Assistance62Health Care and Social Assistance
1048Sector 71 – Arts, Entertainment and Recreation71Arts, Entertainment and Recreation
1077Sector 72 – Accommodation and Food Services72Accommodation and Food Services
1095Sector 81 – Other Services81Other Services
1147Sector 92 – Public Administration92Public Administration
\n", 1702 | "
" 1703 | ], 1704 | "text/plain": [ 1705 | " NAICS Industry Description LookupCodes \\\n", 1706 | "0 Sector 11 – Agriculture, Forestry, Fishing and... 11 \n", 1707 | "71 Sector 21 – Mining, Quarrying, and Oil and Gas... 21 \n", 1708 | "103 Sector 22 – Utilities 22 \n", 1709 | "119 Sector 23 – Construction 23 \n", 1710 | "156 Sector 31 – 33 – Manufacturing 31 \n", 1711 | "538 Sector 42 – Wholesale Trade 42 \n", 1712 | "614 Sector 44 - 45 – Retail Trade 44 \n", 1713 | "694 Sector 48 - 49 – Transportation and Warehousing 48 \n", 1714 | "764 Sector 51 – Information 51 \n", 1715 | "802 Sector 52 – Finance and Insurance 52 \n", 1716 | "847 Sector 53 – Real Estate and Rental and Leasing 53 \n", 1717 | "875 Sector 54 – Professional, Scientific and Techn... 54 \n", 1718 | "932 Sector 55 – Management of Companies and Enterp... 55 \n", 1719 | "936 Sector 56 – Administrative and Support, Waste ... 56 \n", 1720 | "984 Sector 61 – Educational Services 61 \n", 1721 | "1004 Sector 62 – Health Care and Social Assistance 62 \n", 1722 | "1048 Sector 71 – Arts, Entertainment and Recreation 71 \n", 1723 | "1077 Sector 72 – Accommodation and Food Services 72 \n", 1724 | "1095 Sector 81 – Other Services 81 \n", 1725 | "1147 Sector 92 – Public Administration 92 \n", 1726 | "\n", 1727 | " Sector \n", 1728 | "0 Agriculture, Forestry, Fishing and Hunting \n", 1729 | "71 Mining, Quarrying, and Oil and Gas Extraction \n", 1730 | "103 Utilities \n", 1731 | "119 Construction \n", 1732 | "156 Manufacturing \n", 1733 | "538 Wholesale Trade \n", 1734 | "614 Retail Trade \n", 1735 | "694 Transportation and Warehousing \n", 1736 | "764 Information \n", 1737 | "802 Finance and Insurance \n", 1738 | "847 Real Estate and Rental and Leasing \n", 1739 | "875 Professional, Scientific and Technical Services \n", 1740 | "932 Management of Companies and Enterprises \n", 1741 | "936 Administrative and Support, Waste Management a... \n", 1742 | "984 Educational Services \n", 1743 | "1004 Health Care and Social Assistance \n", 1744 | "1048 Arts, Entertainment and Recreation \n", 1745 | "1077 Accommodation and Food Services \n", 1746 | "1095 Other Services \n", 1747 | "1147 Public Administration " 1748 | ] 1749 | }, 1750 | "execution_count": 31, 1751 | "metadata": {}, 1752 | "output_type": "execute_result" 1753 | } 1754 | ], 1755 | "source": [ 1756 | "data" 1757 | ] 1758 | }, 1759 | { 1760 | "cell_type": "code", 1761 | "execution_count": 32, 1762 | "id": "d1bff76f", 1763 | "metadata": {}, 1764 | "outputs": [ 1765 | { 1766 | "name": "stderr", 1767 | "output_type": "stream", 1768 | "text": [ 1769 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:723: SettingWithCopyWarning: \n", 1770 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 1771 | "\n", 1772 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1773 | " iloc._setitem_with_indexer(indexer, value, self.name)\n" 1774 | ] 1775 | } 1776 | ], 1777 | "source": [ 1778 | "data.loc [len(data.index)] = ['Sector 31 – 33 – Manufacturing', 32, 'Manufacturing']\n", 1779 | "data.loc [len(data.index)] = ['Sector 31 – 33 – Manufacturing', 33, 'Manufacturing']\n", 1780 | "\n", 1781 | "df2 = {'NAICS Industry Description': 'Sector 44 - 45 – Retail Trade', 'LookupCodes': 45, 'Sector': 'Retail Trade'}\n", 1782 | "data = data.append(df2, ignore_index = True)" 1783 | ] 1784 | }, 1785 | { 1786 | "cell_type": "code", 1787 | "execution_count": 33, 1788 | "id": "b9ffb995", 1789 | "metadata": {}, 1790 | "outputs": [], 1791 | "source": [ 1792 | "data.loc [len(data.index)] = ['Sector 48 - 49 – Transportation and Warehousing', 49, 'Transportation and Warehousing']" 1793 | ] 1794 | }, 1795 | { 1796 | "cell_type": "code", 1797 | "execution_count": 34, 1798 | "id": "5463d597", 1799 | "metadata": {}, 1800 | "outputs": [ 1801 | { 1802 | "data": { 1803 | "text/html": [ 1804 | "
\n", 1805 | "\n", 1818 | "\n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | " \n", 1899 | " \n", 1900 | " \n", 1901 | " \n", 1902 | " \n", 1903 | " \n", 1904 | " \n", 1905 | " \n", 1906 | " \n", 1907 | " \n", 1908 | " \n", 1909 | " \n", 1910 | " \n", 1911 | " \n", 1912 | " \n", 1913 | " \n", 1914 | " \n", 1915 | " \n", 1916 | " \n", 1917 | " \n", 1918 | " \n", 1919 | " \n", 1920 | " \n", 1921 | " \n", 1922 | " \n", 1923 | " \n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | " \n", 1945 | " \n", 1946 | " \n", 1947 | " \n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | " \n", 1961 | " \n", 1962 | " \n", 1963 | " \n", 1964 | " \n", 1965 | " \n", 1966 | " \n", 1967 | " \n", 1968 | " \n", 1969 | " \n", 1970 | " \n", 1971 | " \n", 1972 | " \n", 1973 | "
NAICS Industry DescriptionLookupCodesSector
0Sector 11 – Agriculture, Forestry, Fishing and...11Agriculture, Forestry, Fishing and Hunting
1Sector 21 – Mining, Quarrying, and Oil and Gas...21Mining, Quarrying, and Oil and Gas Extraction
2Sector 22 – Utilities22Utilities
3Sector 23 – Construction23Construction
4Sector 31 – 33 – Manufacturing31Manufacturing
5Sector 42 – Wholesale Trade42Wholesale Trade
6Sector 44 - 45 – Retail Trade44Retail Trade
7Sector 48 - 49 – Transportation and Warehousing48Transportation and Warehousing
8Sector 51 – Information51Information
9Sector 52 – Finance and Insurance52Finance and Insurance
10Sector 53 – Real Estate and Rental and Leasing53Real Estate and Rental and Leasing
11Sector 54 – Professional, Scientific and Techn...54Professional, Scientific and Technical Services
12Sector 55 – Management of Companies and Enterp...55Management of Companies and Enterprises
13Sector 56 – Administrative and Support, Waste ...56Administrative and Support, Waste Management a...
14Sector 61 – Educational Services61Educational Services
15Sector 62 – Health Care and Social Assistance62Health Care and Social Assistance
16Sector 71 – Arts, Entertainment and Recreation71Arts, Entertainment and Recreation
17Sector 72 – Accommodation and Food Services72Accommodation and Food Services
18Sector 81 – Other Services81Other Services
19Sector 92 – Public Administration92Public Administration
20Sector 31 – 33 – Manufacturing32Manufacturing
21Sector 31 – 33 – Manufacturing33Manufacturing
22Sector 44 - 45 – Retail Trade45Retail Trade
23Sector 48 - 49 – Transportation and Warehousing49Transportation and Warehousing
\n", 1974 | "
" 1975 | ], 1976 | "text/plain": [ 1977 | " NAICS Industry Description LookupCodes \\\n", 1978 | "0 Sector 11 – Agriculture, Forestry, Fishing and... 11 \n", 1979 | "1 Sector 21 – Mining, Quarrying, and Oil and Gas... 21 \n", 1980 | "2 Sector 22 – Utilities 22 \n", 1981 | "3 Sector 23 – Construction 23 \n", 1982 | "4 Sector 31 – 33 – Manufacturing 31 \n", 1983 | "5 Sector 42 – Wholesale Trade 42 \n", 1984 | "6 Sector 44 - 45 – Retail Trade 44 \n", 1985 | "7 Sector 48 - 49 – Transportation and Warehousing 48 \n", 1986 | "8 Sector 51 – Information 51 \n", 1987 | "9 Sector 52 – Finance and Insurance 52 \n", 1988 | "10 Sector 53 – Real Estate and Rental and Leasing 53 \n", 1989 | "11 Sector 54 – Professional, Scientific and Techn... 54 \n", 1990 | "12 Sector 55 – Management of Companies and Enterp... 55 \n", 1991 | "13 Sector 56 – Administrative and Support, Waste ... 56 \n", 1992 | "14 Sector 61 – Educational Services 61 \n", 1993 | "15 Sector 62 – Health Care and Social Assistance 62 \n", 1994 | "16 Sector 71 – Arts, Entertainment and Recreation 71 \n", 1995 | "17 Sector 72 – Accommodation and Food Services 72 \n", 1996 | "18 Sector 81 – Other Services 81 \n", 1997 | "19 Sector 92 – Public Administration 92 \n", 1998 | "20 Sector 31 – 33 – Manufacturing 32 \n", 1999 | "21 Sector 31 – 33 – Manufacturing 33 \n", 2000 | "22 Sector 44 - 45 – Retail Trade 45 \n", 2001 | "23 Sector 48 - 49 – Transportation and Warehousing 49 \n", 2002 | "\n", 2003 | " Sector \n", 2004 | "0 Agriculture, Forestry, Fishing and Hunting \n", 2005 | "1 Mining, Quarrying, and Oil and Gas Extraction \n", 2006 | "2 Utilities \n", 2007 | "3 Construction \n", 2008 | "4 Manufacturing \n", 2009 | "5 Wholesale Trade \n", 2010 | "6 Retail Trade \n", 2011 | "7 Transportation and Warehousing \n", 2012 | "8 Information \n", 2013 | "9 Finance and Insurance \n", 2014 | "10 Real Estate and Rental and Leasing \n", 2015 | "11 Professional, Scientific and Technical Services \n", 2016 | "12 Management of Companies and Enterprises \n", 2017 | "13 Administrative and Support, Waste Management a... \n", 2018 | "14 Educational Services \n", 2019 | "15 Health Care and Social Assistance \n", 2020 | "16 Arts, Entertainment and Recreation \n", 2021 | "17 Accommodation and Food Services \n", 2022 | "18 Other Services \n", 2023 | "19 Public Administration \n", 2024 | "20 Manufacturing \n", 2025 | "21 Manufacturing \n", 2026 | "22 Retail Trade \n", 2027 | "23 Transportation and Warehousing " 2028 | ] 2029 | }, 2030 | "execution_count": 34, 2031 | "metadata": {}, 2032 | "output_type": "execute_result" 2033 | } 2034 | ], 2035 | "source": [ 2036 | "data" 2037 | ] 2038 | }, 2039 | { 2040 | "cell_type": "code", 2041 | "execution_count": 35, 2042 | "id": "ccac0cbd", 2043 | "metadata": {}, 2044 | "outputs": [], 2045 | "source": [ 2046 | "data.to_csv('clean_data.csv', encoding = 'utf-8')" 2047 | ] 2048 | }, 2049 | { 2050 | "cell_type": "code", 2051 | "execution_count": 36, 2052 | "id": "ce6a2a38", 2053 | "metadata": {}, 2054 | "outputs": [], 2055 | "source": [ 2056 | "data.to_csv('clean_data_without_index.csv', encoding = 'utf-8', index = False)" 2057 | ] 2058 | }, 2059 | { 2060 | "cell_type": "code", 2061 | "execution_count": null, 2062 | "id": "074a3885", 2063 | "metadata": {}, 2064 | "outputs": [], 2065 | "source": [] 2066 | } 2067 | ], 2068 | "metadata": { 2069 | "kernelspec": { 2070 | "display_name": "Python 3 (ipykernel)", 2071 | "language": "python", 2072 | "name": "python3" 2073 | }, 2074 | "language_info": { 2075 | "codemirror_mode": { 2076 | "name": "ipython", 2077 | "version": 3 2078 | }, 2079 | "file_extension": ".py", 2080 | "mimetype": "text/x-python", 2081 | "name": "python", 2082 | "nbconvert_exporter": "python", 2083 | "pygments_lexer": "ipython3", 2084 | "version": "3.9.7" 2085 | } 2086 | }, 2087 | "nbformat": 4, 2088 | "nbformat_minor": 5 2089 | } 2090 | -------------------------------------------------------------------------------- /Cleaning_Data_with_SQL_Part1.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllThingsDataWithAngelina/Data-Analyst-Projects/a585ff08df89dbe2fa05c0f9cc3850067a6f319a/Cleaning_Data_with_SQL_Part1.sql -------------------------------------------------------------------------------- /Exploring_Data_SQL_Part1.sql: -------------------------------------------------------------------------------- 1 | --1--- 2 | ---Summary of All PPP Approved Lending 3 | --Note, there is also servicing Lender 4 | Select count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Total_Net_Dollars, AVG(InitialApprovalAmount) Average_Loan_Size, 5 | (select count(distinct (OriginatingLender))from [dbo].[sba_public_data])Total_Originating_Lender_Count 6 | from [dbo].[sba_public_data] 7 | order by 3 desc 8 | 9 | ---Summary of 2021 PPP Approved Lending 10 | Select count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Total_Net_Dollars, AVG(InitialApprovalAmount) Average_Loan_Size, 11 | (select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2021)Total_Originating_Lender_Count 12 | from [dbo].[sba_public_data] 13 | where year(DateApproved) = 2021 14 | order by 3 desc 15 | 16 | ---Summary of 2020 PPP Approved Lending 17 | Select count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Total_Net_Dollars, AVG(InitialApprovalAmount) Average_Loan_Size, 18 | (select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2020)Total_Originating_Lender_Count 19 | from [dbo].[sba_public_data] 20 | where year(DateApproved) = 2020 21 | order by 3 desc 22 | 23 | 24 | ---2--- 25 | ---Summary of 2021 PPP Approved Loans per Originating Lender, loan count, total amount and average 26 | --Top 15 Originating Lenders for 2021 PPP Loans 27 | --Data is ordered by Net_Dollars 28 | Select top 15 OriginatingLender, count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Net_Dollars, AVG(InitialApprovalAmount) Average_Loan_Size 29 | from [dbo].[sba_public_data] 30 | where year(DateApproved) = 2021 31 | group by OriginatingLender 32 | order by 3 desc 33 | 34 | Select top 15 OriginatingLender, count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Net_Dollars, AVG(InitialApprovalAmount) Average_Loan_Size 35 | from [dbo].[sba_public_data] 36 | where year(DateApproved) = 2020 37 | group by OriginatingLender 38 | order by 3 desc 39 | 40 | 41 | ---3---- 42 | ---Top 20 Industries that received the PPP Loans in 2021 43 | -- I need to add the NAICS codes to the GitHub Repo, extracted from SQL 44 | with cte as ( 45 | 46 | select ncd.Sector, count(LoanNumber) as Loans_Approved, sum(CurrentApprovalAmount) Net_Dollars 47 | from [dbo].[sba_public_data] main 48 | inner join [dbo].[sba_naics_sector_codes_description] ncd 49 | on left(cast(main.NAICSCode as varchar), 2) = ncd.LookupCode 50 | where year(DateApproved) = 2021 51 | group by ncd.Sector 52 | --order by 3 desc 53 | 54 | ) 55 | SELECT 56 | sector,Loans_Approved, 57 | SUM(Net_Dollars) OVER(PARTITION BY sector) AS Net_Dollars, 58 | --SUM(Net_Dollars) OVER() AS Total, 59 | CAST(1. * Net_Dollars / SUM(Net_Dollars) OVER() AS DECIMAL(5,2)) * 100 AS "Percent by Amount" 60 | FROM cte 61 | order by 3 desc 62 | --where year(DateApproved) = 2021 63 | 64 | ---4--- 65 | --States and Territories 66 | select BorrowerState as state, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars 67 | from [dbo].[sba_public_data] main 68 | --where cast(DateApproved as date) < '2021-06-01' 69 | group by BorrowerState 70 | order by 1 71 | 72 | 73 | ---5---- 74 | ---Demographics for PPP 75 | select race, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars 76 | from [dbo].[sba_public_data] 77 | group by race 78 | order by 3 79 | 80 | select gender, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars 81 | from [dbo].[sba_public_data] 82 | group by gender 83 | order by 3 84 | 85 | select Ethnicity, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars 86 | from [dbo].[sba_public_data] 87 | group by Ethnicity 88 | order by 3 89 | 90 | select Veteran, count(LoanNumber) as Loan_Count, sum(CurrentApprovalAmount) Net_Dollars 91 | from [dbo].[sba_public_data] 92 | group by Veteran 93 | order by 3 94 | 95 | ---6--- 96 | ---How much of the PPP Loans of 2021 have been fully forgiven 97 | select count(LoanNumber) as Count_of_Payments, sum(ForgivenessAmount) Forgiveness_amount_paid 98 | from sba_public_data 99 | where year(DateApproved) = 2020 and ForgivenessAmount <> 0 100 | 101 | ---Summary of 2021 PPP Approved Lending 102 | Select count(LoanNumber) as Loans_Approved, sum(InitialApprovalAmount) Total_Net_Dollars, sum(ForgivenessAmount) Forgiveness_amount_paid, 103 | (select count(distinct (OriginatingLender))from [dbo].[sba_public_data] where year(DateApproved) = 2021)Total_Originating_Lender_Count 104 | from [dbo].[sba_public_data] 105 | where year(DateApproved) = 2020 106 | order by 3 desc 107 | 108 | 109 | --7--- 110 | --In which month was the highest amount given out by the SBA to borrowers 111 | select Year(DateApproved) Year_Approved, Month(DateApproved)Month_Approved, ProcessingMethod, sum(CurrentApprovalAmount) Net_Dollars 112 | from sba_public_data 113 | group by Year(DateApproved), Month(DateApproved), ProcessingMethod 114 | order by 4 desc -------------------------------------------------------------------------------- /LEGO Data Analysis - Rebrickable.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3adc098c", 6 | "metadata": {}, 7 | "source": [ 8 | "# LEGO Data Analysis - Rebrickable" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "da2515b3", 14 | "metadata": {}, 15 | "source": [ 16 | "The dataset is from https://rebrickable.com/downloads/. \n", 17 | "\n", 18 | "From the Rebrickable website - \"The LEGO Parts/Sets/Colors and Inventories of every official LEGO set in the Rebrickable database is available for download as csv files here. These files are automatically updated daily. If you need more details, you can use the API which provides real-time data, but has rate limits that prevent bulk downloading of data.\"\n", 19 | "\n", 20 | "Rebrickable provides this database to be used for any purpose." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "6264928c", 26 | "metadata": {}, 27 | "source": [ 28 | "## LEGO Data Model" 29 | ] 30 | }, 31 | { 32 | "attachments": { 33 | "downloads_schema_v3.png": { 34 | "image/png": "" 35 | } 36 | }, 37 | "cell_type": "markdown", 38 | "id": "32ec6d94", 39 | "metadata": {}, 40 | "source": [ 41 | "![downloads_schema_v3.png](attachment:downloads_schema_v3.png)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "65c8d7a4", 47 | "metadata": {}, 48 | "source": [ 49 | "#### Importing Libraries" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "3ee02051", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import pandas as pd\n", 60 | "import numpy as np" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "24ae49a0", 66 | "metadata": {}, 67 | "source": [ 68 | "#### Getting Data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "f2dac057", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "e8576500", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "cc69c8a5", 90 | "metadata": {}, 91 | "source": [ 92 | "#### Display Top 10 Rows of the sets dataset" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "7e95329e", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "b9914c48", 106 | "metadata": {}, 107 | "source": [ 108 | "#### Display Last 10 Rows of the sets dataset" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "460a7729", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "117a51fc", 122 | "metadata": {}, 123 | "source": [ 124 | "#### Check Datatype of Each Column" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "af47af0c", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "4b087e21", 138 | "metadata": {}, 139 | "source": [ 140 | "#### Check NULL Values in Sets" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "ffe7a416", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "50274329", 154 | "metadata": {}, 155 | "source": [ 156 | "#### How many Rows and Columns are there in our dataset" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "e5f3672a", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "6b4a53e1", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "7e0e5dbd", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "b9cd9c5a", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "88d8debe", 194 | "metadata": {}, 195 | "source": [ 196 | "## MERGE DATA\n", 197 | "#### Joining the two datasets to answer questions" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "84152152", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "9eac02ea", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "c3880b41", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "3cdbc54f", 227 | "metadata": {}, 228 | "source": [ 229 | "### Now Let's Answer some Questions" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "22a11bfc", 235 | "metadata": {}, 236 | "source": [ 237 | "#### 1.) What is the total number of parts per theme" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "id": "d6c42dd7", 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "9bbe2fbe", 251 | "metadata": {}, 252 | "source": [ 253 | "#### 2.) What is the total number of parts per year" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "47817343", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "25dc250c", 267 | "metadata": {}, 268 | "source": [ 269 | "#### 3.) How many sets where created in each Century in the dataset" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "38e33e6a", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "id": "8f55095f", 283 | "metadata": {}, 284 | "source": [ 285 | "#### 4.) What percentage of sets ever released in the 21st Century were Trains Themed " 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "f007901b", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "id": "9f452beb", 299 | "metadata": {}, 300 | "source": [ 301 | "#### 5.) What percentage of sets ever released in the 21st Century were Disney Themed" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "2eaea3c7", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "id": "f4de3747", 315 | "metadata": {}, 316 | "source": [ 317 | "#### 6.) What is the popular theme by year in terms of sets released in the 21st Century" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "id": "4046508a", 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "5e73daea", 331 | "metadata": {}, 332 | "source": [ 333 | "#### 7.) What is the most produced color of lego ever in terms of quantity of parts?" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "id": "014d7b45", 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "id": "410d43d3", 347 | "metadata": {}, 348 | "source": [ 349 | "#### Note - Join two datasets to answer question" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "1dce0367", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [] 359 | } 360 | ], 361 | "metadata": { 362 | "kernelspec": { 363 | "display_name": "Python 3 (ipykernel)", 364 | "language": "python", 365 | "name": "python3" 366 | }, 367 | "language_info": { 368 | "codemirror_mode": { 369 | "name": "ipython", 370 | "version": 3 371 | }, 372 | "file_extension": ".py", 373 | "mimetype": "text/x-python", 374 | "name": "python", 375 | "nbconvert_exporter": "python", 376 | "pygments_lexer": "ipython3", 377 | "version": "3.9.7" 378 | } 379 | }, 380 | "nbformat": 4, 381 | "nbformat_minor": 5 382 | } 383 | -------------------------------------------------------------------------------- /LEGO_Data_Analysis.sql: -------------------------------------------------------------------------------- 1 | ---CREATE VIEW---- 2 | Create view [dbo].[analytics_main] as 3 | 4 | select s.set_num, s.name as set_name, s.year, s.theme_id, cast(s.num_parts as numeric) num_parts, t.name as theme_name, t.parent_id, p.name as parent_theme_name, 5 | case 6 | when s.year between 1901 and 2000 then '20th_Century' 7 | when s.year between 2001 and 2100 then '21st_Century' 8 | end 9 | as Century 10 | from dbo.sets s 11 | left join [dbo].[themes] t 12 | on s.theme_id = t.id 13 | left join [dbo].[themes] p 14 | on t.parent_id = p.id 15 | GO 16 | 17 | 18 | 19 | 20 | ---1--- 21 | ---What is the total number of parts per theme 22 | --select * from dbo.analytics_main 23 | 24 | 25 | select theme_name, sum(num_parts) as total_num_parts 26 | from dbo.analytics_main 27 | --where parent_theme_name is not null 28 | group by theme_name 29 | order by 2 desc 30 | 31 | 32 | ---2--- 33 | ---What is the total number of parts per year 34 | select year, sum(num_parts) as total_num_parts 35 | from dbo.analytics_main 36 | where parent_theme_name is not null 37 | group by year 38 | order by 2 desc 39 | 40 | 41 | ---3--- 42 | --- How many sets where created in each Century in the dataset 43 | select Century, count(set_num) as total_set_num 44 | from dbo.analytics_main 45 | ---where parent_theme_name is not null 46 | group by Century 47 | 48 | 49 | 50 | ---4--- 51 | --- What percentage of sets ever released in the 21st Century were Trains Themed 52 | ;with cte as 53 | ( 54 | select Century, theme_name, count(set_num) total_set_num 55 | from analytics_main 56 | where Century = '21st_Century' 57 | group by Century, theme_name 58 | ) 59 | select sum(total_set_num), sum(percentage) 60 | from( 61 | select Century, theme_name, total_set_num, sum(total_set_num) OVER() as total, cast(1.00 * total_set_num / sum(total_set_num) OVER() as decimal(5,4))*100 Percentage 62 | from cte 63 | --order by 3 desc 64 | )m 65 | where theme_name like '%Star wars%' 66 | 67 | 68 | 69 | --- 5 --- 70 | --- What was the popular theme by year in terms of sets released in the 21st Century 71 | select year, theme_name, total_set_num 72 | from ( 73 | select year, theme_name, count(set_num) total_set_num, ROW_NUMBER() OVER (partition by year order by count(set_num) desc) rn 74 | from analytics_main 75 | where Century = '21st_Century' 76 | --and parent_theme_name is not null 77 | group by year, theme_name 78 | )m 79 | where rn = 1 80 | order by year desc 81 | 82 | 83 | 84 | ---6--- 85 | ---What is the most produced color of lego ever in terms of quantity of parts? 86 | 87 | select color_name, sum(quantity) as quantity_of_parts 88 | from 89 | ( 90 | select 91 | inv.color_id, inv.inventory_id, inv.part_num, cast(inv.quantity as numeric) quantity, inv.is_spare, c.name as color_name, c.rgb, p.name as part_name, p.part_material, pc.name as category_name 92 | from inventory_parts inv 93 | inner join colors c 94 | on inv.color_id = c.id 95 | inner join parts p 96 | on inv.part_num = p.part_num 97 | inner join part_categories pc 98 | on part_cat_id = pc.id 99 | )main 100 | 101 | group by color_name 102 | order by 2 desc -------------------------------------------------------------------------------- /RFM_Segmentation_Sales_Analysis_Main.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AllThingsDataWithAngelina/Data-Analyst-Projects/a585ff08df89dbe2fa05c0f9cc3850067a6f319a/RFM_Segmentation_Sales_Analysis_Main.sql -------------------------------------------------------------------------------- /Visualizing_Data_with_Tableau.sql: -------------------------------------------------------------------------------- 1 | create view ppp_main as 2 | 3 | SELECT 4 | d.Sector, 5 | year(DateApproved) year_approved, 6 | month(DateApproved) month_Approved, 7 | OriginatingLender, 8 | BorrowerState, 9 | Race, 10 | Gender, 11 | Ethnicity, 12 | 13 | count(LoanNumber) Number_of_Approved, 14 | 15 | sum(CurrentApprovalAmount) Current_Approved_Amount, 16 | avg (CurrentApprovalAmount) Current_Average_loan_size, 17 | sum(ForgivenessAmount) Amount_Forgiven, 18 | 19 | 20 | sum(InitialApprovalAmount) Approved_Amount, 21 | avg (InitialApprovalAmount) Average_loan_size 22 | 23 | FROM 24 | [PortfolioDB].[dbo].[sba_public_data] p 25 | inner join [dbo].[sba_naics_sector_codes_description] d 26 | on left(p.NAICSCode, 2) = d.LookupCodes 27 | group by 28 | d.Sector, 29 | year(DateApproved), 30 | month(DateApproved), 31 | OriginatingLender, 32 | BorrowerState, 33 | Race, 34 | Gender, 35 | Ethnicity -------------------------------------------------------------------------------- /cohort_rentention_analysis.sql: -------------------------------------------------------------------------------- 1 | ---Cleaning Data 2 | 3 | ---Total Records = 541909 4 | ---135080 Records have no customerID 5 | ---406829 Records have customerID 6 | 7 | ;with online_retail as 8 | ( 9 | SELECT [InvoiceNo] 10 | ,[StockCode] 11 | ,[Description] 12 | ,[Quantity] 13 | ,[InvoiceDate] 14 | ,[UnitPrice] 15 | ,[CustomerID] 16 | ,[Country] 17 | FROM [PortfolioDB].[dbo].[online_retail] 18 | Where CustomerID != 0 19 | ) 20 | , quantity_unit_price as 21 | ( 22 | 23 | ---397882 records with quantity and Unit price 24 | select * 25 | from online_retail 26 | where Quantity > 0 and UnitPrice > 0 27 | ) 28 | , dup_check as 29 | ( 30 | ---duplicate check 31 | select * , ROW_NUMBER() over (partition by InvoiceNo, StockCode, Quantity order by InvoiceDate)dup_flag 32 | from quantity_unit_price 33 | 34 | ) 35 | ---397667 clean data 36 | --5215 duplicate records 37 | select * 38 | into #online_retail_main 39 | from dup_check 40 | where dup_flag = 1 41 | 42 | ----Clean Data 43 | ----BEGIN COHORT ANALYSIS 44 | select * from #online_retail_main 45 | 46 | --Unique Identifier (CustomerID) 47 | --Initial Start Date (First Invoice Date) 48 | --Revenue Data 49 | 50 | select 51 | CustomerID, 52 | min(InvoiceDate) first_purchase_date, 53 | DATEFROMPARTS(year(min(InvoiceDate)), month(min(InvoiceDate)), 1) Cohort_Date 54 | into #cohort 55 | from #online_retail_main 56 | group by CustomerID 57 | 58 | select * 59 | from #cohort 60 | 61 | ---Create Cohort Index 62 | select 63 | mmm.*, 64 | cohort_index = year_diff * 12 + month_diff + 1 65 | into #cohort_retention 66 | from 67 | ( 68 | select 69 | mm.*, 70 | year_diff = invoice_year - cohort_year, 71 | month_diff = invoice_month - cohort_month 72 | from 73 | ( 74 | select 75 | m.*, 76 | c.Cohort_Date, 77 | year(m.InvoiceDate) invoice_year, 78 | month(m.InvoiceDate) invoice_month, 79 | year(c.Cohort_Date) cohort_year, 80 | month(c.Cohort_Date) cohort_month 81 | from #online_retail_main m 82 | left join #cohort c 83 | on m.CustomerID = c.CustomerID 84 | )mm 85 | )mmm 86 | --where CustomerID = 14733 87 | 88 | 89 | ---Pivot Data to see the cohort table 90 | select * 91 | into #cohort_pivot 92 | from( 93 | select distinct 94 | CustomerID, 95 | Cohort_Date, 96 | cohort_index 97 | from #cohort_retention 98 | )tbl 99 | pivot( 100 | Count(CustomerID) 101 | for Cohort_Index In 102 | ( 103 | [1], 104 | [2], 105 | [3], 106 | [4], 107 | [5], 108 | [6], 109 | [7], 110 | [8], 111 | [9], 112 | [10], 113 | [11], 114 | [12], 115 | [13]) 116 | 117 | )as pivot_table 118 | 119 | select * 120 | from #cohort_pivot 121 | order by Cohort_Date 122 | 123 | select Cohort_Date , 124 | (1.0 * [1]/[1] * 100) as [1], 125 | 1.0 * [2]/[1] * 100 as [2], 126 | 1.0 * [3]/[1] * 100 as [3], 127 | 1.0 * [4]/[1] * 100 as [4], 128 | 1.0 * [5]/[1] * 100 as [5], 129 | 1.0 * [6]/[1] * 100 as [6], 130 | 1.0 * [7]/[1] * 100 as [7], 131 | 1.0 * [8]/[1] * 100 as [8], 132 | 1.0 * [9]/[1] * 100 as [9], 133 | 1.0 * [10]/[1] * 100 as [10], 134 | 1.0 * [11]/[1] * 100 as [11], 135 | 1.0 * [12]/[1] * 100 as [12], 136 | 1.0 * [13]/[1] * 100 as [13] 137 | from #cohort_pivot 138 | order by Cohort_Date 139 | 140 | 141 | 142 | 143 | 144 | ---DYNAMIC SQL TO CREATE PIVOT TABLE 145 | 146 | DECLARE 147 | @columns NVARCHAR(MAX) = '', 148 | @sql NVARCHAR(MAX) = ''; 149 | 150 | SELECT 151 | @columns += QUOTENAME(cohort_index) + ',' 152 | FROM 153 | (select distinct cohort_index from #cohort_retention) m 154 | ORDER BY 155 | cohort_index; 156 | 157 | SET @columns = LEFT(@columns, LEN(@columns) - 1); 158 | 159 | PRINT @columns; 160 | 161 | 162 | -- construct dynamic SQL 163 | SET @sql =' 164 | 165 | ---# Return number of unique elements in the object 166 | SELECT * 167 | FROM 168 | ( 169 | select distinct 170 | Cohort_Date, 171 | cohort_index, 172 | CustomerID 173 | from #cohort_retention 174 | ) t 175 | PIVOT( 176 | COUNT(CustomerID) 177 | FOR cohort_index IN ('+ @columns +') 178 | ) AS pivot_table 179 | order by Cohort_Date 180 | 181 | 182 | '; 183 | 184 | -- execute the dynamic SQL 185 | EXECUTE sp_executesql @sql; 186 | --------------------------------------------------------------------------------