├── README.md └── mynotebook.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # my-first-notebook -------------------------------------------------------------------------------- /mynotebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "OOC-egEWD0_J" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "source": [ 34 | "from google.colab import drive\n", 35 | "drive.mount('/content/drive')" 36 | ], 37 | "metadata": { 38 | "id": "KEJv3nICISR4" 39 | }, 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "dikw = pd.read_csv('/content/drive/MyDrive/Classroom/Advanced Data Analytics/Nasratullah Shafiq - DIKW_HW_Activity.csv')" 47 | ], 48 | "metadata": { 49 | "id": "gyy4feKQOlVN" 50 | }, 51 | "execution_count": null, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "source": [ 57 | "dikw" 58 | ], 59 | "metadata": { 60 | "colab": { 61 | "base_uri": "https://localhost:8080/", 62 | "height": 243 63 | }, 64 | "id": "SD8aP7qe4QQH", 65 | "outputId": "836da8c0-d7f9-408c-db28-374f2f0c8562" 66 | }, 67 | "execution_count": null, 68 | "outputs": [ 69 | { 70 | "output_type": "execute_result", 71 | "data": { 72 | "text/plain": [ 73 | " EmployeeID EmployeeName Department JoiningDate Salary Gender \\\n", 74 | "0 E001 James Wilson Sales 2022-01-15 50000 Male \n", 75 | "1 E002 Maria Garcia Engineering 2021-03-12 75000 Female \n", 76 | "2 E003 Robert Brown HR 2020-06-23 60000 Male \n", 77 | "3 E004 Emily Davis Engineering 2021-11-05 72000 Female \n", 78 | "4 E005 Michael Johnson Marketing 2022-05-20 65000 Male \n", 79 | "\n", 80 | " Country PerformanceScore JobSatisfaction \n", 81 | "0 USA 85 4 \n", 82 | "1 UK 92 5 \n", 83 | "2 Canada 88 4 \n", 84 | "3 USA 79 3 \n", 85 | "4 Australia 83 4 " 86 | ], 87 | "text/html": [ 88 | "\n", 89 | "
\n", 90 | "
\n", 91 | "\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | "
EmployeeIDEmployeeNameDepartmentJoiningDateSalaryGenderCountryPerformanceScoreJobSatisfaction
0E001James WilsonSales2022-01-1550000MaleUSA854
1E002Maria GarciaEngineering2021-03-1275000FemaleUK925
2E003Robert BrownHR2020-06-2360000MaleCanada884
3E004Emily DavisEngineering2021-11-0572000FemaleUSA793
4E005Michael JohnsonMarketing2022-05-2065000MaleAustralia834
\n", 182 | "
\n", 183 | "
\n", 184 | "\n", 185 | "
\n", 186 | " \n", 194 | "\n", 195 | " \n", 235 | "\n", 236 | " \n", 260 | "
\n", 261 | "\n", 262 | "\n", 263 | "
\n", 264 | " \n", 275 | "\n", 276 | "\n", 365 | "\n", 366 | " \n", 388 | "
\n", 389 | "
\n", 390 | "
\n" 391 | ], 392 | "application/vnd.google.colaboratory.intrinsic+json": { 393 | "type": "dataframe", 394 | "variable_name": "dikw", 395 | "summary": "{\n \"name\": \"dikw\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"EmployeeID\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"E002\",\n \"E005\",\n \"E003\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"EmployeeName\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Maria Garcia\",\n \"Michael Johnson\",\n \"Robert Brown\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Department\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Engineering\",\n \"Marketing\",\n \"Sales\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"JoiningDate\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"2021-03-12\",\n \"2022-05-20\",\n \"2020-06-23\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9964,\n \"min\": 50000,\n \"max\": 75000,\n \"num_unique_values\": 5,\n \"samples\": [\n 75000,\n 65000,\n 60000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Female\",\n \"Male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Country\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"UK\",\n \"Australia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PerformanceScore\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4,\n \"min\": 79,\n \"max\": 92,\n \"num_unique_values\": 5,\n \"samples\": [\n 92,\n 83\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"JobSatisfaction\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 4,\n 5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" 396 | } 397 | }, 398 | "metadata": {}, 399 | "execution_count": 3 400 | } 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "source": [ 406 | "dikw.shape" 407 | ], 408 | "metadata": { 409 | "colab": { 410 | "base_uri": "https://localhost:8080/" 411 | }, 412 | "id": "k_RCBRdd4vAV", 413 | "outputId": "1ea071c7-8269-4feb-ee0e-94b28310950d" 414 | }, 415 | "execution_count": null, 416 | "outputs": [ 417 | { 418 | "output_type": "execute_result", 419 | "data": { 420 | "text/plain": [ 421 | "(5, 9)" 422 | ] 423 | }, 424 | "metadata": {}, 425 | "execution_count": 4 426 | } 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "source": [ 432 | "dikw.info" 433 | ], 434 | "metadata": { 435 | "id": "y-JWBLa25YQH", 436 | "outputId": "6bcc9b21-89a8-4173-9738-02991d57b54d", 437 | "colab": { 438 | "base_uri": "https://localhost:8080/", 439 | "height": 220 440 | } 441 | }, 442 | "execution_count": null, 443 | "outputs": [ 444 | { 445 | "output_type": "execute_result", 446 | "data": { 447 | "text/plain": [ 448 | "" 461 | ], 462 | "text/html": [ 463 | "
\n", 475 | "
pandas.core.frame.DataFrame.info
def info(verbose: bool | None=None, buf: WriteBuffer[str] | None=None, max_cols: int | None=None, memory_usage: bool | str | None=None, show_counts: bool | None=None) -> None
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.pyPrint a concise summary of a DataFrame.\n",
478 |               "\n",
479 |               "This method prints information about a DataFrame including\n",
480 |               "the index dtype and columns, non-null values and memory usage.\n",
481 |               "\n",
482 |               "Parameters\n",
483 |               "----------\n",
484 |               "verbose : bool, optional\n",
485 |               "    Whether to print the full summary. By default, the setting in\n",
486 |               "    ``pandas.options.display.max_info_columns`` is followed.\n",
487 |               "buf : writable buffer, defaults to sys.stdout\n",
488 |               "    Where to send the output. By default, the output is printed to\n",
489 |               "    sys.stdout. Pass a writable buffer if you need to further process\n",
490 |               "    the output.\n",
491 |               "max_cols : int, optional\n",
492 |               "    When to switch from the verbose to the truncated output. If the\n",
493 |               "    DataFrame has more than `max_cols` columns, the truncated output\n",
494 |               "    is used. By default, the setting in\n",
495 |               "    ``pandas.options.display.max_info_columns`` is used.\n",
496 |               "memory_usage : bool, str, optional\n",
497 |               "    Specifies whether total memory usage of the DataFrame\n",
498 |               "    elements (including the index) should be displayed. By default,\n",
499 |               "    this follows the ``pandas.options.display.memory_usage`` setting.\n",
500 |               "\n",
501 |               "    True always show memory usage. False never shows memory usage.\n",
502 |               "    A value of 'deep' is equivalent to "True with deep introspection".\n",
503 |               "    Memory usage is shown in human-readable units (base-2\n",
504 |               "    representation). Without deep introspection a memory estimation is\n",
505 |               "    made based in column dtype and number of rows assuming values\n",
506 |               "    consume the same memory amount for corresponding dtypes. With deep\n",
507 |               "    memory introspection, a real memory usage calculation is performed\n",
508 |               "    at the cost of computational resources. See the\n",
509 |               "    :ref:`Frequently Asked Questions <df-memory-usage>` for more\n",
510 |               "    details.\n",
511 |               "show_counts : bool, optional\n",
512 |               "    Whether to show the non-null counts. By default, this is shown\n",
513 |               "    only if the DataFrame is smaller than\n",
514 |               "    ``pandas.options.display.max_info_rows`` and\n",
515 |               "    ``pandas.options.display.max_info_columns``. A value of True always\n",
516 |               "    shows the counts, and False never shows the counts.\n",
517 |               "\n",
518 |               "Returns\n",
519 |               "-------\n",
520 |               "None\n",
521 |               "    This method prints a summary of a DataFrame and returns None.\n",
522 |               "\n",
523 |               "See Also\n",
524 |               "--------\n",
525 |               "DataFrame.describe: Generate descriptive statistics of DataFrame\n",
526 |               "    columns.\n",
527 |               "DataFrame.memory_usage: Memory usage of DataFrame columns.\n",
528 |               "\n",
529 |               "Examples\n",
530 |               "--------\n",
531 |               ">>> int_values = [1, 2, 3, 4, 5]\n",
532 |               ">>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']\n",
533 |               ">>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]\n",
534 |               ">>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,\n",
535 |               "...                   "float_col": float_values})\n",
536 |               ">>> df\n",
537 |               "    int_col text_col  float_col\n",
538 |               "0        1    alpha       0.00\n",
539 |               "1        2     beta       0.25\n",
540 |               "2        3    gamma       0.50\n",
541 |               "3        4    delta       0.75\n",
542 |               "4        5  epsilon       1.00\n",
543 |               "\n",
544 |               "Prints information of all columns:\n",
545 |               "\n",
546 |               ">>> df.info(verbose=True)\n",
547 |               "<class 'pandas.core.frame.DataFrame'>\n",
548 |               "RangeIndex: 5 entries, 0 to 4\n",
549 |               "Data columns (total 3 columns):\n",
550 |               " #   Column     Non-Null Count  Dtype\n",
551 |               "---  ------     --------------  -----\n",
552 |               " 0   int_col    5 non-null      int64\n",
553 |               " 1   text_col   5 non-null      object\n",
554 |               " 2   float_col  5 non-null      float64\n",
555 |               "dtypes: float64(1), int64(1), object(1)\n",
556 |               "memory usage: 248.0+ bytes\n",
557 |               "\n",
558 |               "Prints a summary of columns count and its dtypes but not per column\n",
559 |               "information:\n",
560 |               "\n",
561 |               ">>> df.info(verbose=False)\n",
562 |               "<class 'pandas.core.frame.DataFrame'>\n",
563 |               "RangeIndex: 5 entries, 0 to 4\n",
564 |               "Columns: 3 entries, int_col to float_col\n",
565 |               "dtypes: float64(1), int64(1), object(1)\n",
566 |               "memory usage: 248.0+ bytes\n",
567 |               "\n",
568 |               "Pipe output of DataFrame.info to buffer instead of sys.stdout, get\n",
569 |               "buffer content and writes to a text file:\n",
570 |               "\n",
571 |               ">>> import io\n",
572 |               ">>> buffer = io.StringIO()\n",
573 |               ">>> df.info(buf=buffer)\n",
574 |               ">>> s = buffer.getvalue()\n",
575 |               ">>> with open("df_info.txt", "w",\n",
576 |               "...           encoding="utf-8") as f:  # doctest: +SKIP\n",
577 |               "...     f.write(s)\n",
578 |               "260\n",
579 |               "\n",
580 |               "The `memory_usage` parameter allows deep introspection mode, specially\n",
581 |               "useful for big DataFrames and fine-tune memory optimization:\n",
582 |               "\n",
583 |               ">>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
584 |               ">>> df = pd.DataFrame({\n",
585 |               "...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
586 |               "...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n",
587 |               "...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)\n",
588 |               "... })\n",
589 |               ">>> df.info()\n",
590 |               "<class 'pandas.core.frame.DataFrame'>\n",
591 |               "RangeIndex: 1000000 entries, 0 to 999999\n",
592 |               "Data columns (total 3 columns):\n",
593 |               " #   Column    Non-Null Count    Dtype\n",
594 |               "---  ------    --------------    -----\n",
595 |               " 0   column_1  1000000 non-null  object\n",
596 |               " 1   column_2  1000000 non-null  object\n",
597 |               " 2   column_3  1000000 non-null  object\n",
598 |               "dtypes: object(3)\n",
599 |               "memory usage: 22.9+ MB\n",
600 |               "\n",
601 |               ">>> df.info(memory_usage='deep')\n",
602 |               "<class 'pandas.core.frame.DataFrame'>\n",
603 |               "RangeIndex: 1000000 entries, 0 to 999999\n",
604 |               "Data columns (total 3 columns):\n",
605 |               " #   Column    Non-Null Count    Dtype\n",
606 |               "---  ------    --------------    -----\n",
607 |               " 0   column_1  1000000 non-null  object\n",
608 |               " 1   column_2  1000000 non-null  object\n",
609 |               " 2   column_3  1000000 non-null  object\n",
610 |               "dtypes: object(3)\n",
611 |               "memory usage: 165.9 MB
\n", 612 | " \n", 631 | "
" 632 | ] 633 | }, 634 | "metadata": {}, 635 | "execution_count": 5 636 | } 637 | ] 638 | } 639 | ] 640 | } --------------------------------------------------------------------------------