├── plot.png ├── Data Files ├── my_fields.csv ├── data.h5 ├── pickle_data ├── my_fields.xlsx ├── excel_data.xlsx ├── none_fields.csv ├── no_cols.csv ├── sep_fields.txt ├── daydates.csv ├── fields.csv ├── fields2.csv ├── new_fields.csv └── interupt_fields.csv ├── some_data.npy ├── some_data_archieve.npz ├── array_ex.txt ├── README.md ├── Replacing Values.ipynb ├── Methods for Boolean Arrays.ipynb ├── File Operations with Numpy Arrays.ipynb ├── Using JSON Module.ipynb ├── Set Logic Methods.ipynb ├── Handling DataTypes for ndarrays.ipynb ├── Handling Pandas Index Objects.ipynb ├── Combining Data with Overlap.ipynb ├── numpy.where.ipynb ├── Axis Indexes with Duplicate Values.ipynb ├── Reading Excel Files using Pandas.ipynb ├── Pickle Serialization.ipynb ├── Data Transformation using Functions or Mapping.ipynb ├── Reading CSV File in Pieces.ipynb ├── Using the CSV Module.ipynb ├── Using HDF5 Formats.ipynb ├── Numpy Random Number Generation.ipynb ├── Basic Array Statistical Methods.ipynb ├── Dropping Entries from Axis.ipynb ├── Index Object Methods.ipynb ├── Unique Values, Value Counts and Membership.ipynb ├── Handling Data from Databases.ipynb ├── Removing Duplicates.ipynb ├── Apply Methods for DataFrames.ipynb ├── Operations of Linear Algebra.ipynb └── Renaming Axis Indexes.ipynb /plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/plot.png -------------------------------------------------------------------------------- /Data Files/my_fields.csv: -------------------------------------------------------------------------------- 1 | Day;Date;Year 2 | Monday;02;2020 3 | Tuesday;03;2020 4 | Friday;05;2020 5 | -------------------------------------------------------------------------------- /some_data.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/some_data.npy -------------------------------------------------------------------------------- /Data Files/data.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/Data Files/data.h5 -------------------------------------------------------------------------------- /Data Files/pickle_data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/Data Files/pickle_data -------------------------------------------------------------------------------- /some_data_archieve.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/some_data_archieve.npz -------------------------------------------------------------------------------- /Data Files/my_fields.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/Data Files/my_fields.xlsx -------------------------------------------------------------------------------- /Data Files/excel_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hassan-Farid/Python-for-Data-Analysis/HEAD/Data Files/excel_data.xlsx -------------------------------------------------------------------------------- /Data Files/none_fields.csv: -------------------------------------------------------------------------------- 1 | ID,Name,Field 2 | 1,Adam Jones,Electrical 3 | 2,Edward Elrich,- 4 | 3,Ken Adams, 5 | 4,Ross Taylor,Sportsman 6 | ,Stain Steve, -------------------------------------------------------------------------------- /Data Files/no_cols.csv: -------------------------------------------------------------------------------- 1 | 1,Adam Jones,Electrical 2 | 2,Edward Elrich,Mechanical 3 | 3,Stain Steve,Computer Science 4 | 4,Ken Adams,Media Science 5 | 5,Ross Taylor,Sportsman -------------------------------------------------------------------------------- /Data Files/sep_fields.txt: -------------------------------------------------------------------------------- 1 | Name|Field 2 | 1|Adam Jones|Electrical 3 | 2|Edward Elrich|Mechanical 4 | 3|Stain Steve|Computer Science 5 | 4|Ken Adams|Media Science 6 | 5|Ross Taylor|Sportsman -------------------------------------------------------------------------------- /Data Files/daydates.csv: -------------------------------------------------------------------------------- 1 | ,0 2 | 2020-01-01,Monday 3 | 2020-01-02,Tuesday 4 | 2020-01-03,Wednesday 5 | 2020-01-04,Thursday 6 | 2020-01-05,Friday 7 | 2020-01-06,Saturday 8 | 2020-01-07,Sunday 9 | -------------------------------------------------------------------------------- /Data Files/fields.csv: -------------------------------------------------------------------------------- 1 | ID,Name,Field 2 | 1,Adam Jones,Electrical 3 | 2,Edward Elrich,Mechanical 4 | 3,Stain Steve,Computer Science 5 | 4,Ken Adams,Media Science 6 | 5,Ross Taylor,Sportsman 7 | -------------------------------------------------------------------------------- /Data Files/fields2.csv: -------------------------------------------------------------------------------- 1 | Field|ID|Name 2 | Electrical|1|Adam Jones 3 | Mechanical|2|Edward Elrich 4 | Computer Science|3|Stain Steve 5 | Media Science|4|Ken Adams 6 | Sportsman|5|Ross Taylor 7 | -------------------------------------------------------------------------------- /Data Files/new_fields.csv: -------------------------------------------------------------------------------- 1 | ID,Field,Name 2 | 1,Electrical,Adam 3 | 2,Mechanical,Jones 4 | 3,Computer Science,Edward 5 | 4,Mechanical,Elrich 6 | 5,Electrical,Stain 7 | 6,Computer Science,Steve -------------------------------------------------------------------------------- /array_ex.txt: -------------------------------------------------------------------------------- 1 | 1.032518385200751898e-01 1.462772003512045726e+00 3.997018720913923517e-01 -1.882154894423050118e-01 2 | -8.295909744816059028e-01 -1.569539155638059125e+00 8.959982769036259898e-01 -3.877866584234971326e-01 3 | -6.653994555347828577e-01 6.800770078903267679e-02 -6.443251304063275509e-01 -6.580105130695919347e-02 4 | -1.157403436222848025e+00 -9.428450708993425522e-01 -3.516229879549682136e-01 1.617873786276278647e+00 5 | -------------------------------------------------------------------------------- /Data Files/interupt_fields.csv: -------------------------------------------------------------------------------- 1 | Hello there! 2 | This dataset contains the names and fields of various students 3 | ID,Name,Field 4 | Ooops! I mean not only students but professionals as well 5 | 1,Adam Jones,Electrical 6 | 2,Edward Elrich,Mechanical 7 | Ohh!! And i forgot to mention... oops i forgot about it 8 | 3,Stain Steve,Computer Science 9 | Ahh yes,i remember now, but first seek the last two entries info 10 | 4,Ken Adams,Media Science 11 | 5,Ross Taylor,Sportsman 12 | Well, how are you gonna get your data nowwwww -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-for-Data-Analysis 2 | A number of scripts describing use of various data analysis tools in python and their implementation on different case scenarios 3 | 4 | 5 | The sequence for following the scripts instruction wise is as follows: 6 | 7 | 1. Getting started with ndarrays 8 | 2. Handling DataTypes for ndarrays 9 | 3. Basic Operations on Arrays 10 | 4. Universal Functions 11 | 5. Simple example using Numpy vectorization 12 | 6. numpy.where 13 | 7. Basic Array Statistical Methods 14 | 8. Methods for Boolean Arrays 15 | 9. Numpy Sorting 16 | 10. Set Logic Methods 17 | 11. File Operations with Numpy Arrays 18 | 12. Operations of Linear Algebra 19 | 13. Numpy Random Number Generation 20 | 14. Implementation Random Walks (Case Scenario Implementation) 21 | 15. Getting Started with Pandas Series 22 | 16. Getting Started with Pandas DataFrames 23 | 17. Handling Pandas Index Objects 24 | 18. Index Object Methods 25 | 19. Reindexing 26 | 20. Dropping Entries from Axis 27 | 21. Indexing, Selecting and Filtering Operations 28 | 22. Arithmetic and Data Alignment 29 | 23. Apply Methods for DataFrames 30 | 24. Sorting and Ranking 31 | 25. Axis Indexes with Duplicate Values 32 | 26. Computing Descriptive Statistics 33 | 27. Unique Values, Value Counts and Membership 34 | 28. Handling Missing Data Operations 35 | 29. Hierarchical Indexing 36 | 30. Reordering and Sorting Levels 37 | 31. Applying Summary Statistics to Levels 38 | 32. Using DataFrame Columns as a Hierarchical Form 39 | 33. Reading Data from CSV Files 40 | 34. Reading CSV Files in Pieces 41 | 35. Writing Data to CSV Files 42 | 36. Using the CSV Module 43 | 37. Using JSON Module 44 | 38. Pickle Serialization 45 | 39. Using HDF5 Formats 46 | 40. Reading Excel Files using Pandas 47 | 41. Handling Data from Databases 48 | 42. Database Style DataFrame Merges 49 | 43. Merging Data using Index 50 | 44. Concatenating Along an Axis 51 | 45. Combining Data with Overlap 52 | 46. Pivoting Long to Wide Format 53 | 47. Removing Duplicates 54 | 48. Data Transformation using Functions or Mapping 55 | 49. Replacing Values 56 | 50. Renaming Axes Indexes 57 | -------------------------------------------------------------------------------- /Replacing Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#We now look at a more general method of replacement in pandas\n", 10 | "import pandas as pd\n", 11 | "from pandas import DataFrame, Series\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 6, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "#Consider a Series of values\n", 22 | "data = Series([1.,-234,2.,-234,1000,-124,-999,456,-999,56,234])" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 7, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "0 1.0\n", 34 | "1 -234.0\n", 35 | "2 2.0\n", 36 | "3 -234.0\n", 37 | "4 1000.0\n", 38 | "5 -124.0\n", 39 | "6 -999.0\n", 40 | "7 456.0\n", 41 | "8 -999.0\n", 42 | "9 56.0\n", 43 | "10 234.0\n", 44 | "dtype: float64" 45 | ] 46 | }, 47 | "execution_count": 7, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "data" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 8, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "0 1.0\n", 65 | "1 -234.0\n", 66 | "2 2.0\n", 67 | "3 -234.0\n", 68 | "4 1000.0\n", 69 | "5 -124.0\n", 70 | "6 NaN\n", 71 | "7 456.0\n", 72 | "8 NaN\n", 73 | "9 56.0\n", 74 | "10 234.0\n", 75 | "dtype: float64" 76 | ] 77 | }, 78 | "execution_count": 8, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "#The replace method allows you to change values easily\n", 85 | "#Suppsoe -999 be sentimental values for missing data\n", 86 | "\n", 87 | "data.replace(-999, np.nan) #Replacing all -999 values with NaN values" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 11, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "0 1.0\n", 99 | "1 NaN\n", 100 | "2 2.0\n", 101 | "3 NaN\n", 102 | "4 1000.0\n", 103 | "5 -124.0\n", 104 | "6 NaN\n", 105 | "7 456.0\n", 106 | "8 NaN\n", 107 | "9 56.0\n", 108 | "10 234.0\n", 109 | "dtype: float64" 110 | ] 111 | }, 112 | "execution_count": 11, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "#We can also perform it on more than one items at a time by passing the items to replace as a list with a list of their replacement values\n", 119 | "\n", 120 | "data.replace([-999,-234], np.nan)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 12, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "0 1.0\n", 132 | "1 0.0\n", 133 | "2 2.0\n", 134 | "3 0.0\n", 135 | "4 1000.0\n", 136 | "5 -124.0\n", 137 | "6 NaN\n", 138 | "7 456.0\n", 139 | "8 NaN\n", 140 | "9 56.0\n", 141 | "10 234.0\n", 142 | "dtype: float64" 143 | ] 144 | }, 145 | "execution_count": 12, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "data.replace([-999,-234],[np.nan,0])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 15, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "0 1.0\n", 163 | "1 0.0\n", 164 | "2 2.0\n", 165 | "3 0.0\n", 166 | "4 1000.0\n", 167 | "5 -124.0\n", 168 | "6 NaN\n", 169 | "7 456.0\n", 170 | "8 NaN\n", 171 | "9 56.0\n", 172 | "10 234.0\n", 173 | "dtype: float64" 174 | ] 175 | }, 176 | "execution_count": 15, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "#We can also perform the operations using dicts instead of lists\n", 183 | "data.replace({-999: np.nan, -234:0})" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.8.2" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /Methods for Boolean Arrays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "array([False, True, True, False, True, False, False])" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "#Numpy can be used to carry out methods for boolean arrays\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "from numpy.random import randn\n", 24 | "\n", 25 | "bool_arr = np.array([False, True, True, False, True, False, False])\n", 26 | "bool_arr" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "array([-0.82970925, -0.48842782, 0.86957697, 1.37041254, -1.4876376 ,\n", 38 | " 0.92170847, -0.98813075, 0.67024641, -0.06906026, -0.62078202,\n", 39 | " -0.07863047, 0.89943169, -1.75947612, -1.28893664, 0.20412679,\n", 40 | " 0.39540743, -1.68352065, 0.82482254, 0.59171459, -0.89408004,\n", 41 | " -0.45913128, 0.25794878, -1.08751663, 0.23117771, -1.34423815,\n", 42 | " -0.02232406, 0.88415439, 0.51197638, -0.15096856, -0.00670537,\n", 43 | " 0.85985364, -0.00472732, 0.44990037, -1.08608628, 0.71948998,\n", 44 | " -0.84585393, 1.24785555, -0.20212936, 0.68847375, 0.7670229 ,\n", 45 | " 0.95111833, -1.59459513, -1.09352728, -0.64458559, -0.3634872 ,\n", 46 | " -1.3276456 , 0.57444793, 0.76870495, -0.39937309, -0.29209341,\n", 47 | " -1.2194886 , -0.17203274, 0.75005586, -0.03389137, 1.61555002,\n", 48 | " -0.77622018, 0.65691334, 0.43114165, -1.3669775 , 0.91468553,\n", 49 | " -0.04366873, 1.81260005, 1.02728964, -0.31610149, -0.82800718,\n", 50 | " -1.35359534, -0.75335943, -0.59118949, 1.65746694, -2.02029146,\n", 51 | " 1.46574871, -1.14418908, -0.01979856, 0.08159557, -0.79708698,\n", 52 | " 0.74049367, 0.92326584, -0.01123274, -0.05593027, 1.54018937,\n", 53 | " 0.81886412, -0.62970331, -0.62979455, -0.09733021, 0.29788235,\n", 54 | " 1.26489678, -0.33919618, -0.53359832, -0.07910365, -0.34341075,\n", 55 | " -0.69232626, -0.09005864, -1.4931534 , -1.42775201, 0.4742894 ,\n", 56 | " 0.10223136, 0.5759892 , -1.15037282, 0.81923134, 1.00097885])" 57 | ] 58 | }, 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "#Calculating number of positive values for a randomly generated array\n", 66 | "#Summation value function: sum\n", 67 | "\n", 68 | "data = randn(100)\n", 69 | "data" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "43" 81 | ] 82 | }, 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "(data > 0).sum()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "True" 101 | ] 102 | }, 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "#Any Truth Value Function: any\n", 110 | "bool_arr.any()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "False" 122 | ] 123 | }, 124 | "execution_count": 7, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "#All Truth Values function: all\n", 131 | "bool_arr.all()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 8, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "True" 143 | ] 144 | }, 145 | "execution_count": 8, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "#These functions are applicable for non-boolean values where 0 is False while all other values are True\n", 152 | "data.any()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 9, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "True" 164 | ] 165 | }, 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "data.all()" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.7.4" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 2 197 | } 198 | -------------------------------------------------------------------------------- /File Operations with Numpy Arrays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "#We can also perform file input/output operations using the Numpy arrays\n", 21 | "#Numpy allows to save and load array data on disk in text or binary format\n", 22 | "\n", 23 | "import numpy as np\n", 24 | "\n", 25 | "data = np.arange(10)\n", 26 | "data" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "#Storing Array Data on Disk as Binary Format\n", 36 | "\n", 37 | "#Saving Data Function: save\n", 38 | "np.save('some_data',data) #Saves data in .npy extension by default" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 50 | ] 51 | }, 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "#Loading Data Function: load\n", 59 | "np.load('some_data.npy')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "#Saving Data in .Zip File Function: savez\n", 69 | "np.savez('some_data_archieve.npz', a = data, b = data)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "#Loading Data from .Zip File Function: load (returned object is a Dict)\n", 79 | "archieve_data = np.load('some_data_archieve.npz')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "" 91 | ] 92 | }, 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "archieve_data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 7, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 111 | ] 112 | }, 113 | "execution_count": 7, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "#We can extract the data from this Dict by calling the key values\n", 120 | "archieve_data['b']" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 8, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 132 | ] 133 | }, 134 | "execution_count": 8, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "archieve_data['a']" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 10, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "array([[ 0.10325184, 1.462772 , 0.39970187, -0.18821549],\n", 152 | " [-0.82959097, -1.56953916, 0.89599828, -0.38778666],\n", 153 | " [-0.66539946, 0.0680077 , -0.64432513, -0.06580105],\n", 154 | " [-1.15740344, -0.94284507, -0.35162299, 1.61787379]])" 155 | ] 156 | }, 157 | "execution_count": 10, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "#Saving and Loading Text Files\n", 164 | "from numpy.random import randn\n", 165 | "\n", 166 | "data2 = randn(4,4)\n", 167 | "data2" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 13, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "#Saving Data into a Text File: savetxt\n", 177 | "np.savetxt('array_ex.txt', data2)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 15, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "array([[ 0.10325184, 1.462772 , 0.39970187, -0.18821549],\n", 189 | " [-0.82959097, -1.56953916, 0.89599828, -0.38778666],\n", 190 | " [-0.66539946, 0.0680077 , -0.64432513, -0.06580105],\n", 191 | " [-1.15740344, -0.94284507, -0.35162299, 1.61787379]])" 192 | ] 193 | }, 194 | "execution_count": 15, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "#Loading Data from a Text File: loadtxt\n", 201 | "np.loadtxt('array_ex.txt')" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.4" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /Using JSON Module.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 36, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "from pandas import DataFrame" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 27, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "#JSON stands for Java Script Object Notation\n", 20 | "#JSON is the standard format for sending data by HTTP requests between web browsers and other applications\n", 21 | "#We now look at some functionalities of the built-in python json module\n", 22 | "\n", 23 | "#First creating a json object\n", 24 | "\n", 25 | "obj = \"\"\"\n", 26 | "{\n", 27 | " \"name\":\"Adam Jones\",\n", 28 | " \"age\": 25,\n", 29 | " \"places_lived\":[\"Pakistan\",\"UAE\",\"USA\"],\n", 30 | " \"fields_exp\": [{\"company\":\"Folio3\",\"duration(years)\":2,\"job\":\"Tester\"},\n", 31 | " {\"company\":\"Siemens\",\"duration(years)\":7,\"job\":\"Software Eng\"}]}\n", 32 | "\"\"\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 28, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "#The above is a json object whose format largely resembles that of python script dictionary\n", 42 | "#Since json is a text format, thus we have to create its object in a multi-string, otherwise we will get an error that json.loads accepts only str object\n", 43 | "\n", 44 | "#Now to load this object we can use the json.loads function\n", 45 | "json_reader = json.loads(obj)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 29, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "{'name': 'Adam Jones',\n", 57 | " 'age': 25,\n", 58 | " 'places_lived': ['Pakistan', 'UAE', 'USA'],\n", 59 | " 'fields_exp': [{'company': 'Folio3', 'duration(years)': 2, 'job': 'Tester'},\n", 60 | " {'company': 'Siemens', 'duration(years)': 7, 'job': 'Software Eng'}]}" 61 | ] 62 | }, 63 | "execution_count": 29, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "json_reader #To display the data in json object as a python object" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 30, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "#We can convert the python object into a json object by using the json.dumps method\n", 79 | "obj = json.dumps(json_reader)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 31, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "'{\"name\": \"Adam Jones\", \"age\": 25, \"places_lived\": [\"Pakistan\", \"UAE\", \"USA\"], \"fields_exp\": [{\"company\": \"Folio3\", \"duration(years)\": 2, \"job\": \"Tester\"}, {\"company\": \"Siemens\", \"duration(years)\": 7, \"job\": \"Software Eng\"}]}'" 91 | ] 92 | }, 93 | "execution_count": 31, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "obj #Displaying the json object" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 32, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "#We can pass the json data directly to a pandas DataFrame or pass it after converting it into a python object\n", 109 | "\n", 110 | "json_df = DataFrame(json_reader['fields_exp'], columns=['company','job','duration(years)'])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 33, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/html": [ 121 | "
\n", 122 | "\n", 135 | "\n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
companyjobduration(years)
0Folio3Tester2
1SiemensSoftware Eng7
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " company job duration(years)\n", 163 | "0 Folio3 Tester 2\n", 164 | "1 Siemens Software Eng 7" 165 | ] 166 | }, 167 | "execution_count": 33, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "json_df" 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.8.2" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 4 198 | } 199 | -------------------------------------------------------------------------------- /Set Logic Methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "array(['A', 'B', 'C', 'B', 'E', 'A', 'C', 'C', 'B', 'A'], dtype=' Signed/Unsigned Integer(8-bits)\n", 15 | "#int16/uint16 => Signed/Unsigned Integer(16-bits)\n", 16 | "#int32/uint32 => Signed/Unsigned Integer(32-bits)\n", 17 | "#int64/uint64 => Signed/Unsigned Integer(64-bits)\n", 18 | "#float 16/32/64/128 => Floating Point(16/32/64/128 bits)\n", 19 | "#complex 64/128/256 => Complex Number(64/128/256 bits)\n", 20 | "#bool => Boolean number(1 bit i.e. either True or False)\n", 21 | "#object => Python object type(Bits based on the initializing in its class)\n", 22 | "#string_ => Fixed length string type(1 byte per character)\n", 23 | "#unicode_ => Fixed length unicode type(Number of bytes depend upon platform)\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 6, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "dtype('int32')" 35 | ] 36 | }, 37 | "execution_count": 6, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "#ndarrays can be type casted from one dtype to another using astype()\n", 44 | "\n", 45 | "int_arr1 = np.array([1,2,3,4,5])\n", 46 | "int_arr1.dtype" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 11, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "array([1., 2., 3., 4., 5.])" 58 | ] 59 | }, 60 | "execution_count": 11, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "float_arr1 = int_arr1.astype(np.float64) #typecasted into float64\n", 67 | "float_arr1" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 12, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "dtype('float64')" 79 | ] 80 | }, 81 | "execution_count": 12, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "float_arr2 = np.array([2.3,4.3,5.3,1.5,6.64,6.4])\n", 88 | "float_arr2.dtype" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 13, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "array([2, 4, 5, 1, 6, 6])" 100 | ] 101 | }, 102 | "execution_count": 13, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "int_arr2 = float_arr2.astype(np.int32) #typecasted into int32\n", 109 | "int_arr2" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 14, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "array(['1.23', '-3.4', '4.234'], dtype='\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m#Index Types are immmutable i.e. cannot be altered by user, so that they can be easily shared among different data structures\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mindex\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'e'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 90 | "\u001b[1;32mc:\\users\\mr. ghori\\appdata\\local\\programs\\python\\python37-32\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36m__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 4258\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4259\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4260\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Index does not support mutable operations\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4261\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4262\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 91 | "\u001b[1;31mTypeError\u001b[0m: Index does not support mutable operations" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "#Index Types are immmutable i.e. cannot be altered by user, so that they can be easily shared among different data structures\n", 97 | "index[2] = 'e'" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "0 0.0\n", 109 | "1 3.4\n", 110 | "2 -2.0\n", 111 | "dtype: float64" 112 | ] 113 | }, 114 | "execution_count": 8, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "index = pd.Index(np.arange(3))\n", 121 | "obj2 = Series(np.array([0,3.4,-2]), index = index)\n", 122 | "obj2" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 11, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "Int64Index([0, 1, 2], dtype='int64')" 134 | ] 135 | }, 136 | "execution_count": 11, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "#In case of numpy array data as index object is of type Int64Index\n", 143 | "obj2.index" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 12, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "#Other types of index objects that can be obtained are:\n", 153 | "\n", 154 | "#MultiIndex => In case of hierarchical objects\n", 155 | "#DatetimeIndex => Stores nanoseconds tiemstamps(datetime64 dtype)\n", 156 | "#PeriodIndex => Specialized Index for Period data(timespans)" 157 | ] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.7.4" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 2 181 | } 182 | -------------------------------------------------------------------------------- /Combining Data with Overlap.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Now we consider a special case in which the two datasets to combine have full or partial same indexes\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "from pandas import DataFrame, Series\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 5, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Creating two Series objects with same indexes bu different values\n", 23 | "\n", 24 | "a = Series([np.nan, 2.5, np.nan, 3.4, 6.7, np.nan], index=list('fedbca'))\n", 25 | "b = Series(np.arange(len(a), dtype=np.float64), index=a.index)\n", 26 | "b[-1] = np.nan" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 6, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "f NaN\n", 38 | "e 2.5\n", 39 | "d NaN\n", 40 | "b 3.4\n", 41 | "c 6.7\n", 42 | "a NaN\n", 43 | "dtype: float64" 44 | ] 45 | }, 46 | "execution_count": 6, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "a" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "f 0.0\n", 64 | "e 1.0\n", 65 | "d 2.0\n", 66 | "b 3.0\n", 67 | "c 4.0\n", 68 | "a NaN\n", 69 | "dtype: float64" 70 | ] 71 | }, 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "b" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 9, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "array([0. , 2.5, 2. , 3.4, 6.7, nan])" 90 | ] 91 | }, 92 | "execution_count": 9, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "#We can use the numpy where to pass a condition to combine the datasets as\n", 99 | "np.where(pd.isnull(a), b, a)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 11, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "a NaN\n", 111 | "b 3.0\n", 112 | "c 6.7\n", 113 | "d 2.0\n", 114 | "e 1.0\n", 115 | "f 0.0\n", 116 | "dtype: float64" 117 | ] 118 | }, 119 | "execution_count": 11, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "#The above expression would return a ndarray with all values of a and values of b where a values were NaN\n", 126 | "\n", 127 | "#A similar function for pandas Series is the combine_first\n", 128 | "b[:-2].combine_first(a[2:])" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 15, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "#The combine_first appears to perform the same task column by column, thus we can use it for dataframes as well\n", 138 | "df1 = DataFrame({'a':[1, np.nan, 23, 4.5], 'b':[2.3,1.4, np.nan, 32.4], 'c':np.arange(6,10)})\n", 139 | "df2 = DataFrame({'a':[np.nan, 2.4, np.nan, 5.6], 'b':[np.nan,1.4,3.4,np.nan], 'c':np.arange(7,11)})" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 16, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/html": [ 150 | "
\n", 151 | "\n", 164 | "\n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | "
abc
01.02.36
12.41.47
223.03.48
34.532.49
\n", 200 | "
" 201 | ], 202 | "text/plain": [ 203 | " a b c\n", 204 | "0 1.0 2.3 6\n", 205 | "1 2.4 1.4 7\n", 206 | "2 23.0 3.4 8\n", 207 | "3 4.5 32.4 9" 208 | ] 209 | }, 210 | "execution_count": 16, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "df1.combine_first(df2)" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.8.2" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /numpy.where.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "#numpy.where is vectorized form of python ternary expresssion (if else statement)\n", 12 | "\n", 13 | "xarr = np.array([1.,2.,3.,4.,5.])\n", 14 | "yarr = np.array([1.4,2.5,3.6,4.7,6.8])\n", 15 | "conditions = np.array([True, False, False, True, False])" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "[1.0, 2.5, 3.6, 4.0, 6.8]" 27 | ] 28 | }, 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "#Suppose we want to take value from xarr if condition is True and from yarr if condition is False\n", 36 | "\n", 37 | "#In case of pure Python, the task can be performed via list comprehension as:\n", 38 | "result = [(x if c else y) for x,y,c in zip(xarr,yarr,conditions)]\n", 39 | "result" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "array([1. , 2.5, 3.6, 4. , 6.8])" 51 | ] 52 | }, 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "#In case of numpy, we can use where() to perform the above task as:\n", 60 | "result = np.where(conditions, xarr, yarr)\n", 61 | "result" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "array([[ 1.32969813, -1.3431948 , -1.63296197, -0.31929917],\n", 73 | " [-0.63439893, 0.088797 , 0.04996672, 0.39516985],\n", 74 | " [ 0.00767993, -0.93328636, 0.42585378, 0.03790392],\n", 75 | " [ 0.67080532, 0.7480199 , 0.26772152, 0.74426192]])" 76 | ] 77 | }, 78 | "execution_count": 6, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "#Using np.where to change positive and negative values\n", 85 | "\n", 86 | "from numpy.random import randn\n", 87 | "\n", 88 | "data = randn(4,4)\n", 89 | "data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "array([[1, 0, 0, 0],\n", 101 | " [0, 1, 1, 1],\n", 102 | " [1, 0, 1, 1],\n", 103 | " [1, 1, 1, 1]])" 104 | ] 105 | }, 106 | "execution_count": 7, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "#Suppose we want to replace all negative values to 0 and positive to 1 then:\n", 113 | "\n", 114 | "data = np.where(data < 0, 0, 1)\n", 115 | "data" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 29, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "#Nested where expressions\n", 125 | "#In case of two or more conditions, np.where can be nested to perform desired task\n", 126 | "\n", 127 | "#Suppose we want values 0,1,2,3 for each of the following different conditions satisfying the value of x, then:\n", 128 | "#conitions = [false, false], [false, true], [true, false], [true, true]\n", 129 | "\n", 130 | "x = [-2,-1,0,1]\n", 131 | "y = [-2,0,-1,1]" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 37, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "[0, 1, 2, 3]" 143 | ] 144 | }, 145 | "execution_count": 37, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "#Using pure Python\n", 152 | "result_1 = []\n", 153 | "for i in range(4):\n", 154 | " if (x[i] < 0) and (y[i] < 0):\n", 155 | " result_1.append(0)\n", 156 | " elif (x[i] < 0) and (y[i] >= 0):\n", 157 | " result_1.append(1)\n", 158 | " elif (x[i] >= 0) and (y[i] < 0):\n", 159 | " result_1.append(2)\n", 160 | " else:\n", 161 | " result_1.append(3)\n", 162 | " \n", 163 | "result_1" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 38, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "array([0., 1., 2., 3.])" 175 | ] 176 | }, 177 | "execution_count": 38, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "#Using np.where\n", 184 | "result_2 = np.empty(4)\n", 185 | "for j in range(4):\n", 186 | " result_2[j] = np.where((x[j] < 0) and (y[j] < 0), 0,\n", 187 | " np.where((x[j] < 0) and (y[j] >= 0), 1,\n", 188 | " np.where((x[j] >= 0) and (y[j] < 0), 2, 3)))\n", 189 | "result_2" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 41, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "array([0., 1., 2., 3.])" 201 | ] 202 | }, 203 | "execution_count": 41, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "#In some cases we can apply np.where furthur to remove the for loop\n", 210 | "#Although for this approach, we can use the 0/1 values of conditional logic to our advantage and can evaluate the values based on simple Boolean Expression\n", 211 | "\n", 212 | "result_3 = np.empty(4)\n", 213 | "for k in range(1, 4):\n", 214 | " result_3[k] = 1 * ((x[k] < 0) and (y[k] >= 0)) + 2 * ((x[k] >= 0) and (y[k] < 0)) + 3 * ((x[k] >= 0) and (y[k] >= 0))\n", 215 | "result_3 " 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 3", 222 | "language": "python", 223 | "name": "python3" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 3 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython3", 235 | "version": "3.7.4" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 2 240 | } 241 | -------------------------------------------------------------------------------- /Axis Indexes with Duplicate Values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Now we will see how to deal with indexes in which labels are duplicate\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from pandas import Series,DataFrame\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "a 0\n", 25 | "b 1\n", 26 | "a 2\n", 27 | "c 3\n", 28 | "d 4\n", 29 | "a 5\n", 30 | "c 6\n", 31 | "dtype: int32" 32 | ] 33 | }, 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "series_obj = Series(np.arange(7), index=['a','b','a','c','d','a','c'])\n", 41 | "series_obj" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 7, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "False" 53 | ] 54 | }, 55 | "execution_count": 7, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "#We can find whether a certain object's indexes are unique or not as:\n", 62 | "series_obj.index.is_unique" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 8, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "a 0\n", 74 | "a 2\n", 75 | "a 5\n", 76 | "dtype: int32" 77 | ] 78 | }, 79 | "execution_count": 8, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "#In case of Data selection, a certain index returns all the values stored at that index\n", 86 | "series_obj['a']" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 9, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/html": [ 97 | "
\n", 98 | "\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
012
a-1.333040-0.410786-0.465574
b1.308799-1.6230320.257251
a1.036575-1.997923-0.711075
c0.8050561.8590863.042280
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " 0 1 2\n", 151 | "a -1.333040 -0.410786 -0.465574\n", 152 | "b 1.308799 -1.623032 0.257251\n", 153 | "a 1.036575 -1.997923 -0.711075\n", 154 | "c 0.805056 1.859086 3.042280" 155 | ] 156 | }, 157 | "execution_count": 9, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "#Same goes in case of DataFrame\n", 164 | "frame_obj = DataFrame(np.random.randn(4,3), index=['a','b','a','c'])\n", 165 | "frame_obj" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/html": [ 176 | "
\n", 177 | "\n", 190 | "\n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
012
a-1.333040-0.410786-0.465574
a1.036575-1.997923-0.711075
\n", 214 | "
" 215 | ], 216 | "text/plain": [ 217 | " 0 1 2\n", 218 | "a -1.333040 -0.410786 -0.465574\n", 219 | "a 1.036575 -1.997923 -0.711075" 220 | ] 221 | }, 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "frame_obj.loc['a']" 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.7.4" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /Reading Excel Files using Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Sometimes the data is stored in an excel file\n", 10 | "#We can extract data from excel files using pandas\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "from pandas import DataFrame" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 8, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Like HDF5 to use this class of the pandas we should have dependencies installed\n", 23 | "#In case of excel files, pandas uses the xlrd and openpyxl packages\n", 24 | "\n", 25 | "xls_file = pd.ExcelFile('Data Files\\my_fields.xlsx')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 11, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "#We can extract the data stored in the excel file using the parse method\n", 35 | "xls_tables = xls_file.parse('my_fields')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 12, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": [ 46 | "
\n", 47 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
DateDayYear
0Monday22020
1Tuesday32020
2Friday52020
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " Date Day Year\n", 94 | "0 Monday 2 2020\n", 95 | "1 Tuesday 3 2020\n", 96 | "2 Friday 5 2020" 97 | ] 98 | }, 99 | "execution_count": 12, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "xls_tables" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 13, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "#We can also create excel files using the dataframe data\n", 115 | "data = pd.read_csv(r'Data Files/fields.csv')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 14, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/html": [ 126 | "
\n", 127 | "\n", 140 | "\n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 182 | "
" 183 | ], 184 | "text/plain": [ 185 | " ID Name Field\n", 186 | "0 1 Adam Jones Electrical\n", 187 | "1 2 Edward Elrich Mechanical\n", 188 | "2 3 Stain Steve Computer Science\n", 189 | "3 4 Ken Adams Media Science\n", 190 | "4 5 Ross Taylor Sportsman" 191 | ] 192 | }, 193 | "execution_count": 14, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "data" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 17, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "#Using the Excel Writer and to_excel methods\n", 209 | "writer = pd.ExcelWriter('Data Files\\excel_data.xlsx') #Creating file to save data in\n", 210 | "data.to_excel(writer, 'Sheet1') #Writing data from dataframe to excel file\n", 211 | "writer.save() #Saving data stored in the excel file" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.8.2" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 4 243 | } 244 | -------------------------------------------------------------------------------- /Pickle Serialization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Pickling or Pickle Serialization is the process of converting a python object into a byte stream or binary format\n", 10 | "#We can then use the inverse operation of deserialization to get ack the original object\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "from pandas import DataFrame" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Reading a sample file\n", 23 | "data = pd.read_csv(r'Data Files\\fields.csv')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/html": [ 34 | "
\n", 35 | "\n", 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " ID Name Field\n", 94 | "0 1 Adam Jones Electrical\n", 95 | "1 2 Edward Elrich Mechanical\n", 96 | "2 3 Stain Steve Computer Science\n", 97 | "3 4 Ken Adams Media Science\n", 98 | "4 5 Ross Taylor Sportsman" 99 | ] 100 | }, 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "#We can save data in pickle format by using the pandas to_pickle method\n", 117 | "#It automatically saves the data from a dataframe into a binary pickle format using serialization\n", 118 | "data.to_pickle(r'Data Files\\pickle_data')" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 7, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/html": [ 129 | "
\n", 130 | "\n", 143 | "\n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 185 | "
" 186 | ], 187 | "text/plain": [ 188 | " ID Name Field\n", 189 | "0 1 Adam Jones Electrical\n", 190 | "1 2 Edward Elrich Mechanical\n", 191 | "2 3 Stain Steve Computer Science\n", 192 | "3 4 Ken Adams Media Science\n", 193 | "4 5 Ross Taylor Sportsman" 194 | ] 195 | }, 196 | "execution_count": 7, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "#Using the read_pickle method we can also read the pickle file by applying deserialization\n", 203 | "pd.read_pickle(r'Data Files\\pickle_data')" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.8.2" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 4 235 | } 236 | -------------------------------------------------------------------------------- /Data Transformation using Functions or Mapping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Suppose we want to transform our data based on the values of an array, series or dataframe\n", 10 | "import pandas as pd\n", 11 | "from pandas import DataFrame, Series\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "#Consider a dataset containing data about various types of meats\n", 22 | "data = DataFrame({'food':['mutton','beef','pastraml','beef','nova lox','beef'], 'ounces':[3.4,2.3,5.4,4.5,2.5,6.1]})" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | "
foodounces
0mutton3.4
1beef2.3
2pastraml5.4
3beef4.5
4nova lox2.5
5beef6.1
\n", 88 | "
" 89 | ], 90 | "text/plain": [ 91 | " food ounces\n", 92 | "0 mutton 3.4\n", 93 | "1 beef 2.3\n", 94 | "2 pastraml 5.4\n", 95 | "3 beef 4.5\n", 96 | "4 nova lox 2.5\n", 97 | "5 beef 6.1" 98 | ] 99 | }, 100 | "execution_count": 3, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "data" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "#Suppose we wanted to add a column to the dataframe idnicating the animal from which the meat came\n", 116 | "#Create a map/dictionary to indicate each item\n", 117 | "\n", 118 | "meat_to_animal = {'mutton':'goat', 'beef':'cow', 'pastraml':'cow', 'nova lox':'salmon'}" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "#The map method allows to map a dictionary like object and can create a new value according to the item in the food column\n", 128 | "data['animal'] = data['food'].map(meat_to_animal)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 6, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 153 | "\n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | "
foodouncesanimal
0mutton3.4goat
1beef2.3cow
2pastraml5.4cow
3beef4.5cow
4nova lox2.5salmon
5beef6.1cow
\n", 201 | "
" 202 | ], 203 | "text/plain": [ 204 | " food ounces animal\n", 205 | "0 mutton 3.4 goat\n", 206 | "1 beef 2.3 cow\n", 207 | "2 pastraml 5.4 cow\n", 208 | "3 beef 4.5 cow\n", 209 | "4 nova lox 2.5 salmon\n", 210 | "5 beef 6.1 cow" 211 | ] 212 | }, 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "data" 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.8.2" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /Reading CSV File in Pieces.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import DataFrame, Series" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "data_df = pd.read_csv('Data Files/fields.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 45 | "\n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 87 | "
" 88 | ], 89 | "text/plain": [ 90 | " ID Name Field\n", 91 | "0 1 Adam Jones Electrical\n", 92 | "1 2 Edward Elrich Mechanical\n", 93 | "2 3 Stain Steve Computer Science\n", 94 | "3 4 Ken Adams Media Science\n", 95 | "4 5 Ross Taylor Sportsman" 96 | ] 97 | }, 98 | "execution_count": 3, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "data_df" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "#Suppose we have to read only 2 rows from the file, we can specify that to the nrows argument\n", 114 | "data_df = pd.read_csv('Data Files/fields.csv', nrows=2)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 139 | "\n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
\n", 163 | "
" 164 | ], 165 | "text/plain": [ 166 | " ID Name Field\n", 167 | "0 1 Adam Jones Electrical\n", 168 | "1 2 Edward Elrich Mechanical" 169 | ] 170 | }, 171 | "execution_count": 5, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "data_df" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 46, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "#We can also read a csv in pieces, which is often a case for big data files.\n", 187 | "#Suppose we want to read the data in chunks of 2 i.e. the csv is seperated into chunks containing 2 rows each\n", 188 | "#We can do this by passing the value of our chunksize to the chunksize argument\n", 189 | "df_chunks = pd.read_csv('Data Files/fields.csv', chunksize=2)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 47, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "" 201 | ] 202 | }, 203 | "execution_count": 47, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df_chunks" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 48, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "0 Adam Jones\n", 222 | "1 Edward Elrich\n", 223 | "Name: Name, dtype: object\n", 224 | "2 Stain Steve\n", 225 | "3 Ken Adams\n", 226 | "Name: Name, dtype: object\n", 227 | "4 Ross Taylor\n", 228 | "Name: Name, dtype: object\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "#Using the returned text parser object we can iterate over the given csv file according to the chunk size as shown below\n", 234 | "#If displayed it will show three different series each havingn chunks of the data passed through the csv\n", 235 | "\n", 236 | "for chunks in df_chunks:\n", 237 | " print(chunks['Name'])" 238 | ] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.8.2" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /Using the CSV Module.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Sometimes we have to manually correct the info in the delimited files rather than using automation process\n", 10 | "#For that we will use the Python built-in csv module\n", 11 | "import csv" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 12, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "['ID', 'Name', 'Field']\n", 24 | "['1', 'Adam Jones', 'Electrical']\n", 25 | "['2', 'Edward Elrich', 'Mechanical']\n", 26 | "['3', 'Stain Steve', 'Computer Science']\n", 27 | "['4', 'Ken Adams', 'Media Science']\n", 28 | "['5', 'Ross Taylor', 'Sportsman']\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "#Reading files using the csv module\n", 34 | "with open(r'Data Files\\fields.csv', 'r') as csv_file:\n", 35 | " csv_reader = csv.reader(csv_file)\n", 36 | " \n", 37 | " #Displaying the content of the csv file as a list\n", 38 | " for line in csv_reader:\n", 39 | " print(line)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 14, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "#We can manually seperate the headers from the given file using basic list indexing\n", 49 | "#Opening the file for reading\n", 50 | "csv_file = open(r'Data Files\\fields.csv')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 17, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "#Reading the contents of the file to a list containing header and contents as elements\n", 60 | "reader = list(csv.reader(csv_file))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 18, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "#Seperating the header and content from list using basic assignment\n", 70 | "header, content = reader[0], reader[1:]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 19, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "['ID', 'Name', 'Field']\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "print(header)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 20, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "[['1', 'Adam Jones', 'Electrical'], ['2', 'Edward Elrich', 'Mechanical'], ['3', 'Stain Steve', 'Computer Science'], ['4', 'Ken Adams', 'Media Science'], ['5', 'Ross Taylor', 'Sportsman']]\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "print(content)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 21, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "#Using this data we can create other data structures for suppose a dictionary\n", 114 | "reader_dict = {h:c for h,c in zip(header, zip(*content))} #Creates a dictionary with key values as headers and ordered pairs of the content values" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 22, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "{'ID': ('1', '2', '3', '4', '5'),\n", 126 | " 'Name': ('Adam Jones',\n", 127 | " 'Edward Elrich',\n", 128 | " 'Stain Steve',\n", 129 | " 'Ken Adams',\n", 130 | " 'Ross Taylor'),\n", 131 | " 'Field': ('Electrical',\n", 132 | " 'Mechanical',\n", 133 | " 'Computer Science',\n", 134 | " 'Media Science',\n", 135 | " 'Sportsman')}" 136 | ] 137 | }, 138 | "execution_count": 22, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "reader_dict" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 26, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "#Another great use of csv module is that we can create our own dialect of a csv file by passing arguments to it\n", 154 | "#For a csv.Dialect class we have:\n", 155 | "#delimiter: value seperator (by default: ',')\n", 156 | "#lineterminator: line seperator (by default: '\\n')\n", 157 | "#quotechar: special character seperator (by default: '\"')\n", 158 | "#quoting: csv.QUOTE_ALL (quotes every character) or csv.QUOTE_MINIMAL (quotes only special characters)(by default), csv.QUOTE_NONNUMERIC (quotes only non-numeric characters), csv.QUOTE_NON (quotes no characters)\n", 159 | "#skipintialspace: Ignore white space after each delimiter (by default: False)\n", 160 | "#doublequote: Double quotes inside a field\n", 161 | "#escapechar: String to escape delimiter for QUOTE_NON (by default: disabled)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 60, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "#Lets create a simple csv.Dialect class and use it for our storage\n", 171 | "class my_dialect(csv.Dialect): #creating subclass from csv.Dialect\n", 172 | " lineterminator = '\\n'\n", 173 | " delimiter = \";\"\n", 174 | " quotechar = '\"'\n", 175 | " quoting = csv.QUOTE_MINIMAL" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 62, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "csv_file = open(r'Data Files\\fields.csv')\n", 185 | "csv_reader = csv.reader(csv_file, delimiter='|')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 63, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "<_csv.reader object at 0x000002058D709B20>\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "print(csv_reader)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 65, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "#We can also write files manually by using the csv.writer function in our dialect we created\n", 212 | "with open('Data Files/my_fields.csv', 'w') as csv_file:\n", 213 | " writer = csv.writer(csv_file, dialect=my_dialect)\n", 214 | " writer.writerow(('Day','Date','Year'))\n", 215 | " writer.writerow(('Monday','02','2020'))\n", 216 | " writer.writerow(('Tuesday','03','2020'))\n", 217 | " writer.writerow(('Friday','05','2020'))\n", 218 | " " 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 66, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "csv_file = open(r'Data Files/my_fields.csv')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 67, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "csv_reader = csv.reader(csv_file)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 68, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "['Day;Date;Year']\n", 249 | "['Monday;02;2020']\n", 250 | "['Tuesday;03;2020']\n", 251 | "['Friday;05;2020']\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "for line in csv_reader:\n", 257 | " print(line)" 258 | ] 259 | } 260 | ], 261 | "metadata": { 262 | "kernelspec": { 263 | "display_name": "Python 3", 264 | "language": "python", 265 | "name": "python3" 266 | }, 267 | "language_info": { 268 | "codemirror_mode": { 269 | "name": "ipython", 270 | "version": 3 271 | }, 272 | "file_extension": ".py", 273 | "mimetype": "text/x-python", 274 | "name": "python", 275 | "nbconvert_exporter": "python", 276 | "pygments_lexer": "ipython3", 277 | "version": "3.8.2" 278 | } 279 | }, 280 | "nbformat": 4, 281 | "nbformat_minor": 4 282 | } 283 | -------------------------------------------------------------------------------- /Using HDF5 Formats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Another binary format yet for large data purposes is the Heirarchical Data Format HDF\n", 10 | "#Pandas supports the HDF5 format with the use of pytables interface\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "from pandas import DataFrame" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Creating a hdf5 format file using pandas (Dependency tables mst be installed in order to use it: pip install tables)\n", 23 | "hdf_file = pd.HDFStore('data.h5')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 5, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "data = pd.read_csv('Data Files/fields.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 6, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/html": [ 43 | "
\n", 44 | "\n", 57 | "\n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 99 | "
" 100 | ], 101 | "text/plain": [ 102 | " ID Name Field\n", 103 | "0 1 Adam Jones Electrical\n", 104 | "1 2 Edward Elrich Mechanical\n", 105 | "2 3 Stain Steve Computer Science\n", 106 | "3 4 Ken Adams Media Science\n", 107 | "4 5 Ross Taylor Sportsman" 108 | ] 109 | }, 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "data" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#HDF5 storage works that of a dictionary, we pass keys and store whole dataframe or big data into it\n", 126 | "hdf_file['obj1'] = data" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "hdf_file['obj2'] = data['Name'] #Storing the names as second object" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 10, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "\n", 147 | "File path: data.h5" 148 | ] 149 | }, 150 | "execution_count": 10, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "hdf_file" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 11, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "
\n", 168 | "\n", 181 | "\n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | "
IDNameField
01Adam JonesElectrical
12Edward ElrichMechanical
23Stain SteveComputer Science
34Ken AdamsMedia Science
45Ross TaylorSportsman
\n", 223 | "
" 224 | ], 225 | "text/plain": [ 226 | " ID Name Field\n", 227 | "0 1 Adam Jones Electrical\n", 228 | "1 2 Edward Elrich Mechanical\n", 229 | "2 3 Stain Steve Computer Science\n", 230 | "3 4 Ken Adams Media Science\n", 231 | "4 5 Ross Taylor Sportsman" 232 | ] 233 | }, 234 | "execution_count": 11, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "#To retieve the data simply pass the key like we do in case of a dictionary\n", 241 | "hdf_file['obj1']" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 12, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "0 Adam Jones\n", 253 | "1 Edward Elrich\n", 254 | "2 Stain Steve\n", 255 | "3 Ken Adams\n", 256 | "4 Ross Taylor\n", 257 | "Name: Name, dtype: object" 258 | ] 259 | }, 260 | "execution_count": 12, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "hdf_file['obj2']" 267 | ] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "Python 3", 273 | "language": "python", 274 | "name": "python3" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 3 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython3", 286 | "version": "3.8.2" 287 | } 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 4 291 | } 292 | -------------------------------------------------------------------------------- /Numpy Random Number Generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Numpy can be used to generate random values for a ceratin array elements\n", 10 | "#This generates sample values for arrays from many kinds of probability distributions\n", 11 | "\n", 12 | "import numpy as np\n", 13 | "import numpy.random\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Seeds random number generator(gives initial value to random generator to create all random values): seed\n", 23 | "np.random.seed(1)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "array([0.51815255, 0.86502025, 0.82914691, 0.82960336, 0.27304997,\n", 35 | " 0.0592432 , 0.67052804, 0.59306552, 0.6716541 , 0.41178788])" 36 | ] 37 | }, 38 | "execution_count": 13, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "#Random sample from Uniform Distribution: rand\n", 45 | "X = np.random.rand(10)\n", 46 | "X" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 16, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "8" 58 | ] 59 | }, 60 | "execution_count": 16, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "#Random integer sample from given low to high range: randint\n", 67 | "X2 = np.random.randint(1,11)\n", 68 | "X2" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 31, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "array([-0.93786433, -1.3856563 , -0.33888054, -0.17925544, -0.24094727,\n", 80 | " -0.24769856, -0.16851994, 0.57370922, -0.95677318, -0.65152255])" 81 | ] 82 | }, 83 | "execution_count": 31, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "#Random sample from Normal Distribution with mean = 0 and Std = 1: randn\n", 90 | "X3 = np.random.randn(10)\n", 91 | "X3" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 32, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "array([3, 2, 2, 1, 0, 2, 2, 1, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 0, 0, 0, 1,\n", 103 | " 0, 0, 3, 1, 2, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 2, 0, 0, 2, 0, 0,\n", 104 | " 0, 1, 2, 1, 1, 0, 1, 0, 1, 0, 1, 3, 0, 1, 0, 1, 2, 1, 0, 1, 2, 0,\n", 105 | " 0, 3, 1, 0, 1, 0, 3, 0, 3, 0, 0, 2, 0, 2, 1, 0, 1, 0, 0, 0, 1, 1,\n", 106 | " 2, 2, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0])" 107 | ] 108 | }, 109 | "execution_count": 32, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "#Random sample from Binomial Distribution: binomial\n", 116 | "X4 = np.random.binomial(10,0.10,100)\n", 117 | "X4" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 33, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "10.176001143769438" 129 | ] 130 | }, 131 | "execution_count": 33, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "#Random sample from Gaussian Distribution: normal\n", 138 | "X5 = np.random.normal(10)\n", 139 | "X5" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 36, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "0.35084536799172106" 151 | ] 152 | }, 153 | "execution_count": 36, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "#Random sample from Beta Distribution: beta\n", 160 | "X6 = np.random.beta(10,10)\n", 161 | "X6" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 37, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "7.616863289989834" 173 | ] 174 | }, 175 | "execution_count": 37, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "#Random sample from Chi-square Distribution: chisquare\n", 182 | "X7 = np.random.chisquare(10)\n", 183 | "X7" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 39, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "6.390743513572102" 195 | ] 196 | }, 197 | "execution_count": 39, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "#Random sample from Gamma Distribution: gamma\n", 204 | "X8 = np.random.gamma(10)\n", 205 | "X8" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 40, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "9.910002476795643" 217 | ] 218 | }, 219 | "execution_count": 40, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "#Random sample from uniform [0,1) Distribution: uniform\n", 226 | "X9 = np.random.uniform(10)\n", 227 | "X9" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 45, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "array([[ 0.3743189 , -1.84955116, -0.27810508, 0.31784693],\n", 239 | " [-0.35876103, -0.04193944, 1.25794105, 0.62834995],\n", 240 | " [-1.00292632, 0.78797796, -1.05711954, 0.54215727],\n", 241 | " [ 0.91940949, 0.05073418, 1.20135336, -0.08601853]])" 242 | ] 243 | }, 244 | "execution_count": 45, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "#In most functions stated above, we can use the size argument to tell the size of sample we want to generate\n", 251 | "X10 = np.random.normal(size = (4,4))\n", 252 | "X10" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 52, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "array([[1.96225376, 0.77559961],\n", 264 | " [0.09694872, 2.57011088],\n", 265 | " [2.84800908, 1.04277678],\n", 266 | " [2.43705663, 1.76068542],\n", 267 | " [6.98429186, 0.72446192]])" 268 | ] 269 | }, 270 | "execution_count": 52, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "X11 = np.random.gamma(shape = 1 , scale = 2, size = (5,2))\n", 277 | "X11" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 60, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "array([[2.84800908, 1.04277678],\n", 289 | " [0.09694872, 2.57011088],\n", 290 | " [1.96225376, 0.77559961],\n", 291 | " [2.43705663, 1.76068542],\n", 292 | " [6.98429186, 0.72446192]])" 293 | ] 294 | }, 295 | "execution_count": 60, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "#Random Permutation function: permutation\n", 302 | "X12 = np.random.permutation(X11)\n", 303 | "X12" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 61, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "array([[2.43705663, 1.76068542],\n", 315 | " [1.96225376, 0.77559961],\n", 316 | " [2.84800908, 1.04277678],\n", 317 | " [6.98429186, 0.72446192],\n", 318 | " [0.09694872, 2.57011088]])" 319 | ] 320 | }, 321 | "execution_count": 61, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "#Random Shuffling function: shuffle\n", 328 | "np.random.shuffle(X11)\n", 329 | "X11" 330 | ] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.7.4" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 2 354 | } 355 | -------------------------------------------------------------------------------- /Basic Array Statistical Methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "array([[-0.97872554, -0.46356244, 1.3922046 , -0.96398794],\n", 12 | " [-0.50689651, 0.26098338, -0.65482834, 1.0093453 ],\n", 13 | " [-0.44440787, -2.3193851 , 0.7040289 , 0.08837219],\n", 14 | " [-1.22212494, 0.65296398, -0.03782775, 0.75681772],\n", 15 | " [-1.31233681, 1.2051613 , 0.01207815, 0.56413249]])" 16 | ] 17 | }, 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "output_type": "execute_result" 21 | } 22 | ], 23 | "source": [ 24 | "#We can use statstical methods in numpy on arrays for data processing purposes\n", 25 | "\n", 26 | "#Some of these are mentioned as under:\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "from numpy.random import randn\n", 30 | "\n", 31 | "data = randn(5,4)\n", 32 | "data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "-0.11289976128875331" 44 | ] 45 | }, 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "#Mean value function: mean\n", 53 | "np.mean(data)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "array([-0.25351783, 0.02715096, -0.49284797, 0.03745725, 0.11725878])" 65 | ] 66 | }, 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "#Additional axis argument can be given to mean, which finds means along that particular axis\n", 74 | "np.mean(data, axis = 1)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "-2.2579952257750664" 86 | ] 87 | }, 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "#Summation value function: sum\n", 95 | "np.sum(data)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "array([-4.46449165, -0.66383889, 1.41565555, 1.45467976])" 107 | ] 108 | }, 109 | "execution_count": 6, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "#Additional axis argument can be given to sum, which finds sum along that particular axis\n", 116 | "np.sum(data, 0)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "0.9387680172117109" 128 | ] 129 | }, 130 | "execution_count": 7, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "#Standard Deviation function: std\n", 137 | "np.std(data)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 13, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "array([0.97252443, 0.66514769, 1.1301455 , 0.78875585, 0.92709688])" 149 | ] 150 | }, 151 | "execution_count": 13, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "#Additional argument allows to compute standard deviation along a certain dimension\n", 158 | "np.std(data, 1)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 14, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "0.8812853901396072" 170 | ] 171 | }, 172 | "execution_count": 14, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "#Variance function: var\n", 179 | "np.var(data)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 15, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "array([0.12836524, 1.4906378 , 0.49269024, 0.48473204])" 191 | ] 192 | }, 193 | "execution_count": 15, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "#Additional argument allows to compute variance along a particular axis\n", 200 | "np.var(data, 0)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 16, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "1.3922046018160443" 212 | ] 213 | }, 214 | "execution_count": 16, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "#Maximum value function: max\n", 221 | "np.max(data)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 17, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "-2.3193851000495096" 233 | ] 234 | }, 235 | "execution_count": 17, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "#Minimum value function: min\n", 242 | "np.min(data)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 18, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "2" 254 | ] 255 | }, 256 | "execution_count": 18, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "#Maximum Index Value Function: argmax\n", 263 | "np.argmax(data)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 19, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "9" 275 | ] 276 | }, 277 | "execution_count": 19, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "#Minimum Index Value Function: argmin\n", 284 | "np.argmin(data)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 39, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "array([[-0.08561185, -0.09479014, 1.26101276],\n", 296 | " [ 0.09860458, -0.87188753, -0.7315243 ],\n", 297 | " [ 0.24212062, 1.11690374, 1.12849184],\n", 298 | " [-0.06623721, 1.71512039, -0.6974301 ]])" 299 | ] 300 | }, 301 | "execution_count": 39, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | } 305 | ], 306 | "source": [ 307 | "#Cumulative Sum function: cumsum (Starting from 0)\n", 308 | "data2 = randn(4,3)\n", 309 | "data2" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 40, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "array([-0.08561185, -0.18040199, 1.08061077, 1.17921535, 0.30732782,\n", 321 | " -0.42419649, -0.18207587, 0.93482787, 2.06331971, 1.9970825 ,\n", 322 | " 3.71220289, 3.01477279])" 323 | ] 324 | }, 325 | "execution_count": 40, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "np.cumsum(data2)" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 41, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "array([-8.56118494e-02, 8.11515930e-03, 1.02333194e-02, 1.00905218e-03,\n", 343 | " -8.79780019e-04, 6.43580464e-04, 1.55824101e-04, 1.74040521e-04,\n", 344 | " 1.96403307e-04, -1.30092069e-05, -2.23123560e-05, 1.55613088e-05])" 345 | ] 346 | }, 347 | "execution_count": 41, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "#Cumulative product: cumprod (Starting from 1)\n", 354 | "np.cumprod(data2)" 355 | ] 356 | } 357 | ], 358 | "metadata": { 359 | "kernelspec": { 360 | "display_name": "Python 3", 361 | "language": "python", 362 | "name": "python3" 363 | }, 364 | "language_info": { 365 | "codemirror_mode": { 366 | "name": "ipython", 367 | "version": 3 368 | }, 369 | "file_extension": ".py", 370 | "mimetype": "text/x-python", 371 | "name": "python", 372 | "nbconvert_exporter": "python", 373 | "pygments_lexer": "ipython3", 374 | "version": "3.7.4" 375 | } 376 | }, 377 | "nbformat": 4, 378 | "nbformat_minor": 2 379 | } 380 | -------------------------------------------------------------------------------- /Dropping Entries from Axis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "a 0.0\n", 12 | "b 1.0\n", 13 | "c 2.0\n", 14 | "d 3.0\n", 15 | "e 4.0\n", 16 | "dtype: float64" 17 | ] 18 | }, 19 | "execution_count": 5, 20 | "metadata": {}, 21 | "output_type": "execute_result" 22 | } 23 | ], 24 | "source": [ 25 | "#Now we view another functionalities i.e. dropping an entry into an array which does not have those entries through an axis\n", 26 | "\n", 27 | "import pandas as pd\n", 28 | "import numpy as np\n", 29 | "from pandas import Series, DataFrame\n", 30 | "\n", 31 | "series_obj = Series(np.arange(5.), index = ['a','b','c','d','e'])\n", 32 | "series_obj" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "a 0.0\n", 44 | "b 1.0\n", 45 | "d 3.0\n", 46 | "e 4.0\n", 47 | "dtype: float64" 48 | ] 49 | }, 50 | "execution_count": 7, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "#Values can be dropped using the drop method\n", 57 | "series_obj.drop('c')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 8, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "b 1.0\n", 69 | "e 4.0\n", 70 | "dtype: float64" 71 | ] 72 | }, 73 | "execution_count": 8, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "#The values to be dropped can also be a list of values\n", 80 | "series_obj.drop(['a','c','d'])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 13, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | "
JulyDecember
1998NaN19.0
200112.0NaN
20033.021.0
\n", 131 | "
" 132 | ], 133 | "text/plain": [ 134 | " July December\n", 135 | "1998 NaN 19.0\n", 136 | "2001 12.0 NaN\n", 137 | "2003 3.0 21.0" 138 | ] 139 | }, 140 | "execution_count": 13, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "#In case of DataFrame, values canbe dropped from any axis given the axis number\n", 147 | "frame_data = {'July': Series([12,3], index = [2001,2003]), 'December': Series([19,21], index = [1998,2003])}\n", 148 | "frame_obj = DataFrame(frame_data)\n", 149 | "frame_obj" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 16, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/html": [ 160 | "
\n", 161 | "\n", 174 | "\n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
JulyDecember
1998NaN19.0
20033.021.0
\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " July December\n", 199 | "1998 NaN 19.0\n", 200 | "2003 3.0 21.0" 201 | ] 202 | }, 203 | "execution_count": 16, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "frame_obj.drop([2001])" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 17, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "
\n", 221 | "\n", 234 | "\n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | "
JulyDecember
200112.0NaN
\n", 250 | "
" 251 | ], 252 | "text/plain": [ 253 | " July December\n", 254 | "2001 12.0 NaN" 255 | ] 256 | }, 257 | "execution_count": 17, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "frame_obj.drop([1998,2003])" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 18, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/html": [ 274 | "
\n", 275 | "\n", 288 | "\n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | "
December
199819.0
2001NaN
200321.0
\n", 310 | "
" 311 | ], 312 | "text/plain": [ 313 | " December\n", 314 | "1998 19.0\n", 315 | "2001 NaN\n", 316 | "2003 21.0" 317 | ] 318 | }, 319 | "execution_count": 18, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "frame_obj.drop(['July'],axis=1)" 326 | ] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.7.4" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } 351 | -------------------------------------------------------------------------------- /Index Object Methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | "
values
00.472986
1-0.681426
20.242439
3-1.700736
\n", 51 | "
" 52 | ], 53 | "text/plain": [ 54 | " values\n", 55 | "0 0.472986\n", 56 | "1 -0.681426\n", 57 | "2 0.242439\n", 58 | "3 -1.700736" 59 | ] 60 | }, 61 | "execution_count": 11, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "#There are a certain methods which can be applied on the index objects listed below:\n", 68 | "\n", 69 | "import numpy as np\n", 70 | "import pandas as pd\n", 71 | "from pandas import Series, DataFrame\n", 72 | "\n", 73 | "np.random.seed(12)\n", 74 | "frame = DataFrame(np.random.randn(4), columns = ['values'], index = pd.Index(np.arange(4)))\n", 75 | "frame" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 13, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "Int64Index([0, 1, 2, 3], dtype='int64')" 87 | ] 88 | }, 89 | "execution_count": 13, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "index_obj1 = frame.index\n", 96 | "index_obj1" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 14, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "2 2\n", 108 | "3 3\n", 109 | "4 4\n", 110 | "5 5\n", 111 | "dtype: int64" 112 | ] 113 | }, 114 | "execution_count": 14, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "series = Series([2,3,4,5], index = np.array([2,3,4,5]))\n", 121 | "series" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 16, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "Int64Index([2, 3, 4, 5], dtype='int64')" 133 | ] 134 | }, 135 | "execution_count": 16, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "index_obj2 = series.index\n", 142 | "index_obj2" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 21, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "Int64Index([0, 1, 2, 3, 2, 3, 4, 5], dtype='int64')" 154 | ] 155 | }, 156 | "execution_count": 21, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "#Concatenate two indexes\n", 163 | "pd.Index.append(index_obj1, index_obj2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 27, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "Int64Index([0, 1], dtype='int64')" 175 | ] 176 | }, 177 | "execution_count": 27, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "#Set Difference of two indexes\n", 184 | "pd.Index.difference(index_obj1, index_obj2)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 28, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "Int64Index([2, 3], dtype='int64')" 196 | ] 197 | }, 198 | "execution_count": 28, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "#Intersection of two indexes\n", 205 | "pd.Index.intersection(index_obj1, index_obj2)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 29, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')" 217 | ] 218 | }, 219 | "execution_count": 29, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "#Union of two indexes\n", 226 | "pd.Index.union(index_obj1, index_obj2)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 30, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "array([False, False, True, True])" 238 | ] 239 | }, 240 | "execution_count": 30, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "#Comparision if both have same indexes\n", 247 | "pd.Index.isin(index_obj1, index_obj2)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 36, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "Int64Index([0, 2, 3], dtype='int64')" 259 | ] 260 | }, 261 | "execution_count": 36, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "#Delete a certain index position\n", 268 | "pd.Index.delete(index_obj1, 1)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 37, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "Int64Index([0, 3], dtype='int64')" 280 | ] 281 | }, 282 | "execution_count": 37, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "#Delete a certain index by its index value\n", 289 | "pd.Index.drop(index_obj1, [1,2])" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 39, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "Index([0, 1, 'New', 2, 3], dtype='object')" 301 | ] 302 | }, 303 | "execution_count": 39, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "#Insert a new index\n", 310 | "pd.Index.insert(index_obj1, 2, 'New')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 46, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "True" 322 | ] 323 | }, 324 | "execution_count": 46, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "#Checks for increasing monotonic sequence\n", 331 | "index_obj1.is_monotonic_increasing" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 47, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "True" 343 | ] 344 | }, 345 | "execution_count": 47, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "#Checks if index has no duplicate values\n", 352 | "index_obj1.is_unique" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 49, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')" 364 | ] 365 | }, 366 | "execution_count": 49, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "#Computes array of unique values in index only\n", 373 | "pd.Index.unique(pd.Index.append(index_obj1,index_obj2))" 374 | ] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.7.4" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 2 398 | } 399 | -------------------------------------------------------------------------------- /Unique Values, Value Counts and Membership.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#We now look into more operations on Sequence of values\n", 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "from pandas import Series,DataFrame" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "0 a\n", 24 | "1 b\n", 25 | "2 d\n", 26 | "3 e\n", 27 | "4 c\n", 28 | "5 d\n", 29 | "6 a\n", 30 | "7 f\n", 31 | "8 e\n", 32 | "9 g\n", 33 | "10 a\n", 34 | "11 d\n", 35 | "12 e\n", 36 | "dtype: object" 37 | ] 38 | }, 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "series_obj = Series(list('abdecdafegade'))\n", 46 | "series_obj" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 24, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "
\n", 58 | "\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | "
Data1Data2Data3
0015
1136
2257
3378
\n", 107 | "
" 108 | ], 109 | "text/plain": [ 110 | " Data1 Data2 Data3\n", 111 | "0 0 1 5\n", 112 | "1 1 3 6\n", 113 | "2 2 5 7\n", 114 | "3 3 7 8" 115 | ] 116 | }, 117 | "execution_count": 24, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "frame_obj = DataFrame({'Data1':np.arange(0,4), 'Data2':np.array([1,3,5,7]), 'Data3':np.arange(5,9)})\n", 124 | "frame_obj" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "array(['a', 'b', 'd', 'e', 'c', 'f', 'g'], dtype=object)" 136 | ] 137 | }, 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "#Unique value method: unique (For Series)\n", 145 | "series_obj.unique() #Gives a list of unique values in Series object" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 20, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "d 3\n", 157 | "e 3\n", 158 | "a 3\n", 159 | "f 1\n", 160 | "b 1\n", 161 | "g 1\n", 162 | "c 1\n", 163 | "dtype: int64" 164 | ] 165 | }, 166 | "execution_count": 20, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "#Value count method: value_counts (For Series)\n", 173 | "series_obj.value_counts() #Gives a Series containing the value frequency of each item" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 21, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "a 3\n", 185 | "e 3\n", 186 | "d 3\n", 187 | "c 1\n", 188 | "g 1\n", 189 | "b 1\n", 190 | "f 1\n", 191 | "dtype: int64" 192 | ] 193 | }, 194 | "execution_count": 21, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "#This method is also available for pandas library i.e.\n", 201 | "pd.value_counts(series_obj, sort=False) " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 22, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "0 True\n", 213 | "1 False\n", 214 | "2 True\n", 215 | "3 False\n", 216 | "4 False\n", 217 | "5 True\n", 218 | "6 True\n", 219 | "7 False\n", 220 | "8 False\n", 221 | "9 True\n", 222 | "10 True\n", 223 | "11 True\n", 224 | "12 False\n", 225 | "dtype: bool" 226 | ] 227 | }, 228 | "execution_count": 22, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "#Comparison method: isin (Returns a Boolean value)\n", 235 | "series_obj.isin(list('agd')) #Returns True for elements in provided list, returns False for other elements" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 25, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/html": [ 246 | "
\n", 247 | "\n", 260 | "\n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | "
Data1Data2Data3
01.00.00.0
11.01.00.0
21.00.00.0
31.01.00.0
50.01.01.0
60.00.01.0
70.01.01.0
80.00.01.0
\n", 320 | "
" 321 | ], 322 | "text/plain": [ 323 | " Data1 Data2 Data3\n", 324 | "0 1.0 0.0 0.0\n", 325 | "1 1.0 1.0 0.0\n", 326 | "2 1.0 0.0 0.0\n", 327 | "3 1.0 1.0 0.0\n", 328 | "5 0.0 1.0 1.0\n", 329 | "6 0.0 0.0 1.0\n", 330 | "7 0.0 1.0 1.0\n", 331 | "8 0.0 0.0 1.0" 332 | ] 333 | }, 334 | "execution_count": 25, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "#Applying value_counts method on DataFrame\n", 341 | "#We can use the apply method to apply the value_counts method on the DataFrame object to get frequency values of items in the columns, this may help in creating various charts like Histogram, etc.\n", 342 | "\n", 343 | "hist_res = frame_obj.apply(pd.value_counts).fillna(0) #Calculates frequency value and puts 0 for all NaN values\n", 344 | "hist_res" 345 | ] 346 | } 347 | ], 348 | "metadata": { 349 | "kernelspec": { 350 | "display_name": "Python 3", 351 | "language": "python", 352 | "name": "python3" 353 | }, 354 | "language_info": { 355 | "codemirror_mode": { 356 | "name": "ipython", 357 | "version": 3 358 | }, 359 | "file_extension": ".py", 360 | "mimetype": "text/x-python", 361 | "name": "python", 362 | "nbconvert_exporter": "python", 363 | "pygments_lexer": "ipython3", 364 | "version": "3.7.4" 365 | } 366 | }, 367 | "nbformat": 4, 368 | "nbformat_minor": 2 369 | } 370 | -------------------------------------------------------------------------------- /Handling Data from Databases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#We can also handle data from daabases using pandas\n", 10 | "#For this case, we will be using sqllite as our database\n", 11 | "\n", 12 | "import sqlite3 as sqlite\n", 13 | "import pandas as pd\n", 14 | "import numpy as np\n", 15 | "from pandas import DataFrame" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "#Creating a query \n", 25 | "\n", 26 | "query = \"\"\"\n", 27 | " CREATE TABLE test\n", 28 | " (a VARCHAR(20), b VARCHAR(20),\n", 29 | " c REAL, d INTEGER\n", 30 | " );\n", 31 | "\"\"\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "#Connecting to the database with memory i.e. the database will reside in RAM instead of on disk\n", 41 | "con = sqlite.connect(':memory:')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "" 53 | ] 54 | }, 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "#Executing the query on the database\n", 62 | "#First it creates a cursor object which would be used throughout the database\n", 63 | "#After that the cursor executes the SQL statement passed as query to the database\n", 64 | "con.execute(query)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "#Commiting to the database connection so that the changes can be seen by other db connections\n", 74 | "con.commit()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "#Creating data to insert into the database\n", 84 | "data = [('Andrew','Garfield',12.5,21),\n", 85 | " ('John','Phillips',13.7,32),\n", 86 | " ('Dough','Don',41.6,78)]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "#Creating insert statement to be executed by the database\n", 96 | "stmnt = \"INSERT INTO test VALUES(?,?,?,?)\"" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 8, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "" 108 | ] 109 | }, 110 | "execution_count": 8, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "#Executing the statement for all entries in data\n", 117 | "con.executemany(stmnt, data)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 9, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "con.commit()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 10, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "#Executing the SELECT query\n", 136 | "#In case of python SQLdrivers, the data is usually in a list of tuples\n", 137 | "\n", 138 | "#The cursor will execute the select all query on the test table\n", 139 | "cursor = con.execute('select * from test') " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 11, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "rows = cursor.fetchall() #Fetches all datas in the rows of a table and returns as a list of tuples" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 12, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "[('Andrew', 'Garfield', 12.5, 21),\n", 160 | " ('John', 'Phillips', 13.7, 32),\n", 161 | " ('Dough', 'Don', 41.6, 78)]" 162 | ] 163 | }, 164 | "execution_count": 12, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "rows #Displays the data in the table" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 13, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "(('a', None, None, None, None, None, None),\n", 182 | " ('b', None, None, None, None, None, None),\n", 183 | " ('c', None, None, None, None, None, None),\n", 184 | " ('d', None, None, None, None, None, None))" 185 | ] 186 | }, 187 | "execution_count": 13, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "cursor.description #Describes the column values for a result" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 14, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "#Storing rows as rows in dataframe and the first element of each tuple in description list as indexes\n", 203 | "db_df = DataFrame(rows, columns = list(zip(*cursor.description))[0])" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 15, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/html": [ 214 | "
\n", 215 | "\n", 228 | "\n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
abcd
0AndrewGarfield12.521
1JohnPhillips13.732
2DoughDon41.678
\n", 262 | "
" 263 | ], 264 | "text/plain": [ 265 | " a b c d\n", 266 | "0 Andrew Garfield 12.5 21\n", 267 | "1 John Phillips 13.7 32\n", 268 | "2 Dough Don 41.6 78" 269 | ] 270 | }, 271 | "execution_count": 15, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "db_df" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 16, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/html": [ 288 | "
\n", 289 | "\n", 302 | "\n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
abcd
0AndrewGarfield12.521
1JohnPhillips13.732
2DoughDon41.678
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " a b c d\n", 340 | "0 Andrew Garfield 12.5 21\n", 341 | "1 John Phillips 13.7 32\n", 342 | "2 Dough Don 41.6 78" 343 | ] 344 | }, 345 | "execution_count": 16, 346 | "metadata": {}, 347 | "output_type": "execute_result" 348 | } 349 | ], 350 | "source": [ 351 | "#Another method of creating dataframe using the database data is by simply using read_sql method from pandas sql module\n", 352 | "\n", 353 | "import pandas.io.sql as sql\n", 354 | "\n", 355 | "sql.read_sql('select * from test', con) #Reads SQL query into a dataframe" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.8.2" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 4 380 | } 381 | -------------------------------------------------------------------------------- /Removing Duplicates.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#We now look at a method to perform data cleaning i.e. removing duplicate values from a dataframe\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "from pandas import DataFrame\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#Consider a dataframe with duplicate values\n", 23 | "data = DataFrame({'A': ['one'] * 4 + ['two']*3, 'B':[1,1,2,3,3,4,4]})" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 5, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/html": [ 34 | "
\n", 35 | "\n", 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | "
AB
0one1
1one1
2one2
3one3
4two3
5two4
6two4
\n", 94 | "
" 95 | ], 96 | "text/plain": [ 97 | " A B\n", 98 | "0 one 1\n", 99 | "1 one 1\n", 100 | "2 one 2\n", 101 | "3 one 3\n", 102 | "4 two 3\n", 103 | "5 two 4\n", 104 | "6 two 4" 105 | ] 106 | }, 107 | "execution_count": 5, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "data" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "0 False\n", 125 | "1 True\n", 126 | "2 False\n", 127 | "3 False\n", 128 | "4 False\n", 129 | "5 False\n", 130 | "6 True\n", 131 | "dtype: bool" 132 | ] 133 | }, 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "#DataFrame has a method duplicated which returns a boolean series of whether the item is a duplicate or not\n", 141 | "data.duplicated()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/html": [ 152 | "
\n", 153 | "\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
AB
0one1
2one2
3one3
4two3
5two4
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " A B\n", 206 | "0 one 1\n", 207 | "2 one 2\n", 208 | "3 one 3\n", 209 | "4 two 3\n", 210 | "5 two 4" 211 | ] 212 | }, 213 | "execution_count": 7, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "#Using drop_duplicates we can remove these duplicates from the Dataframe\n", 220 | "data.drop_duplicates()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | "
AB
0one1
4two3
\n", 266 | "
" 267 | ], 268 | "text/plain": [ 269 | " A B\n", 270 | "0 one 1\n", 271 | "4 two 3" 272 | ] 273 | }, 274 | "execution_count": 8, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "#Suppose we want to drop duplicates based on a particular column, then we can pass that column as a list to the method\n", 281 | "data.drop_duplicates(['A'])" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 12, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/html": [ 292 | "
\n", 293 | "\n", 306 | "\n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | "
AB
1one1
2one2
3one3
4two3
6two4
\n", 342 | "
" 343 | ], 344 | "text/plain": [ 345 | " A B\n", 346 | "1 one 1\n", 347 | "2 one 2\n", 348 | "3 one 3\n", 349 | "4 two 3\n", 350 | "6 two 4" 351 | ] 352 | }, 353 | "execution_count": 12, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "#In case of dropping, if we want the last combination of the duplicate values rathar than the first combination which is by default then we can give keep argument last value\n", 360 | "\n", 361 | "data.drop_duplicates(keep='last')" 362 | ] 363 | } 364 | ], 365 | "metadata": { 366 | "kernelspec": { 367 | "display_name": "Python 3", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.8.2" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 4 386 | } 387 | -------------------------------------------------------------------------------- /Apply Methods for DataFrames.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | "
bde
First1.8503280.000154-2.013637
Second0.318101-1.497292-0.198700
Third-0.5078360.8414052.139488
Fourth-0.6285870.9412621.083527
\n", 61 | "
" 62 | ], 63 | "text/plain": [ 64 | " b d e\n", 65 | "First 1.850328 0.000154 -2.013637\n", 66 | "Second 0.318101 -1.497292 -0.198700\n", 67 | "Third -0.507836 0.841405 2.139488\n", 68 | "Fourth -0.628587 0.941262 1.083527" 69 | ] 70 | }, 71 | "execution_count": 2, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "#Now we will deal with the apply methods for DataFrames\n", 78 | "\n", 79 | "import pandas as pd\n", 80 | "import numpy as np\n", 81 | "from pandas import Series, DataFrame\n", 82 | "\n", 83 | "frame_obj = DataFrame(np.random.randn(4,3), columns=list('bde'), index=['First','Second','Third','Fourth'])\n", 84 | "frame_obj" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "b 2.478915\n", 96 | "d 2.438554\n", 97 | "e 4.153125\n", 98 | "dtype: float64" 99 | ] 100 | }, 101 | "execution_count": 5, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "#We can perform a particular function on a DataFrame's data using the Apply method \n", 108 | "\n", 109 | "f = lambda x: x.max() - x.min() #A simple function to calculate max-min value for a set of values\n", 110 | "\n", 111 | "#Now applying the function on the DataFrame\n", 112 | "frame_obj.apply(f) #Applies function to columns" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 6, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "First 3.863965\n", 124 | "Second 1.815393\n", 125 | "Third 2.647324\n", 126 | "Fourth 1.712114\n", 127 | "dtype: float64" 128 | ] 129 | }, 130 | "execution_count": 6, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "#Now by defualt the axis value for this apply method is 0, we can alter it to apply the function to other axes\n", 137 | "frame_obj.apply(f, axis = 1) #Applies function to rows" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/html": [ 148 | "
\n", 149 | "\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
bde
min-0.628587-1.497292-2.013637
max1.8503280.9412622.139488
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " b d e\n", 190 | "min -0.628587 -1.497292 -2.013637\n", 191 | "max 1.850328 0.941262 2.139488" 192 | ] 193 | }, 194 | "execution_count": 8, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "#The function that apply uses doesnt necessarily have to be a scalar value, we can return structures through it as well\n", 201 | "\n", 202 | "def f(x):\n", 203 | " return Series([min(x),max(x)], index=['min','max']) #returns a series containing max and min elements of x\n", 204 | "\n", 205 | "#Applying the function to DataFrame object\n", 206 | "frame_obj.apply(f) #Computes the max and min values and returns it as a Series as per the function" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 9, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/html": [ 217 | "
\n", 218 | "\n", 231 | "\n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
minmax
First-2.0136371.850328
Second-1.4972920.318101
Third-0.5078362.139488
Fourth-0.6285871.083527
\n", 262 | "
" 263 | ], 264 | "text/plain": [ 265 | " min max\n", 266 | "First -2.013637 1.850328\n", 267 | "Second -1.497292 0.318101\n", 268 | "Third -0.507836 2.139488\n", 269 | "Fourth -0.628587 1.083527" 270 | ] 271 | }, 272 | "execution_count": 9, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "#In case of row wise\n", 279 | "frame_obj.apply(f, axis = 1)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 10, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/html": [ 290 | "
\n", 291 | "\n", 304 | "\n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | "
bde
First1.8503280.000154-2.013637
Second0.318101-1.497292-0.198700
Third-0.5078360.8414052.139488
Fourth-0.6285870.9412621.083527
\n", 340 | "
" 341 | ], 342 | "text/plain": [ 343 | " b d e\n", 344 | "First 1.850328 0.000154 -2.013637\n", 345 | "Second 0.318101 -1.497292 -0.198700\n", 346 | "Third -0.507836 0.841405 2.139488\n", 347 | "Fourth -0.628587 0.941262 1.083527" 348 | ] 349 | }, 350 | "execution_count": 10, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "#In case of applying an element wise function, we have\n", 357 | "def format(x): #Returns the formatted string of the items in x\n", 358 | " return '%2f'%x\n", 359 | " \n", 360 | "frame_obj.applymap(format)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 13, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "First -2.013637\n", 372 | "Second -0.198700\n", 373 | "Third 2.139488\n", 374 | "Fourth 1.083527\n", 375 | "Name: e, dtype: object" 376 | ] 377 | }, 378 | "execution_count": 13, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "#In case we want to apply the function to only the 'e' column of the DataFrame\n", 385 | "frame_obj['e'].map(format)" 386 | ] 387 | } 388 | ], 389 | "metadata": { 390 | "kernelspec": { 391 | "display_name": "Python 3", 392 | "language": "python", 393 | "name": "python3" 394 | }, 395 | "language_info": { 396 | "codemirror_mode": { 397 | "name": "ipython", 398 | "version": 3 399 | }, 400 | "file_extension": ".py", 401 | "mimetype": "text/x-python", 402 | "name": "python", 403 | "nbconvert_exporter": "python", 404 | "pygments_lexer": "ipython3", 405 | "version": "3.7.4" 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 2 410 | } 411 | -------------------------------------------------------------------------------- /Operations of Linear Algebra.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Numpy Arrays can evaluate operations of Linear Algebra\n", 10 | "import numpy as np\n", 11 | "from numpy.random import randn\n", 12 | "\n", 13 | "A = np.array([[1,2,3],[4,5,6]]) #2x2 matrix\n", 14 | "B = np.array([[1,2],[3,4],[5,6]]) #3x2 matrix" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "array([[1, 2, 3],\n", 26 | " [4, 5, 6]])" 27 | ] 28 | }, 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "A" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "array([[1, 2],\n", 47 | " [3, 4],\n", 48 | " [5, 6]])" 49 | ] 50 | }, 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "B" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "array([[22, 28],\n", 69 | " [49, 64]])" 70 | ] 71 | }, 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "#Dot product function: dot\n", 79 | "np.dot(A,B)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 7, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "array([ 6., 15.])" 91 | ] 92 | }, 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "#Using suitable matrix, we can get 1D result from dot product of a 2D and 1D array\n", 100 | "np.dot(A, np.ones(3))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 11, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "array([[-0.92330513, -0.27545177, 1.01606084, 0.47656796, -0.25881727],\n", 112 | " [ 0.92064529, -1.16781398, -3.13578792, -0.40787525, 1.56836767],\n", 113 | " [ 0.16504961, 1.64581198, 0.78009535, -1.51762351, -1.34317362],\n", 114 | " [-0.75060235, -0.21383346, -0.27263813, -0.68501248, -0.85525025],\n", 115 | " [-1.69681292, 0.46144554, -2.47395717, 0.87598453, 1.66015274]])" 116 | ] 117 | }, 118 | "execution_count": 11, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "#Numpy gives the linalg module to perform linear algebra functionalities\n", 125 | "\n", 126 | "import numpy.linalg\n", 127 | "\n", 128 | "X = randn(5, 5)\n", 129 | "X" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 13, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "array([-0.92330513, -1.16781398, 0.78009535, -0.68501248, 1.66015274])" 141 | ] 142 | }, 143 | "execution_count": 13, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "#Diagonal Values Function: diag\n", 150 | "np.diag(X)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 14, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "-0.3358834923328249" 162 | ] 163 | }, 164 | "execution_count": 14, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "#Trace Value Function: trace\n", 171 | "np.trace(X)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 15, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "-7.284272432422328" 183 | ] 184 | }, 185 | "execution_count": 15, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "#Determinant Value Function: det\n", 192 | "np.linalg.det(X)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 16, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "(array([ 2.16223833+0.j , -1.30624168+1.75899067j,\n", 204 | " -1.30624168-1.75899067j, 0.89686451+0.j ,\n", 205 | " -0.78250298+0.j ]),\n", 206 | " array([[ 0.16685568+0.j , -0.23207051+0.18212845j,\n", 207 | " -0.23207051-0.18212845j, 0.14041108+0.j ,\n", 208 | " -0.35315952+0.j ],\n", 209 | " [-0.43898452+0.j , 0.67822071+0.j ,\n", 210 | " 0.67822071-0.j , -0.27583369+0.j ,\n", 211 | " -0.53361388+0.j ],\n", 212 | " [ 0.06005357+0.j , -0.14210975-0.4837585j ,\n", 213 | " -0.14210975+0.4837585j , -0.23787721+0.j ,\n", 214 | " 0.20106415+0.j ],\n", 215 | " [ 0.23797086+0.j , 0.11225518-0.16507914j,\n", 216 | " 0.11225518+0.16507914j, 0.44685659+0.j ,\n", 217 | " -0.6774018 +0.j ],\n", 218 | " [-0.84806557+0.j , -0.1785741 -0.3564136j ,\n", 219 | " -0.1785741 +0.3564136j , -0.80494355+0.j ,\n", 220 | " 0.30205063+0.j ]]))" 221 | ] 222 | }, 223 | "execution_count": 16, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "#Eigen Value Function: eig\n", 230 | "np.linalg.eig(X)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 17, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "array([[-1.14204732, -0.32262037, -0.37128838, -0.00564796, -0.17656773],\n", 242 | " [-0.71073733, -0.39354616, 0.11311952, -0.17575424, 0.2619634 ],\n", 243 | " [ 1.55309694, 0.52906657, 0.58561973, -0.74901444, -0.16974848],\n", 244 | " [-2.41421948, -1.29958748, -1.28384275, 0.70079412, 0.1736707 ],\n", 245 | " [ 2.61857842, 1.25378842, 1.1391838 , -1.44287746, 0.00447696]])" 246 | ] 247 | }, 248 | "execution_count": 17, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "#Inverse Value Function: inv\n", 255 | "np.linalg.inv(X)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 18, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "array([[-1.14204732, -0.32262037, -0.37128838, -0.00564796, -0.17656773],\n", 267 | " [-0.71073733, -0.39354616, 0.11311952, -0.17575424, 0.2619634 ],\n", 268 | " [ 1.55309694, 0.52906657, 0.58561973, -0.74901444, -0.16974848],\n", 269 | " [-2.41421948, -1.29958748, -1.28384275, 0.70079412, 0.1736707 ],\n", 270 | " [ 2.61857842, 1.25378842, 1.1391838 , -1.44287746, 0.00447696]])" 271 | ] 272 | }, 273 | "execution_count": 18, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "#Pseudo Inverse Value Function (Moore-Penrose pseudo-inverse): pinv\n", 280 | "np.linalg.pinv(X)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 19, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "(array([[-0.40607308, -0.23817498, 0.41949282, -0.14740517, -0.76202023],\n", 292 | " [ 0.40490327, -0.47132063, -0.63571499, -0.27686549, -0.36485909],\n", 293 | " [ 0.07258944, 0.82710805, -0.20271468, -0.39953961, -0.33150853],\n", 294 | " [-0.33011775, -0.18866395, 0.04695207, -0.82275165, 0.419885 ],\n", 295 | " [-0.74626472, 0.03778527, -0.61367295, 0.25507851, -0.00130282]]),\n", 296 | " array([[ 2.27374129, -0.51529989, 0.3105706 , -0.89641647, -0.31394444],\n", 297 | " [ 0. , 2.03506339, 1.83914228, -1.01416869, -1.56442575],\n", 298 | " [ 0. , 0. , 3.76696051, 0.19712304, -1.89227242],\n", 299 | " [ 0. , 0. , 0. , 1.43606868, 1.26770303],\n", 300 | " [ 0. , 0. , 0. , 0. , -0.29100531]]))" 301 | ] 302 | }, 303 | "execution_count": 19, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "#QR Decomposition Function: qr\n", 310 | "np.linalg.qr(X)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 20, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "(array([[ 1.53279987e-01, 4.29782060e-01, 1.93259855e-01,\n", 322 | " -3.46098645e-01, -7.96654871e-01],\n", 323 | " [-6.92175955e-01, -5.91557891e-01, 7.26728861e-03,\n", 324 | " -1.75243654e-01, -3.74417624e-01],\n", 325 | " [ 3.83564000e-01, -2.89961580e-01, -7.81855475e-01,\n", 326 | " 1.83370734e-01, -3.51963251e-01],\n", 327 | " [ 4.98775688e-02, 4.76164759e-04, -3.26819100e-01,\n", 328 | " -8.89012542e-01, 3.16793274e-01],\n", 329 | " [-5.89736025e-01, 6.17469593e-01, -4.94458398e-01,\n", 330 | " 1.59803830e-01, 3.02712963e-02]]),\n", 331 | " array([5.05640143, 2.60015677, 2.23832488, 1.27342239, 0.19437907]),\n", 332 | " array([[ 0.04900083, 0.22043131, 0.80509004, -0.15376595, -0.52649313],\n", 333 | " [-0.78356076, 0.14616422, 0.20681983, 0.54870627, 0.14427594],\n", 334 | " [ 0.35004857, -0.67317684, 0.3913755 , 0.47644416, 0.21005981],\n", 335 | " [ 0.45909384, 0.67975922, 0.14759219, 0.29622587, 0.46650512],\n", 336 | " [ 0.22434416, 0.12168154, -0.36619305, 0.60044039, -0.6635038 ]]))" 337 | ] 338 | }, 339 | "execution_count": 20, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "#SVD Decomposition Function: svd\n", 346 | "np.linalg.svd(X)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 22, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/plain": [ 357 | "array([[-1.14204732, -0.32262037, -0.37128838, -0.00564796, -0.17656773],\n", 358 | " [-0.71073733, -0.39354616, 0.11311952, -0.17575424, 0.2619634 ],\n", 359 | " [ 1.55309694, 0.52906657, 0.58561973, -0.74901444, -0.16974848],\n", 360 | " [-2.41421948, -1.29958748, -1.28384275, 0.70079412, 0.1736707 ],\n", 361 | " [ 2.61857842, 1.25378842, 1.1391838 , -1.44287746, 0.00447696]])" 362 | ] 363 | }, 364 | "execution_count": 22, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "#Linear System Solution Value Function (x for Ax = b): solve\n", 371 | "np.linalg.solve(X, np.eye(5))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 25, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "array([-2.01817175, -0.9049548 , 1.74902032, -4.12318489, 3.57315015])" 383 | ] 384 | }, 385 | "execution_count": 25, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "#Least Squares Solution Value Function (y for y = Xb): lstsq\n", 392 | "np.linalg.solve(X, np.ones(5))" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "Python 3", 399 | "language": "python", 400 | "name": "python3" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.7.4" 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 2 417 | } 418 | -------------------------------------------------------------------------------- /Renaming Axis Indexes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Like values, we can also modify the index labels using mapping\n", 10 | "import pandas as pd\n", 11 | "from pandas import DataFrame, Series\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "#Consider a dataframe\n", 22 | "\n", 23 | "data = DataFrame(np.arange(12).reshape(3,4), index=list('abc'), columns=['one','two','three','four'])" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/html": [ 34 | "
\n", 35 | "\n", 48 | "\n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | "
onetwothreefour
a0123
b4567
c891011
\n", 82 | "
" 83 | ], 84 | "text/plain": [ 85 | " one two three four\n", 86 | "a 0 1 2 3\n", 87 | "b 4 5 6 7\n", 88 | "c 8 9 10 11" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "data" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "Index(['A', 'B', 'C'], dtype='object')" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "#Using the index map method we can assign new indexes to different values\n", 118 | "#Suppose we want to capitalize the index names\n", 119 | "\n", 120 | "data.index.map(str.upper)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "#Now assigning it to the the index, we get\n", 130 | "data.index = data.index.map(str.upper)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 6, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/html": [ 141 | "
\n", 142 | "\n", 155 | "\n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | "
onetwothreefour
A0123
B4567
C891011
\n", 189 | "
" 190 | ], 191 | "text/plain": [ 192 | " one two three four\n", 193 | "A 0 1 2 3\n", 194 | "B 4 5 6 7\n", 195 | "C 8 9 10 11" 196 | ] 197 | }, 198 | "execution_count": 6, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "data" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/html": [ 215 | "
\n", 216 | "\n", 229 | "\n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | "
ONETWOTHREEFOUR
A0123
B4567
C891011
\n", 263 | "
" 264 | ], 265 | "text/plain": [ 266 | " ONE TWO THREE FOUR\n", 267 | "A 0 1 2 3\n", 268 | "B 4 5 6 7\n", 269 | "C 8 9 10 11" 270 | ] 271 | }, 272 | "execution_count": 7, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "#We can use the rename method to apply changes to the dataframe without actually modifying the actual one\n", 279 | "\n", 280 | "data.rename(index=str.title, columns=str.upper) #Capitalizes the columns" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 9, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/html": [ 291 | "
\n", 292 | "\n", 305 | "\n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | "
onetwofivefour
A0123
D4567
C891011
\n", 339 | "
" 340 | ], 341 | "text/plain": [ 342 | " one two five four\n", 343 | "A 0 1 2 3\n", 344 | "D 4 5 6 7\n", 345 | "C 8 9 10 11" 346 | ] 347 | }, 348 | "execution_count": 9, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "#We can also use rename to modify the name of an existing label with some other one\n", 355 | "\n", 356 | "data.rename(index={'B':'D'}, columns={'three':'five'})" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 10, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "#We can modify our actual value using rename by passing inplace=True\n", 366 | "\n", 367 | "_ = data.rename(index={'B':'D'}, columns={'three':'five'}, inplace=True)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 11, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/html": [ 378 | "
\n", 379 | "\n", 392 | "\n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | "
onetwofivefour
A0123
D4567
C891011
\n", 426 | "
" 427 | ], 428 | "text/plain": [ 429 | " one two five four\n", 430 | "A 0 1 2 3\n", 431 | "D 4 5 6 7\n", 432 | "C 8 9 10 11" 433 | ] 434 | }, 435 | "execution_count": 11, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "data" 442 | ] 443 | } 444 | ], 445 | "metadata": { 446 | "kernelspec": { 447 | "display_name": "Python 3", 448 | "language": "python", 449 | "name": "python3" 450 | }, 451 | "language_info": { 452 | "codemirror_mode": { 453 | "name": "ipython", 454 | "version": 3 455 | }, 456 | "file_extension": ".py", 457 | "mimetype": "text/x-python", 458 | "name": "python", 459 | "nbconvert_exporter": "python", 460 | "pygments_lexer": "ipython3", 461 | "version": "3.8.2" 462 | } 463 | }, 464 | "nbformat": 4, 465 | "nbformat_minor": 4 466 | } 467 | --------------------------------------------------------------------------------