├── access_data.ipynb ├── chatgpt_ds.ipynb ├── churn_gpt3_ds.ipynb ├── customer_segmentation_chatgpt_ds.ipynb ├── demand_gpt3_ds.ipynb ├── helper_class_ml.ipynb ├── hyperparameter_tuning_tutorial.ipynb ├── imputer.ipynb ├── list_comp_generators.ipynb ├── p_values_ml.ipynb ├── portfolio_optimization.ipynb ├── predictive_modeling_attrition.ipynb ├── richest_people_eda.ipynb ├── spotify_churn_sythentic.csv ├── stable_diffusion └── hello.py ├── synthetic_best_buy_data.csv ├── tabgan_experiements.ipynb └── time_series_oop.ipynb /chatgpt_ds.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"%pip install openai\n%pip install catboost","metadata":{"tags":[],"cell_id":"0adf8006093a451895d31f046cb00961","allow_embed":"code_output","source_hash":"8c175969","execution_start":1673363490242,"execution_millis":11359,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Collecting openai\n Downloading openai-0.26.0.tar.gz (54 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.7/54.7 KB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n\u001b[?25hRequirement already satisfied: requests>=2.20 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from openai) (2.28.1)\nRequirement already satisfied: aiohttp in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from openai) (3.8.3)\nRequirement already satisfied: tqdm in /shared-libs/python3.9/py/lib/python3.9/site-packages (from openai) (4.64.1)\nRequirement already satisfied: idna<4,>=2.5 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\nRequirement already satisfied: certifi>=2017.4.17 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from requests>=2.20->openai) (2022.9.24)\nRequirement already satisfied: urllib3<1.27,>=1.21.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from requests>=2.20->openai) (1.26.12)\nRequirement already satisfied: charset-normalizer<3,>=2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from requests>=2.20->openai) (2.1.1)\nRequirement already satisfied: frozenlist>=1.1.1 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\nRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (4.0.2)\nRequirement already satisfied: aiosignal>=1.1.2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (1.2.0)\nRequirement already satisfied: yarl<2.0,>=1.0 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (1.8.1)\nRequirement already satisfied: attrs>=17.3.0 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (22.1.0)\nRequirement already satisfied: multidict<7.0,>=4.5 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from aiohttp->openai) (6.0.2)\nBuilding wheels for collected packages: openai\n Building wheel for openai (pyproject.toml) ... \u001b[?25ldone\n\u001b[?25h Created wheel for openai: filename=openai-0.26.0-py3-none-any.whl size=66833 sha256=0521d4ca265f85c6ae8e6844f2a80bc750427b6b74cb99a5aba252e6d57d6259\n Stored in directory: /root/.cache/pip/wheels/50/85/93/3c090d89fb182ca03a781eff1f7195ec4a893dbeea5ae964dc\nSuccessfully built openai\nInstalling collected packages: openai\nSuccessfully installed openai-0.26.0\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0mNote: you may need to restart the kernel to use updated packages.\nCollecting catboost\n Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: plotly in /shared-libs/python3.9/py/lib/python3.9/site-packages (from catboost) (5.10.0)\nRequirement already satisfied: scipy in /shared-libs/python3.9/py/lib/python3.9/site-packages (from catboost) (1.9.3)\nRequirement already satisfied: matplotlib in /shared-libs/python3.9/py/lib/python3.9/site-packages (from catboost) (3.6.0)\nCollecting graphviz\n Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.0/47.0 KB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: pandas>=0.24.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from catboost) (1.2.5)\nRequirement already satisfied: numpy>=1.16.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from catboost) (1.23.4)\nRequirement already satisfied: six in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from catboost) (1.16.0)\nRequirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from pandas>=0.24.0->catboost) (2.8.2)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pandas>=0.24.0->catboost) (2022.5)\nRequirement already satisfied: cycler>=0.10 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->catboost) (0.11.0)\nRequirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->catboost) (4.37.4)\nRequirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib->catboost) (3.0.9)\nRequirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->catboost) (1.4.4)\nRequirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->catboost) (9.2.0)\nRequirement already satisfied: contourpy>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->catboost) (1.0.5)\nRequirement already satisfied: packaging>=20.0 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib->catboost) (21.3)\nRequirement already satisfied: tenacity>=6.2.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from plotly->catboost) (8.1.0)\nInstalling collected packages: graphviz, catboost\nSuccessfully installed catboost-1.1.1 graphviz-0.20.1\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0mNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"import openai\nopenai.api_key = \"\"\n","metadata":{"tags":[],"cell_id":"f8dbdb5478f749439db51e9911865499","allow_embed":"code_output","source_hash":"df671972","execution_start":1673365907749,"execution_millis":1,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":50},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What is the pandas library?\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"a180f55a9c91496397774bef72d7973f","allow_embed":"code_output","source_hash":"76b98026","execution_start":1673365727131,"execution_millis":2539,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nPandas is an open source software library written in Python for data manipulation and analysis. Pandas is widely used in data science, machine learning and many other fields. It provides high-level data structures and tools for handling and manipulating data, including data frames, series, plotting tools and more.\n","output_type":"stream"}],"execution_count":48},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"what are some common Pandas use cases?\", max_tokens=240)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"2760765f78ac4f3b90287f6512348607","allow_embed":"code_output","source_hash":"e0759cfc","execution_start":1673363502724,"execution_millis":1964,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. Data Cleaning and Transformation\n2. Data Analysis and Exploration\n3. Time Series Analysis\n4. Data Visualization\n5. Statistical Modeling\n6. Predictive Modeling\n7. Machine Learning\n8. Web Scraping\n","output_type":"stream"}],"execution_count":3},{"cell_type":"code","source":"#what are the most common deep learning libraries?\ncompletion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"what are the most common deep learning libraries?\", max_tokens=240)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"65cf1dd02cc7408bbcc2ee9fb0e64cb5","allow_embed":"code_output","source_hash":"1d073d9f","execution_start":1673363504691,"execution_millis":2997,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. TensorFlow \n2. PyTorch \n3. Keras \n4. Caffe \n5. CNTK \n6. MXNet \n7. Theano \n8. Deeplearning4j \n9. Gensim \n10. LUNA\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What is a deep neural network?\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"13c87c1765004e44bcf533bad839146a","allow_embed":"code_output","source_hash":"fc1559af","execution_start":1673363507691,"execution_millis":2509,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nA deep neural network (DNN) is a type of artificial neural network (ANN) with multiple layers of neurons between the input and output layers. DNNs are designed to learn complex non-linear relationships from data, and have been successfully applied in a wide range of areas such as image recognition, natural language processing, and financial forecasting.\n","output_type":"stream"}],"execution_count":5},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"give some ideas on feature transformations that can improve model performance\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"2413189d53874e22b3c645a0b1a09aee","allow_embed":"code_output","source_hash":"1cb10d06","execution_start":1673363510201,"execution_millis":5732,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. Standardization/Normalization: A common feature transform used to ensure features are on the same scale, standardizing or normalizing variables can help limit the severity of outliers and improve the overall model performance.\n\n2. Feature Binning: Binning is a process of transforming numerical variables into categorical ones. This can be useful when working with variables that have too many levels and can have a significant effect on the model performance.\n\n3. Polynomial Expansion: When a nonlinear relationship is expected between features and the output variable, a polynomial expansion feature transformation can help improve model performance.\n\n4. Feature Selection: Removing redundant or irrelevant features from the dataset can help improve the model performance as these features may lead to overfitting.\n\n5. Ensemble: Combining different types of models (or different versions of the same model) can often improve performance due to their combined capabilities.\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that performs data standardization\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"af12900045b2478b93ab9736b64e882b","allow_embed":"code_output","source_hash":"b15dbb8b","execution_start":1673363515937,"execution_millis":4579,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n# Import the necessary libraries\nimport numpy as np\n\n# Define the data \ndata = np.array([[-3, 9, 0, 8],\n [ 4, 6, 5, 12],\n [20, 2, 3, 15]])\n\n# Calculate mean and standard deviation\nmean = np.mean(data, axis=0)\nstd = np.std(data, axis=0)\n\n# Perform data standardization\nstandardized_data = (data - mean) / std\n\n# Print the results\nprint(standardized_data)\n","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"# Import the necessary libraries\nimport numpy as np\n\n# Define the data \ndata = np.array([[-3, 9, 0, 8],\n [ 4, 6, 5, 12],\n [20, 2, 3, 15]])\n\n# Calculate mean and standard deviation\nmean = np.mean(data, axis=0)\nstd = np.std(data, axis=0)\n\n# Perform data standardization\nstandardized_data = (data - mean) / std\n\n# Print the results\nprint(standardized_data)","metadata":{"tags":[],"cell_id":"32e7ec7fc04b4a29962cb69a6e1a25af","allow_embed":"code_output","source_hash":"2f0bcad0","execution_start":1673363633656,"execution_millis":4,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[[-1.03881504 1.16247639 -1.29777137 -1.27872403]\n [-0.31164451 0.11624764 1.13554995 0.11624764]\n [ 1.35045955 -1.27872403 0.16222142 1.16247639]]\n","output_type":"stream"}],"execution_count":12},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that performs data normalization on fake data\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"a924a7e8372a4189b039844383357660","allow_embed":"code_output","source_hash":"a87e1c85","execution_start":1673363909417,"execution_millis":4670,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n# Normalizing data will rescale features in the range [0,1]\n\ndata = [3, 7, 10, 13] # Sample data\n\n# Calculate the maximum and minimum of the data\nmax_data = max(data)\nmin_data = min(data)\n\n# Normalize the data\nnormalized_data = [(x-min_data)/(max_data-min_data) for x in data]\n\n# Print first value to check \nprint(normalized_data[0]) # Prints 0.2\n","output_type":"stream"}],"execution_count":19},{"cell_type":"code","source":"\n# Normalizing data will rescale features in the range [0,1]\n\ndata = [3, 7, 10, 13] # Sample data\n\n# Calculate the maximum and minimum of the data\nmax_data = max(data)\nmin_data = min(data)\n\n# Normalize the data\nnormalized_data = [(x-min_data)/(max_data-min_data) for x in data]\n\n# Print first value to check \nprint(normalized_data) \n","metadata":{"tags":[],"cell_id":"4dcd582d6443441b97b4652d53795756","allow_embed":"code_output","source_hash":"8d36fcfa","execution_start":1673363932199,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[0.0, 0.4, 0.7, 1.0]\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"How do i select a time series model?\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"3906c30630854d749bfee27b410c8b19","allow_embed":"code_output","source_hash":"431e6b6e","execution_start":1673363723013,"execution_millis":4807,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nThe selection of a time series model depends on the type of data that is being analyzed. If there are long-term trends or cycles in the data, then an autoregressive integrated moving average (ARIMA) model or a exponential smoothing (ETS) model may be the best option. If the data are seasonal or there are outliers present, then a Holt-Winters model may be a better choice. If the data do not contain any long-term trends or seasonality, a Box-Jenkins model may be suitable. Regardless of the model selected, it is important to consider the type of data that is being modeled when selecting a time series model.\n","output_type":"stream"}],"execution_count":17},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"When to use ARIMA vs FB prophet?\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"dd459a0b0c2042f68abbe1c77c8cd789","allow_embed":"code_output","source_hash":"aa5a6656","execution_start":1673363744423,"execution_millis":3699,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nARIMA and FB Prophet are both used for time series forecasting. ARIMA is a linear model and is better for predicting long-term trends with stationary data. FB Prophet is a non-linear model and is better for predicting short-term trends with non-stationary data. So, you would use ARIMA when you are mainly concerned with predicting long-term trends, and you would use FB Prophet when you are mainly concerned with predicting short-term trends.\n","output_type":"stream"}],"execution_count":18},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that generates synthetic healthcare readmission data stored in a dataframe\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"f997e7a71c57471c81dba210622b0ffe","allow_embed":"code_output","source_hash":"7160f974","execution_start":1673364364069,"execution_millis":5794,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nimport pandas as pd\nimport numpy as np\n\n# Create Dataframe \n\ndf = pd.DataFrame(columns=['Patient_ID', 'Age', 'Admission_Type', 'Readmitted'])\n\n# Generate Data\n\nnp.random.seed(0)\nfor i in range(10):\n admission_type = np.random.choice(['Urgent', 'Scheduled', 'Emergency'])\n patient_age = np.random.randint(18, 80)\n readmission = np.random.choice([0, 1])\n df.loc[i] = [i+1, patient_age, admission_type, readmission]\n\n# Print Dataframe to Console\n\nprint(df)\n","output_type":"stream"}],"execution_count":22},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n\n# Create Dataframe \n\ndf = pd.DataFrame(columns=['Patient_ID', 'Age', 'Admission_Type', 'Readmitted'])\n\n# Generate Data\n\nnp.random.seed(0)\nfor i in range(10):\n admission_type = np.random.choice(['Urgent', 'Scheduled', 'Emergency'])\n patient_age = np.random.randint(18, 80)\n readmission = np.random.choice([0, 1])\n df.loc[i] = [i+1, patient_age, admission_type, readmission]\n\n# Print Dataframe to Console\n\ndf\n","metadata":{"tags":[],"cell_id":"3b5d1a15336a44efbc67b9734d8ff466","allow_embed":"code_output","source_hash":"1f170973","execution_start":1673364389122,"execution_millis":22,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":23,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":4,"row_count":10,"columns":[{"name":"Patient_ID","dtype":"object","stats":{"unique_count":10,"nan_count":0,"categories":[{"name":"1","count":1},{"name":"2","count":1},{"name":"8 others","count":8}]}},{"name":"Age","dtype":"object","stats":{"unique_count":9,"nan_count":0,"categories":[{"name":"42","count":2},{"name":"65","count":1},{"name":"7 others","count":7}]}},{"name":"Admission_Type","dtype":"object","stats":{"unique_count":3,"nan_count":0,"categories":[{"name":"Urgent","count":4},{"name":"Emergency","count":4},{"name":"Scheduled","count":2}]}},{"name":"Readmitted","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"1","count":8},{"name":"0","count":2}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"Patient_ID":"1","Age":"65","Admission_Type":"Urgent","Readmitted":"1","_deepnote_index_column":"0"},{"Patient_ID":"2","Age":"21","Admission_Type":"Urgent","Readmitted":"1","_deepnote_index_column":"1"},{"Patient_ID":"3","Age":"37","Admission_Type":"Scheduled","Readmitted":"1","_deepnote_index_column":"2"},{"Patient_ID":"4","Age":"54","Admission_Type":"Emergency","Readmitted":"1","_deepnote_index_column":"3"},{"Patient_ID":"5","Age":"42","Admission_Type":"Emergency","Readmitted":"0","_deepnote_index_column":"4"},{"Patient_ID":"6","Age":"76","Admission_Type":"Urgent","Readmitted":"1","_deepnote_index_column":"5"},{"Patient_ID":"7","Age":"57","Admission_Type":"Emergency","Readmitted":"1","_deepnote_index_column":"6"},{"Patient_ID":"8","Age":"42","Admission_Type":"Emergency","Readmitted":"1","_deepnote_index_column":"7"},{"Patient_ID":"9","Age":"43","Admission_Type":"Scheduled","Readmitted":"1","_deepnote_index_column":"8"},{"Patient_ID":"10","Age":"27","Admission_Type":"Urgent","Readmitted":"0","_deepnote_index_column":"9"}]},"text/plain":" Patient_ID Age Admission_Type Readmitted\n0 1 65 Urgent 1\n1 2 21 Urgent 1\n2 3 37 Scheduled 1\n3 4 54 Emergency 1\n4 5 42 Emergency 0\n5 6 76 Urgent 1\n6 7 57 Emergency 1\n7 8 42 Emergency 1\n8 9 43 Scheduled 1\n9 10 27 Urgent 0","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Patient_IDAgeAdmission_TypeReadmitted
0165Urgent1
1221Urgent1
2337Scheduled1
3454Emergency1
4542Emergency0
5676Urgent1
6757Emergency1
7842Emergency1
8943Scheduled1
91027Urgent0
\n
"},"metadata":{}}],"execution_count":23},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that generates synthetic healthcare readmission data stored in a dataframe. From this write code that builds a catboost model that predicts readmission outcomes. Also write code to calculate and print performance\", max_tokens=3000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"7b8ff97243cb47a280d6a3295141fbe2","allow_embed":"code_output","source_hash":"ad1ce1e2","execution_start":1673364529638,"execution_millis":30591,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":" metrics\n\n## Generate Synthetic Healthcare Readmission Data\n\nimport pandas as pd \nimport numpy as np \n\n# set the seed for reproducibility \nnp.random.seed(1)\n\n# create dataframe \ndf = pd.DataFrame(np.random.randint(0, 100, size=(100, 10)), columns=['age','gender','length_of_stay','diagnosis','NIV','laboratory','past_hospitalizations','medications','bmi','readmission'])\n\n# add labels to data frame \ndf['age'] = np.random.randint(20, 80, size=(100))\ndf['gender'] = np.random.randint(1, 2, size=(100))\ndf['length_of_stay'] = np.random.randint(2, 14, size=(100))\ndf['diagnosis'] = np.random.randint(1, 5, size=(100))\ndf['NIV'] = np.random.randint(0, 2, size=(100))\ndf['laboratory'] = np.random.randint(1, 6, size=(100))\ndf['past_hospitalizations'] = np.random.randint(0, 10, size=(100))\ndf['medications'] = np.random.randint(1, 6, size=(100))\ndf['bmi'] = np.random.randint(18, 35, size=(100))\ndf['readmission'] = np.random.randint(0, 2, size=(100))\n\n# print the dataframe \nprint(df)\n\n## Build a CatBoost Model\nfrom catboost import CatBoostClassifier\nfrom sklearn.metrics import confusion_matrix\n\n# separate X and y\nX = df.iloc[:, 0:9]\ny = df.iloc[:, 9]\n\n# initialize catboost classifier \ncat_clf = CatBoostClassifier(iterations=50,\n learning_rate=0.3,\n depth=8,\n eval_metric='Accuracy',\n random_seed=42)\n\n# fit the model \ncat_clf.fit(X, y)\n\n# predict values\ny_pred = cat_clf.predict(X)\n\n# print confusion matrix\nconf_mat = confusion_matrix(y, y_pred)\nprint(conf_mat)\n\n## Calculate and Print Performance Metrics\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score\n\n# calculate performance metrics\nacc = accuracy_score(y, y_pred)\nprecision = precision_score(y, y_pred)\nrecall = recall_score(y, y_pred)\n\n# print performance metrics \nprint(\"Accuracy: {}\".format(acc))\nprint(\"Precision: {}\".format(precision))\nprint(\"Recall: {}\".format(recall))\n","output_type":"stream"}],"execution_count":27},{"cell_type":"code","source":"## Generate Synthetic Healthcare Readmission Data\n\nimport pandas as pd \nimport numpy as np \n\n# set the seed for reproducibility \nnp.random.seed(1)\n\n# create dataframe \ndf = pd.DataFrame(np.random.randint(0, 100, size=(100, 10)), columns=['age','gender','length_of_stay','diagnosis','NIV','laboratory','past_hospitalizations','medications','bmi','readmission'])\n\n# add labels to data frame \ndf['age'] = np.random.randint(20, 80, size=(100))\ndf['gender'] = np.random.randint(1, 2, size=(100))\ndf['length_of_stay'] = np.random.randint(2, 14, size=(100))\ndf['diagnosis'] = np.random.randint(1, 5, size=(100))\ndf['NIV'] = np.random.randint(0, 2, size=(100))\ndf['laboratory'] = np.random.randint(1, 6, size=(100))\ndf['past_hospitalizations'] = np.random.randint(0, 10, size=(100))\ndf['medications'] = np.random.randint(1, 6, size=(100))\ndf['bmi'] = np.random.randint(18, 35, size=(100))\ndf['readmission'] = np.random.randint(0, 2, size=(100))\n\n# print the dataframe \nprint(df)\n\n## Build a CatBoost Model\nfrom catboost import CatBoostClassifier\nfrom sklearn.metrics import confusion_matrix\n\n# separate X and y\nX = df.iloc[:, 0:9]\ny = df.iloc[:, 9]\n\n# initialize catboost classifier \ncat_clf = CatBoostClassifier(iterations=50,\n learning_rate=0.3,\n depth=8,\n eval_metric='Accuracy',\n random_seed=42)\n\n# fit the model \ncat_clf.fit(X, y)\n\n# predict values\ny_pred = cat_clf.predict(X)\n\n# print confusion matrix\nconf_mat = confusion_matrix(y, y_pred)\nprint(conf_mat)\n\n## Calculate and Print Performance Metrics\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score\n\n# calculate performance metrics\nacc = accuracy_score(y, y_pred)\nprecision = precision_score(y, y_pred)\nrecall = recall_score(y, y_pred)\n\n# print performance metrics \nprint(\"Accuracy: {}\".format(acc))\nprint(\"Precision: {}\".format(precision))\nprint(\"Recall: {}\".format(recall))","metadata":{"tags":[],"cell_id":"bc718c65f54440b594932296ba2619e2","allow_embed":"code_output","source_hash":"2aa4be45","execution_start":1673364619253,"execution_millis":113,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":" age gender length_of_stay diagnosis NIV laboratory \\\n0 79 1 11 3 1 3 \n1 22 1 13 3 1 1 \n2 30 1 4 4 0 3 \n3 47 1 5 1 1 4 \n4 25 1 10 4 1 4 \n.. ... ... ... ... ... ... \n95 60 1 5 4 0 4 \n96 71 1 6 3 0 5 \n97 62 1 13 3 0 4 \n98 43 1 6 1 1 5 \n99 71 1 6 1 0 5 \n\n past_hospitalizations medications bmi readmission \n0 7 1 19 1 \n1 6 1 27 0 \n2 5 1 18 1 \n3 7 2 30 1 \n4 2 4 18 1 \n.. ... ... ... ... \n95 1 1 22 1 \n96 4 1 32 0 \n97 6 1 21 1 \n98 8 3 26 1 \n99 8 1 29 0 \n\n[100 rows x 10 columns]\n0:\tlearn: 0.6000000\ttotal: 171us\tremaining: 8.42ms\n1:\tlearn: 0.7800000\ttotal: 453us\tremaining: 10.9ms\n2:\tlearn: 0.8400000\ttotal: 681us\tremaining: 10.7ms\n3:\tlearn: 0.8300000\ttotal: 886us\tremaining: 10.2ms\n4:\tlearn: 0.8900000\ttotal: 1.1ms\tremaining: 9.88ms\n5:\tlearn: 0.9100000\ttotal: 1.31ms\tremaining: 9.63ms\n6:\tlearn: 0.9300000\ttotal: 1.52ms\tremaining: 9.32ms\n7:\tlearn: 0.9700000\ttotal: 1.71ms\tremaining: 8.99ms\n8:\tlearn: 0.9600000\ttotal: 1.92ms\tremaining: 8.75ms\n9:\tlearn: 0.9600000\ttotal: 2.01ms\tremaining: 8.04ms\n10:\tlearn: 0.9800000\ttotal: 2.22ms\tremaining: 7.86ms\n11:\tlearn: 0.9900000\ttotal: 2.42ms\tremaining: 7.67ms\n12:\tlearn: 0.9900000\ttotal: 2.61ms\tremaining: 7.44ms\n13:\tlearn: 0.9900000\ttotal: 2.81ms\tremaining: 7.21ms\n14:\tlearn: 0.9900000\ttotal: 3.02ms\tremaining: 7.04ms\n15:\tlearn: 1.0000000\ttotal: 3.22ms\tremaining: 6.84ms\n16:\tlearn: 1.0000000\ttotal: 3.42ms\tremaining: 6.64ms\n17:\tlearn: 1.0000000\ttotal: 3.64ms\tremaining: 6.47ms\n18:\tlearn: 1.0000000\ttotal: 3.88ms\tremaining: 6.32ms\n19:\tlearn: 1.0000000\ttotal: 4.08ms\tremaining: 6.12ms\n20:\tlearn: 1.0000000\ttotal: 4.27ms\tremaining: 5.89ms\n21:\tlearn: 1.0000000\ttotal: 4.46ms\tremaining: 5.68ms\n22:\tlearn: 1.0000000\ttotal: 4.67ms\tremaining: 5.48ms\n23:\tlearn: 1.0000000\ttotal: 4.9ms\tremaining: 5.3ms\n24:\tlearn: 1.0000000\ttotal: 5.11ms\tremaining: 5.11ms\n25:\tlearn: 1.0000000\ttotal: 5.3ms\tremaining: 4.89ms\n26:\tlearn: 1.0000000\ttotal: 5.5ms\tremaining: 4.69ms\n27:\tlearn: 1.0000000\ttotal: 5.73ms\tremaining: 4.5ms\n28:\tlearn: 1.0000000\ttotal: 5.94ms\tremaining: 4.3ms\n29:\tlearn: 1.0000000\ttotal: 6.14ms\tremaining: 4.09ms\n30:\tlearn: 1.0000000\ttotal: 6.34ms\tremaining: 3.88ms\n31:\tlearn: 1.0000000\ttotal: 6.54ms\tremaining: 3.68ms\n32:\tlearn: 1.0000000\ttotal: 6.75ms\tremaining: 3.48ms\n33:\tlearn: 1.0000000\ttotal: 6.95ms\tremaining: 3.27ms\n34:\tlearn: 1.0000000\ttotal: 7.17ms\tremaining: 3.07ms\n35:\tlearn: 1.0000000\ttotal: 7.37ms\tremaining: 2.87ms\n36:\tlearn: 1.0000000\ttotal: 7.59ms\tremaining: 2.67ms\n37:\tlearn: 1.0000000\ttotal: 7.8ms\tremaining: 2.46ms\n38:\tlearn: 1.0000000\ttotal: 8.01ms\tremaining: 2.26ms\n39:\tlearn: 1.0000000\ttotal: 8.2ms\tremaining: 2.05ms\n40:\tlearn: 1.0000000\ttotal: 8.43ms\tremaining: 1.85ms\n41:\tlearn: 1.0000000\ttotal: 8.64ms\tremaining: 1.65ms\n42:\tlearn: 1.0000000\ttotal: 8.84ms\tremaining: 1.44ms\n43:\tlearn: 1.0000000\ttotal: 9.04ms\tremaining: 1.23ms\n44:\tlearn: 1.0000000\ttotal: 9.24ms\tremaining: 1.03ms\n45:\tlearn: 1.0000000\ttotal: 9.46ms\tremaining: 822us\n46:\tlearn: 1.0000000\ttotal: 9.66ms\tremaining: 616us\n47:\tlearn: 1.0000000\ttotal: 9.87ms\tremaining: 411us\n48:\tlearn: 1.0000000\ttotal: 10.1ms\tremaining: 205us\n49:\tlearn: 1.0000000\ttotal: 10.3ms\tremaining: 0us\n[[53 0]\n [ 0 47]]\nAccuracy: 1.0\nPrecision: 1.0\nRecall: 1.0\n","output_type":"stream"}],"execution_count":29},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that generates synthetic transaction data stored in a dataframe\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"19a9d0f597574d908151c234cc4c44fe","allow_embed":"code_output","source_hash":"a82ce2a4","execution_start":1673364672681,"execution_millis":8025,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\nimport pandas as pd \nimport numpy as np \n\n#create randomly generated customer data\ncustomer_id = np.arange(1,101) \ncustomer_names = [f'John Doe {x}' for x in range(1,101)] \n\n#create randomly generated transaction data\ntransaction_id = np.arange(1,101)\ndates = [f'2020-07-{x}' for x in range(1,101)]\namounts = np.random.randint(low=1, high=1000, size=(100,)) \n\n#create dataframe with randomly generated data\ntransaction_data = pd.DataFrame({'Customer ID': customer_id, \n 'Customer Name': customer_names,\n 'Transaction ID': transaction_id, \n 'Date': dates, \n 'Amount': amounts})\n\nprint(transaction_data)\n","output_type":"stream"}],"execution_count":30},{"cell_type":"code","source":"\nimport pandas as pd \nimport numpy as np \n\n#create randomly generated customer data\ncustomer_id = np.arange(1,101) \ncustomer_names = [f'John Doe {x}' for x in range(1,101)] \n\n#create randomly generated transaction data\ntransaction_id = np.arange(1,101)\ndates = [f'2020-07-{x}' for x in range(1,101)]\namounts = np.random.randint(low=1, high=1000, size=(100,)) \n\n#create dataframe with randomly generated data\ntransaction_data = pd.DataFrame({'Customer ID': customer_id, \n 'Customer Name': customer_names,\n 'Transaction ID': transaction_id, \n 'Date': dates, \n 'Amount': amounts})\n\ntransaction_data","metadata":{"tags":[],"cell_id":"d786bdb042b744c2bda8eb6004a0ff77","allow_embed":"code_output","source_hash":"1307dbc7","execution_start":1673364707400,"execution_millis":52,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":31,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":5,"row_count":100,"columns":[{"name":"Customer ID","dtype":"int64","stats":{"unique_count":100,"nan_count":0,"min":"1","max":"100","histogram":[{"bin_start":1,"bin_end":10.9,"count":10},{"bin_start":10.9,"bin_end":20.8,"count":10},{"bin_start":20.8,"bin_end":30.700000000000003,"count":10},{"bin_start":30.700000000000003,"bin_end":40.6,"count":10},{"bin_start":40.6,"bin_end":50.5,"count":10},{"bin_start":50.5,"bin_end":60.400000000000006,"count":10},{"bin_start":60.400000000000006,"bin_end":70.3,"count":10},{"bin_start":70.3,"bin_end":80.2,"count":10},{"bin_start":80.2,"bin_end":90.10000000000001,"count":10},{"bin_start":90.10000000000001,"bin_end":100,"count":10}]}},{"name":"Customer Name","dtype":"object","stats":{"unique_count":100,"nan_count":0,"categories":[{"name":"John Doe 1","count":1},{"name":"John Doe 2","count":1},{"name":"98 others","count":98}]}},{"name":"Transaction ID","dtype":"int64","stats":{"unique_count":100,"nan_count":0,"min":"1","max":"100","histogram":[{"bin_start":1,"bin_end":10.9,"count":10},{"bin_start":10.9,"bin_end":20.8,"count":10},{"bin_start":20.8,"bin_end":30.700000000000003,"count":10},{"bin_start":30.700000000000003,"bin_end":40.6,"count":10},{"bin_start":40.6,"bin_end":50.5,"count":10},{"bin_start":50.5,"bin_end":60.400000000000006,"count":10},{"bin_start":60.400000000000006,"bin_end":70.3,"count":10},{"bin_start":70.3,"bin_end":80.2,"count":10},{"bin_start":80.2,"bin_end":90.10000000000001,"count":10},{"bin_start":90.10000000000001,"bin_end":100,"count":10}]}},{"name":"Date","dtype":"object","stats":{"unique_count":100,"nan_count":0,"categories":[{"name":"2020-07-1","count":1},{"name":"2020-07-2","count":1},{"name":"98 others","count":98}]}},{"name":"Amount","dtype":"int64","stats":{"unique_count":95,"nan_count":0,"min":"1","max":"999","histogram":[{"bin_start":1,"bin_end":100.8,"count":7},{"bin_start":100.8,"bin_end":200.6,"count":6},{"bin_start":200.6,"bin_end":300.4,"count":16},{"bin_start":300.4,"bin_end":400.2,"count":9},{"bin_start":400.2,"bin_end":500,"count":7},{"bin_start":500,"bin_end":599.8,"count":8},{"bin_start":599.8,"bin_end":699.6,"count":10},{"bin_start":699.6,"bin_end":799.4,"count":9},{"bin_start":799.4,"bin_end":899.1999999999999,"count":13},{"bin_start":899.1999999999999,"bin_end":999,"count":15}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"Customer ID":"1","Customer Name":"John Doe 1","Transaction ID":"1","Date":"2020-07-1","Amount":"138","_deepnote_index_column":"0"},{"Customer ID":"2","Customer Name":"John Doe 2","Transaction ID":"2","Date":"2020-07-2","Amount":"373","_deepnote_index_column":"1"},{"Customer ID":"3","Customer Name":"John Doe 3","Transaction ID":"3","Date":"2020-07-3","Amount":"751","_deepnote_index_column":"2"},{"Customer ID":"4","Customer Name":"John Doe 4","Transaction ID":"4","Date":"2020-07-4","Amount":"385","_deepnote_index_column":"3"},{"Customer ID":"5","Customer Name":"John Doe 5","Transaction ID":"5","Date":"2020-07-5","Amount":"744","_deepnote_index_column":"4"},{"Customer ID":"6","Customer Name":"John Doe 6","Transaction ID":"6","Date":"2020-07-6","Amount":"56","_deepnote_index_column":"5"},{"Customer ID":"7","Customer Name":"John Doe 7","Transaction ID":"7","Date":"2020-07-7","Amount":"492","_deepnote_index_column":"6"},{"Customer ID":"8","Customer Name":"John Doe 8","Transaction ID":"8","Date":"2020-07-8","Amount":"622","_deepnote_index_column":"7"},{"Customer ID":"9","Customer Name":"John Doe 9","Transaction ID":"9","Date":"2020-07-9","Amount":"582","_deepnote_index_column":"8"},{"Customer ID":"10","Customer Name":"John Doe 10","Transaction ID":"10","Date":"2020-07-10","Amount":"267","_deepnote_index_column":"9"}]},"text/plain":" Customer ID Customer Name Transaction ID Date Amount\n0 1 John Doe 1 1 2020-07-1 138\n1 2 John Doe 2 2 2020-07-2 373\n2 3 John Doe 3 3 2020-07-3 751\n3 4 John Doe 4 4 2020-07-4 385\n4 5 John Doe 5 5 2020-07-5 744\n.. ... ... ... ... ...\n95 96 John Doe 96 96 2020-07-96 895\n96 97 John Doe 97 97 2020-07-97 984\n97 98 John Doe 98 98 2020-07-98 424\n98 99 John Doe 99 99 2020-07-99 294\n99 100 John Doe 100 100 2020-07-100 391\n\n[100 rows x 5 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Customer IDCustomer NameTransaction IDDateAmount
01John Doe 112020-07-1138
12John Doe 222020-07-2373
23John Doe 332020-07-3751
34John Doe 442020-07-4385
45John Doe 552020-07-5744
..................
9596John Doe 96962020-07-96895
9697John Doe 97972020-07-97984
9798John Doe 98982020-07-98424
9899John Doe 99992020-07-99294
99100John Doe 1001002020-07-100391
\n

100 rows × 5 columns

\n
"},"metadata":{}}],"execution_count":31},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"Write example python code that generates synthetic transaction data stored in a dataframe. Include customer ID, transaction amount, item ID, item name, age, gender, and zipcode\", max_tokens=2000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"806cdf0729bf4fff9a912247c8352cfc","allow_embed":"code_output","source_hash":"ae8986e5","execution_start":1673364810801,"execution_millis":13798,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n\nimport pandas as pd\nimport numpy as np\n\nrows = ['customer_ID', 'transaction_amnt', 'item_ID', 'item_name', 'age', 'gender', 'zipcode']\n\ndata = pd.DataFrame(columns=rows) \n\nfor i in range(1,100):\n customer_ID = int( np.random.uniform(100,600-100)) \n transaction_amnt = np.random.uniform(1.25, 10.00)\n item_ID = int( np.random.uniform(1,35))\n item_name = np.random.choice([\"phone\", \"tablet\", \"laptop\", \"smartwatch\"])\n age = int( np.random.uniform(17,75)) \n gender = np.random.choice([\"male\", \"female\"]) \n zipcode = np.random.choice([\"98101\", \"98200\", \"98469\", \"98801\"])\n data.loc[i] = [customer_ID, transaction_amnt, item_ID, item_name, age, gender, zipcode]\n\nprint (data)\n","output_type":"stream"}],"execution_count":34},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n\nrows = ['customer_ID', 'transaction_amnt', 'item_ID', 'item_name', 'age', 'gender', 'zipcode']\n\ndata = pd.DataFrame(columns=rows) \n\nfor i in range(1,100):\n customer_ID = int( np.random.uniform(100,600-100)) \n transaction_amnt = np.random.uniform(1.25, 10.00)\n item_ID = int( np.random.uniform(1,35))\n item_name = np.random.choice([\"phone\", \"tablet\", \"laptop\", \"smartwatch\"])\n age = int( np.random.uniform(17,75)) \n gender = np.random.choice([\"male\", \"female\"]) \n zipcode = np.random.choice([\"98101\", \"98200\", \"98469\", \"98801\"])\n data.loc[i] = [customer_ID, transaction_amnt, item_ID, item_name, age, gender, zipcode]\n\ndata\n","metadata":{"tags":[],"cell_id":"8baa29620421412782c494da99099734","allow_embed":"code_output","source_hash":"e22e09cb","execution_start":1673364853257,"execution_millis":248,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":35,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":7,"row_count":99,"columns":[{"name":"customer_ID","dtype":"object","stats":{"unique_count":87,"nan_count":0,"categories":[{"name":"322","count":2},{"name":"146","count":2},{"name":"85 others","count":95}]}},{"name":"transaction_amnt","dtype":"float64","stats":{"unique_count":99,"nan_count":0,"min":"1.3676709942205518","max":"9.987054437035802","histogram":[{"bin_start":1.3676709942205518,"bin_end":2.229609338502077,"count":9},{"bin_start":2.229609338502077,"bin_end":3.091547682783602,"count":8},{"bin_start":3.091547682783602,"bin_end":3.953486027065127,"count":9},{"bin_start":3.953486027065127,"bin_end":4.815424371346652,"count":15},{"bin_start":4.815424371346652,"bin_end":5.677362715628177,"count":12},{"bin_start":5.677362715628177,"bin_end":6.539301059909702,"count":8},{"bin_start":6.539301059909702,"bin_end":7.401239404191227,"count":8},{"bin_start":7.401239404191227,"bin_end":8.263177748472753,"count":10},{"bin_start":8.263177748472753,"bin_end":9.125116092754277,"count":9},{"bin_start":9.125116092754277,"bin_end":9.987054437035802,"count":11}]}},{"name":"item_ID","dtype":"object","stats":{"unique_count":32,"nan_count":0,"categories":[{"name":"32","count":7},{"name":"14","count":6},{"name":"30 others","count":86}]}},{"name":"item_name","dtype":"object","stats":{"unique_count":4,"nan_count":0,"categories":[{"name":"tablet","count":28},{"name":"laptop","count":24},{"name":"2 others","count":47}]}},{"name":"age","dtype":"object","stats":{"unique_count":47,"nan_count":0,"categories":[{"name":"53","count":5},{"name":"26","count":4},{"name":"45 others","count":90}]}},{"name":"gender","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"female","count":50},{"name":"male","count":49}]}},{"name":"zipcode","dtype":"object","stats":{"unique_count":4,"nan_count":0,"categories":[{"name":"98101","count":28},{"name":"98801","count":25},{"name":"2 others","count":46}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"customer_ID":"321","transaction_amnt":"4.713675785008061","item_ID":"25","item_name":"laptop","age":"26","gender":"female","zipcode":"98101","_deepnote_index_column":"1"},{"customer_ID":"128","transaction_amnt":"9.813387045135537","item_ID":"9","item_name":"smartwatch","age":"43","gender":"male","zipcode":"98469","_deepnote_index_column":"2"},{"customer_ID":"490","transaction_amnt":"4.214857963236753","item_ID":"31","item_name":"phone","age":"48","gender":"female","zipcode":"98200","_deepnote_index_column":"3"},{"customer_ID":"322","transaction_amnt":"5.9467228169965605","item_ID":"14","item_name":"phone","age":"53","gender":"female","zipcode":"98801","_deepnote_index_column":"4"},{"customer_ID":"162","transaction_amnt":"1.6692301678150834","item_ID":"18","item_name":"laptop","age":"71","gender":"male","zipcode":"98200","_deepnote_index_column":"5"},{"customer_ID":"491","transaction_amnt":"5.1088079285776224","item_ID":"26","item_name":"tablet","age":"26","gender":"male","zipcode":"98101","_deepnote_index_column":"6"},{"customer_ID":"170","transaction_amnt":"9.780168421011425","item_ID":"10","item_name":"tablet","age":"61","gender":"male","zipcode":"98101","_deepnote_index_column":"7"},{"customer_ID":"319","transaction_amnt":"8.258702363235615","item_ID":"23","item_name":"tablet","age":"23","gender":"female","zipcode":"98101","_deepnote_index_column":"8"},{"customer_ID":"302","transaction_amnt":"3.994470838038586","item_ID":"32","item_name":"laptop","age":"65","gender":"female","zipcode":"98801","_deepnote_index_column":"9"},{"customer_ID":"489","transaction_amnt":"9.987054437035802","item_ID":"5","item_name":"laptop","age":"73","gender":"male","zipcode":"98801","_deepnote_index_column":"10"}]},"text/plain":" customer_ID transaction_amnt item_ID item_name age gender zipcode\n1 321 4.713676 25 laptop 26 female 98101\n2 128 9.813387 9 smartwatch 43 male 98469\n3 490 4.214858 31 phone 48 female 98200\n4 322 5.946723 14 phone 53 female 98801\n5 162 1.669230 18 laptop 71 male 98200\n.. ... ... ... ... .. ... ...\n95 195 9.636766 13 phone 47 male 98801\n96 425 8.315732 22 smartwatch 49 female 98101\n97 146 1.455586 19 smartwatch 69 female 98101\n98 438 8.426772 17 phone 26 female 98101\n99 246 4.782375 4 tablet 28 male 98101\n\n[99 rows x 7 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
customer_IDtransaction_amntitem_IDitem_nameagegenderzipcode
13214.71367625laptop26female98101
21289.8133879smartwatch43male98469
34904.21485831phone48female98200
43225.94672314phone53female98801
51621.66923018laptop71male98200
........................
951959.63676613phone47male98801
964258.31573222smartwatch49female98101
971461.45558619smartwatch69female98101
984388.42677217phone26female98101
992464.7823754tablet28male98101
\n

99 rows × 7 columns

\n
"},"metadata":{}}],"execution_count":35},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\" list some good public datasets\", max_tokens=1000)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"2aaf89e048084118b09d5dc8d5973997","allow_embed":"code_output","source_hash":"799d7981","execution_start":1673364891721,"execution_millis":2602,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. US Census Data\n2. Enron Email Dataset\n3. Global Open Data Index\n4. Air Quality Monitoring Data\n5. New York City Taxi Trip Data\n6. IMF Data\n7. World Bank Open Data\n8. Google Books Ngrams Dataset\n9. Amazon Reviews Dataset\n10. UCI Machine Learning Repository\n","output_type":"stream"}],"execution_count":36},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\" list some good public datasets under apache 2.0 license. provide links to their source\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"dc005a6ff705440382a1d7733cad7653","allow_embed":"code_output","source_hash":"763a0cd0","execution_start":1673364936215,"execution_millis":7205,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. OpenStreetMap: https://www.openstreetmap.org/\n2. US Census Data: https://www.census.gov/data.html\n3. Google Books Ngrams: https://aws.amazon.com/datasets/google-books-ngrams/\n4. Wikipedia: https://dumps.wikimedia.org/enwiki/\n5. US Government Spending Data: https://www.usaspending.gov/\n6. World Bank Open Data: https://data.worldbank.org/\n7. Common Crawl: http://commoncrawl.org/\n8. Open Images: https://storage.googleapis.com/openimages/web/index.html\n9. OpenFlights: https://openflights.org/data.html\n10. GDELT: http://data.gdeltproject.org/\n","output_type":"stream"}],"execution_count":38},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\" list some good public datasets under apache 2.0 license. provide links to their source and descriptions\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"8b63efe556cf4310ab44b844c761d2f5","allow_embed":"code_output","source_hash":"d9e6cfd6","execution_start":1673364954767,"execution_millis":11736,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. OpenStreetMap: OpenStreetMap is a free, editable map of the world, created and maintained by volunteers and available for use under an open license. It contains millions of data points, including roads, buildings, and points of interest. Source: https://www.openstreetmap.org/\n\n2. Google Books Ngrams: Google Books Ngrams is a dataset of over 5 million books from Google Books, spanning from 1500 to 2008. It contains word counts for each year, allowing researchers to track the usage of words over time. Source: https://aws.amazon.com/datasets/google-books-ngrams/\n\n3. Wikipedia: Wikipedia is a free, open-source encyclopedia with millions of articles in hundreds of languages. It is available for use under the Creative Commons Attribution-ShareAlike license. Source: https://www.wikipedia.org/\n\n4. Common Crawl: Common Crawl is a large-scale web crawl that collects data from over 5 billion webpages. It is available for use under the Apache 2.0 license. Source: https://commoncrawl.org/\n\n5. Open Images Dataset: The Open Images Dataset is a collection of 9 million images annotated with labels spanning over 6000 categories. It is available for use under the Apache 2.0 license. Source: https://storage.googleapis.com/openimages/web/index.html\n","output_type":"stream"}],"execution_count":39},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What are some emerging machine learning use-cases in social media?\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"5eadc317599a441d886169d26d2060c1","allow_embed":"code_output","source_hash":"fa7443c9","execution_start":1673365110828,"execution_millis":4087,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. Automated Content Curation: Automatically curating content from social media platforms to create personalized content feeds for users.\n\n2. Sentiment Analysis: Analyzing user sentiment from social media posts to gain insights into customer opinions and preferences.\n\n3. Social Media Monitoring: Using machine learning algorithms to monitor social media conversations and detect potential issues or trends.\n\n4. Social Media Advertising: Leveraging machine learning to optimize social media advertising campaigns and target the right audience.\n\n5. Social Media Recommendations: Using machine learning to recommend content to users based on their interests and preferences.\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What are some emerging machine learning use-cases in healthcare?\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"a08e5d3197504ba499f8de9b63c83090","allow_embed":"code_output","source_hash":"f3e455d","execution_start":1673365252249,"execution_millis":5894,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. Automated Diagnosis: Machine learning algorithms can be used to analyze patient data and medical images to detect and diagnose diseases.\n\n2. Personalized Medicine: Machine learning algorithms can be used to analyze patient data and medical images to create personalized treatment plans for each patient.\n\n3. Drug Discovery: Machine learning algorithms can be used to analyze large datasets of chemical compounds to identify potential new drugs.\n\n4. Clinical Decision Support: Machine learning algorithms can be used to analyze patient data and medical images to provide clinicians with real-time decision support.\n\n5. Predictive Analytics: Machine learning algorithms can be used to analyze patient data and medical images to predict future health outcomes.\n","output_type":"stream"}],"execution_count":44},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What are some good research questions on using deep learning for image detection?\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"77d3e37fec824ba69d71991b9c73298e","allow_embed":"code_output","source_hash":"543c1ba7","execution_start":1673365541881,"execution_millis":7904,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. What are the most effective deep learning architectures for image detection?\n2. How can deep learning be used to improve the accuracy of image detection?\n3. What are the most effective methods for training deep learning models for image detection?\n4. How can deep learning be used to detect objects in images with varying levels of complexity?\n5. How can deep learning be used to detect objects in images with varying levels of illumination?\n6. How can deep learning be used to detect objects in images with varying levels of noise?\n7. How can deep learning be used to detect objects in images with varying levels of resolution?\n8. How can deep learning be used to detect objects in images with varying levels of occlusion?\n9. How can deep learning be used to detect objects in images with varying levels of background clutter?\n10. How can deep learning be used to detect objects in images with varying levels of rotation?\n","output_type":"stream"}],"execution_count":45},{"cell_type":"code","source":"completion = openai.Completion.create(engine=\"text-davinci-003\", prompt=\"What are some good research questions related to NLP transformer models?\", max_tokens=1000, temperature=0)\nprint(completion.choices[0]['text'])","metadata":{"tags":[],"cell_id":"959960a41e784d279afadf9e891c0377","allow_embed":"code_output","source_hash":"649001ea","execution_start":1673365855245,"execution_millis":8775,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"\n\n1. How can transformer models be used to improve the accuracy of natural language processing tasks?\n2. What are the most effective methods for training transformer models for natural language processing tasks?\n3. How can transformer models be used to improve the efficiency of natural language processing tasks?\n4. What are the most effective methods for optimizing transformer models for natural language processing tasks?\n5. How can transformer models be used to improve the interpretability of natural language processing tasks?\n6. What are the most effective methods for deploying transformer models for natural language processing tasks?\n7. How can transformer models be used to improve the scalability of natural language processing tasks?\n8. What are the most effective methods for combining transformer models with other natural language processing techniques?\n9. How can transformer models be used to improve the robustness of natural language processing tasks?\n10. What are the most effective methods for evaluating transformer models for natural language processing tasks?\n","output_type":"stream"}],"execution_count":49},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"226ddc2337ad4c80a9c176feb573faf1","deepnote_persisted_session":{"createdAt":"2023-01-10T14:45:50.913Z"},"deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /imputer.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"import pandas as pd \nimport functools\nfrom sklearn.experimental import enable_iterative_imputer\nfrom sklearn.impute import IterativeImputer\nimport numpy as np ","metadata":{"tags":[],"cell_id":"d62b0e3149f34b6da05e2daf6d0f7534","allow_embed":"code_output","source_hash":"5072b50a","execution_start":1666884400995,"execution_millis":1573,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":1},{"cell_type":"code","source":"# impute with zero\ndef simple_imputation(input_function):\n @functools.wraps(input_function)\n def simple_imputation_wrapper(*args, **kwargs):\n return_value = input_function(*args, **kwargs)\n print(\"--------------Before Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n return_value.fillna(0, inplace = True)\n print(\"--------------After Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n return return_value\n return simple_imputation_wrapper","metadata":{"tags":[],"cell_id":"0a9a725472834fbeb7b5214be47e6d29","allow_embed":"code_output","source_hash":"c0cc44af","execution_start":1666884402575,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":2},{"cell_type":"code","source":"","metadata":{"tags":[],"cell_id":"bc8e45f3df6d4383b659a0d84928a63f","source_hash":"b623e53d","execution_start":1666884402582,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":2},{"cell_type":"code","source":"@simple_imputation\ndef read_data():\n df = pd.read_csv(\"wines_data.csv\", sep = \";\")\n return df","metadata":{"tags":[],"cell_id":"5e138fd1e3af494c82e86f7ba4731199","source_hash":"e271671b","execution_start":1666884402588,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"code","source":" read_data()","metadata":{"tags":[],"cell_id":"dc9cff612f244c76b95fdb252973f1f8","allow_embed":"code_output","source_hash":"6fc175e1","execution_start":1666884402642,"execution_millis":1460,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"--------------Before Imputation--------------\ncountry 2\ndesignation 43826\npoints 5\nprice 13396\nprovince 7\nregion_1 23845\nregion_2 85659\nvariety 5\nwinery 5\nlast_year_points 0\ndtype: int64\n--------------After Imputation--------------\ncountry 0\ndesignation 0\npoints 0\nprice 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nlast_year_points 0\ndtype: int64\n","output_type":"stream"},{"output_type":"execute_result","execution_count":4,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":10,"row_count":144037,"columns":[{"name":"country","dtype":"object"},{"name":"designation","dtype":"object"},{"name":"points","dtype":"float64"},{"name":"price","dtype":"float64"},{"name":"province","dtype":"object"},{"name":"region_1","dtype":"object"},{"name":"region_2","dtype":"object"},{"name":"variety","dtype":"object"},{"name":"winery","dtype":"object"},{"name":"last_year_points","dtype":"int64"},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"country":"US","designation":"Martha's Vineyard","points":"96.0","price":"235.0","province":"California","region_1":"Napa Valley","region_2":"Napa","variety":"Cabernet Sauvignon","winery":"Heitz","last_year_points":"94","_deepnote_index_column":"0"},{"country":"Spain","designation":"Carodorum Selección Especial Reserva","points":"96.0","price":"110.0","province":"Northern Spain","region_1":"Toro","region_2":"0","variety":"Tinta de Toro","winery":"Bodega Carmen Rodríguez","last_year_points":"92","_deepnote_index_column":"1"},{"country":"US","designation":"Special Selected Late Harvest","points":"96.0","price":"90.0","province":"California","region_1":"Knights Valley","region_2":"Sonoma","variety":"Sauvignon Blanc","winery":"Macauley","last_year_points":"100","_deepnote_index_column":"2"},{"country":"US","designation":"Reserve","points":"96.0","price":"65.0","province":"Oregon","region_1":"Willamette Valley","region_2":"Willamette Valley","variety":"Pinot Noir","winery":"Ponzi","last_year_points":"94","_deepnote_index_column":"3"},{"country":"France","designation":"La Brûlade","points":"95.0","price":"66.0","province":"Provence","region_1":"Bandol","region_2":"0","variety":"Provence red blend","winery":"Domaine de la Bégude","last_year_points":"94","_deepnote_index_column":"4"},{"country":"Spain","designation":"Numanthia","points":"95.0","price":"73.0","province":"Northern Spain","region_1":"Toro","region_2":"0","variety":"Tinta de Toro","winery":"Numanthia","last_year_points":"94","_deepnote_index_column":"5"},{"country":"Spain","designation":"San Román","points":"95.0","price":"65.0","province":"Northern Spain","region_1":"Toro","region_2":"0","variety":"Tinta de Toro","winery":"Maurodos","last_year_points":"89","_deepnote_index_column":"6"},{"country":"Spain","designation":"Carodorum Único Crianza","points":"95.0","price":"110.0","province":"Northern Spain","region_1":"Toro","region_2":"0","variety":"Tinta de Toro","winery":"Bodega Carmen Rodríguez","last_year_points":"88","_deepnote_index_column":"7"},{"country":"US","designation":"Silice","points":"95.0","price":"65.0","province":"Oregon","region_1":"Chehalem Mountains","region_2":"Willamette Valley","variety":"Pinot Noir","winery":"Bergström","last_year_points":"83","_deepnote_index_column":"8"},{"country":"US","designation":"Gap's Crown Vineyard","points":"95.0","price":"60.0","province":"California","region_1":"Sonoma Coast","region_2":"Sonoma","variety":"Pinot Noir","winery":"Blue Farm","last_year_points":"83","_deepnote_index_column":"9"}]},"text/plain":" country designation points price \\\n0 US Martha's Vineyard 96.0 235.0 \n1 Spain Carodorum Selección Especial Reserva 96.0 110.0 \n2 US Special Selected Late Harvest 96.0 90.0 \n3 US Reserve 96.0 65.0 \n4 France La Brûlade 95.0 66.0 \n... ... ... ... ... \n144032 Italy 0 91.0 20.0 \n144033 France Cuvée Prestige 91.0 27.0 \n144034 Italy Terre di Dora 91.0 20.0 \n144035 France Grand Brut Rosé 90.0 52.0 \n144036 Italy 0 90.0 15.0 \n\n province region_1 region_2 \\\n0 California Napa Valley Napa \n1 Northern Spain Toro 0 \n2 California Knights Valley Sonoma \n3 Oregon Willamette Valley Willamette Valley \n4 Provence Bandol 0 \n... ... ... ... \n144032 Southern Italy Fiano di Avellino 0 \n144033 Champagne Champagne 0 \n144034 Southern Italy Fiano di Avellino 0 \n144035 Champagne Champagne 0 \n144036 Northeastern Italy Alto Adige 0 \n\n variety winery last_year_points \n0 Cabernet Sauvignon Heitz 94 \n1 Tinta de Toro Bodega Carmen Rodríguez 92 \n2 Sauvignon Blanc Macauley 100 \n3 Pinot Noir Ponzi 94 \n4 Provence red blend Domaine de la Bégude 94 \n... ... ... ... \n144032 White Blend Feudi di San Gregorio 84 \n144033 Champagne Blend H.Germain 83 \n144034 White Blend Terredora 97 \n144035 Champagne Blend Gosset 89 \n144036 Pinot Grigio Alois Lageder 82 \n\n[144037 rows x 10 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
countrydesignationpointspriceprovinceregion_1region_2varietywinerylast_year_points
0USMartha's Vineyard96.0235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz94
1SpainCarodorum Selección Especial Reserva96.0110.0Northern SpainToro0Tinta de ToroBodega Carmen Rodríguez92
2USSpecial Selected Late Harvest96.090.0CaliforniaKnights ValleySonomaSauvignon BlancMacauley100
3USReserve96.065.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi94
4FranceLa Brûlade95.066.0ProvenceBandol0Provence red blendDomaine de la Bégude94
.................................
144032Italy091.020.0Southern ItalyFiano di Avellino0White BlendFeudi di San Gregorio84
144033FranceCuvée Prestige91.027.0ChampagneChampagne0Champagne BlendH.Germain83
144034ItalyTerre di Dora91.020.0Southern ItalyFiano di Avellino0White BlendTerredora97
144035FranceGrand Brut Rosé90.052.0ChampagneChampagne0Champagne BlendGosset89
144036Italy090.015.0Northeastern ItalyAlto Adige0Pinot GrigioAlois Lageder82
\n

144037 rows × 10 columns

\n
"},"metadata":{}}],"execution_count":4},{"cell_type":"code","source":"df = read_data()\ndf.isnull().sum(axis = 0)","metadata":{"tags":[],"cell_id":"4ff24651309843e2927764d50596ae93","source_hash":"81ba9071","execution_start":1666884403275,"execution_millis":1095,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"--------------Before Imputation--------------\ncountry 2\ndesignation 43826\npoints 5\nprice 13396\nprovince 7\nregion_1 23845\nregion_2 85659\nvariety 5\nwinery 5\nlast_year_points 0\ndtype: int64\n--------------After Imputation--------------\ncountry 0\ndesignation 0\npoints 0\nprice 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nlast_year_points 0\ndtype: int64\n","output_type":"stream"},{"output_type":"execute_result","execution_count":5,"data":{"text/plain":"country 0\ndesignation 0\npoints 0\nprice 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nlast_year_points 0\ndtype: int64"},"metadata":{}}],"execution_count":5},{"cell_type":"code","source":"def meanmode_imputation(input_function):\n @functools.wraps(input_function)\n def meanmode_imputation_wrapper(*args, **kwargs):\n return_value = input_function(*args, **kwargs)\n print(\"--------------Before Mean/Mode Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n for col in list(return_value.columns):\n if return_value[col].dtype == float:\n return_value[col].fillna(return_value[col].mean(), inplace = True)\n elif return_value[col].dtype.name == 'category': \n return_value[col].fillna(return_value[col].mode()[0], inplace = True)\n print(\"--------------After Mean/Mode Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n return return_value\n return meanmode_imputation_wrapper","metadata":{"tags":[],"cell_id":"431f7417c9c04b26939cf1a44b7bee90","allow_embed":"code_output","source_hash":"25e07f36","execution_start":1666884403888,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":6},{"cell_type":"code","source":"@meanmode_imputation\ndef read_data(data_type_dict):\n df = pd.read_csv(\"wines_data.csv\", sep = \";\")\n for col in list(df.columns):\n df[col] = df[col].astype(data_type_dict[col])\n return df","metadata":{"tags":[],"cell_id":"bb78c01d86c84c10be8e602e78190680","allow_embed":"code_output","source_hash":"642ff961","execution_start":1666884403893,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":7},{"cell_type":"code","source":" data_type_dict = {'country':'category', 'designation':'category',\n'points':'float', 'price':'float', 'province':'category', 'region_1':'category',\n 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}\ndf = read_data(data_type_dict)","metadata":{"tags":[],"cell_id":"bcb072ac40b740cc93e6acd1b553090b","allow_embed":"code_output","source_hash":"5ef22636","execution_start":1666884403900,"execution_millis":666,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"--------------Before Mean/Mode Imputation--------------\ncountry 2\ndesignation 43826\npoints 5\nprice 13396\nprovince 7\nregion_1 23845\nregion_2 85659\nvariety 5\nwinery 5\nlast_year_points 0\ndtype: int64\n--------------After Mean/Mode Imputation--------------\ncountry 0\ndesignation 0\npoints 0\nprice 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nlast_year_points 0\ndtype: int64\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"def iterative_imputation(input_function):\n @functools.wraps(input_function)\n def iterative_imputation_wrapper(*args, **kwargs):\n return_value = input_function(*args, **kwargs)\n print(\"--------------Before Bayesian Ridge Regression Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n return_num = return_value[['price', 'points', 'last_year_points']]\n return_cat = return_value.drop(columns=['price', 'points', 'last_year_points'])\n\n imp_bayesian = IterativeImputer(max_iter=10, random_state=0)\n imp_bayesian.fit(np.array(return_num))\n return_num = pd.DataFrame(np.round(imp_bayesian.transform(np.array(return_num))), columns = ['price', 'points', 'last_year_points'])\n for col in list(return_cat.columns):\n return_cat[col].fillna(return_cat[col].mode()[0], inplace = True)\n return_value = pd.concat([return_cat, return_num], axis=1)\n print(\"--------------After Bayesian Ridge Regression Imputation--------------\")\n print(return_value.isnull().sum(axis = 0))\n return return_value\n return iterative_imputation_wrapper","metadata":{"tags":[],"cell_id":"8bcf361def3549a6bed09465d4f64637","allow_embed":"code_output","source_hash":"ccf70f29","execution_start":1666884404568,"execution_millis":19,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":9},{"cell_type":"code","source":"@iterative_imputation\ndef read_data(data_type_dict):\n df = pd.read_csv(\"wines_data.csv\", sep = \";\")\n for col in list(df.columns):\n df[col] = df[col].astype(data_type_dict[col])\n return df","metadata":{"tags":[],"cell_id":"3bde52f0e03e4977bb6c5976b9a88b4c","allow_embed":"code_output","source_hash":"f6485403","execution_start":1666884404590,"execution_millis":70578,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":10},{"cell_type":"code","source":" data_type_dict = {'country':'category', 'designation':'category',\n'points':'float', 'price':'float', 'province':'category', 'region_1':'category',\n 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}\ndf = read_data(data_type_dict)","metadata":{"tags":[],"cell_id":"f725b7390fdd450a98fac67e0993f450","allow_embed":"code_output","source_hash":"5ef22636","execution_start":1666884404600,"execution_millis":1697,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"--------------Before Bayesian Ridge Regression Imputation--------------\ncountry 2\ndesignation 43826\npoints 5\nprice 13396\nprovince 7\nregion_1 23845\nregion_2 85659\nvariety 5\nwinery 5\nlast_year_points 0\ndtype: int64\n--------------After Bayesian Ridge Regression Imputation--------------\ncountry 0\ndesignation 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nprice 0\npoints 0\nlast_year_points 0\ndtype: int64\n","output_type":"stream"}],"execution_count":11},{"cell_type":"code","source":" data_type_dict = {'country':'category', 'designation':'category',\n'points':'float', 'price':'float', 'province':'category', 'region_1':'category',\n 'region_2':'category', 'variety':'category', 'winery':'category', 'last_year_points':'float'}\ndf_original = pd.read_csv(\"wines_data.csv\", sep = \";\")\ndf_imp = read_data(data_type_dict)\n","metadata":{"tags":[],"cell_id":"0f9c67b50e6c45799beb6125b7ff597d","source_hash":"b3caafb9","execution_start":1666884406300,"execution_millis":1982,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"--------------Before Bayesian Ridge Regression Imputation--------------\ncountry 2\ndesignation 43826\npoints 5\nprice 13396\nprovince 7\nregion_1 23845\nregion_2 85659\nvariety 5\nwinery 5\nlast_year_points 0\ndtype: int64\n--------------After Bayesian Ridge Regression Imputation--------------\ncountry 0\ndesignation 0\nprovince 0\nregion_1 0\nregion_2 0\nvariety 0\nwinery 0\nprice 0\npoints 0\nlast_year_points 0\ndtype: int64\n","output_type":"stream"}],"execution_count":12},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"d50ab12ed7b646ee9d6d24c80167f73f","deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /list_comp_generators.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"import pandas as pd \n\ndf = pd.read_csv(\"insurance.csv\")","metadata":{"tags":[],"cell_id":"720286d65de54a80a00c3a52f25e5169","allow_embed":"code_output","source_hash":"d97cb9c5","execution_start":1671471973316,"execution_millis":1,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":1},{"cell_type":"code","source":"df.head()","metadata":{"tags":[],"cell_id":"525400f95b4a47dca0943addeb411a79","allow_embed":"code_output","source_hash":"c085b6ba","execution_start":1671471973320,"execution_millis":33,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":2,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":7,"row_count":5,"columns":[{"name":"age","dtype":"int64","stats":{"unique_count":5,"nan_count":0,"min":"18","max":"33","histogram":[{"bin_start":18,"bin_end":19.5,"count":2},{"bin_start":19.5,"bin_end":21,"count":0},{"bin_start":21,"bin_end":22.5,"count":0},{"bin_start":22.5,"bin_end":24,"count":0},{"bin_start":24,"bin_end":25.5,"count":0},{"bin_start":25.5,"bin_end":27,"count":0},{"bin_start":27,"bin_end":28.5,"count":1},{"bin_start":28.5,"bin_end":30,"count":0},{"bin_start":30,"bin_end":31.5,"count":0},{"bin_start":31.5,"bin_end":33,"count":2}]}},{"name":"sex","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"male","count":4},{"name":"female","count":1}]}},{"name":"bmi","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"22.705","max":"33.77","histogram":[{"bin_start":22.705,"bin_end":23.8115,"count":1},{"bin_start":23.8115,"bin_end":24.918,"count":0},{"bin_start":24.918,"bin_end":26.0245,"count":0},{"bin_start":26.0245,"bin_end":27.131,"count":0},{"bin_start":27.131,"bin_end":28.2375,"count":1},{"bin_start":28.2375,"bin_end":29.344,"count":1},{"bin_start":29.344,"bin_end":30.4505,"count":0},{"bin_start":30.4505,"bin_end":31.557000000000002,"count":0},{"bin_start":31.557000000000002,"bin_end":32.6635,"count":0},{"bin_start":32.6635,"bin_end":33.77,"count":2}]}},{"name":"children","dtype":"int64","stats":{"unique_count":3,"nan_count":0,"min":"0","max":"3","histogram":[{"bin_start":0,"bin_end":0.3,"count":3},{"bin_start":0.3,"bin_end":0.6,"count":0},{"bin_start":0.6,"bin_end":0.8999999999999999,"count":0},{"bin_start":0.8999999999999999,"bin_end":1.2,"count":1},{"bin_start":1.2,"bin_end":1.5,"count":0},{"bin_start":1.5,"bin_end":1.7999999999999998,"count":0},{"bin_start":1.7999999999999998,"bin_end":2.1,"count":0},{"bin_start":2.1,"bin_end":2.4,"count":0},{"bin_start":2.4,"bin_end":2.6999999999999997,"count":0},{"bin_start":2.6999999999999997,"bin_end":3,"count":1}]}},{"name":"smoker","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"no","count":4},{"name":"yes","count":1}]}},{"name":"region","dtype":"object","stats":{"unique_count":3,"nan_count":0,"categories":[{"name":"southeast","count":2},{"name":"northwest","count":2},{"name":"southwest","count":1}]}},{"name":"charges","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"1725.5523","max":"21984.47061","histogram":[{"bin_start":1725.5523,"bin_end":3751.444131,"count":1},{"bin_start":3751.444131,"bin_end":5777.335962,"count":2},{"bin_start":5777.335962,"bin_end":7803.227793000001,"count":0},{"bin_start":7803.227793000001,"bin_end":9829.119624,"count":0},{"bin_start":9829.119624,"bin_end":11855.011455,"count":0},{"bin_start":11855.011455,"bin_end":13880.903286,"count":0},{"bin_start":13880.903286,"bin_end":15906.795117,"count":0},{"bin_start":15906.795117,"bin_end":17932.686948000002,"count":1},{"bin_start":17932.686948000002,"bin_end":19958.578779,"count":0},{"bin_start":19958.578779,"bin_end":21984.47061,"count":1}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"age":"19","sex":"female","bmi":"27.9","children":"0","smoker":"yes","region":"southwest","charges":"16884.924","_deepnote_index_column":"0"},{"age":"18","sex":"male","bmi":"33.77","children":"1","smoker":"no","region":"southeast","charges":"1725.5523","_deepnote_index_column":"1"},{"age":"28","sex":"male","bmi":"33.0","children":"3","smoker":"no","region":"southeast","charges":"4449.462","_deepnote_index_column":"2"},{"age":"33","sex":"male","bmi":"22.705","children":"0","smoker":"no","region":"northwest","charges":"21984.47061","_deepnote_index_column":"3"},{"age":"32","sex":"male","bmi":"28.88","children":"0","smoker":"no","region":"northwest","charges":"3866.8552","_deepnote_index_column":"4"}]},"text/plain":" age sex bmi children smoker region charges\n0 19 female 27.900 0 yes southwest 16884.92400\n1 18 male 33.770 1 no southeast 1725.55230\n2 28 male 33.000 3 no southeast 4449.46200\n3 33 male 22.705 0 no northwest 21984.47061\n4 32 male 28.880 0 no northwest 3866.85520","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
\n
"},"metadata":{}}],"execution_count":2},{"cell_type":"code","source":"import numpy as np \ndef log_transform(input_list):\n return np.log(input_list)","metadata":{"tags":[],"cell_id":"ca49d07ef75d454bbac460fcc5207eb4","allow_embed":"code_output","source_hash":"10f8e841","execution_start":1671471973358,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"code","source":"bmi_list = list(df['bmi'])\nbmi_lt_map = list(map(log_transform, bmi_list))\ndf['bmi_lt_map'] = bmi_lt_map","metadata":{"tags":[],"cell_id":"6b98a85dc6fa415f9c96b4a76f7ca127","allow_embed":"code_output","source_hash":"9ab65662","execution_start":1671471973402,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":4},{"cell_type":"code","source":"df['bmi_lt_listcomp'] = [np.log(bmi) for bmi in list(df['bmi'])]","metadata":{"tags":[],"cell_id":"b533e1b8442a4d1fbc347dd7bf162e6c","allow_embed":"code_output","source_hash":"3fe3a273","execution_start":1671471973402,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":5},{"cell_type":"code","source":"df.head()","metadata":{"tags":[],"cell_id":"08a80579a5844de0bd619fbe1edba061","source_hash":"c085b6ba","execution_start":1671471973403,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":6,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":9,"row_count":5,"columns":[{"name":"age","dtype":"int64","stats":{"unique_count":5,"nan_count":0,"min":"18","max":"33","histogram":[{"bin_start":18,"bin_end":19.5,"count":2},{"bin_start":19.5,"bin_end":21,"count":0},{"bin_start":21,"bin_end":22.5,"count":0},{"bin_start":22.5,"bin_end":24,"count":0},{"bin_start":24,"bin_end":25.5,"count":0},{"bin_start":25.5,"bin_end":27,"count":0},{"bin_start":27,"bin_end":28.5,"count":1},{"bin_start":28.5,"bin_end":30,"count":0},{"bin_start":30,"bin_end":31.5,"count":0},{"bin_start":31.5,"bin_end":33,"count":2}]}},{"name":"sex","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"male","count":4},{"name":"female","count":1}]}},{"name":"bmi","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"22.705","max":"33.77","histogram":[{"bin_start":22.705,"bin_end":23.8115,"count":1},{"bin_start":23.8115,"bin_end":24.918,"count":0},{"bin_start":24.918,"bin_end":26.0245,"count":0},{"bin_start":26.0245,"bin_end":27.131,"count":0},{"bin_start":27.131,"bin_end":28.2375,"count":1},{"bin_start":28.2375,"bin_end":29.344,"count":1},{"bin_start":29.344,"bin_end":30.4505,"count":0},{"bin_start":30.4505,"bin_end":31.557000000000002,"count":0},{"bin_start":31.557000000000002,"bin_end":32.6635,"count":0},{"bin_start":32.6635,"bin_end":33.77,"count":2}]}},{"name":"children","dtype":"int64","stats":{"unique_count":3,"nan_count":0,"min":"0","max":"3","histogram":[{"bin_start":0,"bin_end":0.3,"count":3},{"bin_start":0.3,"bin_end":0.6,"count":0},{"bin_start":0.6,"bin_end":0.8999999999999999,"count":0},{"bin_start":0.8999999999999999,"bin_end":1.2,"count":1},{"bin_start":1.2,"bin_end":1.5,"count":0},{"bin_start":1.5,"bin_end":1.7999999999999998,"count":0},{"bin_start":1.7999999999999998,"bin_end":2.1,"count":0},{"bin_start":2.1,"bin_end":2.4,"count":0},{"bin_start":2.4,"bin_end":2.6999999999999997,"count":0},{"bin_start":2.6999999999999997,"bin_end":3,"count":1}]}},{"name":"smoker","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"no","count":4},{"name":"yes","count":1}]}},{"name":"region","dtype":"object","stats":{"unique_count":3,"nan_count":0,"categories":[{"name":"southeast","count":2},{"name":"northwest","count":2},{"name":"southwest","count":1}]}},{"name":"charges","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"1725.5523","max":"21984.47061","histogram":[{"bin_start":1725.5523,"bin_end":3751.444131,"count":1},{"bin_start":3751.444131,"bin_end":5777.335962,"count":2},{"bin_start":5777.335962,"bin_end":7803.227793000001,"count":0},{"bin_start":7803.227793000001,"bin_end":9829.119624,"count":0},{"bin_start":9829.119624,"bin_end":11855.011455,"count":0},{"bin_start":11855.011455,"bin_end":13880.903286,"count":0},{"bin_start":13880.903286,"bin_end":15906.795117,"count":0},{"bin_start":15906.795117,"bin_end":17932.686948000002,"count":1},{"bin_start":17932.686948000002,"bin_end":19958.578779,"count":0},{"bin_start":19958.578779,"bin_end":21984.47061,"count":1}]}},{"name":"bmi_lt_map","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"3.122585164549914","max":"3.519572834397476","histogram":[{"bin_start":3.122585164549914,"bin_end":3.1622839315346702,"count":1},{"bin_start":3.1622839315346702,"bin_end":3.2019826985194264,"count":0},{"bin_start":3.2019826985194264,"bin_end":3.2416814655041826,"count":0},{"bin_start":3.2416814655041826,"bin_end":3.2813802324889387,"count":0},{"bin_start":3.2813802324889387,"bin_end":3.321078999473695,"count":0},{"bin_start":3.321078999473695,"bin_end":3.3607777664584515,"count":1},{"bin_start":3.3607777664584515,"bin_end":3.4004765334432077,"count":1},{"bin_start":3.4004765334432077,"bin_end":3.440175300427964,"count":0},{"bin_start":3.440175300427964,"bin_end":3.47987406741272,"count":0},{"bin_start":3.47987406741272,"bin_end":3.519572834397476,"count":2}]}},{"name":"bmi_lt_listcomp","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"3.122585164549914","max":"3.519572834397476","histogram":[{"bin_start":3.122585164549914,"bin_end":3.1622839315346702,"count":1},{"bin_start":3.1622839315346702,"bin_end":3.2019826985194264,"count":0},{"bin_start":3.2019826985194264,"bin_end":3.2416814655041826,"count":0},{"bin_start":3.2416814655041826,"bin_end":3.2813802324889387,"count":0},{"bin_start":3.2813802324889387,"bin_end":3.321078999473695,"count":0},{"bin_start":3.321078999473695,"bin_end":3.3607777664584515,"count":1},{"bin_start":3.3607777664584515,"bin_end":3.4004765334432077,"count":1},{"bin_start":3.4004765334432077,"bin_end":3.440175300427964,"count":0},{"bin_start":3.440175300427964,"bin_end":3.47987406741272,"count":0},{"bin_start":3.47987406741272,"bin_end":3.519572834397476,"count":2}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"age":"19","sex":"female","bmi":"27.9","children":"0","smoker":"yes","region":"southwest","charges":"16884.924","bmi_lt_map":"3.32862668882732","bmi_lt_listcomp":"3.32862668882732","_deepnote_index_column":"0"},{"age":"18","sex":"male","bmi":"33.77","children":"1","smoker":"no","region":"southeast","charges":"1725.5523","bmi_lt_map":"3.519572834397476","bmi_lt_listcomp":"3.519572834397476","_deepnote_index_column":"1"},{"age":"28","sex":"male","bmi":"33.0","children":"3","smoker":"no","region":"southeast","charges":"4449.462","bmi_lt_map":"3.4965075614664802","bmi_lt_listcomp":"3.4965075614664802","_deepnote_index_column":"2"},{"age":"33","sex":"male","bmi":"22.705","children":"0","smoker":"no","region":"northwest","charges":"21984.47061","bmi_lt_map":"3.122585164549914","bmi_lt_listcomp":"3.122585164549914","_deepnote_index_column":"3"},{"age":"32","sex":"male","bmi":"28.88","children":"0","smoker":"no","region":"northwest","charges":"3866.8552","bmi_lt_map":"3.3631493140246254","bmi_lt_listcomp":"3.3631493140246254","_deepnote_index_column":"4"}]},"text/plain":" age sex bmi children smoker region charges bmi_lt_map \\\n0 19 female 27.900 0 yes southwest 16884.92400 3.328627 \n1 18 male 33.770 1 no southeast 1725.55230 3.519573 \n2 28 male 33.000 3 no southeast 4449.46200 3.496508 \n3 33 male 22.705 0 no northwest 21984.47061 3.122585 \n4 32 male 28.880 0 no northwest 3866.85520 3.363149 \n\n bmi_lt_listcomp \n0 3.328627 \n1 3.519573 \n2 3.496508 \n3 3.122585 \n4 3.363149 ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
agesexbmichildrensmokerregionchargesbmi_lt_mapbmi_lt_listcomp
019female27.9000yessouthwest16884.924003.3286273.328627
118male33.7701nosoutheast1725.552303.5195733.519573
228male33.0003nosoutheast4449.462003.4965083.496508
333male22.7050nonorthwest21984.470613.1225853.122585
432male28.8800nonorthwest3866.855203.3631493.363149
\n
"},"metadata":{}}],"execution_count":6},{"cell_type":"code","source":"df['bmi_lt_direct'] = np.log(df['bmi'])","metadata":{"tags":[],"cell_id":"c308658a91064fd8a1c79f5a8a2f0d43","allow_embed":"code_output","source_hash":"b203a072","execution_start":1671471973406,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":7},{"cell_type":"code","source":"my_predictions = [[0.5, 0.2, 0.8], [0.3, 0.1, 0.9], [1.0, 0.2, 0.7]]","metadata":{"tags":[],"cell_id":"2e35b2e354f243db9cf9a71896de6186","allow_embed":"code_output","source_hash":"a9e90b01","execution_start":1671471973410,"execution_millis":1,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":8},{"cell_type":"code","source":"flattened_predictions = [prob for row in my_predictions for prob in row]\nprint(flattened_predictions)","metadata":{"tags":[],"cell_id":"5864af64293a495fbd99b231f17c378f","allow_embed":"code_output","source_hash":"e5e060d5","execution_start":1671471973413,"execution_millis":9,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[0.5, 0.2, 0.8, 0.3, 0.1, 0.9, 1.0, 0.2, 0.7]\n","output_type":"stream"}],"execution_count":9},{"cell_type":"code","source":"flat_fl = []\nfor row in my_predictions:\n for prob in row:\n flat_fl.append(prob)\nprint(flat_fl)","metadata":{"tags":[],"cell_id":"809181cccc09408f9f490d70fa8bc603","allow_embed":"code_output","source_hash":"5fa68ca4","execution_start":1671471973458,"execution_millis":4,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[0.5, 0.2, 0.8, 0.3, 0.1, 0.9, 1.0, 0.2, 0.7]\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"ml_labels = [['Yes' if prob >= 0.8 else 'Maybe' if (prob > 0.5 and prob < 0.8) else 'No' for prob in row] for row in my_predictions]\nprint(my_predictions)\nprint(ml_labels)","metadata":{"tags":[],"cell_id":"2f3136c0e4df48a09b092cc4adb3692c","allow_embed":"code_output","source_hash":"f3e6c582","execution_start":1671471973459,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[[0.5, 0.2, 0.8], [0.3, 0.1, 0.9], [1.0, 0.2, 0.7]]\n[['No', 'No', 'Yes'], ['No', 'No', 'Yes'], ['Yes', 'No', 'Maybe']]\n","output_type":"stream"}],"execution_count":11},{"cell_type":"code","source":"fl_labels = []\n\nfor row in my_predictions:\n hold_list = []\n for prob in row:\n if prob >= 0.8:\n hold_list.append('Yes')\n elif (prob > 0.5 and prob < 0.8):\n hold_list.append('Maybe')\n else:\n hold_list.append('No')\n fl_labels.append(hold_list)\n\nprint(my_predictions)\nprint(fl_labels)\n\n","metadata":{"tags":[],"cell_id":"81cc389958484fc7840fc640f49085a5","allow_embed":"code_output","source_hash":"281fe39d","execution_start":1671471973460,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[[0.5, 0.2, 0.8], [0.3, 0.1, 0.9], [1.0, 0.2, 0.7]]\n[['No', 'No', 'Yes'], ['No', 'No', 'Yes'], ['Yes', 'No', 'Maybe']]\n","output_type":"stream"}],"execution_count":12},{"cell_type":"code","source":"import numpy as np \nmu, sigma = 0.5, 0.1\nnp.random.seed(42)\nprobs = np.random.normal(mu, sigma, 100000000)\nprint(probs[:10])","metadata":{"tags":[],"cell_id":"1b7f647b0e47402993e5f049baa13315","allow_embed":"code_output","source_hash":"3befc574","execution_start":1671471973462,"execution_millis":3630,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"[0.54967142 0.48617357 0.56476885 0.65230299 0.47658466 0.4765863\n 0.65792128 0.57674347 0.45305256 0.554256 ]\n","output_type":"stream"}],"execution_count":13},{"cell_type":"code","source":"prob_labels = ['Yes' if prob > 0.5 else 'No' for prob in probs]","metadata":{"tags":[],"cell_id":"2fc6ac4f380c4e47890a26a04b2a6475","allow_embed":"code_output","source_hash":"badae281","execution_start":1671471977131,"execution_millis":7784,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":14},{"cell_type":"code","source":"print(prob_labels[:10])","metadata":{"tags":[],"cell_id":"47dfdb82f82d42cea835539bcd12d70b","allow_embed":"code_output","source_hash":"80cbc92d","execution_start":1671471984918,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"['Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes']\n","output_type":"stream"}],"execution_count":15},{"cell_type":"code","source":"prob_generator = ('Yes' if prob > 0.5 else 'No' for prob in probs)\nprobs_sublist = []\nfor i in range(0, 11):\n probs_sublist.append(next(prob_generator))\nprint(probs_sublist)","metadata":{"tags":[],"cell_id":"0b5a059b7efa49ea8b3d8d15f1f76639","allow_embed":"code_output","source_hash":"2934b395","execution_start":1671472059158,"execution_millis":5,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"['Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']\n","output_type":"stream"}],"execution_count":18},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"329863938022481ba67e3c24b20c36dc","deepnote_persisted_session":{"createdAt":"2022-12-14T20:03:02.675Z"},"deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /p_values_ml.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"%pip install statsmodels\n%pip install mlxtend","metadata":{"tags":[],"cell_id":"32206ddb84a04303a33c22833739f36b","allow_embed":"code_output","source_hash":"23dc76b8","execution_start":1672948325052,"execution_millis":6397,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Collecting statsmodels\n Using cached statsmodels-0.13.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)\nRequirement already satisfied: scipy>=1.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from statsmodels) (1.9.3)\nRequirement already satisfied: packaging>=21.3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from statsmodels) (21.3)\nRequirement already satisfied: pandas>=0.25 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from statsmodels) (1.2.5)\nRequirement already satisfied: numpy>=1.17 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from statsmodels) (1.23.4)\nCollecting patsy>=0.5.2\n Using cached patsy-0.5.3-py2.py3-none-any.whl (233 kB)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from packaging>=21.3->statsmodels) (3.0.9)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pandas>=0.25->statsmodels) (2022.5)\nRequirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from pandas>=0.25->statsmodels) (2.8.2)\nRequirement already satisfied: six in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)\nInstalling collected packages: patsy, statsmodels\nSuccessfully installed patsy-0.5.3 statsmodels-0.13.5\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0mNote: you may need to restart the kernel to use updated packages.\nRequirement already satisfied: mlxtend in /usr/local/lib/python3.9/site-packages (0.21.0)\nRequirement already satisfied: matplotlib>=3.0.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (3.6.0)\nRequirement already satisfied: setuptools in /root/venv/lib/python3.9/site-packages (from mlxtend) (58.1.0)\nRequirement already satisfied: pandas>=0.24.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (1.2.5)\nRequirement already satisfied: numpy>=1.16.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (1.23.4)\nRequirement already satisfied: joblib>=0.13.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (1.2.0)\nRequirement already satisfied: scipy>=1.2.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (1.9.3)\nRequirement already satisfied: scikit-learn>=1.0.2 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from mlxtend) (1.1.2)\nRequirement already satisfied: contourpy>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (1.0.5)\nRequirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)\nRequirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (4.37.4)\nRequirement already satisfied: cycler>=0.10 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (0.11.0)\nRequirement already satisfied: packaging>=20.0 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (21.3)\nRequirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (9.2.0)\nRequirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)\nRequirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pandas>=0.24.2->mlxtend) (2022.5)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from scikit-learn>=1.0.2->mlxtend) (3.1.0)\nRequirement already satisfied: six>=1.5 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)\n\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n\u001b[0mNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"import pandas as pd\ninsurance_df = pd.read_csv(\"insurance.csv\")\ninsurance_df.head()","metadata":{"tags":[],"cell_id":"8c2e85e92fd9459ba340be5e0f7e8f8b","allow_embed":"code_output","source_hash":"b7688c1a","execution_start":1672948331483,"execution_millis":2,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":2,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":7,"row_count":5,"columns":[{"name":"age","dtype":"int64","stats":{"unique_count":5,"nan_count":0,"min":"18","max":"33","histogram":[{"bin_start":18,"bin_end":19.5,"count":2},{"bin_start":19.5,"bin_end":21,"count":0},{"bin_start":21,"bin_end":22.5,"count":0},{"bin_start":22.5,"bin_end":24,"count":0},{"bin_start":24,"bin_end":25.5,"count":0},{"bin_start":25.5,"bin_end":27,"count":0},{"bin_start":27,"bin_end":28.5,"count":1},{"bin_start":28.5,"bin_end":30,"count":0},{"bin_start":30,"bin_end":31.5,"count":0},{"bin_start":31.5,"bin_end":33,"count":2}]}},{"name":"sex","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"male","count":4},{"name":"female","count":1}]}},{"name":"bmi","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"22.705","max":"33.77","histogram":[{"bin_start":22.705,"bin_end":23.8115,"count":1},{"bin_start":23.8115,"bin_end":24.918,"count":0},{"bin_start":24.918,"bin_end":26.0245,"count":0},{"bin_start":26.0245,"bin_end":27.131,"count":0},{"bin_start":27.131,"bin_end":28.2375,"count":1},{"bin_start":28.2375,"bin_end":29.344,"count":1},{"bin_start":29.344,"bin_end":30.4505,"count":0},{"bin_start":30.4505,"bin_end":31.557000000000002,"count":0},{"bin_start":31.557000000000002,"bin_end":32.6635,"count":0},{"bin_start":32.6635,"bin_end":33.77,"count":2}]}},{"name":"children","dtype":"int64","stats":{"unique_count":3,"nan_count":0,"min":"0","max":"3","histogram":[{"bin_start":0,"bin_end":0.3,"count":3},{"bin_start":0.3,"bin_end":0.6,"count":0},{"bin_start":0.6,"bin_end":0.8999999999999999,"count":0},{"bin_start":0.8999999999999999,"bin_end":1.2,"count":1},{"bin_start":1.2,"bin_end":1.5,"count":0},{"bin_start":1.5,"bin_end":1.7999999999999998,"count":0},{"bin_start":1.7999999999999998,"bin_end":2.1,"count":0},{"bin_start":2.1,"bin_end":2.4,"count":0},{"bin_start":2.4,"bin_end":2.6999999999999997,"count":0},{"bin_start":2.6999999999999997,"bin_end":3,"count":1}]}},{"name":"smoker","dtype":"object","stats":{"unique_count":2,"nan_count":0,"categories":[{"name":"no","count":4},{"name":"yes","count":1}]}},{"name":"region","dtype":"object","stats":{"unique_count":3,"nan_count":0,"categories":[{"name":"southeast","count":2},{"name":"northwest","count":2},{"name":"southwest","count":1}]}},{"name":"charges","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"1725.5523","max":"21984.47061","histogram":[{"bin_start":1725.5523,"bin_end":3751.444131,"count":1},{"bin_start":3751.444131,"bin_end":5777.335962,"count":2},{"bin_start":5777.335962,"bin_end":7803.227793000001,"count":0},{"bin_start":7803.227793000001,"bin_end":9829.119624,"count":0},{"bin_start":9829.119624,"bin_end":11855.011455,"count":0},{"bin_start":11855.011455,"bin_end":13880.903286,"count":0},{"bin_start":13880.903286,"bin_end":15906.795117,"count":0},{"bin_start":15906.795117,"bin_end":17932.686948000002,"count":1},{"bin_start":17932.686948000002,"bin_end":19958.578779,"count":0},{"bin_start":19958.578779,"bin_end":21984.47061,"count":1}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"age":"19","sex":"female","bmi":"27.9","children":"0","smoker":"yes","region":"southwest","charges":"16884.924","_deepnote_index_column":"0"},{"age":"18","sex":"male","bmi":"33.77","children":"1","smoker":"no","region":"southeast","charges":"1725.5523","_deepnote_index_column":"1"},{"age":"28","sex":"male","bmi":"33.0","children":"3","smoker":"no","region":"southeast","charges":"4449.462","_deepnote_index_column":"2"},{"age":"33","sex":"male","bmi":"22.705","children":"0","smoker":"no","region":"northwest","charges":"21984.47061","_deepnote_index_column":"3"},{"age":"32","sex":"male","bmi":"28.88","children":"0","smoker":"no","region":"northwest","charges":"3866.8552","_deepnote_index_column":"4"}]},"text/plain":" age sex bmi children smoker region charges\n0 19 female 27.900 0 yes southwest 16884.92400\n1 18 male 33.770 1 no southeast 1725.55230\n2 28 male 33.000 3 no southeast 4449.46200\n3 33 male 22.705 0 no northwest 21984.47061\n4 32 male 28.880 0 no northwest 3866.85520","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
agesexbmichildrensmokerregioncharges
019female27.9000yessouthwest16884.92400
118male33.7701nosoutheast1725.55230
228male33.0003nosoutheast4449.46200
333male22.7050nonorthwest21984.47061
432male28.8800nonorthwest3866.85520
\n
"},"metadata":{}}],"execution_count":2},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nX = insurance_df[['bmi', 'age', 'children']]\ny = insurance_df['charges']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)","metadata":{"tags":[],"cell_id":"e347f4bec2f94310b26240d19f59fdf2","allow_embed":"code_output","source_hash":"760c53b7","execution_start":1672948331486,"execution_millis":1510,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"code","source":"import statsmodels.api as sm\nX_train = sm.add_constant(X_train)\nlinear_reg_model = sm.OLS(y_train, X_train)\nlinear_reg_model = linear_reg_model.fit()\nprint(linear_reg_model.summary())","metadata":{"tags":[],"cell_id":"fa1737e8e4954602a106fa885e828d76","allow_embed":"code_output","source_hash":"537cc56d","execution_start":1672948332999,"execution_millis":363,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":" OLS Regression Results \n==============================================================================\nDep. Variable: charges R-squared: 0.110\nModel: OLS Adj. R-squared: 0.107\nMethod: Least Squares F-statistic: 43.86\nDate: Thu, 05 Jan 2023 Prob (F-statistic): 9.94e-27\nTime: 19:52:13 Log-Likelihood: -11507.\nNo. Observations: 1070 AIC: 2.302e+04\nDf Residuals: 1066 BIC: 2.304e+04\nDf Model: 3 \nCovariance Type: nonrobust \n==============================================================================\n coef std err t P>|t| [0.025 0.975]\n------------------------------------------------------------------------------\nconst -6118.0462 1968.214 -3.108 0.002 -9980.059 -2256.033\nbmi 332.2025 57.882 5.739 0.000 218.626 445.779\nage 220.7578 24.901 8.865 0.000 171.898 269.618\nchildren 563.0194 286.186 1.967 0.049 1.467 1124.572\n==============================================================================\nOmnibus: 263.642 Durbin-Watson: 1.938\nProb(Omnibus): 0.000 Jarque-Bera (JB): 489.281\nSkew: 1.524 Prob(JB): 5.68e-107\nKurtosis: 4.297 Cond. No. 291.\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"import numpy as np\nchurn_df = pd.read_csv(\"telco_churn.csv\")\nchurn_df.head()\nchurn_df['Churn'] = np.where(churn_df['Churn']=='Yes', 1, 0)","metadata":{"tags":[],"cell_id":"b7a3ac2681b94e3eabab0fcc7a65ad15","allow_embed":"code_output","source_hash":"358cc104","execution_start":1672948333363,"execution_millis":61,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":5},{"cell_type":"code","source":"X = churn_df[['tenure', 'MonthlyCharges']]\nX.loc[:,'tenure_squared'] = [x**2 for x in list(churn_df['tenure'])]\ny = churn_df['Churn']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)","metadata":{"tags":[],"cell_id":"fe161ab5c391412c900f503fbe9d9d8a","allow_embed":"code_output","source_hash":"11414d95","execution_start":1672948333431,"execution_millis":11,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stderr","text":"/shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n self.obj[key] = value\n/shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n self._setitem_single_column(ilocs[0], value, pi)\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"X_train = sm.add_constant(X_train)\nlog_reg_model = sm.Logit(y_train, X_train)\nlog_reg_model = log_reg_model.fit()\nprint(log_reg_model.summary())","metadata":{"tags":[],"cell_id":"ee152531fd5a40d68d1aef5eac9967f1","allow_embed":"code_output","source_hash":"4886aeca","execution_start":1672948333498,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Optimization terminated successfully.\n Current function value: 0.458122\n Iterations 7\n Logit Regression Results \n==============================================================================\nDep. Variable: Churn No. Observations: 5634\nModel: Logit Df Residuals: 5630\nMethod: MLE Df Model: 3\nDate: Thu, 05 Jan 2023 Pseudo R-squ.: 0.2084\nTime: 19:52:13 Log-Likelihood: -2581.1\nconverged: True LL-Null: -3260.7\nCovariance Type: nonrobust LLR p-value: 1.923e-294\n==================================================================================\n coef std err z P>|z| [0.025 0.975]\n----------------------------------------------------------------------------------\nconst -1.6265 0.102 -16.021 0.000 -1.826 -1.428\ntenure -0.0753 0.006 -12.991 0.000 -0.087 -0.064\nMonthlyCharges 0.0325 0.001 22.318 0.000 0.030 0.035\ntenure_squared 0.0003 8.29e-05 4.026 0.000 0.000 0.000\n==================================================================================\n","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestRegressor\nX = insurance_df[['bmi', 'age', 'children']]\ny = insurance_df['charges']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nrf_model1 = RandomForestRegressor(random_state=42)\nrf_model1.fit(X_train, y_train)\ny_pred1 = rf_model1.predict(X_test)\n\nrf_model2 = RandomForestRegressor(n_estimators= 50, max_depth=50, random_state=42)\nrf_model2.fit(X_train, y_train)\ny_pred2 = rf_model2.predict(X_test)\n","metadata":{"tags":[],"cell_id":"828696db519d4885b14642d1daa0208e","allow_embed":"code_output","source_hash":"eb4138e6","execution_start":1672948333499,"execution_millis":2342,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":8},{"cell_type":"code","source":"from mlxtend.evaluate import paired_ttest_5x2cv\n_, p_value = paired_ttest_5x2cv(estimator1=rf_model1, estimator2=rf_model2, scoring='neg_mean_squared_error', X=X_train, y=y_train, random_seed=42)\nif p_value < 0.05:\n print(f\"P-value of {p_value} give evidence that model difference is significant\")\nelse:\n print(f\"P-value of {p_value} give evidence that model difference is not significant\")","metadata":{"tags":[],"cell_id":"cd9091365e55489497a6a6d44b4a3a2a","allow_embed":"code_output","source_hash":"96ccb6c4","execution_start":1672948396105,"execution_millis":2092,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"P-value of 0.33466026017280154 give evidence that model difference is not significant\n","output_type":"stream"}],"execution_count":11},{"cell_type":"code","source":"from sklearn.linear_model import LinearRegression\nlr_model1 = LinearRegression()\nlr_model1.fit(X_train, y_train)\n_, p_value = paired_ttest_5x2cv(estimator1=rf_model1, estimator2=lr_model1, scoring='neg_mean_squared_error', X=X_train, y=y_train, random_seed=42)\nif p_value < 0.05:\n print(f\"P-value of {p_value} give evidence that model difference is significant\")\nelse:\n print(f\"P-value of {p_value} give evidence that model difference is not significant\")","metadata":{"tags":[],"cell_id":"165426efba9c44979b208430a9c650ba","allow_embed":"code_output","source_hash":"a7695e6b","execution_start":1672948632113,"execution_millis":1398,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"P-value of 0.002834207433757008 give evidence that model difference is significant\n","output_type":"stream"}],"execution_count":19},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"fafecebbb0c84a0e9feb8da62f4ef07f","deepnote_persisted_session":{"createdAt":"2023-01-05T20:16:48.418Z"},"deepnote_execution_queue":[]}} -------------------------------------------------------------------------------- /stable_diffusion/hello.py: -------------------------------------------------------------------------------- 1 | priint("Hi") 2 | -------------------------------------------------------------------------------- /tabgan_experiements.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":"%pip install tabgan=='1.1.0'\n%pip install Faker\n%pip install catboost","metadata":{"tags":[],"cell_id":"abbcf329e47b4403aff29663a4198fb3","allow_embed":"code_output","source_hash":"9bb28577","execution_start":1667682202597,"execution_millis":4981,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Requirement already satisfied: tabgan==1.1.0 in /usr/local/lib/python3.8/dist-packages (1.1.0)\nRequirement already satisfied: torch in /shared-libs/python3.8/py/lib/python3.8/site-packages (from tabgan==1.1.0) (1.12.1)\nRequirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from tabgan==1.1.0) (1.22.4)\nRequirement already satisfied: category-encoders in /usr/local/lib/python3.8/dist-packages (from tabgan==1.1.0) (2.5.1.post0)\nRequirement already satisfied: scikit-learn in /shared-libs/python3.8/py/lib/python3.8/site-packages (from tabgan==1.1.0) (1.1.2)\nRequirement already satisfied: lightgbm in /usr/local/lib/python3.8/dist-packages (from tabgan==1.1.0) (3.3.3)\nRequirement already satisfied: python-dateutil in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from tabgan==1.1.0) (2.8.2)\nRequirement already satisfied: torchvision in /shared-libs/python3.8/py/lib/python3.8/site-packages (from tabgan==1.1.0) (0.13.1)\nRequirement already satisfied: pandas in /shared-libs/python3.8/py/lib/python3.8/site-packages (from tabgan==1.1.0) (1.2.5)\nRequirement already satisfied: tqdm in /shared-libs/python3.8/py/lib/python3.8/site-packages (from tabgan==1.1.0) (4.64.1)\nRequirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch->tabgan==1.1.0) (4.2.0)\nRequirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.8/dist-packages (from category-encoders->tabgan==1.1.0) (0.5.3)\nRequirement already satisfied: scipy>=1.0.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from category-encoders->tabgan==1.1.0) (1.9.3)\nRequirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.8/dist-packages (from category-encoders->tabgan==1.1.0) (0.13.5)\nRequirement already satisfied: joblib>=1.0.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from scikit-learn->tabgan==1.1.0) (1.2.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from scikit-learn->tabgan==1.1.0) (3.1.0)\nRequirement already satisfied: wheel in /usr/lib/python3/dist-packages (from lightgbm->tabgan==1.1.0) (0.34.2)\nRequirement already satisfied: six>=1.5 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from python-dateutil->tabgan==1.1.0) (1.16.0)\nRequirement already satisfied: requests in /usr/lib/python3/dist-packages (from torchvision->tabgan==1.1.0) (2.22.0)\nRequirement already satisfied: pillow!=8.3.*,>=5.3.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from torchvision->tabgan==1.1.0) (9.2.0)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from pandas->tabgan==1.1.0) (2022.5)\nRequirement already satisfied: packaging>=21.3 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from statsmodels>=0.9.0->category-encoders->tabgan==1.1.0) (21.3)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from packaging>=21.3->statsmodels>=0.9.0->category-encoders->tabgan==1.1.0) (3.0.9)\n\u001b[33mWARNING: You are using pip version 20.2.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\nNote: you may need to restart the kernel to use updated packages.\nRequirement already satisfied: Faker in /usr/local/lib/python3.8/dist-packages (15.2.0)\nRequirement already satisfied: python-dateutil>=2.4 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from Faker) (2.8.2)\nRequirement already satisfied: six>=1.5 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from python-dateutil>=2.4->Faker) (1.16.0)\n\u001b[33mWARNING: You are using pip version 20.2.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\nNote: you may need to restart the kernel to use updated packages.\nRequirement already satisfied: catboost in /usr/local/lib/python3.8/dist-packages (1.1.1)\nRequirement already satisfied: plotly in /shared-libs/python3.8/py/lib/python3.8/site-packages (from catboost) (5.10.0)\nRequirement already satisfied: six in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from catboost) (1.16.0)\nRequirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.8/dist-packages (from catboost) (1.22.4)\nRequirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from catboost) (0.20.1)\nRequirement already satisfied: matplotlib in /shared-libs/python3.8/py/lib/python3.8/site-packages (from catboost) (3.6.0)\nRequirement already satisfied: scipy in /shared-libs/python3.8/py/lib/python3.8/site-packages (from catboost) (1.9.3)\nRequirement already satisfied: pandas>=0.24.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from catboost) (1.2.5)\nRequirement already satisfied: tenacity>=6.2.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from plotly->catboost) (8.1.0)\nRequirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from matplotlib->catboost) (4.37.4)\nRequirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from matplotlib->catboost) (2.8.2)\nRequirement already satisfied: cycler>=0.10 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from matplotlib->catboost) (0.11.0)\nRequirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from matplotlib->catboost) (9.2.0)\nRequirement already satisfied: contourpy>=1.0.1 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from matplotlib->catboost) (1.0.5)\nRequirement already satisfied: packaging>=20.0 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from matplotlib->catboost) (21.3)\nRequirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.8/py-core/lib/python3.8/site-packages (from matplotlib->catboost) (3.0.9)\nRequirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from matplotlib->catboost) (1.4.4)\nRequirement already satisfied: pytz>=2017.3 in /shared-libs/python3.8/py/lib/python3.8/site-packages (from pandas>=0.24.0->catboost) (2022.5)\n\u001b[33mWARNING: You are using pip version 20.2.4; however, version 22.3.1 is available.\nYou should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\nNote: you may need to restart the kernel to use updated packages.\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"from faker import Faker\nimport random\n\nrandom.seed(10)\nDATA_SIZE = 5000","metadata":{"tags":[],"cell_id":"95fd44a9696349ae9bceaa1724f3b6a4","source_hash":"4a38f707","execution_start":1667682207582,"execution_millis":24,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":2},{"cell_type":"code","source":"fake = Faker()\nFaker.seed(42)\nnames = []\nfor i in range(0,DATA_SIZE):\n names.append(fake.name())","metadata":{"tags":[],"cell_id":"786d6991d489447888ee0bf50a2fd712","source_hash":"c24ea4d5","execution_start":1667682207610,"execution_millis":368,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":3},{"cell_type":"code","source":"Faker.seed(42)\nstate = []\nfor i in range(0,DATA_SIZE):\n state.append(fake.state())","metadata":{"tags":[],"cell_id":"0d3b5c4cfd9b4efaa7d5daf326019219","source_hash":"a1e932b7","execution_start":1667682207990,"execution_millis":6,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":4},{"cell_type":"code","source":"specialty = []\nfor i in range(0, DATA_SIZE):\n specialty.append(\"Emergency Medicine\")","metadata":{"tags":[],"cell_id":"cb826a0b4eab43fc821222aff5ea778e","source_hash":"f51fa424","execution_start":1667682207996,"execution_millis":1,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":5},{"cell_type":"code","source":"from faker.providers import DynamicProvider\nhealth_insurance_provider = DynamicProvider(\n provider_name=\"health_insurance\",\n elements=[\"UnitedHealth Group\", \"Anthem\", \"Aetna\", \"Cigna\", \"Humana\", \"Medicare\"],\n)\nFaker.seed(42)\n\nfake.add_provider(health_insurance_provider)\n\ninsurance = []\nfor i in range(0, DATA_SIZE):\n insurance.append(fake.health_insurance())","metadata":{"tags":[],"cell_id":"8bd583f5bbb4442d8416313f4787dd8f","allow_embed":"code_output","source_hash":"b1d2da56","execution_start":1667682208001,"execution_millis":12,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":6},{"cell_type":"code","source":"Faker.seed(42)\nsex = []\nfor i in range(0, DATA_SIZE):\n sex.append(fake.profile()['sex'])","metadata":{"tags":[],"cell_id":"8e352b7ff91847aeaa0992fc9572d2a6","source_hash":"5e51c6a6","execution_start":1667682208021,"execution_millis":3819,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":7},{"cell_type":"code","source":"import numpy as np\n\nnp.random.seed(42)\nmu, sigma = 50, 20\nage = np.random.normal(mu, sigma, DATA_SIZE)\nage_int = np.round(age)","metadata":{"tags":[],"cell_id":"f2b6cae348a54e33b89b89545fafe2ed","source_hash":"fcce59d1","execution_start":1667682211845,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":8},{"cell_type":"code","source":"np.random.seed(42)\nmu, sigma = 180, 70\nweight = np.round(np.random.normal(mu, sigma, DATA_SIZE), 2)","metadata":{"tags":[],"cell_id":"9da07d6ec1a348e4878d7698f4404a7f","source_hash":"877d4a20","execution_start":1667682211846,"execution_millis":7,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":9},{"cell_type":"code","source":"np.random.seed(42)\nmu, sigma = 68, 8\nheight = np.round(np.random.normal(mu, sigma, DATA_SIZE), 2)","metadata":{"tags":[],"cell_id":"5c400f407a1943908a5406304aef08f4","allow_embed":"code_output","source_hash":"4a85cc4","execution_start":1667682211853,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":10},{"cell_type":"code","source":"smoker = []\nfor i in range(0, DATA_SIZE):\n smoker.append(np.random.randint(0,2))\n\nsmoker_category = ['Yes' if x == 1 else 'No' for x in smoker]","metadata":{"tags":[],"cell_id":"68a3fef3032f4767883dc0656c657a64","allow_embed":"code_output","source_hash":"27c1d06b","execution_start":1667682211854,"execution_millis":10,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":11},{"cell_type":"code","source":"np.random.seed(42)\nlegnth_of_stay = []\nfor i in range(0, DATA_SIZE):\n legnth_of_stay.append(np.random.randint(1,10))","metadata":{"tags":[],"cell_id":"11d1a0f34ce64c969d6926ec97c7a546","allow_embed":"code_output","source_hash":"dd56c00e","execution_start":1667682211891,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":12},{"cell_type":"code","source":"import pandas as pd \ndf = pd.DataFrame({'height':height, 'weight':weight, \n 'age':age_int, 'sex':sex, 'insurance':insurance, \n 'specialty':specialty, 'name':names, 'state':state,\n 'length_of_stay':legnth_of_stay, 'smoker':smoker_category,\n })","metadata":{"tags":[],"cell_id":"5ba86450352743a891328feffa0a56d8","allow_embed":"code_output","source_hash":"3cbbc126","execution_start":1667682211892,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":13},{"cell_type":"code","source":"df['bmi'] = np.round((df['weight']/(df['height']**2))*703, 3)","metadata":{"tags":[],"cell_id":"60576d39941340758d8bafcefec4d7d6","allow_embed":"code_output","source_hash":"5f6a0d97","execution_start":1667682211892,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":14},{"cell_type":"code","source":"df['readmission'] = [np.random.randint(0,2) for x in range(0, DATA_SIZE)]\ndf.drop_duplicates('name', inplace=True)\nnp.random.seed(42)\n\ndf_sample1 = df.sample(frac=0.2, replace=True, random_state=1)\n\ndf_sample2 = df[~df['name'].isin(list(set(df_sample1['name'])))]\n\ndf_sample2['readmission'] = 0 \ndf_sample2.loc[(df_sample2.bmi >=30) & (df_sample2.smoker == 'Yes') & (df_sample2.insurance == 'Medicare') & (df.length_of_stay >= 5), 'readmission'] = 1\n\ndf = df_sample2.append(df_sample1)","metadata":{"tags":[],"cell_id":"bf3897af187043c3a9f93ce14b46f308","allow_embed":"code","source_hash":"fe050939","execution_start":1667682211902,"execution_millis":13,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stderr","text":"/tmp/ipykernel_482/3297978708.py:9: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n df_sample2['readmission'] = 0\n/shared-libs/python3.8/py/lib/python3.8/site-packages/pandas/core/indexing.py:1720: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n self._setitem_single_column(loc, value, pi)\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"from collections import Counter\nCounter(df['readmission'])","metadata":{"tags":[],"cell_id":"e5c9ce8b73f5427a8473e8b2d1141bdb","allow_embed":"code_output","source_hash":"7a636544","execution_start":1667682211936,"execution_millis":3,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":16,"data":{"text/plain":"Counter({0: 4405, 1: 517})"},"metadata":{}}],"execution_count":null},{"cell_type":"code","source":"df['insurance'] = df['insurance'].astype('category')\ndf['insurance'] = df['insurance'].cat.codes\n\ndf['sex'] = df['sex'].astype('category')\ndf['sex'] = df['sex'].cat.codes\n\ndf['smoker'] = df['smoker'].astype('category')\ndf['smoker'] = df['smoker'].cat.codes\n\ndf['state'] = df['state'].astype('category')\ndf['state'] = df['state'].cat.codes","metadata":{"tags":[],"cell_id":"07c4712e868f4ccabd8b366cbb6cecc0","allow_embed":"code_output","source_hash":"dba22df1","execution_start":1667682211937,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":null},{"cell_type":"code","source":"X = df[['insurance', 'sex', 'smoker', 'state', 'height', 'weight', 'bmi', 'length_of_stay']]\ny = df['readmission']","metadata":{"tags":[],"cell_id":"e35f3707ec4f4fc48f22de4ba2ad6d12","allow_embed":"code_output","source_hash":"6e0c54a5","execution_start":1667682211937,"execution_millis":0,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)","metadata":{"tags":[],"cell_id":"ca65ca43602d40acae72294846e67551","allow_embed":"code_output","source_hash":"4ba717ca","execution_start":1667682211943,"execution_millis":294,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":null},{"cell_type":"code","source":"from catboost import CatBoostClassifier","metadata":{"tags":[],"cell_id":"4c0b089b1a7d4c4aab6af10b1ed1fe95","allow_embed":"code_output","source_hash":"390e05a9","execution_start":1667682212282,"execution_millis":1,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[],"execution_count":null},{"cell_type":"code","source":"cats = ['insurance', 'sex', 'smoker', 'state']\nmodel1 = CatBoostClassifier(cat_features= cats, iterations=10)\nmodel1.fit(X_train, y_train)","metadata":{"tags":[],"cell_id":"dc79a692aa364e36956d37ea1c4e9d9d","allow_embed":"code_output","source_hash":"33c229c7","execution_start":1667682212283,"execution_millis":55,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Learning rate set to 0.5\n0:\tlearn: 0.4660763\ttotal: 48.2ms\tremaining: 434ms\n1:\tlearn: 0.3911385\ttotal: 49.5ms\tremaining: 198ms\n2:\tlearn: 0.3574981\ttotal: 50.5ms\tremaining: 118ms\n3:\tlearn: 0.3447903\ttotal: 51.3ms\tremaining: 76.9ms\n4:\tlearn: 0.3374272\ttotal: 52.3ms\tremaining: 52.3ms\n5:\tlearn: 0.3343143\ttotal: 53.4ms\tremaining: 35.6ms\n6:\tlearn: 0.3336231\ttotal: 54.2ms\tremaining: 23.2ms\n7:\tlearn: 0.3302381\ttotal: 55.2ms\tremaining: 13.8ms\n8:\tlearn: 0.3272703\ttotal: 56.2ms\tremaining: 6.24ms\n9:\tlearn: 0.3271818\ttotal: 57ms\tremaining: 0us\n","output_type":"stream"},{"output_type":"execute_result","execution_count":22,"data":{"text/plain":""},"metadata":{}}],"execution_count":null},{"cell_type":"code","source":"y_pred = model1.predict(X_test)\nprint(Counter(y_pred))\nprint(Counter(y_test))\nfrom sklearn.metrics import precision_score\nprint(\"precision_score: \", precision_score(y_test, y_pred))","metadata":{"tags":[],"cell_id":"635e4532d41b4e4ebceb0abb79c92d95","allow_embed":"code_output","source_hash":"dc48d703","execution_start":1667682212339,"execution_millis":6,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Counter({0: 1477})\nCounter({0: 1331, 1: 146})\nprecision_score: 0.0\n/shared-libs/python3.8/py/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1334: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n _warn_prf(average, modifier, msg_start, len(result))\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"from tabgan.sampler import GANGenerator\ncols = ['insurance', 'sex', 'smoker', 'state', 'height', 'weight', 'bmi', 'length_of_stay', 'readmission']\n\nX = df[cols[:-1]]\ny = pd.DataFrame(list(df['readmission']), columns=['readmission'])\nX.reset_index(inplace=True, drop=True)\ny.reset_index(inplace=True, drop=True)\n\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n\nX_train.reset_index(inplace=True, drop=True)\ny_train.reset_index(inplace=True, drop=True)\nX_test.reset_index(inplace=True, drop=True)\n\nnew_train2, new_target2 = GANGenerator(cat_cols = cols, epochs=2, is_post_process=False).generate_data_pipe(X_train, y_train, X_test,use_adversarial=False, only_generated_data=False)\n","metadata":{"tags":[],"cell_id":"ed84f947a62248599c44332d2da38d62","allow_embed":"code_output","source_hash":"86dfda","execution_start":1667682212348,"execution_millis":19978,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stderr","text":"/shared-libs/python3.8/py/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n from .autonotebook import tqdm as notebook_tqdm\nFitting CTGAN transformers for each column: 100%|██████████| 9/9 [00:00<00:00, 1783.71it/s]\nTraining CTGAN, epochs:: 100%|██████████| 2/2 [00:15<00:00, 7.81s/it]\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"print(Counter(new_target2))","metadata":{"tags":[],"cell_id":"b4652ccfb314403ab03c69b44f9f2df8","allow_embed":"code_output","source_hash":"96721c11","execution_start":1667682232332,"execution_millis":8,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Counter({0: 8102, 1: 2921})\n","output_type":"stream"}],"execution_count":null},{"cell_type":"code","source":"new_train2.head()\n","metadata":{"tags":[],"cell_id":"656087f70db24a298ac3a577e9b32507","allow_embed":"code_output","source_hash":"d39d1029","execution_start":1667682232338,"execution_millis":22,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","execution_count":27,"data":{"application/vnd.deepnote.dataframe.v3+json":{"column_count":8,"row_count":5,"columns":[{"name":"insurance","dtype":"int8","stats":{"unique_count":4,"nan_count":0,"min":"1","max":"5","histogram":[{"bin_start":1,"bin_end":1.4,"count":1},{"bin_start":1.4,"bin_end":1.8,"count":0},{"bin_start":1.8,"bin_end":2.2,"count":2},{"bin_start":2.2,"bin_end":2.6,"count":0},{"bin_start":2.6,"bin_end":3,"count":0},{"bin_start":3,"bin_end":3.4000000000000004,"count":1},{"bin_start":3.4000000000000004,"bin_end":3.8000000000000003,"count":0},{"bin_start":3.8000000000000003,"bin_end":4.2,"count":0},{"bin_start":4.2,"bin_end":4.6,"count":0},{"bin_start":4.6,"bin_end":5,"count":1}]}},{"name":"sex","dtype":"int8","stats":{"unique_count":2,"nan_count":0,"min":"0","max":"1","histogram":[{"bin_start":0,"bin_end":0.1,"count":2},{"bin_start":0.1,"bin_end":0.2,"count":0},{"bin_start":0.2,"bin_end":0.30000000000000004,"count":0},{"bin_start":0.30000000000000004,"bin_end":0.4,"count":0},{"bin_start":0.4,"bin_end":0.5,"count":0},{"bin_start":0.5,"bin_end":0.6000000000000001,"count":0},{"bin_start":0.6000000000000001,"bin_end":0.7000000000000001,"count":0},{"bin_start":0.7000000000000001,"bin_end":0.8,"count":0},{"bin_start":0.8,"bin_end":0.9,"count":0},{"bin_start":0.9,"bin_end":1,"count":3}]}},{"name":"smoker","dtype":"int8","stats":{"unique_count":2,"nan_count":0,"min":"0","max":"1","histogram":[{"bin_start":0,"bin_end":0.1,"count":2},{"bin_start":0.1,"bin_end":0.2,"count":0},{"bin_start":0.2,"bin_end":0.30000000000000004,"count":0},{"bin_start":0.30000000000000004,"bin_end":0.4,"count":0},{"bin_start":0.4,"bin_end":0.5,"count":0},{"bin_start":0.5,"bin_end":0.6000000000000001,"count":0},{"bin_start":0.6000000000000001,"bin_end":0.7000000000000001,"count":0},{"bin_start":0.7000000000000001,"bin_end":0.8,"count":0},{"bin_start":0.8,"bin_end":0.9,"count":0},{"bin_start":0.9,"bin_end":1,"count":3}]}},{"name":"state","dtype":"int8","stats":{"unique_count":5,"nan_count":0,"min":"0","max":"34","histogram":[{"bin_start":0,"bin_end":3.4,"count":1},{"bin_start":3.4,"bin_end":6.8,"count":0},{"bin_start":6.8,"bin_end":10.2,"count":1},{"bin_start":10.2,"bin_end":13.6,"count":1},{"bin_start":13.6,"bin_end":17,"count":0},{"bin_start":17,"bin_end":20.4,"count":0},{"bin_start":20.4,"bin_end":23.8,"count":1},{"bin_start":23.8,"bin_end":27.2,"count":0},{"bin_start":27.2,"bin_end":30.599999999999998,"count":0},{"bin_start":30.599999999999998,"bin_end":34,"count":1}]}},{"name":"height","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"54.62","max":"76.08","histogram":[{"bin_start":54.62,"bin_end":56.766,"count":1},{"bin_start":56.766,"bin_end":58.912,"count":0},{"bin_start":58.912,"bin_end":61.058,"count":0},{"bin_start":61.058,"bin_end":63.20399999999999,"count":0},{"bin_start":63.20399999999999,"bin_end":65.35,"count":1},{"bin_start":65.35,"bin_end":67.496,"count":0},{"bin_start":67.496,"bin_end":69.642,"count":2},{"bin_start":69.642,"bin_end":71.788,"count":0},{"bin_start":71.788,"bin_end":73.934,"count":0},{"bin_start":73.934,"bin_end":76.08,"count":1}]}},{"name":"weight","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"62.9","max":"250.66","histogram":[{"bin_start":62.9,"bin_end":81.676,"count":1},{"bin_start":81.676,"bin_end":100.452,"count":0},{"bin_start":100.452,"bin_end":119.22800000000001,"count":0},{"bin_start":119.22800000000001,"bin_end":138.004,"count":0},{"bin_start":138.004,"bin_end":156.78,"count":1},{"bin_start":156.78,"bin_end":175.556,"count":0},{"bin_start":175.556,"bin_end":194.332,"count":2},{"bin_start":194.332,"bin_end":213.108,"count":0},{"bin_start":213.108,"bin_end":231.88400000000001,"count":0},{"bin_start":231.88400000000001,"bin_end":250.66,"count":1}]}},{"name":"bmi","dtype":"float64","stats":{"unique_count":5,"nan_count":0,"min":"14.822","max":"30.444","histogram":[{"bin_start":14.822,"bin_end":16.3842,"count":1},{"bin_start":16.3842,"bin_end":17.9464,"count":0},{"bin_start":17.9464,"bin_end":19.5086,"count":0},{"bin_start":19.5086,"bin_end":21.0708,"count":0},{"bin_start":21.0708,"bin_end":22.633,"count":0},{"bin_start":22.633,"bin_end":24.1952,"count":0},{"bin_start":24.1952,"bin_end":25.757399999999997,"count":1},{"bin_start":25.757399999999997,"bin_end":27.3196,"count":1},{"bin_start":27.3196,"bin_end":28.8818,"count":1},{"bin_start":28.8818,"bin_end":30.444,"count":1}]}},{"name":"length_of_stay","dtype":"int64","stats":{"unique_count":4,"nan_count":0,"min":"1","max":"6","histogram":[{"bin_start":1,"bin_end":1.5,"count":1},{"bin_start":1.5,"bin_end":2,"count":0},{"bin_start":2,"bin_end":2.5,"count":0},{"bin_start":2.5,"bin_end":3,"count":0},{"bin_start":3,"bin_end":3.5,"count":0},{"bin_start":3.5,"bin_end":4,"count":0},{"bin_start":4,"bin_end":4.5,"count":1},{"bin_start":4.5,"bin_end":5,"count":0},{"bin_start":5,"bin_end":5.5,"count":1},{"bin_start":5.5,"bin_end":6,"count":2}]}},{"name":"_deepnote_index_column","dtype":"int64"}],"rows":[{"insurance":"2","sex":"0","smoker":"0","state":"13","height":"54.62","weight":"62.9","bmi":"14.822","length_of_stay":"5","_deepnote_index_column":"0"},{"insurance":"1","sex":"1","smoker":"1","state":"34","height":"76.08","weight":"250.66","bmi":"30.444","length_of_stay":"4","_deepnote_index_column":"1"},{"insurance":"5","sex":"0","smoker":"0","state":"0","height":"67.56","weight":"176.11","bmi":"27.124","length_of_stay":"6","_deepnote_index_column":"2"},{"insurance":"3","sex":"1","smoker":"1","state":"23","height":"63.21","weight":"138.11","bmi":"24.3","length_of_stay":"6","_deepnote_index_column":"3"},{"insurance":"2","sex":"1","smoker":"1","state":"8","height":"68.85","weight":"187.4","bmi":"27.792","length_of_stay":"1","_deepnote_index_column":"4"}]},"text/plain":" insurance sex smoker state height weight bmi length_of_stay\n0 2 0 0 13 54.62 62.90 14.822 5\n1 1 1 1 34 76.08 250.66 30.444 4\n2 5 0 0 0 67.56 176.11 27.124 6\n3 3 1 1 23 63.21 138.11 24.300 6\n4 2 1 1 8 68.85 187.40 27.792 1","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
insurancesexsmokerstateheightweightbmilength_of_stay
02001354.6262.9014.8225
11113476.08250.6630.4444
2500067.56176.1127.1246
33112363.21138.1124.3006
4211868.85187.4027.7921
\n
"},"metadata":{}}],"execution_count":null},{"cell_type":"code","source":"cats = ['insurance', 'sex', 'smoker', 'state']\nmodel2 = CatBoostClassifier(cat_features= cats, iterations=10)\nmodel2.fit(new_train2, new_target2)","metadata":{"tags":[],"cell_id":"2fd9973aa39548a09690fbe0f8b73da0","allow_embed":"code_output","source_hash":"dc3d76c6","execution_start":1667684407711,"execution_millis":33,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Learning rate set to 0.5\n0:\tlearn: 0.6802753\ttotal: 1.61ms\tremaining: 14.5ms\n1:\tlearn: 0.6708204\ttotal: 3.13ms\tremaining: 12.5ms\n2:\tlearn: 0.6643163\ttotal: 4.39ms\tremaining: 10.3ms\n3:\tlearn: 0.6596202\ttotal: 5.67ms\tremaining: 8.51ms\n4:\tlearn: 0.6588420\ttotal: 6.93ms\tremaining: 6.93ms\n5:\tlearn: 0.6580290\ttotal: 8.15ms\tremaining: 5.43ms\n6:\tlearn: 0.6572360\ttotal: 9.41ms\tremaining: 4.03ms\n7:\tlearn: 0.6555131\ttotal: 10.6ms\tremaining: 2.64ms\n8:\tlearn: 0.6538899\ttotal: 11.8ms\tremaining: 1.31ms\n9:\tlearn: 0.6511306\ttotal: 13ms\tremaining: 0us\n","output_type":"stream"},{"output_type":"execute_result","execution_count":33,"data":{"text/plain":""},"metadata":{}}],"execution_count":null},{"cell_type":"code","source":"y_pred2 = model2.predict(X_test)\nprint(Counter(y_pred2))\nprint(Counter(y_test['readmission']))\nfrom sklearn.metrics import precision_score\nprint(\"precision_score: \", precision_score(y_test, y_pred2))","metadata":{"tags":[],"cell_id":"576566cfca4e4822a30fbf42f452c6c4","allow_embed":"code_output","source_hash":"b45bd8c7","execution_start":1667684412101,"execution_millis":5,"deepnote_to_be_reexecuted":false,"deepnote_cell_type":"code"},"outputs":[{"name":"stdout","text":"Counter({0: 1471, 1: 6})\nCounter({0: 1331, 1: 146})\nprecision_score: 0.16666666666666666\n","output_type":"stream"}],"execution_count":null},{"cell_type":"markdown","source":"\nCreated in deepnote.com \nCreated in Deepnote","metadata":{"tags":[],"created_in_deepnote_cell":true,"deepnote_cell_type":"markdown"}}],"nbformat":4,"nbformat_minor":0,"metadata":{"deepnote":{},"orig_nbformat":2,"deepnote_notebook_id":"b55c6a4c4fc1498c924a7875e7880421","deepnote_persisted_session":{"createdAt":"2022-11-05T22:02:21.023Z"},"deepnote_execution_queue":[]}} --------------------------------------------------------------------------------