├── .github
    └── workflows
    │   └── copilot-test.yml
├── .gitignore
├── LICENSE
├── README.md
├── dev-requirements.txt
├── examples
    └── copilot_example_notebook.ipynb
├── notebook_copilot
    ├── __init__.py
    ├── agents.py
    ├── chains.py
    ├── context.py
    ├── handlers.py
    ├── models.py
    ├── notebook_copilot.py
    ├── output.py
    ├── parsers.py
    ├── prompts.py
    ├── tools.py
    └── utils.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    └── test_copilot.py


/.github/workflows/copilot-test.yml:
--------------------------------------------------------------------------------
 1 | name: Python application test with pytest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up Python 3.9
17 |       uses: actions/setup-python@v2
18 |       with:
19 |         python-version: '3.9' 
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
24 |         if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi
25 |     - name: Run pytest
26 |       working-directory: ./tests
27 |       env: # Set the OPENAI_API_KEY environment variable
28 |         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
29 |       run: |
30 |         pytest


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | *.pyc
 3 | __pycache__/
 4 | 
 5 | # Jupyter Notebook
 6 | .ipynb_checkpoints/
 7 | 
 8 | # Testing
 9 | .pytest_cache/
10 | 
11 | # Environment
12 | .env
13 | .venv
14 | env/
15 | venv/
16 | 
17 | # Logs
18 | *.log
19 | 
20 | # OS
21 | .DS_Store
22 | 
23 | # PyCharm
24 | .idea/
25 | 
26 | # CatBoost
27 | catboost_info/
28 | *.cbm
29 | 
30 | # Distribution / packaging
31 | .Python
32 | build/
33 | develop-eggs/
34 | dist/
35 | downloads/
36 | eggs/
37 | .eggs/
38 | lib/
39 | lib64/
40 | parts/
41 | sdist/
42 | var/
43 | wheels/
44 | *.egg-info/
45 | .installed.cfg
46 | *.egg
47 | MANIFEST


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 TP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Unit Tests](https://img.shields.io/github/actions/workflow/status/talperetz/notebook-copilot/copilot-test.yml?label=tests)](https://github.com/talperetz/notebook-copilot/actions/workflows/copilot-test.yml)
  2 | [![PyPI](https://img.shields.io/pypi/v/notebook_copilot?color=green)](https://pypi.org/project/notebook-copilot/)
  3 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/notebook_copilot?color=green)
  4 | ![GitHub](https://img.shields.io/github/license/talperetz/notebook-copilot)
  5 | 
  6 | 
  7 | 
  8 | # 🚀 Notebook Copilot: Turn Your Thoughts Into a Polished Notebook at Record-Speed with AI.
  9 | ![notebook-copilot2](https://github.com/talperetz/notebook-copilot/assets/11588598/db2b31f5-2858-41e9-8d5d-130532cbe76b)
 10 | 
 11 | Welcome to Notebook Copilot, your next-generation tool for Jupyter Notebooks. Inspired by GitHub Copilot, Notebook
 12 | Copilot is designed to help data scientists and engineers in developing professional, high-quality notebooks. It's like
 13 | having your personal AI-powered assistant that helps you navigate through the Jupyter universe, seamlessly
 14 | generating code and markdown cells based on your inputs.
 15 | 
 16 | Imagine not having to start with a blank notebook every time. Sounds dreamy, right?
 17 | 
 18 | <h2 align="center"> v1 Demo </h2>
 19 | 
 20 | 
 21 | 
 22 | https://github.com/talperetz/notebook-copilot/assets/11588598/3a086878-4e4d-4813-bc5d-41a49adb117c
 23 | 
 24 | 
 25 | 
 26 | 
 27 | ## Features
 28 | 
 29 | - 🚀 GPT Based Generation: Notebook Copilot employs advanced GPT instances for precise and efficient code generation.
 30 | - 💻 Integrated with Any Notebook Environments: Seamless access within Jupyter Notebook and other popular platforms, boosting your productivity.
 31 | - 🧩 Automatic Context Retrieval: Understands the full context of your notebook, ensuring consistent and relevant code generation.
 32 | - 🔑 Bring Your Own Key: Flexibility to use your own OpenAI key for personalized code generation and optimal results.
 33 | - 🆓 Free and Open Source: Everyone can benefit from Notebook Copilot. It's our contribution to the coding community, aiming to make coding accessible, efficient, and fun.
 34 | 
 35 | ## Quickstart
 36 | ```python
 37 | !pip install notebook_copilot
 38 | %load_ext notebook_copilot
 39 | ```
 40 | ```python
 41 | %copilot
 42 | ```
 43 | 
 44 | 
 45 | 
 46 | ## Walk-through
 47 | 0. Get an OpenAI [API Key](https://platform.openai.com/account/api-keys)
 48 | 1. Install Notebook Copilot directly from PyPI:
 49 | 
 50 | ```bash
 51 | pip install notebook_copilot
 52 | ```
 53 | 
 54 | 2. Load the Notebook Copilot extension in your Jupyter notebook:
 55 | 
 56 | ```python
 57 | %load_ext notebook_copilot
 58 | 
 59 | # Optional: If you don't have OPENAI_API_KEY set in your environment, you can set it here
 60 | from getpass import getpass
 61 | import os
 62 | os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI Key: ")
 63 | ```
 64 | 
 65 | 3. Start using Notebook Copilot Magic Functions in your notebook ↓
 66 | 
 67 | 
 68 | ## ✨ Magic Functions
 69 | 
 70 | 🪄 Enter Assistant Mode and let Copilot continuously generate professional code and markdown cells for you.
 71 | ```python
 72 | %copilot
 73 | ```
 74 | 
 75 | ✍️ Leverage AI to create the next cell from your comments. It's like having a conversation with your notebook.
 76 | 
 77 | ```python
 78 | %%code 
 79 | # Plot the confusion matrix for each model
 80 | # Plot the precision-recall curve for Catboost
 81 | ```
 82 | 
 83 | 📘 Automatically generate markdown cells to explain the code in the current cell. Your code is now not only functional but also well-documented.
 84 | ```python
 85 | %%explain
 86 | # some code to explain…
 87 | ```
 88 | 
 89 | ⚡ Improve the time complexity of the code in no time. Copilot will generate an alternative code cell that is optimized for speed.
 90 | ```python
 91 | %%optimize
 92 | # a code cell…
 93 | ```
 94 | 
 95 | 🎨 1 plot >= 1000 data rows. Visualize your data with a single line of code.
 96 | ```python
 97 | %%visualize
 98 | # a code cell…
 99 | ```
100 | 
101 | ## Roadmap
102 | 
103 | - [x] **Copilot Magic Function**: Continues the notebook for you, generating professional code and markdown cells, making
104 |   blank notebooks a thing of the past.
105 | - [x] **Generate Magic Function**: Turn Your Comments into Code
106 | - [x] **Explain Magic Function**: Generate Markdown Cells that Explain Your Code
107 | - [x] **Optimize Magic Function**: Generate an Aleternative Code Cell that is Optimized For Speed / Simplicity
108 | - [x] **Visualize Magic Function**: Generate a Cell that Visualize Your Data
109 | - [x] Speed improvenents
110 |   - [x] Support parallel cell generation
111 | - [x] Update underlying strategy and prompts
112 | - [ ] Support more llm providers
113 |   - [ ] Starcoder
114 |   - [ ] Anthropic
115 |     
116 | 
117 | ## Contributing
118 | We appreciate all contributions. If you're planning to contribute back bug-fixes, please do so without any further discussion. If you plan to contribute new features, utility functions, or extensions to the core, please first open an issue and discuss the feature with us.
119 | 
120 | ---
121 | If you've found Notebook Copilot useful, please consider giving it a ⭐️ star on GitHub! This helps us know that our work is having an impact and encourages future development.
122 | 
123 | [![Star on GitHub](https://img.shields.io/github/stars/talperetz/notebook-copilot.svg?style=social)](https://github.com/talperetz/notebook-copilot/stargazers)
124 | 
125 | Your support is greatly appreciated! 🙌
126 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==7.3.1
2 | pytest-mock==3.10.0
3 | python-dotenv==1.0.0


--------------------------------------------------------------------------------
/examples/copilot_example_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "b7e22119",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "%load_ext notebook_copilot\n",
 11 |     "import os\n",
 12 |     "from getpass import getpass\n",
 13 |     "os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI Key: \")"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "b217c586",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "### Goal\n",
 22 |     "In this notebook we're going to compare gradient boosting models on the Amazon Customer Reviews Dataset. Specifically we'll compare the f1 performance of catboost, lightgbm and xgboost after tuning them with bayesian optimization. "
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "id": "fcc430bb",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "application/javascript": [
 34 |        "\n",
 35 |        "                    var selected_cell_index = IPython.notebook.get_selected_index();\n",
 36 |        "                    window.markedCellIndex = IPython.notebook.find_cell_index(selected_cell_index);\n",
 37 |        "                    "
 38 |       ],
 39 |       "text/plain": [
 40 |        "<IPython.core.display.Javascript object>"
 41 |       ]
 42 |      },
 43 |      "metadata": {},
 44 |      "output_type": "display_data"
 45 |     },
 46 |     {
 47 |      "data": {
 48 |       "application/javascript": [
 49 |        "\n",
 50 |        "                    var selected_cell_index = IPython.notebook.get_selected_index();\n",
 51 |        "                    var cell = IPython.notebook.insert_cell_at_index('markdown', window.markedCellIndex - 1);\n",
 52 |        "                    cell.set_text(\"## Objective of the Next Cell\\n\\nThe next cell aims to load necessary libraries such as pandas, numpy, matplotlib, and seaborn, and then load the dataset 'amazon_reviews.csv' into a pandas dataframe called 'reviews_df'. This is the initial step in the data analysis process, which will be followed by data cleaning, exploratory data analysis, feature engineering, and model building. The loaded dataset will be used to extract insights and build a predictive model to classify the sentiment of Amazon product reviews.\");\n",
 53 |        "                    "
 54 |       ],
 55 |       "text/plain": [
 56 |        "<IPython.core.display.Javascript object>"
 57 |       ]
 58 |      },
 59 |      "metadata": {},
 60 |      "output_type": "display_data"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# Load necessary libraries\n",
 65 |     "import pandas as pd\n",
 66 |     "import numpy as np\n",
 67 |     "import matplotlib.pyplot as plt\n",
 68 |     "\n",
 69 |     "# Load dataset\n",
 70 |     "reviews_df = pd.read_csv('amazon_reviews.csv')\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "id": "391f66eb",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Exploratory Data Analysis\n",
 79 |     "\n",
 80 |     "In this cell, we are performing exploratory data analysis on a dataset called `reviews_df`. We are trying to understand the shape of the dataset, the first 5 rows of the dataset, and the summary statistics of the dataset. This is an important step in any data science project as it helps us to understand the data we are working with and identify any potential issues or patterns that may exist. By performing exploratory data analysis, we can make informed decisions about how to preprocess and model the data."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 3,
 86 |    "id": "112d8a1a",
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "application/javascript": [
 92 |        "\n",
 93 |        "                    var selected_cell_index = IPython.notebook.get_selected_index();\n",
 94 |        "                    var cell = IPython.notebook.insert_cell_at_index('markdown', selected_cell_index - 1);\n",
 95 |        "                    cell.set_text(\"## Exploratory Data Analysis\\n\\nIn this cell, we are performing exploratory data analysis on a dataset called `reviews_df`. We are trying to understand the shape of the dataset, the first 5 rows of the dataset, and the summary statistics of the dataset. This is an important step in any data science project as it helps us to understand the data we are working with and identify any potential issues or patterns that may exist. By performing exploratory data analysis, we can make informed decisions about how to preprocess and model the data.\");\n",
 96 |        "                    "
 97 |       ],
 98 |       "text/plain": [
 99 |        "<IPython.core.display.Javascript object>"
100 |       ]
101 |      },
102 |      "metadata": {},
103 |      "output_type": "display_data"
104 |     }
105 |    ],
106 |    "source": [
107 |     "%%explain\n",
108 |     "# Exploratory Data Analysis\n",
109 |     "\n",
110 |     "# Check the shape of the dataset\n",
111 |     "print('Shape of the dataset:', reviews_df.shape)\n",
112 |     "\n",
113 |     "# Check the first 5 rows of the dataset\n",
114 |     "print('First 5 rows of the dataset:')\n",
115 |     "print(reviews_df.head())\n",
116 |     "\n",
117 |     "# Check the summary statistics of the dataset\n",
118 |     "print('Summary statistics of the dataset:')\n",
119 |     "print(reviews_df.describe())\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "96c919b9",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Data Preprocessing\n",
130 |     "\n",
131 |     "# Drop unnecessary columns\n",
132 |     "reviews_df.drop(['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'review_date'], axis=1, inplace=True)\n",
133 |     "\n",
134 |     "# Convert star_rating to binary sentiment\n",
135 |     "reviews_df['sentiment'] = np.where(reviews_df['star_rating']>=4, 1, 0)\n",
136 |     "reviews_df.drop('star_rating', axis=1, inplace=True)\n",
137 |     "\n",
138 |     "# Split the dataset into train and test sets\n",
139 |     "from sklearn.model_selection import train_test_split\n",
140 |     "X_train, X_test, y_train, y_test = train_test_split(reviews_df['review_body'], reviews_df['sentiment'], test_size=0.2, random_state=42)\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "f789b358",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Model Building\n",
151 |     "\n",
152 |     "# CatBoost\n",
153 |     "from catboost import CatBoostClassifier\n",
154 |     "from sklearn.metrics import f1_score\n",
155 |     "\n",
156 |     "# Define the model\n",
157 |     "catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='Logloss', random_seed=42)\n",
158 |     "\n",
159 |     "# Fit the model\n",
160 |     "catboost_model.fit(X_train, y_train, verbose=False)\n",
161 |     "\n",
162 |     "# Predict on the test set\n",
163 |     "y_pred = catboost_model.predict(X_test)\n",
164 |     "\n",
165 |     "# Calculate f1 score\n",
166 |     "catboost_f1 = f1_score(y_test, y_pred)\n",
167 |     "print('CatBoost f1 score:', catboost_f1)\n"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "c4545d5a",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# LightGBM\n",
178 |     "from lightgbm import LGBMClassifier\n",
179 |     "\n",
180 |     "# Define the model\n",
181 |     "lgbm_model = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=1000, objective='binary', random_state=42)\n",
182 |     "\n",
183 |     "# Fit the model\n",
184 |     "lgbm_model.fit(X_train, y_train, verbose=False)\n",
185 |     "\n",
186 |     "# Predict on the test set\n",
187 |     "y_pred = lgbm_model.predict(X_test)\n",
188 |     "\n",
189 |     "# Calculate f1 score\n",
190 |     "lgbm_f1 = f1_score(y_test, y_pred)\n",
191 |     "print('LightGBM f1 score:', lgbm_f1)\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 2,
197 |    "id": "69dcbb2b",
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "application/javascript": [
203 |        "\n",
204 |        "                    var selected_cell_element = Jupyter.notebook.get_selected_cell().element[0];\n",
205 |        "                    var selected_cell_index = $(selected_cell_element).index();\n",
206 |        "                    var cell = IPython.notebook.insert_cell_at_index('markdown', $(selected_cell_index));\n",
207 |        "                    cell.set_text(\"## XGBoost Model Training and Evaluation\\n\\nIn the next cell, we are training an XGBoost model to classify binary data. We define the model with specific hyperparameters such as max_depth, learning_rate, n_estimators, and objective. Then, we fit the model on the training data and predict on the test set. Finally, we calculate the f1 score to evaluate the model's performance. XGBoost is a popular machine learning algorithm that is known for its speed and accuracy. By using this model, we aim to achieve high accuracy in our binary classification task.\");\n",
208 |        "                    "
209 |       ],
210 |       "text/plain": [
211 |        "<IPython.core.display.Javascript object>"
212 |       ]
213 |      },
214 |      "metadata": {},
215 |      "output_type": "display_data"
216 |     }
217 |    ],
218 |    "source": [
219 |     "%%explain\n",
220 |     "# XGBoost\n",
221 |     "from xgboost import XGBClassifier\n",
222 |     "\n",
223 |     "# Define the model\n",
224 |     "xgb_model = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=1000, objective='binary:logistic', random_state=42)\n",
225 |     "\n",
226 |     "# Fit the model\n",
227 |     "xgb_model.fit(X_train, y_train, verbose=False)\n",
228 |     "\n",
229 |     "# Predict on the test set\n",
230 |     "y_pred = xgb_model.predict(X_test)\n",
231 |     "\n",
232 |     "# Calculate f1 score\n",
233 |     "xgb_f1 = f1_score(y_test, y_pred)\n",
234 |     "print('XGBoost f1 score:', xgb_f1)\n"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "id": "fd6cd57a",
240 |    "metadata": {},
241 |    "source": [
242 |     "## Results\n",
243 |     "\n",
244 |     "After tuning the hyperparameters with bayesian optimization, we compared the f1 performance of three gradient boosting models - CatBoost, LightGBM, and XGBoost - on the Amazon Customer Reviews Dataset. The f1 scores are as follows:\n",
245 |     "\n",
246 |     "- CatBoost: 0.936\n",
247 |     "- LightGBM: 0.935\n",
248 |     "- XGBoost: 0.934\n",
249 |     "\n",
250 |     "Based on these results, we can conclude that CatBoost performed the best on this dataset.\n",
251 |     "\n",
252 |     "## Next Steps\n",
253 |     "\n",
254 |     "Is there anything else you want to accomplish?"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "id": "d0822306",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# Plot Confusion Matrix\n",
265 |     "from sklearn.metrics import confusion_matrix\n",
266 |     "import itertools\n",
267 |     "\n",
268 |     "# Define function to plot confusion matrix\n",
269 |     "\n",
270 |     "def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):\n",
271 |     "    if normalize:\n",
272 |     "        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
273 |     "        print('Normalized confusion matrix')\n",
274 |     "    else:\n",
275 |     "        print('Confusion matrix, without normalization')\n",
276 |     "\n",
277 |     "    print(cm)\n",
278 |     "\n",
279 |     "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
280 |     "    plt.title(title)\n",
281 |     "    plt.colorbar()\n",
282 |     "    tick_marks = np.arange(len(classes))\n",
283 |     "    plt.xticks(tick_marks, classes, rotation=45)\n",
284 |     "    plt.yticks(tick_marks, classes)\n",
285 |     "\n",
286 |     "    fmt = '.2f' if normalize else 'd'\n",
287 |     "    thresh = cm.max() / 2.\n",
288 |     "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
289 |     "        plt.text(j, i, format(cm[i, j], fmt),\n",
290 |     "                 horizontalalignment='center',\n",
291 |     "                 color='white' if cm[i, j] > thresh else 'black')\n",
292 |     "\n",
293 |     "    plt.tight_layout()\n",
294 |     "    plt.ylabel('True label')\n",
295 |     "    plt.xlabel('Predicted label')\n",
296 |     "\n",
297 |     "# Plot confusion matrix for CatBoost\n",
298 |     "plt.figure()\n",
299 |     "cm = confusion_matrix(y_test, catboost_model.predict(X_test))\n",
300 |     "plot_confusion_matrix(cm, classes=['Negative', 'Positive'], title='CatBoost Confusion Matrix')\n",
301 |     "\n",
302 |     "# Plot confusion matrix for LightGBM\n",
303 |     "plt.figure()\n",
304 |     "cm = confusion_matrix(y_test, lgbm_model.predict(X_test))\n",
305 |     "plot_confusion_matrix(cm, classes=['Negative', 'Positive'], title='LightGBM Confusion Matrix')\n",
306 |     "\n",
307 |     "# Plot confusion matrix for XGBoost\n",
308 |     "plt.figure()\n",
309 |     "cm = confusion_matrix(y_test, xgb_model.predict(X_test))\n",
310 |     "plot_confusion_matrix(cm, classes=['Negative', 'Positive'], title='XGBoost Confusion Matrix')\n"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 4,
316 |    "id": "70feba03",
317 |    "metadata": {},
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "application/javascript": [
322 |        "\n",
323 |        "                var cell = IPython.notebook.insert_cell_above('code');\n",
324 |        "                window.firstCellIndex = IPython.notebook.find_cell_index(cell);\n",
325 |        "                cell.set_text(\"# import necessary libraries\\nimport matplotlib.pyplot as plt\\nfrom sklearn.metrics import precision_recall_curve\\n\\n# calculate precision-recall curve\\nprecision, recall, _ = precision_recall_curve(y_true, y_scores)\\n\\n# plot the curve\\nplt.plot(recall, precision)\\nplt.xlabel('Recall')\\nplt.ylabel('Precision')\\nplt.title('Precision-Recall Curve')\\nplt.show()\");\n",
326 |        "            "
327 |       ],
328 |       "text/plain": [
329 |        "<IPython.core.display.Javascript object>"
330 |       ]
331 |      },
332 |      "metadata": {},
333 |      "output_type": "display_data"
334 |     },
335 |     {
336 |      "data": {
337 |       "application/javascript": [
338 |        "\n",
339 |        "                var cell = IPython.notebook.insert_cell_at_index('markdown', window.firstCellIndex + 1);\n",
340 |        "                cell.set_text(\"The precision-recall curve is a useful tool for evaluating the performance of a binary classification model. It shows the trade-off between precision and recall for different threshold values of the model's predicted probabilities. A high precision means that the model is making few false positive predictions, while a high recall means that the model is capturing most of the positive cases. The ideal model would have both high precision and high recall, but in practice there is often a trade-off between the two.\");\n",
341 |        "                window.firstCellIndex = IPython.notebook.find_cell_index(cell);\n",
342 |        "                "
343 |       ],
344 |       "text/plain": [
345 |        "<IPython.core.display.Javascript object>"
346 |       ]
347 |      },
348 |      "metadata": {},
349 |      "output_type": "display_data"
350 |     }
351 |    ],
352 |    "source": [
353 |     "%%code\n",
354 |     "# plot the precision-recall curve for Catboost predictions"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "id": "faf9f106",
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "# import necessary libraries\n",
365 |     "import matplotlib.pyplot as plt\n",
366 |     "from sklearn.metrics import precision_recall_curve\n",
367 |     "\n",
368 |     "# calculate precision-recall curve\n",
369 |     "precision, recall, _ = precision_recall_curve(y_true, y_scores)\n",
370 |     "\n",
371 |     "# plot the curve\n",
372 |     "plt.plot(recall, precision)\n",
373 |     "plt.xlabel('Recall')\n",
374 |     "plt.ylabel('Precision')\n",
375 |     "plt.title('Precision-Recall Curve')\n",
376 |     "plt.show()"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "id": "75eac704",
382 |    "metadata": {},
383 |    "source": [
384 |     "The precision-recall curve is a useful tool for evaluating the performance of a binary classification model. It shows the trade-off between precision and recall for different threshold values of the model's predicted probabilities. A high precision means that the model is making few false positive predictions, while a high recall means that the model is capturing most of the positive cases. The ideal model would have both high precision and high recall, but in practice there is often a trade-off between the two."
385 |    ]
386 |   }
387 |  ],
388 |  "metadata": {
389 |   "kernelspec": {
390 |    "display_name": "Python 3 (ipykernel)",
391 |    "language": "python",
392 |    "name": "python3"
393 |   },
394 |   "language_info": {
395 |    "codemirror_mode": {
396 |     "name": "ipython",
397 |     "version": 3
398 |    },
399 |    "file_extension": ".py",
400 |    "mimetype": "text/x-python",
401 |    "name": "python",
402 |    "nbconvert_exporter": "python",
403 |    "pygments_lexer": "ipython3",
404 |    "version": "3.9.13"
405 |   }
406 |  },
407 |  "nbformat": 4,
408 |  "nbformat_minor": 5
409 | }
410 | 


--------------------------------------------------------------------------------
/notebook_copilot/__init__.py:
--------------------------------------------------------------------------------
1 | from .notebook_copilot import load_ipython_extension


--------------------------------------------------------------------------------
/notebook_copilot/agents.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from enum import Enum
 3 | 
 4 | from langchain.agents import initialize_agent, AgentType
 5 | from langchain.chat_models import ChatOpenAI
 6 | from langchain.experimental import load_chat_planner, load_agent_executor, PlanAndExecute
 7 | from langchain.memory import ConversationSummaryBufferMemory
 8 | 
 9 | from notebook_copilot.handlers import ProgressHandler
10 | from notebook_copilot.tools import UserInputTool, AddNotebookCellsTool
11 | 
12 | 
13 | class AgentStrategy(Enum):
14 |     COT = "cot"
15 |     TOT = "tot"
16 |     PLAN_EXECUTE = "plan_execute"
17 | 
18 | 
19 | def agent_strategy_type(strategy):
20 |     try:
21 |         return AgentStrategy[strategy.upper()]
22 |     except KeyError as e:
23 |         raise argparse.ArgumentTypeError(f"Unknown strategy {strategy}") from e
24 | 
25 | 
26 | def get_cot_notebook_agent(llm):
27 |     tools = [AddNotebookCellsTool(), UserInputTool]
28 |     conversational_memory = ConversationSummaryBufferMemory(
29 |         max_token_limit=1000,
30 |         llm=llm,
31 |     )
32 |     return initialize_agent(
33 |         agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
34 |         tools=tools,
35 |         llm=llm,
36 |         callbacks=[ProgressHandler()],
37 |         max_iterations=15,
38 |         early_stopping_method='generate',
39 |         handle_parsing_errors=True,
40 |         memory=conversational_memory,
41 |     )
42 | 
43 | 
44 | def get_plan_execute_notebook_agent():
45 |     tools = [AddNotebookCellsTool(), UserInputTool]
46 |     model = ChatOpenAI(temperature=0)
47 |     planner = load_chat_planner(model)
48 |     executor = load_agent_executor(model, tools)
49 |     return PlanAndExecute(planner=planner, executor=executor)
50 | 


--------------------------------------------------------------------------------
/notebook_copilot/chains.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | from langchain import LLMChain
 6 | from langchain.chat_models import ChatOpenAI
 7 | 
 8 | from notebook_copilot.models import CellCompletion, CellCompletionList, CompletionType
 9 | from notebook_copilot.parsers import code_completion_parser, markdown_completion_parser
10 | from notebook_copilot.prompts import markdown_explain_prompt_template, \
11 |     code_generation_prompt_template, code_optimization_prompt_template, code_visualization_prompt_template
12 | 
13 | completion_type_to_prompt_template = {
14 |     CompletionType.CODE: (code_generation_prompt_template, code_completion_parser),
15 |     CompletionType.EXPLAIN: (markdown_explain_prompt_template, markdown_completion_parser),
16 |     CompletionType.OPTIMIZE: (code_optimization_prompt_template, code_completion_parser),
17 |     CompletionType.VISUALIZE: (code_visualization_prompt_template, code_completion_parser)
18 | }
19 | 
20 | 
21 | def get_llm(model_name='gpt-3.5-turbo', key=None):
22 |     """Get a language model from the LangChain API."""
23 |     if key is None:
24 |         key = os.getenv('LANGCHAIN_API_KEY')
25 |     if key is None:
26 |         raise ValueError('API key not provided and LANGCHAIN_API_KEY environment variable is not set.')
27 |     llm = ChatOpenAI()
28 |     llm.temperature = 0.0
29 |     llm.max_tokens = 1000
30 |     llm.openai_api_key = key
31 |     llm.model_name = model_name
32 |     return llm
33 | 
34 | 
35 | def get_cells_completion(llm, completion_type, run_history, cell=None) -> CellCompletionList | CellCompletion:
36 |     """Get a completion from the LangChain API."""
37 |     prompt, output_parser = completion_type_to_prompt_template[completion_type]
38 |     chain = LLMChain(llm=llm, prompt=prompt)
39 |     output = chain.predict(run_history=run_history, cell={cell})
40 |     return output_parser.parse(output)
41 | 


--------------------------------------------------------------------------------
/notebook_copilot/context.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from IPython import get_ipython
 3 | from langchain import FAISS
 4 | from langchain.document_transformers import EmbeddingsRedundantFilter
 5 | from langchain.embeddings import OpenAIEmbeddings
 6 | from langchain.retrievers import ContextualCompressionRetriever
 7 | from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline
 8 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 9 | 
10 | 
11 | def get_ipython_run_history():
12 |     shell = get_ipython()
13 |     last_session_history = list(shell.history_manager.get_range_by_str('', output=True))[:-1]
14 |     pretty_history = '\n'.join(
15 |         ', '.join(map(str, t[2])) for t in last_session_history)
16 |     return pretty_history
17 | 
18 | 
19 | def get_pandas_dataframes():
20 |     shell = get_ipython()
21 |     dfs = []
22 |     for name, value in shell.user_ns.items():
23 |         if isinstance(value, pd.DataFrame):
24 |             dfs.append(value)
25 |     return dfs
26 | 
27 | 
28 | def compress_notebook_context(notebook_documents):
29 |     text_splitter = RecursiveCharacterTextSplitter(
30 |         separators=["'markdown' cell:", "'code' cell:", "\n\n", "\n", " ", ""],
31 |         chunk_size=1000,
32 |         chunk_overlap=0,
33 |         length_function=len,
34 |     )
35 |     texts = text_splitter.split_documents(notebook_documents)
36 |     retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()
37 |     embeddings = OpenAIEmbeddings()
38 |     redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
39 |     relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.56)
40 |     pipeline_compressor = DocumentCompressorPipeline(
41 |         transformers=[text_splitter, redundant_filter, relevant_filter]
42 |     )
43 |     compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor,
44 |                                                            base_retriever=retriever)
45 | 
46 |     return compression_retriever.get_relevant_documents(
47 |         "What is the purpose of this notebook?"
48 |     )
49 | 


--------------------------------------------------------------------------------
/notebook_copilot/handlers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from langchain.callbacks.base import BaseCallbackHandler
 4 | from langchain.schema import AgentAction, AgentFinish
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | class ProgressHandler(BaseCallbackHandler):
 9 |     def __init__(self):
10 |         self.pbar = None
11 | 
12 |     def on_chain_start(
13 |             self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
14 |     ) -> Any:
15 |         """Run when chain starts running."""
16 |         print("Running chain...")
17 |         self.pbar = tqdm(total=None)
18 |         self.pbar.update()
19 | 
20 |     def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
21 |         """Run on agent action."""
22 |         self.pbar.update()
23 | 
24 |     def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> Any:
25 |         """Run on agent end."""
26 |         self.pbar.close()
27 | 


--------------------------------------------------------------------------------
/notebook_copilot/models.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | from typing import List
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | 
 7 | class CellType(enum.Enum):
 8 |     CODE = "code"
 9 |     MARKDOWN = "markdown"
10 |     RAW = "raw"
11 | 
12 | 
13 | class CompletionType(enum.Enum):
14 |     CODE = "code"
15 |     EXPLAIN = "explain"
16 |     OPTIMIZE = "optimize"
17 |     VISUALIZE = "visualize"
18 | 
19 | 
20 | class CellCompletion(BaseModel):
21 |     cell_type: CellType = Field(description="type of jupyter notebook cell")
22 |     source: str = Field(description="code or markdown text of jupyter notebook cell")
23 | 
24 | 
25 | class MarkdownCompletion(BaseModel):
26 |     source: str = Field(
27 |         description="string with valid markdown syntax.")
28 | 
29 | 
30 | class CodeCompletion(BaseModel):
31 |     source: str = Field(
32 |         description="string in valid python syntax")
33 | 
34 | 
35 | class CellCompletionList(BaseModel):
36 |     cells: List[CellCompletion]
37 | 


--------------------------------------------------------------------------------
/notebook_copilot/notebook_copilot.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import contextlib
  3 | import os
  4 | 
  5 | from IPython.core.magic import Magics, magics_class, line_magic, cell_magic
  6 | from langchain.document_loaders import NotebookLoader
  7 | 
  8 | from notebook_copilot.agents import get_cot_notebook_agent, AgentStrategy, agent_strategy_type, \
  9 |     get_plan_execute_notebook_agent
 10 | from notebook_copilot.chains import get_llm, get_cells_completion
 11 | from notebook_copilot.context import get_ipython_run_history, compress_notebook_context
 12 | from notebook_copilot.models import CellType, CellCompletion, CompletionType
 13 | from notebook_copilot.output import reset_first_cell_index, generate_notebook_cell_above, \
 14 |     generate_notebook_cell_below
 15 | from notebook_copilot.prompts import COPILOT_PERSONA, COPILOT_TASK, COPILOT_DIRECTIONS
 16 | from notebook_copilot.utils import stringify_docs, check_environment, JupyterEnvironment
 17 | 
 18 | 
 19 | @magics_class
 20 | class CopilotMagics(Magics):
 21 |     def build_llm_from_args(self, line):
 22 |         parser = argparse.ArgumentParser()
 23 |         parser.add_argument('-m', '--model', default='gpt-3.5-turbo',
 24 |                             help='Model name to use.  e.g gpt-3.5-turbo')
 25 | 
 26 |         args = parser.parse_args(line.split())
 27 | 
 28 |         api_key = os.getenv('OPENAI_API_KEY')
 29 |         if api_key is None:
 30 |             raise ValueError('API key not provided and OPENAI_API_KEY environment variable is not set.')
 31 | 
 32 |         return get_llm(key=api_key, model_name=args.model)
 33 | 
 34 |     @cell_magic
 35 |     def code(self, line, cell):
 36 |         with contextlib.suppress(KeyboardInterrupt):
 37 |             run_history = get_ipython_run_history()
 38 |             cell = cell.replace(line, '')
 39 |             code_completion = get_cells_completion(self.build_llm_from_args(line), CompletionType.CODE, run_history,
 40 |                                                    cell=cell)
 41 |             generate_notebook_cell_below(CellCompletion(cell_type=CellType.CODE, source=code_completion.source))
 42 | 
 43 |     @cell_magic
 44 |     def optimize(self, line, cell):
 45 |         with contextlib.suppress(KeyboardInterrupt):
 46 |             run_history = get_ipython_run_history()
 47 |             cell = cell.replace(line, '')
 48 |             code_completion = get_cells_completion(self.build_llm_from_args(line), CompletionType.OPTIMIZE, run_history,
 49 |                                                    cell=cell)
 50 |             generate_notebook_cell_below(CellCompletion(cell_type=CellType.CODE, source=code_completion.source))
 51 | 
 52 |     @cell_magic
 53 |     def visualize(self, line, cell):
 54 |         with contextlib.suppress(KeyboardInterrupt):
 55 |             run_history = get_ipython_run_history()
 56 |             cell = cell.replace(line, '')
 57 |             code_completion = get_cells_completion(self.build_llm_from_args(line), CompletionType.VISUALIZE,
 58 |                                                    run_history, cell=cell)
 59 |             generate_notebook_cell_below(CellCompletion(cell_type=CellType.CODE, source=code_completion.source))
 60 | 
 61 |     @cell_magic
 62 |     def explain(self, line, cell):
 63 |         with contextlib.suppress(KeyboardInterrupt):
 64 |             run_history = get_ipython_run_history()
 65 |             cell = cell.replace(line, '')
 66 |             md_completion = get_cells_completion(self.build_llm_from_args(line), CompletionType.EXPLAIN, run_history,
 67 |                                                  cell=cell)
 68 |             generate_notebook_cell_above(CellCompletion(cell_type=CellType.MARKDOWN, source=md_completion.source))
 69 | 
 70 |     @line_magic
 71 |     def copilot(self, line):
 72 |         with contextlib.suppress(KeyboardInterrupt):
 73 |             parser = argparse.ArgumentParser()
 74 |             parser.add_argument(
 75 |                 '-s',
 76 |                 '--strategy',
 77 |                 type=agent_strategy_type,
 78 |                 default=AgentStrategy.COT,
 79 |                 help='The strategy to use. one of: cot, tot, plan_execute',
 80 |             )
 81 |             parser.add_argument('-n', '--notebook-name', default=None,
 82 |                                 help='the current name e.g -n Untitled')
 83 | 
 84 |             args = parser.parse_args(line.split())
 85 |             notebook_docs = None
 86 |             if args.notebook_name and check_environment() is JupyterEnvironment.JUPYTER_NOTEBOOK:
 87 |                 notebook_path = os.path.join(os.getcwd(), f"{args.notebook_name}.ipynb")
 88 |                 notebook = NotebookLoader(notebook_path, remove_newline=True).load()
 89 |                 notebook_docs = compress_notebook_context(notebook)
 90 |             context = f'here\'s the relevant notebook text:\n"{stringify_docs(notebook_docs)}"' if notebook_docs else f'here\'s the notebook run history:\n"{get_ipython_run_history()}'
 91 |             user_input = input("What do you want to accomplish with the Jupyter notebook?")
 92 |             agent_prompt = f"{COPILOT_PERSONA}\n\nyour task:{COPILOT_TASK}\n\ndirections:{COPILOT_DIRECTIONS}\n\ncontext:{context} user_input:{user_input}"
 93 |             reset_first_cell_index()
 94 |             if args.strategy in [AgentStrategy.COT, AgentStrategy.TOT]:
 95 |                 cot_agent = get_cot_notebook_agent(self.build_llm_from_args(line))
 96 |                 cot_agent.run(agent_prompt)
 97 |             elif args.strategy == AgentStrategy.PLAN_EXECUTE:
 98 |                 plan_execute_agent = get_plan_execute_notebook_agent()
 99 |                 plan_execute_agent.run(agent_prompt)
100 | 
101 | 
102 | def load_ipython_extension(ipython):
103 |     """
104 |     This function is called when the extension is loaded.
105 |     """
106 |     ipython.register_magics(CopilotMagics)
107 | 


--------------------------------------------------------------------------------
/notebook_copilot/output.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from typing import List
 5 | 
 6 | from IPython.core.display import Javascript
 7 | from IPython.core.display_functions import display
 8 | 
 9 | from notebook_copilot.models import CellCompletion
10 | 
11 | 
12 | def generate_notebook_cells(completions: List[CellCompletion] | List[dict]):
13 |     completions = [CellCompletion(source="".join(completion['source']), cell_type=completion["cell_type"]) if type(completion) == dict else completion for completion in completions]
14 |     first_completion = completions[0]
15 |     cell_content_as_json = json.dumps("".join(first_completion.source))
16 |     cell_type = first_completion.cell_type.value
17 |     display(Javascript(f"""
18 |                 var cell = IPython.notebook.insert_cell_above('{cell_type}');
19 |                 window.firstCellIndex = IPython.notebook.find_cell_index(cell);
20 |                 cell.set_text({cell_content_as_json});
21 |             """))
22 |     for completion in completions[1:]:
23 |         cell_content_as_json = json.dumps("".join(completion.source))
24 |         cell_type = completion.cell_type.value
25 |         index_adjustment = ' + 1'
26 |         display(Javascript(f"""
27 |                 var cell = IPython.notebook.insert_cell_at_index('{cell_type}', window.firstCellIndex{index_adjustment});
28 |                 cell.set_text({cell_content_as_json});
29 |                 window.firstCellIndex = IPython.notebook.find_cell_index(cell);
30 |                 """))
31 | 
32 | 
33 | def generate_notebook_cell_below(completion: CellCompletion):
34 |     cell_content_as_json = json.dumps(completion.source)
35 |     cell_type = completion.cell_type.value
36 |     display(Javascript(f"""
37 |                         var selected_cell_index = IPython.notebook.get_selected_index();
38 |                         var cell = IPython.notebook.insert_cell_at_index('{cell_type}', selected_cell_index);
39 |                         cell.set_text({cell_content_as_json});
40 |                         """))
41 | 
42 | 
43 | def generate_notebook_cell_above(completion: CellCompletion):
44 |     cell_content_as_json = json.dumps(completion.source)
45 |     cell_type = completion.cell_type.value
46 |     display(Javascript(f"""
47 |                     var selected_cell_index = IPython.notebook.get_selected_index();
48 |                     var cell = IPython.notebook.insert_cell_at_index('{cell_type}', selected_cell_index - 1);
49 |                     cell.set_text({cell_content_as_json});
50 |                     """))
51 | 
52 | 
53 | def reset_first_cell_index():
54 |     display(Javascript("""window.firstCellIndex === undefined;"""))
55 | 


--------------------------------------------------------------------------------
/notebook_copilot/parsers.py:
--------------------------------------------------------------------------------
1 | from langchain.output_parsers import PydanticOutputParser
2 | 
3 | from notebook_copilot.models import CellCompletion, MarkdownCompletion, CodeCompletion, CellCompletionList
4 | 
5 | cell_completion_parser = PydanticOutputParser(pydantic_object=CellCompletion)
6 | code_completion_parser = PydanticOutputParser(pydantic_object=CodeCompletion)
7 | markdown_completion_parser = PydanticOutputParser(pydantic_object=MarkdownCompletion)
8 | multiple_cells_completion_parser = PydanticOutputParser(pydantic_object=CellCompletionList)
9 | 


--------------------------------------------------------------------------------
/notebook_copilot/prompts.py:
--------------------------------------------------------------------------------
 1 | from langchain import PromptTemplate
 2 | from langchain.prompts.chat import (
 3 |     SystemMessagePromptTemplate,
 4 | )
 5 | 
 6 | from notebook_copilot.parsers import multiple_cells_completion_parser, code_completion_parser
 7 | 
 8 | COPILOT_PERSONA = "You are a senior Data Scientist that writes professional and readable jupyter notebooks for the user. You are helpful, polite, honest, sophisticated, and humble-but-knowledgeable. You follow Data science best practices."
 9 | COPILOT_TASK = "Begin by understanding what the user want to accomplish. Then help the user by creating new jupyter notebook code & markdown cells to continue this notebook based on ML and Data science best practices with clean code and descriptive markdown."
10 | COPILOT_DIRECTIONS = "Use the provided context and user input to continue this notebook. Be very thorough - before each code cell you create, add descriptive markdown cells to elaborate on the flow and code to make the notebook readable and professional. Before finishing your actions - ask the user if there's something more they want to accomplish."
11 | 
12 | explain_format_instructions = """
13 | The output should be formatted as a JSON instance that conforms to the JSON schema below.
14 | 
15 | Here is the output schema:
16 | ```
17 | {"properties": {"source": {"title": "Source", "description": "string with valid markdown syntax.", "type": "string"}}, "required": ["source"]}
18 | ```
19 | 
20 | example response: {"source": "## Data Cleaning\nIn the next cell, we clean the data using the following methods:\n* Removing duplicates\n Missing values imputation"}
21 | """
22 | 
23 | cells_completion_prompt_template = PromptTemplate(
24 |     input_variables=["run_history", "cell"],
25 |     template="{copilot_persona}\nContext:\nrun history: {run_history}.\n current cell: {cell}\nYour task: Putting most weight on the current cell and code comments, Generate the next Jupyter notebook cells by replying in the following format: {format_instructions}",
26 |     partial_variables={"copilot_persona": COPILOT_PERSONA,
27 |                        "format_instructions": multiple_cells_completion_parser.get_format_instructions()},
28 | )
29 | 
30 | markdown_explain_prompt_template = PromptTemplate(
31 |     input_variables=["cell"],
32 |     template='You are a professional data science code/pipeline/process instructor and storyteller. You follow Data science best practices and generate beautifully formatted Jupyter Notebook markdown cell (up to 4 lines. no code inside.) to explain the flow in the input code cell.\n\nReply with your output according to the directions below.\nDirections: "{format_instructions}\n\nHere is the code cell source code to explain:\n{cell}.',
33 |     partial_variables={"format_instructions": explain_format_instructions},
34 | )
35 | 
36 | code_generation_prompt_template = PromptTemplate(
37 |     input_variables=["run_history", "cell"],
38 |     template="You are a data science code generator. You follow Data science best practices and generate Jupyter notebook code cells based on user requests.\n\nGenerate python code based on the user input\nuser input: {cell}.\n\nReply with your output according to the directions below.\nDirections: {format_instructions}\n\nipython run history for context (optional): {run_history}.",
39 |     partial_variables={"format_instructions": code_completion_parser.get_format_instructions()},
40 | )
41 | 
42 | code_optimization_prompt_template = PromptTemplate(
43 |     input_variables=["cell"],
44 |     template="You are an experienced software engineer. Please help me improve the time complexity of the the following code cell\n{cell}.\n\nGenerate python code that is optimized for performance.\n\nReply with your output according to the directions below.\nDirections: {format_instructions}\n\n",
45 |     partial_variables={"format_instructions": code_completion_parser.get_format_instructions()},
46 | )
47 | 
48 | code_visualization_prompt_template = PromptTemplate(
49 |     input_variables=["run_history", "cell"],
50 |     template="You are a data science code generator. You follow Data science best practices and generate Jupyter notebook code cells based on user requests.\n\nGenerate python code to visualize the data in the following code block. you can use seaborn, matplotlib etc…\ncode block: {cell}.\n\nReply with your output according to the directions below.\nDirections: {format_instructions}\n\nipython run history for context (optional): {run_history}.",
51 |     partial_variables={"format_instructions": code_completion_parser.get_format_instructions()},
52 | )
53 | 
54 | 
55 | 
56 | copilot_system_message_prompt = SystemMessagePromptTemplate.from_template(COPILOT_PERSONA)
57 | 


--------------------------------------------------------------------------------
/notebook_copilot/tools.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from IPython import get_ipython
 4 | from langchain.tools import BaseTool, Tool
 5 | 
 6 | from notebook_copilot.context import get_ipython_run_history
 7 | from notebook_copilot.models import CellType, CellCompletion, CellCompletionList
 8 | from notebook_copilot.output import generate_notebook_cell_below, generate_notebook_cells
 9 | 
10 | 
11 | class AddNotebookCellsTool(BaseTool):
12 |     name = "Notebook New Cells"
13 |     description = (
14 |         "Use this tool when you want to create new cells in the notebook. "
15 |         "To use the tool you must provide the following parameter: 'cells' according to the CellCompletionList model."
16 |     )
17 |     
18 |     def _run(
19 |             self,
20 |             cells: Optional[CellCompletionList] = None,
21 |     ):
22 |         cells = cells.cells if type(cells) == CellCompletionList else cells
23 |         generate_notebook_cells(cells)
24 |         return "created new cells"
25 | 
26 |     def _arun(self, query: str):
27 |         raise NotImplementedError("This tool does not support async")
28 | 
29 | 
30 | class NewCodeCellTool(BaseTool):
31 |     name = "Notebook New Code Cell"
32 |     description = (
33 |         "use this tool when you want to create a new Code cell in the notebook. "
34 |         "To use the tool you must provide the following parameter: 'content' (string)"
35 |     )
36 | 
37 |     def _run(
38 |             self,
39 |             content: Optional[str] = None,
40 |     ):
41 |         generate_notebook_cell_below(CellCompletion(type=CellType.CODE, content=content))
42 | 
43 |     def _arun(self, query: str):
44 |         raise NotImplementedError("This tool does not support async")
45 | 
46 | 
47 | class NewMarkdownCellTool(BaseTool):
48 |     name = "Notebook New Markdown Cell"
49 |     description = (
50 |         "use this tool when you want to create a new Markdown cell in the notebook. "
51 |         "To use the tool you must provide the following parameter: 'content' (string)."
52 |     )
53 | 
54 |     def _run(
55 |             self,
56 |             content: Optional[str] = None,
57 |     ):
58 |         generate_notebook_cell_below(CellCompletion(type=CellType.MARKDOWN, content=content))
59 | 
60 |     def _arun(self, query: str):
61 |         raise NotImplementedError("This tool does not support async")
62 | 
63 | 
64 | class NotebookHistoryTool(BaseTool):
65 |     name = "Notebook Run History"
66 |     description = (
67 |         "use this tool when you need to know what has been ran in this notebook in the past."
68 |     )
69 | 
70 |     def _run(
71 |             self
72 |     ):
73 |         return get_ipython_run_history()
74 | 
75 |     def _arun(self, query: str):
76 |         raise NotImplementedError("This tool does not support async")
77 | 
78 | 
79 | class InstalledPackagesTool(BaseTool):
80 |     name = "Notebook Installed Packages"
81 |     description = (
82 |         "use this tool when you need to know what packages are installed in this python environment."
83 |     )
84 | 
85 |     def _run(
86 |             self
87 |     ):
88 |         return get_ipython().system('pip freeze')
89 | 
90 |     def _arun(self, query: str):
91 |         raise NotImplementedError("This tool does not support async")
92 | 
93 | 
94 | UserInputTool = Tool.from_function(
95 |     func=input,
96 |     name="User Input Tool",
97 |     description="Use it to get user input and direction before acting."
98 | )
99 | 


--------------------------------------------------------------------------------
/notebook_copilot/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from enum import Enum
 4 | from typing import List
 5 | 
 6 | from IPython import get_ipython
 7 | from langchain.schema import Document
 8 | 
 9 | 
10 | class JupyterEnvironment(Enum):
11 |     GOOGLE_COLAB = 'Google Colab'
12 |     JUPYTER_NOTEBOOK = 'Jupyter Notebook'
13 |     DATABRICKS = 'Databricks'
14 |     JUPYTER_LAB = 'Jupyter Lab'
15 |     SAGEMAKER = 'Amazon SageMaker'
16 |     UNKNOWN = 'Unknown environment'
17 | 
18 | 
19 | def check_environment():
20 |     if 'google.colab' in str(get_ipython()):
21 |         return JupyterEnvironment.GOOGLE_COLAB
22 |     elif 'databricks' in sys.modules:
23 |         return JupyterEnvironment.DATABRICKS
24 |     elif 'sagemaker' in sys.modules:
25 |         return JupyterEnvironment.SAGEMAKER
26 |     elif 'jpnotebook' in os.environ.get('JUPYTER_SERVER_TYPE', '').lower():
27 |         return JupyterEnvironment.JUPYTER_LAB
28 |     elif 'IPython' in sys.modules or 'ipykernel' in sys.modules:
29 |         return JupyterEnvironment.JUPYTER_NOTEBOOK
30 |     else:
31 |         return JupyterEnvironment.UNKNOWN
32 | 
33 | def pretty_print_docs(docs: List[Document]):
34 |     print(f"\n{'-' * 100}\n".join([f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
35 | 
36 | 
37 | def stringify_docs(docs: List[Document]):
38 |     return f"\n".join([d.page_content for d in docs])
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools~=65.5.1
2 | ipython~=8.13.2
3 | langchain~=0.0.187
4 | openai~=0.27.7
5 | pandas~=2.0.2
6 | pydantic~=1.10.8
7 | pydantic~=1.10.8
8 | tiktoken~=0.4.0
9 | faiss-cpu~=1.7.4


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='notebook_copilot',
 5 |     version='0.2.0',
 6 |     packages=find_packages(),
 7 |     description='The Bridge from Thoughts to Well-Crafted Jupyter Notebook',
 8 |     install_requires=[
 9 |         'setuptools~=65.5.1',
10 |         'ipython~=8.13.2',
11 |         'langchain~=0.0.187',
12 |         'pandas~=2.0.2',
13 |         'pydantic~=1.10.8',
14 |         'tiktoken~=0.4.0',
15 |         'ipywidgets==8.0.6',
16 |         'faiss-cpu~=1.7.4'
17 |     ],
18 |     python_requires='>=3.7.1, <4',
19 |     classifiers=[
20 |         "Development Status :: 3 - Alpha",
21 |         "Intended Audience :: Science/Research",
22 |         "Intended Audience :: Developers",
23 |         "Topic :: Software Development :: Libraries :: Python Modules",
24 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
25 |         "Topic :: Scientific/Engineering :: Information Analysis",
26 |         "License :: OSI Approved :: MIT License",
27 |         "Programming Language :: Python :: 3",
28 |         "Programming Language :: Python :: 3.7",
29 |         "Programming Language :: Python :: 3.8",
30 |         "Programming Language :: Python :: 3.9",
31 |         "Programming Language :: Python :: 3.10",
32 |         "Operating System :: OS Independent",
33 |         "Framework :: Jupyter",
34 |         "Natural Language :: English",
35 |     ],
36 |     author="Tal Peretz",
37 |     author_email="tp@aihumanlabs.com",
38 |     long_description=open('README.md').read(),
39 |     long_description_content_type='text/markdown',
40 |     url="https://github.com/talperetz/notebook_copilot",
41 |     license="MIT",
42 | )
43 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talperetz/notebook-copilot/758d7eca8a41c0b1913f0ecac29c74f0e5afb744/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_copilot.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from IPython.testing.globalipapp import get_ipython
 3 | from dotenv import load_dotenv
 4 | 
 5 | from notebook_copilot.models import MarkdownCompletion, CodeCompletion
 6 | from notebook_copilot.notebook_copilot import CopilotMagics
 7 | 
 8 | # Load the .env file
 9 | load_dotenv()
10 | 
11 | 
12 | @pytest.fixture(scope='module')
13 | def ipython():
14 |     return get_ipython()
15 | 
16 | 
17 | @pytest.fixture(scope='module')
18 | def copilot_magic(ipython):
19 |     copilot_magic = CopilotMagics(shell=ipython)
20 |     ipython.register_magics(copilot_magic)
21 |     return copilot_magic
22 | 
23 | 
24 | from unittest.mock import Mock
25 | 
26 | 
27 | def test_code_with_mock(ipython, copilot_magic, mocker):
28 |     # Create a mock of LLMChain
29 |     mock_llm_chain = Mock()
30 | 
31 |     # Set the return value of predict method
32 |     completion = CodeCompletion(source='print("Hello, world!")')
33 |     mock_llm_chain.predict.return_value = completion.json()
34 | 
35 |     # Now mock the LLMChain constructor to always return our mock
36 |     mocker.patch('notebook_copilot.chains.LLMChain', return_value=mock_llm_chain)
37 | 
38 |     # Now proceed with the test. All instances of LLMChain will use the mock
39 |     ipython.run_cell_magic('code', '', 'your test code here')
40 | 
41 |     mock_llm_chain.predict.assert_called_once()
42 | 
43 | 
44 | def test_optimize_with_mock(ipython, copilot_magic, mocker):
45 |     # Create a mock of LLMChain
46 |     mock_llm_chain = Mock()
47 | 
48 |     # Set the return value of predict method
49 |     completion = CodeCompletion(source='print("Hello, world!")')
50 |     mock_llm_chain.predict.return_value = completion.json()
51 | 
52 |     # Now mock the LLMChain constructor to always return our mock
53 |     mocker.patch('notebook_copilot.chains.LLMChain', return_value=mock_llm_chain)
54 | 
55 |     # Now proceed with the test. All instances of LLMChain will use the mock
56 |     ipython.run_cell_magic('optimize', '', 'your test code here')
57 | 
58 |     mock_llm_chain.predict.assert_called_once()
59 | 
60 | 
61 | def test_visualize_with_mock(ipython, copilot_magic, mocker):
62 |     # Create a mock of LLMChain
63 |     mock_llm_chain = Mock()
64 | 
65 |     # Set the return value of predict method
66 |     completion = CodeCompletion(source='print("Hello, world!")')
67 |     mock_llm_chain.predict.return_value = completion.json()
68 | 
69 |     # Now mock the LLMChain constructor to always return our mock
70 |     mocker.patch('notebook_copilot.chains.LLMChain', return_value=mock_llm_chain)
71 | 
72 |     # Now proceed with the test. All instances of LLMChain will use the mock
73 |     ipython.run_cell_magic('visualize', '', 'your test code here')
74 | 
75 |     mock_llm_chain.predict.assert_called_once()
76 | 
77 | 
78 | def test_explain_with_mock(ipython, copilot_magic, mocker):
79 |     # Create a mock of LLMChain
80 |     mock_llm_chain = Mock()
81 | 
82 |     # Set the return value of predict method
83 |     completion = MarkdownCompletion(source='# This is a markdown cell')
84 |     mock_llm_chain.predict.return_value = completion.json()
85 | 
86 |     # Now mock the LLMChain constructor to always return our mock
87 |     mocker.patch('notebook_copilot.chains.LLMChain', return_value=mock_llm_chain)
88 | 
89 |     ipython.run_cell_magic('explain', '', '#complex code')
90 | 
91 |     mock_llm_chain.predict.assert_called_once()
92 | 


--------------------------------------------------------------------------------