├── setup.cfg ├── pandas_llm ├── pyvenv.cfg ├── requirements.txt ├── example.py ├── example-chatbot.py └── __init__.py ├── requirements.txt ├── LICENSE ├── setup.py ├── .gitignore └── README.md /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /pandas_llm/pyvenv.cfg: -------------------------------------------------------------------------------- 1 | home = /usr/local/opt/python@3.9/bin 2 | include-system-site-packages = false 3 | version = 3.9.16 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | certifi==2023.5.7 6 | charset-normalizer==3.1.0 7 | frozenlist==1.3.3 8 | idna==3.4 9 | multidict==6.0.4 10 | numpy==1.24.3 11 | openai==0.27.6 12 | pandas==2.0.1 13 | python-dateutil==2.8.2 14 | pytz==2023.3 15 | requests==2.30.0 16 | RestrictedPython==6.0 17 | six==1.16.0 18 | tqdm==4.65.0 19 | tzdata==2023.3 20 | urllib3==2.0.2 21 | yarl==1.9.2 22 | -------------------------------------------------------------------------------- /pandas_llm/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | certifi==2023.5.7 6 | charset-normalizer==3.1.0 7 | frozenlist==1.3.3 8 | idna==3.4 9 | multidict==6.0.4 10 | numpy==1.24.3 11 | openai==0.27.6 12 | pandas==2.0.1 13 | python-dateutil==2.8.2 14 | pytz==2023.3 15 | requests==2.30.0 16 | RestrictedPython==6.0 17 | six==1.16.0 18 | tqdm==4.65.0 19 | tzdata==2023.3 20 | urllib3==2.0.2 21 | yarl==1.9.2 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Dashy Dash 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pandas_llm/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 7 | from pandas_llm import PandasLLM 8 | 9 | # Data 10 | # Please note that these names, ages, and donations are randomly generated 11 | # and do not correspond to real individuals or their donations. 12 | data = [('John Doe', 25, 50), 13 | ('Jane Smith', 38, 70), 14 | ('Alex Johnson', 45, 80), 15 | ('Jessica Brown', 60, 40), 16 | ('Michael Davis', 22, 90), 17 | ('Emily Wilson', 30, 60), 18 | ('Daniel Taylor', 35, 75), 19 | ('Sophia Moore', 40, 85), 20 | ('David Thomas', 50, 65), 21 | ('Olivia Jackson', 29, 55)] 22 | df = pd.DataFrame(data, columns=['name', 'age', 'donation']) 23 | 24 | conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) 25 | result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?") 26 | code = conv_df.code_block 27 | 28 | print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") 29 | # Executing the following expression of type : 30 | # result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean() 31 | 32 | # Result is: 33 | # 72.5 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Reads the content of your README.md into a variable to be used in the setup below 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name='pandas_llm', # should match the package folder 9 | version='0.0.6', # important for updates 10 | license='MIT', # should match your chosen license 11 | description='Conversational Pandas Dataframes', 12 | long_description=long_description, # loads your README.md 13 | long_description_content_type="text/markdown", # README.md is of type 'markdown' 14 | author='DashyDash', 15 | author_email='alessio@dashydash.com', 16 | url='https://github.com/DashyDashOrg/pandas-llm', 17 | project_urls = { # Optional 18 | "Bug Tracker": "https://github.com/DashyDashOrg/pandas-llm/issues" 19 | }, 20 | keywords=["pypi", "pandas-llm", "pandas", "llm", "ai", "openai", "chatgpt"], #descriptive meta-data 21 | packages=find_packages(), 22 | classifiers=[ # https://pypi.org/classifiers 23 | 'Development Status :: 3 - Alpha', 24 | 'Intended Audience :: Developers', 25 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Programming Language :: Python :: 3', 28 | "Operating System :: OS Independent", 29 | ], 30 | python_requires='>=3.6', 31 | install_requires=[ 32 | "aiohttp", 33 | "aiosignal", 34 | "async-timeout", 35 | "attrs", 36 | "certifi", 37 | "charset-normalizer", 38 | "frozenlist", 39 | "idna", 40 | "multidict", 41 | "numpy", 42 | "openai", 43 | "pandas", 44 | "python-dateutil", 45 | "pytz", 46 | "requests", 47 | "RestrictedPython", 48 | "six", 49 | "tqdm", 50 | "tzdata", 51 | "urllib3", 52 | "yarl", 53 | ], 54 | download_url="https://github.com/DashyDashOrg/pandas-llm/releases/tag/v0.0.6", 55 | ) -------------------------------------------------------------------------------- /pandas_llm/example-chatbot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | import sys 5 | from pathlib import Path 6 | sys.path.append(str(Path(__file__).resolve().parent.parent)) 7 | 8 | from pandas_llm import PandasLLM 9 | 10 | # Data 11 | # Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations. 12 | data = [('John Doe', 25, 50), 13 | ('Jane Smith', 38, 70), 14 | ('Alex Johnson', 45, 80), 15 | ('Jessica Brown', 60, 40), 16 | ('Michael Davis', 22, 90), 17 | ('Emily Wilson', 30, 60), 18 | ('Daniel Taylor', 35, 75), 19 | ('Sophia Moore', 40, 85), 20 | ('David Thomas', 50, 65), 21 | ('Olivia Jackson', 29, 55), 22 | ('Carlos García', 22, 50), 23 | ('Ana Rodriguez', 38, 70), 24 | ('Luis Hernandez', 45, 80), 25 | ('Sofia Martinez', 60, 40), 26 | ('Miguel Lopez', 22, 90), 27 | ('Isabella Gonzalez', 30, 60), 28 | ('Diego Perez', 35, 75), 29 | ('Maria Sanchez', 40, 85), 30 | ('Juan Pena', 50, 65), 31 | ('Gabriela Ramirez', 29, 55), 32 | ('Giovanni Rossi', 22, 50), 33 | ('Maria Bianchi', 38, 70), 34 | ('Luca Ferrari', 45, 80), 35 | ('Sofia Russo', 60, 40), 36 | ('Francesco Romano', 22, 90), 37 | ('Isabella Colombo', 30, 60), 38 | ('Alessandro Ricci', 35, 75), 39 | ('Giulia Marino', 40, 85), 40 | ('Antonio Greco', 50, 65), 41 | ('Gabriella Bruno', 29, 55)] 42 | 43 | # Create DataFrame 44 | df = pd.DataFrame(data, columns=['name', 'age', 'donation']) 45 | 46 | # Print DataFrame 47 | print(df) 48 | 49 | 50 | def main(): 51 | 52 | # Initialise library and set the OpenAI API key 53 | conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) 54 | print() 55 | banner = """ 56 | Welcome to the Donation Data CLI. 57 | The donation dataset has three columns (name, age, donation) 58 | Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations. 59 | 60 | You can ask questions like: 61 | - show me the list of names 62 | - What is the average age of people who donated? 63 | - What is the average donation amount? 64 | - What is the average donation of people older than 30? 65 | - What is the average donation of people older than 30 who donated more than $50? 66 | """ 67 | print(banner) 68 | 69 | while True: 70 | prompt = input("Enter your query (or 'exit' to quit): ") 71 | if prompt.lower() == "exit": 72 | break 73 | 74 | result = conv_df.prompt(prompt) 75 | code = conv_df.code_block 76 | print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Dashy dash private stuff 163 | dashy/ 164 | temp.py 165 | tmp.py 166 | test.py 167 | temp/ 168 | bin/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pandas-LLM 2 | 3 | ## Introduction 4 | pandas-llm is a lightweight Python library that extends pandas to allow querying datasets using OpenAI prompts. This powerful tool leverages the natural language processing capabilities of OpenAI to offer intuitive, language-based querying of your Pandas dataframes. 5 | 6 | ## Key Features 7 | - **Natural Language Querying**: With pandas-llm, you can execute complex Pandas queries using natural language prompts. Instead of writing code, you can express your query in plain language and obtain the desired results. 8 | 9 | - **Data Privacy**: Your data is not sent on the Internet. Pandas-LLM works locally with your data and uses openAI to create the query based on the dataframe columns and data types, not its content. 10 | 11 | - **Seamless Integration**: The library seamlessly integrates with your existing Pandas workflow. You can continue using normal Pandas functions and syntax while leveraging the added capability of natural language queries. 12 | 13 | - **Efficiency and Performance**: pandas-LLM is designed to deliver efficient and performant querying capabilities. It uses OpenAI's language model to process queries quickly and accurately, providing rapid insights from your data. 14 | 15 | - **Flexible and Expressive**: Whether you need to filter, aggregate, sort, or transform your data, pandas-LLM allows you to express your requirements flexibly and expressively. You can perform complex operations on your dataframes with ease using human-readable language. 16 | 17 | - **Intelligent Results**: The library returns the results of your queries in a concise and understandable format. You can extract valuable insights from your data without complex code or manual analysis. 18 | 19 | With pandas-llm, you can unlock the power of natural language querying and effortlessly execute complex pandas queries. Let the library handle the intricacies of data manipulation while you focus on gaining insights and making data-driven decisions. 20 | 21 | ## Installation 22 | 23 | Install pandas-llm using pip: 24 | 25 | ```shell 26 | pip install pandas-llm 27 | ``` 28 | 29 | ## Features 30 | - Query pandas dataframes using natural language prompts. 31 | - Leverage the power of OpenAI's language models in your data analysis. 32 | - Seamless integration with existing pandas functions. 33 | 34 | ## Usage 35 | Here's a quick [example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example.py) of how to use pandas-llm: 36 | 37 | ```python 38 | import os 39 | import pandas as pd 40 | from pandas_llm import PandasLLM 41 | 42 | # Data 43 | # Please note that these names, ages, and donations are randomly generated 44 | # and do not correspond to real individuals or their donations. 45 | data = [('John Doe', 25, 50), 46 | ('Jane Smith', 38, 70), 47 | ('Alex Johnson', 45, 80), 48 | ('Jessica Brown', 60, 40), 49 | ('Michael Davis', 22, 90), 50 | ('Emily Wilson', 30, 60), 51 | ('Daniel Taylor', 35, 75), 52 | ('Sophia Moore', 40, 85), 53 | ('David Thomas', 50, 65), 54 | ('Olivia Jackson', 29, 55)] 55 | df = pd.DataFrame(data, columns=['name', 'age', 'donation']) 56 | 57 | conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY")) 58 | result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?") 59 | code = conv_df.code_block 60 | 61 | print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n") 62 | # Executing the following expression of type : 63 | # result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean() 64 | 65 | # Result is: 66 | # 72.5 67 | 68 | ``` 69 | 70 | There is also a chatbot available in the repository using the same dataset. 71 | Look at [Chatbot example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example-chatbot.py) 72 | 73 | ## PandasLLM Class Constructor 74 | 75 | The constructor for the PandasLLM class has been enhanced in this release to provide more flexibility and control over the language model interaction. The constructor accepts the following arguments: 76 | 77 | **data** (mandatory): The data to be used. It can be a Pandas DataFrame, a list of lists, tuples, dictionaries, a dictionary, a string, or a list. 78 | 79 | **llm_engine** (optional): The name of the LLM engine to use. Currently, only OpenAI is supported. Defaults to "openai". 80 | 81 | **llm_params** (optional): A dictionary of parameters to be used with the OpenAI API. This allows customization of the LLM behavior. Defaults to model=gpt-3.5-turbo and temperature=0.2. 82 | 83 | **prompt_override** (optional): A boolean that determines whether or not the prompt is overridden. If set to True, the custom prompt becomes the main prompt. Defaults to False. 84 | 85 | **custom_prompt** (optional): A string that can be provided if prompt_override is False. The custom prompt will be added to the default pandas_llm prompt. Defaults to an empty string. 86 | 87 | **path** (optional): The path to the file where the debug data will be saved. If not specified, debug data files will not be generated. 88 | 89 | **verbose** (optional): A boolean determines whether debugging information will be printed. If set to True, additional debugging info will be displayed. Defaults to False. 90 | 91 | **data_privacy** (optional): A boolean determines whether the data is treated as private. If set to True, the function will not send the data content to OpenAI. Defaults to True. 92 | 93 | **llm_api_key** (optional): The OpenAI API key to be used. The library will attempt to use the default API key configured if not provided. 94 | 95 | **force_sandbox** (optional): A boolean determining the fallback behaviour if the sandbox environment fails. If set to False and the sandbox fails, the library will retry using eval, which is less safe. Defaults to False. 96 | 97 | 98 | ## Contributing 99 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. Please make sure to update tests as appropriate. 100 | 101 | ## License 102 | MIT 103 | -------------------------------------------------------------------------------- /pandas_llm/__init__.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import datetime 3 | import numpy as np 4 | import openai 5 | import os 6 | import re 7 | import json 8 | 9 | # sandbox.py 10 | from RestrictedPython import compile_restricted 11 | from RestrictedPython.Guards import safe_builtins,guarded_iter_unpack_sequence 12 | from RestrictedPython.Eval import default_guarded_getattr, default_guarded_getitem, default_guarded_getiter 13 | import pandas as pd 14 | 15 | class Sandbox: 16 | def __init__(self): 17 | self._allowed_imports = {} 18 | 19 | def allow_import(self, module_name): 20 | try: 21 | module = __import__(module_name) 22 | self._allowed_imports[module_name] = module 23 | except ImportError: 24 | pass 25 | 26 | def execute(self, code, local_vars = {}): 27 | allowed_builtins = safe_builtins 28 | # Add __builtins__, __import__, and allowed imports to the globals 29 | restricted_globals = {"__builtins__": allowed_builtins} 30 | restricted_globals.update(self._allowed_imports) 31 | 32 | builtin_mappings = { 33 | "__import__": __import__, 34 | "_getattr_": default_guarded_getattr, 35 | "_getitem_": default_guarded_getitem, 36 | "_getiter_": default_guarded_getiter, 37 | "_iter_unpack_sequence_": guarded_iter_unpack_sequence, 38 | "list": list, 39 | "set": set, 40 | "pd": pd, 41 | } 42 | 43 | series_methods = [ 44 | "sum", "mean", "any", "argmax", "argmin", "count", "cumsum", "cumprod", "diff", 45 | "dropna", "fillna", "head", "idxmax", "idxmin", "last", "max", "min", "notna", 46 | "prod", "quantile", "rename", "round", "tail", "to_frame", "to_list", "to_numpy", 47 | "to_string","unique", "sort_index", "sort_values", "aggregate" 48 | ] 49 | 50 | 51 | builtin_mappings.update({method: getattr(pd.Series, method) for method in series_methods}) 52 | 53 | restricted_globals["__builtins__"].update(builtin_mappings) 54 | 55 | byte_code = compile_restricted(source=code, filename='', mode='exec') 56 | 57 | # Execute the restricted code 58 | exec(byte_code, restricted_globals, local_vars) 59 | 60 | return local_vars 61 | 62 | 63 | class PandasLLM(pd.DataFrame): 64 | """ 65 | PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a 66 | wrapper around the OpenAI API. 67 | """ 68 | 69 | code_blocks = [r'```python(.*?)```',r'```(.*?)```'] 70 | 71 | llm_default_model = "gpt-3.5-turbo" 72 | llm_default_temperature = 0.2 73 | llm_engine = "openai" 74 | llm_default_params = { "model": llm_default_model, 75 | "temperature": llm_default_temperature} 76 | llm_api_key = None 77 | 78 | prompt_override = False 79 | custom_prompt = "" 80 | data_privacy = True 81 | path = None 82 | verbose = False 83 | code_block = "" 84 | force_sandbox = False 85 | def __init__(self, 86 | data, 87 | llm_engine:str = "openai", llm_params=llm_default_params, 88 | prompt_override:bool = False, 89 | custom_prompt:str = "", 90 | path:str = None, 91 | verbose:bool = False, 92 | data_privacy:bool = True, 93 | llm_api_key:str = None, 94 | force_sandbox:bool = False, 95 | *args, **kwargs): 96 | """ 97 | This is the constructor for the PandasLLM class. It takes in the following arguments: 98 | data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples, 99 | a list of dictionaries, a dictionary, a string, or a list. 100 | llm_engine: The name of the OpenAI engine to use. 101 | llm_params: A dictionary of parameters to be used with the OpenAI API. 102 | prompt_override: A boolean that determines whether or not the prompt is overridden. 103 | custom_prompt: A string that overrides the prompt. 104 | path: The path to the file to be used. 105 | verbose: A boolean that determines whether or not the output is verbose. 106 | data_privacy: A boolean that determines whether or not the data is private. 107 | llm_api_key: The OpenAI API key to be used. 108 | force_sandbox: if False and the sandbox fails, it will retry using eval (less safe) 109 | 110 | The constructor also calls the parent class's constructor. 111 | 112 | 113 | Args: 114 | data (pandas dataframe, mandatory): dataset to query. Defaults to None. 115 | llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai". 116 | llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2". 117 | prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False. 118 | custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "". 119 | path (str, optional): the path where the files containing debug data will be save. Defaults to None. 120 | verbose (bool, optional): if True debugging info will be printed. Defaults to False. 121 | data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True. 122 | llm_api_key (str, optional): the Open API key. Defaults to None. 123 | force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False. 124 | """ 125 | 126 | 127 | super().__init__(data, *args, **kwargs) 128 | 129 | self.llm_params = llm_params or {} 130 | 131 | # Set up OpenAI API key from the environment or the config 132 | self.llm_api_key = llm_api_key or os.environ.get("OPENAI_API_KEY") 133 | 134 | self.llm_engine = llm_engine 135 | self.llm_params = llm_params or {} 136 | self.model = self.llm_params.get("model", self.llm_default_model) 137 | self.temperature = self.llm_params.get("temperature", self.llm_default_temperature) 138 | 139 | self.prompt_override = prompt_override 140 | self.custom_prompt = custom_prompt 141 | 142 | self.data_privacy = data_privacy 143 | self.path = path 144 | self.verbose = verbose 145 | self.force_sandbox = force_sandbox 146 | 147 | def _buildPromptForRole(self): 148 | prompt_role = f""" 149 | I want you to act as a data scientist and Python coder. I want you code for me. 150 | I have a dataset of {len(self)} rows and {len(self.columns)} columns. 151 | Columns and their type are the following: 152 | """ 153 | 154 | for col in self.columns: 155 | col_type = self.dtypes[col] 156 | prompt_role += f"{col} ({col_type})\n" 157 | 158 | return prompt_role 159 | 160 | def _buildPromptForProblemSolving(self, request): 161 | 162 | if self.prompt_override: 163 | return self.custom_prompt 164 | 165 | columns = "" 166 | for col in self.columns: 167 | col_type = self.dtypes[col] 168 | columns += f"{col} ({col_type})\n" 169 | 170 | prompt_problem = f""" 171 | Given a DataFrame named 'df' of {len(self)} rows and {len(self.columns)} columns, 172 | Its columns are the following: 173 | 174 | {columns} 175 | 176 | I want you to solve the following problem: 177 | write a Python code snippet that addresses the following request: 178 | {request} 179 | 180 | While crafting the code, please follow these guidelines: 181 | 1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search. 182 | 2. Ensure that the answer is a single line of code without explanations, comments, or additional details. 183 | 3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'. 184 | 4. Assign the resulting code to the variable 'result'. 185 | 5. Avoid importing any additional libraries than pandas and numpy. 186 | 187 | """ 188 | if not self.custom_prompt is None and len(self.custom_prompt) > 0: 189 | 190 | prompt_problem += f""" 191 | Also: 192 | {self.custom_prompt} 193 | """ 194 | 195 | return prompt_problem 196 | 197 | def _extractPythonCode(self, text: str, regexp: str) -> str: 198 | # Define the regular expression pattern for the Python code block 199 | pattern = regexp 200 | 201 | # Search for the pattern in the input text 202 | match = re.search(pattern, text, re.DOTALL) 203 | 204 | # If a match is found, return the extracted code (without the markers) 205 | if match: 206 | return match.group(1).strip() 207 | 208 | # If no match is found, return an empty string 209 | return "" 210 | 211 | def _print(self, *args, **kwargs): 212 | if self.verbose: 213 | print(*args, **kwargs) 214 | 215 | # def _variable_to_string(self, variable): 216 | # if variable is None: return None 217 | # try: 218 | 219 | # if isinstance(variable, pd.Series): 220 | # # convert to dataframe 221 | # variable = variable.to_frame() 222 | 223 | # if isinstance(variable, pd.DataFrame): 224 | # variable = variable.drop_duplicates() 225 | # if len(variable) == 0: return None 226 | # return str(variable) 227 | 228 | # elif isinstance(variable, np.ndarray): 229 | # if len(variable) == 0: return None 230 | # return np.array2string(variable) 231 | # else: 232 | # # Convert the variable to a string 233 | # return str(variable) 234 | # except Exception as e: 235 | # return str(variable) 236 | 237 | 238 | def _save(self,name,value): 239 | if self.path is None or self.path == "": 240 | return 241 | try: 242 | with open(f"{self.path}/{name}", 'w') as file: 243 | file.write(value) 244 | except Exception as e: 245 | self._print(f"error {e}") 246 | return 247 | 248 | def _execInSandbox(self, df, generated_code:str): 249 | 250 | # Create a Sandbox instance and allow pandas to be imported 251 | sandbox = Sandbox() 252 | sandbox.allow_import("pandas") 253 | sandbox.allow_import("numpy") 254 | 255 | # Define the initial code to set up the DataFrame 256 | initial_code = f""" 257 | import pandas as pd 258 | import datetime 259 | from pandas import Timestamp 260 | import numpy as np 261 | 262 | """ 263 | 264 | # Combine the initial code and the generated code 265 | full_code = initial_code + "\n" + generated_code 266 | 267 | self._save("temp/prompt_code.py",full_code) 268 | # Execute the combined code in the Sandbox 269 | sandbox_result = sandbox.execute(full_code, {"df":df}) 270 | 271 | # Get the result from the local_vars dictionary 272 | result = sandbox_result.get("result") 273 | return result 274 | 275 | def prompt(self, request: str): 276 | """ 277 | 278 | Args: 279 | request (str): prompt containing the request. it must be expressed as a question or a problem to solve 280 | 281 | Returns: 282 | Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float 283 | """ 284 | 285 | # Set up OpenAI API key 286 | openai.api_key = self.llm_api_key 287 | 288 | messages=[ 289 | {"role": "system", 290 | "content": self._buildPromptForRole()}, 291 | {"role": "user", 292 | "content": self._buildPromptForProblemSolving(request) 293 | } 294 | ] 295 | 296 | response = None 297 | for times in range(0,3): 298 | try: 299 | response = openai.ChatCompletion.create( 300 | model=self.model, 301 | temperature=self.temperature, 302 | messages = messages 303 | ) 304 | break; 305 | except Exception as e: 306 | self._print(f"error {e}") 307 | continue 308 | 309 | if response is None: 310 | return "Please try later" 311 | 312 | self._save("temp/prompt_cmd.json",json.dumps(messages, indent=4)) 313 | 314 | generated_code = response.choices[0].message.content 315 | if generated_code == "" or generated_code is None: 316 | self.code_block = "" 317 | return None 318 | 319 | self.code_block = generated_code 320 | 321 | results=[] 322 | for regexp in self.code_blocks: 323 | cleaned_code = self._extractPythonCode(generated_code,regexp) 324 | if cleaned_code == "" or cleaned_code is None: 325 | continue 326 | results.append(cleaned_code) 327 | results.append(generated_code) 328 | 329 | if len(results) == 0: 330 | return None 331 | 332 | result = None 333 | for cleaned_code in results: 334 | 335 | try: 336 | result = self._execInSandbox(self, cleaned_code) 337 | except Exception as e: 338 | self._print(f"error {e}") 339 | if not self.force_sandbox: 340 | try: 341 | expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip() 342 | result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result}) 343 | except Exception as e: 344 | self._print(f"error {e}") 345 | pass 346 | 347 | if result is not None and str(result) != "": 348 | break 349 | 350 | if self.data_privacy == True: 351 | # non formatted result 352 | return result 353 | 354 | # currently the privacy option is not needed. 355 | # in the future, we can choose to send data to LLM if privacy is set to false 356 | 357 | return result 358 | 359 | --------------------------------------------------------------------------------