├── setup.cfg
├── pandas_llm
    ├── pyvenv.cfg
    ├── requirements.txt
    ├── example.py
    ├── example-chatbot.py
    └── __init__.py
├── requirements.txt
├── LICENSE
├── setup.py
├── .gitignore
└── README.md


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/pandas_llm/pyvenv.cfg:
--------------------------------------------------------------------------------
1 | home = /usr/local/opt/python@3.9/bin
2 | include-system-site-packages = false
3 | version = 3.9.16
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.4
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.2
 4 | attrs==23.1.0
 5 | certifi==2023.5.7
 6 | charset-normalizer==3.1.0
 7 | frozenlist==1.3.3
 8 | idna==3.4
 9 | multidict==6.0.4
10 | numpy==1.24.3
11 | openai==0.27.6
12 | pandas==2.0.1
13 | python-dateutil==2.8.2
14 | pytz==2023.3
15 | requests==2.30.0
16 | RestrictedPython==6.0
17 | six==1.16.0
18 | tqdm==4.65.0
19 | tzdata==2023.3
20 | urllib3==2.0.2
21 | yarl==1.9.2
22 | 


--------------------------------------------------------------------------------
/pandas_llm/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.4
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.2
 4 | attrs==23.1.0
 5 | certifi==2023.5.7
 6 | charset-normalizer==3.1.0
 7 | frozenlist==1.3.3
 8 | idna==3.4
 9 | multidict==6.0.4
10 | numpy==1.24.3
11 | openai==0.27.6
12 | pandas==2.0.1
13 | python-dateutil==2.8.2
14 | pytz==2023.3
15 | requests==2.30.0
16 | RestrictedPython==6.0
17 | six==1.16.0
18 | tqdm==4.65.0
19 | tzdata==2023.3
20 | urllib3==2.0.2
21 | yarl==1.9.2
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Dashy Dash 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pandas_llm/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | import sys
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(__file__).resolve().parent.parent))
 7 | from pandas_llm import PandasLLM
 8 | 
 9 | # Data
10 | # Please note that these names, ages, and donations are randomly generated 
11 | # and do not correspond to real individuals or their donations.
12 | data = [('John Doe', 25, 50), 
13 |         ('Jane Smith', 38, 70),
14 |         ('Alex Johnson', 45, 80),
15 |         ('Jessica Brown', 60, 40),
16 |         ('Michael Davis', 22, 90),
17 |         ('Emily Wilson', 30, 60),
18 |         ('Daniel Taylor', 35, 75),
19 |         ('Sophia Moore', 40, 85),
20 |         ('David Thomas', 50, 65),
21 |         ('Olivia Jackson', 29, 55)]
22 | df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
23 | 
24 | conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
25 | result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?")
26 | code = conv_df.code_block
27 | 
28 | print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n")
29 | # Executing the following expression of type <class 'numpy.float64'>:
30 | # result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean()
31 | 
32 | # Result is:
33 | #  72.5
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | # Reads the content of your README.md into a variable to be used in the setup below
 4 | with open("README.md", "r", encoding="utf-8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(
 8 |     name='pandas_llm',                           # should match the package folder
 9 |     version='0.0.6',                                # important for updates
10 |     license='MIT',                                  # should match your chosen license
11 |     description='Conversational Pandas Dataframes',
12 |     long_description=long_description,              # loads your README.md
13 |     long_description_content_type="text/markdown",  # README.md is of type 'markdown'
14 |     author='DashyDash',
15 |     author_email='alessio@dashydash.com',
16 |     url='https://github.com/DashyDashOrg/pandas-llm', 
17 |     project_urls = {                                # Optional
18 |         "Bug Tracker": "https://github.com/DashyDashOrg/pandas-llm/issues"
19 |     },
20 |     keywords=["pypi", "pandas-llm", "pandas", "llm", "ai", "openai", "chatgpt"], #descriptive meta-data
21 |     packages=find_packages(),
22 |     classifiers=[                                   # https://pypi.org/classifiers
23 |         'Development Status :: 3 - Alpha',
24 |         'Intended Audience :: Developers',
25 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Programming Language :: Python :: 3',
28 |         "Operating System :: OS Independent",
29 |     ],
30 |     python_requires='>=3.6',
31 |     install_requires=[
32 |         "aiohttp",
33 |         "aiosignal",
34 |         "async-timeout",
35 |         "attrs",
36 |         "certifi",
37 |         "charset-normalizer",
38 |         "frozenlist",
39 |         "idna",
40 |         "multidict",
41 |         "numpy",
42 |         "openai",
43 |         "pandas",
44 |         "python-dateutil",
45 |         "pytz",
46 |         "requests",
47 |         "RestrictedPython",
48 |         "six",
49 |         "tqdm",
50 |         "tzdata",
51 |         "urllib3",
52 |         "yarl",
53 |     ],   
54 |     download_url="https://github.com/DashyDashOrg/pandas-llm/releases/tag/v0.0.6",
55 | )


--------------------------------------------------------------------------------
/pandas_llm/example-chatbot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | import sys
 5 | from pathlib import Path
 6 | sys.path.append(str(Path(__file__).resolve().parent.parent))
 7 | 
 8 | from pandas_llm import PandasLLM
 9 | 
10 | # Data
11 | #  Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations.
12 | data = [('John Doe', 25, 50), 
13 |         ('Jane Smith', 38, 70),
14 |         ('Alex Johnson', 45, 80),
15 |         ('Jessica Brown', 60, 40),
16 |         ('Michael Davis', 22, 90),
17 |         ('Emily Wilson', 30, 60),
18 |         ('Daniel Taylor', 35, 75),
19 |         ('Sophia Moore', 40, 85),
20 |         ('David Thomas', 50, 65),
21 |         ('Olivia Jackson', 29, 55),
22 |         ('Carlos García', 22, 50),
23 |         ('Ana Rodriguez', 38, 70),
24 |         ('Luis Hernandez', 45, 80),
25 |         ('Sofia Martinez', 60, 40),
26 |         ('Miguel Lopez', 22, 90),
27 |         ('Isabella Gonzalez', 30, 60),
28 |         ('Diego Perez', 35, 75),
29 |         ('Maria Sanchez', 40, 85),
30 |         ('Juan Pena', 50, 65),
31 |         ('Gabriela Ramirez', 29, 55),
32 |         ('Giovanni Rossi', 22, 50),
33 |         ('Maria Bianchi', 38, 70),
34 |         ('Luca Ferrari', 45, 80),
35 |         ('Sofia Russo', 60, 40),
36 |         ('Francesco Romano', 22, 90),
37 |         ('Isabella Colombo', 30, 60),
38 |         ('Alessandro Ricci', 35, 75),
39 |         ('Giulia Marino', 40, 85),
40 |         ('Antonio Greco', 50, 65),
41 |         ('Gabriella Bruno', 29, 55)]
42 | 
43 | # Create DataFrame
44 | df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
45 | 
46 | # Print DataFrame
47 | print(df)
48 | 
49 | 
50 | def main():
51 | 
52 |     # Initialise library and set the OpenAI API key
53 |     conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
54 |     print()
55 |     banner = """
56 |     Welcome to the Donation Data CLI.
57 |     The donation dataset has three columns (name, age, donation)
58 |     Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations.
59 |     
60 |     You can ask questions like:
61 |     - show me the list of names
62 |     - What is the average age of people who donated?
63 |     - What is the average donation amount?
64 |     - What is the average donation of people older than 30?
65 |     - What is the average donation of people older than 30 who donated more than $50?
66 |     """
67 |     print(banner)
68 | 
69 |     while True:
70 |         prompt = input("Enter your query (or 'exit' to quit): ")
71 |         if prompt.lower() == "exit":
72 |             break
73 | 
74 |         result = conv_df.prompt(prompt)
75 |         code = conv_df.code_block
76 |         print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n")
77 |         
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Dashy dash private stuff
163 | dashy/
164 | temp.py
165 | tmp.py
166 | test.py
167 | temp/
168 | bin/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pandas-LLM
  2 | 
  3 | ## Introduction
  4 | pandas-llm is a lightweight Python library that extends pandas to allow querying datasets using OpenAI prompts. This powerful tool leverages the natural language processing capabilities of OpenAI to offer intuitive, language-based querying of your Pandas dataframes.
  5 | 
  6 | ## Key Features
  7 | - **Natural Language Querying**: With pandas-llm, you can execute complex Pandas queries using natural language prompts. Instead of writing code, you can express your query in plain language and obtain the desired results.
  8 | 
  9 | - **Data Privacy**: Your data is not sent on the Internet. Pandas-LLM works locally with your data and uses openAI to create the query based on the dataframe columns and data types, not its content.
 10 | 
 11 | - **Seamless Integration**: The library seamlessly integrates with your existing Pandas workflow. You can continue using normal Pandas functions and syntax while leveraging the added capability of natural language queries.
 12 | 
 13 | - **Efficiency and Performance**: pandas-LLM is designed to deliver efficient and performant querying capabilities. It uses OpenAI's language model to process queries quickly and accurately, providing rapid insights from your data.
 14 | 
 15 | - **Flexible and Expressive**: Whether you need to filter, aggregate, sort, or transform your data, pandas-LLM allows you to express your requirements flexibly and expressively. You can perform complex operations on your dataframes with ease using human-readable language.
 16 | 
 17 | - **Intelligent Results**: The library returns the results of your queries in a concise and understandable format. You can extract valuable insights from your data without complex code or manual analysis.
 18 | 
 19 | With pandas-llm, you can unlock the power of natural language querying and effortlessly execute complex pandas queries. Let the library handle the intricacies of data manipulation while you focus on gaining insights and making data-driven decisions.
 20 | 
 21 | ## Installation
 22 | 
 23 | Install pandas-llm using pip:
 24 | 
 25 | ```shell
 26 | pip install pandas-llm
 27 | ```
 28 | 
 29 | ## Features
 30 | - Query pandas dataframes using natural language prompts.
 31 | - Leverage the power of OpenAI's language models in your data analysis.
 32 | - Seamless integration with existing pandas functions.
 33 | 
 34 | ## Usage
 35 | Here's a quick [example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example.py) of how to use pandas-llm:
 36 | 
 37 | ```python
 38 | import os
 39 | import pandas as pd
 40 | from pandas_llm import PandasLLM
 41 | 
 42 | # Data
 43 | # Please note that these names, ages, and donations are randomly generated 
 44 | # and do not correspond to real individuals or their donations.
 45 | data = [('John Doe', 25, 50), 
 46 |         ('Jane Smith', 38, 70),
 47 |         ('Alex Johnson', 45, 80),
 48 |         ('Jessica Brown', 60, 40),
 49 |         ('Michael Davis', 22, 90),
 50 |         ('Emily Wilson', 30, 60),
 51 |         ('Daniel Taylor', 35, 75),
 52 |         ('Sophia Moore', 40, 85),
 53 |         ('David Thomas', 50, 65),
 54 |         ('Olivia Jackson', 29, 55)]
 55 | df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
 56 | 
 57 | conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
 58 | result = conv_df.prompt("What is the average donation of people older than 40 who donated more than $50?")
 59 | code = conv_df.code_block
 60 | 
 61 | print(f"Executing the following expression of type {type(result)}:\n{code}\n\nResult is:\n {result}\n")
 62 | # Executing the following expression of type <class 'numpy.float64'>:
 63 | # result = df.loc[(df['age'] > 40) & (df['donation'] > 50), 'donation'].mean()
 64 | 
 65 | # Result is:
 66 | #  72.5
 67 | 
 68 | ```
 69 | 
 70 | There is also a chatbot available in the repository using the same dataset. 
 71 | Look at [Chatbot example](https://github.com/DashyDashOrg/pandas-llm/blob/main/pandas_llm/example-chatbot.py)
 72 | 
 73 | ## PandasLLM Class Constructor
 74 | 
 75 | The constructor for the PandasLLM class has been enhanced in this release to provide more flexibility and control over the language model interaction. The constructor accepts the following arguments:
 76 | 
 77 | **data** (mandatory): The data to be used. It can be a Pandas DataFrame, a list of lists, tuples, dictionaries, a dictionary, a string, or a list.
 78 | 
 79 | **llm_engine** (optional): The name of the LLM engine to use. Currently, only OpenAI is supported. Defaults to "openai".
 80 | 
 81 | **llm_params** (optional): A dictionary of parameters to be used with the OpenAI API. This allows customization of the LLM behavior. Defaults to model=gpt-3.5-turbo and temperature=0.2.
 82 | 
 83 | **prompt_override** (optional): A boolean that determines whether or not the prompt is overridden. If set to True, the custom prompt becomes the main prompt. Defaults to False.
 84 | 
 85 | **custom_prompt** (optional): A string that can be provided if prompt_override is False. The custom prompt will be added to the default pandas_llm prompt. Defaults to an empty string.
 86 | 
 87 | **path** (optional): The path to the file where the debug data will be saved. If not specified, debug data files will not be generated.
 88 | 
 89 | **verbose** (optional): A boolean determines whether debugging information will be printed. If set to True, additional debugging info will be displayed. Defaults to False.
 90 | 
 91 | **data_privacy** (optional): A boolean determines whether the data is treated as private. If set to True, the function will not send the data content to OpenAI. Defaults to True.
 92 | 
 93 | **llm_api_key** (optional): The OpenAI API key to be used. The library will attempt to use the default API key configured if not provided.
 94 | 
 95 | **force_sandbox** (optional): A boolean determining the fallback behaviour if the sandbox environment fails. If set to False and the sandbox fails, the library will retry using eval, which is less safe. Defaults to False.
 96 | 
 97 | 
 98 | ## Contributing
 99 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. Please make sure to update tests as appropriate.
100 | 
101 | ## License
102 | MIT
103 | 


--------------------------------------------------------------------------------
/pandas_llm/__init__.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import datetime
  3 | import numpy as np
  4 | import openai
  5 | import os
  6 | import re
  7 | import json
  8 | 
  9 | # sandbox.py
 10 | from RestrictedPython import compile_restricted
 11 | from RestrictedPython.Guards import safe_builtins,guarded_iter_unpack_sequence
 12 | from  RestrictedPython.Eval import default_guarded_getattr, default_guarded_getitem, default_guarded_getiter
 13 | import pandas as pd
 14 | 
 15 | class Sandbox:
 16 |     def __init__(self):
 17 |         self._allowed_imports = {}
 18 | 
 19 |     def allow_import(self, module_name):
 20 |         try:
 21 |             module = __import__(module_name)
 22 |             self._allowed_imports[module_name] = module
 23 |         except ImportError:
 24 |             pass
 25 | 
 26 |     def execute(self, code, local_vars = {}):
 27 |         allowed_builtins = safe_builtins
 28 |         # Add __builtins__, __import__, and allowed imports to the globals
 29 |         restricted_globals = {"__builtins__": allowed_builtins}
 30 |         restricted_globals.update(self._allowed_imports)
 31 | 
 32 |         builtin_mappings = {
 33 |             "__import__": __import__,
 34 |             "_getattr_": default_guarded_getattr,
 35 |             "_getitem_": default_guarded_getitem,
 36 |             "_getiter_": default_guarded_getiter,
 37 |             "_iter_unpack_sequence_": guarded_iter_unpack_sequence,
 38 |             "list": list,
 39 |             "set": set,
 40 |             "pd": pd,
 41 |         }
 42 | 
 43 |         series_methods = [
 44 |             "sum", "mean", "any", "argmax", "argmin", "count", "cumsum", "cumprod", "diff",
 45 |             "dropna", "fillna", "head", "idxmax", "idxmin", "last", "max", "min", "notna",
 46 |             "prod", "quantile", "rename", "round", "tail", "to_frame", "to_list", "to_numpy",
 47 |             "to_string","unique",  "sort_index", "sort_values", "aggregate"
 48 |         ]
 49 | 
 50 | 
 51 |         builtin_mappings.update({method: getattr(pd.Series, method) for method in series_methods})
 52 | 
 53 |         restricted_globals["__builtins__"].update(builtin_mappings)
 54 | 
 55 |         byte_code = compile_restricted(source=code, filename='<inline>', mode='exec')
 56 | 
 57 |         # Execute the restricted code
 58 |         exec(byte_code, restricted_globals, local_vars)
 59 | 
 60 |         return local_vars
 61 | 
 62 | 
 63 | class PandasLLM(pd.DataFrame):
 64 |     """
 65 |     PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
 66 |     wrapper around the OpenAI API. 
 67 |     """
 68 | 
 69 |     code_blocks = [r'```python(.*?)```',r'```(.*?)```']
 70 | 
 71 |     llm_default_model = "gpt-3.5-turbo"
 72 |     llm_default_temperature = 0.2
 73 |     llm_engine = "openai"
 74 |     llm_default_params = { "model": llm_default_model,
 75 |                             "temperature": llm_default_temperature}
 76 |     llm_api_key = None
 77 |     
 78 |     prompt_override = False
 79 |     custom_prompt = ""
 80 |     data_privacy = True
 81 |     path = None
 82 |     verbose = False 
 83 |     code_block = ""
 84 |     force_sandbox = False
 85 |     def __init__(self, 
 86 |                  data, 
 87 |                  llm_engine:str = "openai", llm_params=llm_default_params, 
 88 |                  prompt_override:bool = False,     
 89 |                  custom_prompt:str = "", 
 90 |                  path:str = None,
 91 |                  verbose:bool = False,
 92 |                  data_privacy:bool = True,
 93 |                  llm_api_key:str = None,
 94 |                  force_sandbox:bool = False,
 95 |                  *args, **kwargs):
 96 |         """
 97 |         This is the constructor for the PandasLLM class. It takes in the following arguments:
 98 |         data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
 99 |         a list of dictionaries, a dictionary, a string, or a list.
100 |         llm_engine: The name of the OpenAI engine to use.
101 |         llm_params: A dictionary of parameters to be used with the OpenAI API.
102 |         prompt_override: A boolean that determines whether or not the prompt is overridden.
103 |         custom_prompt: A string that overrides the prompt.
104 |         path: The path to the file to be used.
105 |         verbose: A boolean that determines whether or not the output is verbose.
106 |         data_privacy: A boolean that determines whether or not the data is private.
107 |         llm_api_key: The OpenAI API key to be used.
108 |         force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
109 | 
110 |         The constructor also calls the parent class's constructor.
111 | 
112 |         
113 |         Args:
114 |             data (pandas dataframe, mandatory): dataset to query. Defaults to None.
115 |             llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
116 |             llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
117 |             prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
118 |             custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
119 |             path (str, optional): the path where the files containing debug data will be save. Defaults to None.
120 |             verbose (bool, optional): if True debugging info will be printed. Defaults to False.
121 |             data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
122 |             llm_api_key (str, optional): the Open API key. Defaults to None.
123 |             force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
124 |         """
125 | 
126 | 
127 |         super().__init__(data, *args, **kwargs)
128 |         
129 |         self.llm_params = llm_params or {}
130 | 
131 |         # Set up OpenAI API key from the environment or the config
132 |         self.llm_api_key = llm_api_key or os.environ.get("OPENAI_API_KEY")
133 | 
134 |         self.llm_engine = llm_engine
135 |         self.llm_params = llm_params or {}
136 |         self.model = self.llm_params.get("model", self.llm_default_model)
137 |         self.temperature = self.llm_params.get("temperature", self.llm_default_temperature)
138 | 
139 |         self.prompt_override = prompt_override
140 |         self.custom_prompt = custom_prompt
141 | 
142 |         self.data_privacy = data_privacy
143 |         self.path = path
144 |         self.verbose = verbose
145 |         self.force_sandbox = force_sandbox
146 | 
147 |     def _buildPromptForRole(self):
148 |         prompt_role = f"""
149 | I want you to act as a data scientist and Python coder. I want you code for me. 
150 | I have a dataset of {len(self)} rows and {len(self.columns)} columns.
151 | Columns and their type are the following:
152 |         """
153 | 
154 |         for col in self.columns:
155 |             col_type = self.dtypes[col]
156 |             prompt_role += f"{col} ({col_type})\n"
157 | 
158 |         return prompt_role
159 | 
160 |     def _buildPromptForProblemSolving(self, request):
161 | 
162 |         if self.prompt_override:
163 |             return self.custom_prompt
164 | 
165 |         columns = ""
166 |         for col in self.columns:
167 |             col_type = self.dtypes[col]
168 |             columns += f"{col} ({col_type})\n"
169 | 
170 |         prompt_problem = f"""
171 | Given a DataFrame named 'df' of {len(self)} rows and {len(self.columns)} columns,
172 | Its columns are the following:
173 | 
174 | {columns}
175 | 
176 | I want you to solve the following problem:
177 | write a Python code snippet that addresses the following request:
178 | {request}
179 | 
180 | While crafting the code, please follow these guidelines:
181 | 1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search.
182 | 2. Ensure that the answer is a single line of code without explanations, comments, or additional details. 
183 | 3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'.
184 | 4. Assign the resulting code to the variable 'result'.
185 | 5. Avoid importing any additional libraries than pandas and numpy.
186 | 
187 | """
188 |         if not self.custom_prompt is None and len(self.custom_prompt) > 0:
189 |              
190 |             prompt_problem += f"""
191 |             Also:
192 |             {self.custom_prompt}
193 |             """
194 | 
195 |         return prompt_problem
196 | 
197 |     def _extractPythonCode(self, text: str, regexp: str) -> str:
198 |         # Define the regular expression pattern for the Python code block
199 |         pattern = regexp
200 |         
201 |         # Search for the pattern in the input text
202 |         match = re.search(pattern, text, re.DOTALL)
203 |         
204 |         # If a match is found, return the extracted code (without the markers)
205 |         if match:
206 |             return match.group(1).strip()
207 |         
208 |         # If no match is found, return an empty string
209 |         return ""
210 | 
211 |     def _print(self,  *args, **kwargs):
212 |         if self.verbose:
213 |             print(*args, **kwargs)
214 | 
215 |     # def _variable_to_string(self, variable):
216 |     #     if variable is None: return None
217 |     #     try:
218 | 
219 |     #         if isinstance(variable, pd.Series):
220 |     #             # convert to dataframe
221 |     #             variable = variable.to_frame()
222 | 
223 |     #         if isinstance(variable, pd.DataFrame):
224 |     #             variable = variable.drop_duplicates()
225 |     #             if len(variable) == 0: return None
226 |     #             return str(variable)
227 | 
228 |     #         elif isinstance(variable, np.ndarray):
229 |     #             if len(variable) == 0: return None
230 |     #             return  np.array2string(variable)
231 |     #         else:
232 |     #             # Convert the variable to a string
233 |     #             return str(variable)
234 |     #     except Exception as e:
235 |     #         return str(variable)
236 |         
237 | 
238 |     def _save(self,name,value):
239 |         if self.path is None or self.path == "":
240 |             return  
241 |         try:
242 |             with open(f"{self.path}/{name}", 'w') as file:
243 |                 file.write(value)
244 |         except Exception as e:
245 |             self._print(f"error {e}")
246 |         return
247 | 
248 |     def _execInSandbox(self, df, generated_code:str):
249 | 
250 |         # Create a Sandbox instance and allow pandas to be imported
251 |         sandbox = Sandbox()
252 |         sandbox.allow_import("pandas")
253 |         sandbox.allow_import("numpy")
254 | 
255 |         # Define the initial code to set up the DataFrame
256 |         initial_code = f"""
257 | import pandas as pd
258 | import datetime
259 | from pandas import Timestamp
260 | import numpy as np
261 | 
262 |         """
263 | 
264 |         # Combine the initial code and the generated code
265 |         full_code = initial_code + "\n" + generated_code
266 | 
267 |         self._save("temp/prompt_code.py",full_code)
268 |         # Execute the combined code in the Sandbox
269 |         sandbox_result = sandbox.execute(full_code, {"df":df})
270 | 
271 |         # Get the result from the local_vars dictionary
272 |         result = sandbox_result.get("result")
273 |         return result
274 | 
275 |     def prompt(self, request: str):
276 |         """
277 | 
278 |         Args:
279 |             request (str): prompt containing the request. it must be expressed as a question or a problem to solve
280 | 
281 |         Returns:
282 |             Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
283 |         """
284 |         
285 |         # Set up OpenAI API key
286 |         openai.api_key = self.llm_api_key
287 | 
288 |         messages=[
289 |                 {"role": "system", 
290 |                 "content": self._buildPromptForRole()},
291 |                 {"role": "user", 
292 |                 "content": self._buildPromptForProblemSolving(request)
293 |                 }
294 |             ]
295 | 
296 |         response = None
297 |         for times in range(0,3):
298 |             try:
299 |                 response = openai.ChatCompletion.create(
300 |                 model=self.model,
301 |                 temperature=self.temperature,
302 |                 messages = messages
303 |                 )
304 |                 break;
305 |             except Exception as e:
306 |                 self._print(f"error {e}")
307 |                 continue
308 | 
309 |         if response is None:
310 |             return "Please try later"
311 | 
312 |         self._save("temp/prompt_cmd.json",json.dumps(messages, indent=4))
313 | 
314 |         generated_code = response.choices[0].message.content
315 |         if generated_code == "" or generated_code is None:
316 |             self.code_block = ""
317 |             return None
318 |         
319 |         self.code_block = generated_code
320 | 
321 |         results=[]
322 |         for regexp in self.code_blocks:
323 |             cleaned_code = self._extractPythonCode(generated_code,regexp)
324 |             if cleaned_code == "" or cleaned_code is None:
325 |                 continue
326 |             results.append(cleaned_code)
327 |         results.append(generated_code)
328 | 
329 |         if len(results) == 0:
330 |             return None
331 | 
332 |         result = None
333 |         for cleaned_code in results:
334 |     
335 |             try:
336 |                 result = self._execInSandbox(self, cleaned_code)
337 |             except Exception as e:
338 |                 self._print(f"error {e}")
339 |                 if not self.force_sandbox:
340 |                     try:
341 |                         expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip()
342 |                         result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result})
343 |                     except Exception as e:
344 |                         self._print(f"error {e}")
345 |                         pass
346 | 
347 |             if result is not None and str(result) != "":
348 |                 break
349 | 
350 |         if self.data_privacy == True:
351 |             # non formatted result
352 |             return result
353 |         
354 |         # currently the privacy option is not needed.
355 |         # in the future, we can choose to send data to LLM if privacy is set to false
356 | 
357 |         return result
358 | 
359 |         


--------------------------------------------------------------------------------