├── .gitignore
├── .vscode
    └── ltex.dictionary.en-US.txt
├── LICENSE
├── README.md
├── env-setup
    └── aws
    │   └── ubuntu-22.04
    │       ├── README.md
    │       ├── docker-install.sh
    │       ├── nvidia-container-install.sh
    │       ├── nvidia-container-runtime-script.sh
    │       └── nvidia-driver-install.sh
├── pdf2md
    ├── .gitignore
    ├── Dockerfile
    ├── README.md
    ├── docker-compose.yaml
    ├── requirements.txt
    ├── run.sh
    ├── shutdown.sh
    └── src
    │   ├── __init__.py
    │   ├── llamaparse_pdf2md.py
    │   ├── llmsherpa_pdf2md.py
    │   ├── main.py
    │   ├── openai_utils.py
    │   └── unstructured_pdf2md.py
├── tabular-data-analysis
    ├── .gitignore
    ├── Dockerfile
    ├── Dockerfile.lit
    ├── README.md
    ├── conf
    │   └── config.json
    ├── docker-compose-lit.yaml
    ├── docker-compose.yaml
    ├── download-models.sh
    ├── requirements.txt
    ├── run.sh
    ├── shutdown.sh
    └── src
    │   ├── __init__.py
    │   ├── backend.py
    │   ├── constants.py
    │   └── main.py
└── text2sql
    ├── Dockerfile
    ├── Dockerfile.lit
    ├── README.md
    ├── conf
        └── config.json
    ├── db
        └── mysqlsampledatabase.sql
    ├── docker-compose-lit.yaml
    ├── docker-compose.yaml
    ├── load_data.sh
    ├── requirements.txt
    ├── run.sh
    ├── shutdown.sh
    └── src
        ├── __init__.py
        ├── constants.py
        ├── db_engine.py
        ├── main.py
        └── query_engine.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # directories created during the runtime
 10 | LocalAI/
 11 | models/
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | **/.DS_Store


--------------------------------------------------------------------------------
/.vscode/ltex.dictionary.en-US.txt:
--------------------------------------------------------------------------------
1 | LLMs
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 LinkTime Corp
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM in Containers - Examples and Guides
 2 | 
 3 | ## Overview
 4 | 
 5 | Welcome to the LLM in Containers repository! This project is dedicated to providing comprehensive examples and guidelines for running Large Language Models (LLMs) and related tools in the ecosystem in containers. Our primary focus is on utilizing tools like Docker and Kubernetes to create scalable, reproducible, and easily deployable LLM applications.
 6 | 
 7 | ## Motivation
 8 | 
 9 | The advent of LLMs has revolutionized various sectors from natural language processing to AI-driven content creation. However, deploying these models efficiently and effectively remains a challenge due to their size and complexity. Containerization offers a solution by encapsulating the model, its dependencies, and the runtime environment, ensuring that it runs consistently across different computing environments. This repository aims to lower the barrier to entry for utilizing LLMs in a containerized setup, fostering innovation and experimentation.
10 | 
11 | ## What You'll Find Here
12 | 
13 | - **Examples**: Ready-to-use examples demonstrating how to containerize different LLMs for various use cases.
14 | - **Best Practices**: Guidelines on optimizing performance, managing resources, and ensuring security while running LLMs in containers.
15 | - **Tutorials**: Step-by-step instructions to get you started with containerizing LLMs, tailored for both beginners and experienced users.
16 | - **Community Contributions**: A collaborative space for users to share their own examples, tips, and tricks related to LLM containerization.
17 | 
18 | ## Getting Started
19 | 
20 | 1. **Clone the Repository**: Get the latest examples and documentation.
21 | 2. **Choose an Example**: Navigate through various examples to find one that suits your need.
22 | 3. **Follow the Tutorial**: Each example comes with a detailed tutorial to guide you through the process.
23 | 4. **Deploy and Experiment**: Use these examples as a starting point for your projects or as a learning tool.
24 | 
25 | ## License
26 | 
27 | Distributed under the MIT License. See `LICENSE` for more information.
28 | 
29 | ## Contact
30 | 
31 | [admin@linktimecorp.com] - [https://twitter.com/linktimecorp]
32 | 


--------------------------------------------------------------------------------
/env-setup/aws/ubuntu-22.04/README.md:
--------------------------------------------------------------------------------
 1 | # Setup environment on AWS
 2 | 
 3 | We suggest the following AWS EC2 instances as the environment to run the demo:
 4 | 
 5 | - OS: Ubuntu 22.04 LTS
 6 | - CPU instance: c5.2xlarge with 8 vCPUs and 16GB of RAM, $0.34 per hour as of 02/01/2024.
 7 | - GPU instance: g5.xlarge with NVIDIA A10G GPU, 4 vCPUs, and 16GB of RAM, $1.006 per hour as of 02/01/2024.
 8 | 
 9 | By default, AWS EC2 creates a 'ubuntu' user, and we run all the setup scripts under this user:
10 | 
11 | ```
12 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
13 | 
14 | cd llm-in-containers/env-setup/aws/ubuntu-22.04
15 | 
16 | bash docker-install.sh
17 | ```
18 | 
19 | To enable Docker commands for the 'ubuntu' user, please log out. For running this demo on GPU instances, proceed as follows:
20 | 
21 | ```
22 | bash nvidia-driver-install.sh
23 | ```
24 | 
25 | After the reboot is done, go back to the 'llm-in-containers/env-setup/aws/ubuntu-22.04' directory and run:
26 | 
27 | ```
28 | bash nvidia-container-install.sh
29 | 
30 | docker run -it --rm --gpus all ubuntu nvidia-smi
31 | ```
32 | 
33 | If everything is installed successfully, the final command will display Nvidia GPU and driver details.
34 | 


--------------------------------------------------------------------------------
/env-setup/aws/ubuntu-22.04/docker-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | sudo apt update
 5 | 
 6 | sudo apt install -y apt-transport-https ca-certificates curl software-properties-common unzip
 7 | 
 8 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
 9 | 
10 | echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
11 | 
12 | sudo apt update
13 | 
14 | apt-cache policy docker-ce
15 | 
16 | sudo apt install -y docker-ce docker-compose
17 | 
18 | sudo usermod -aG docker "${USER}"
19 | 


--------------------------------------------------------------------------------
/env-setup/aws/ubuntu-22.04/nvidia-container-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -u
3 | 
4 | bash nvidia-container-runtime-script.sh
5 | 
6 | sudo apt-get install -y nvidia-container-runtime
7 | 
8 | sudo systemctl restart docker
9 | 


--------------------------------------------------------------------------------
/env-setup/aws/ubuntu-22.04/nvidia-container-runtime-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e -u
 4 | 
 5 | curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | sudo apt-key add -
 6 | 
 7 | distribution=$(. /etc/os-release; echo $ID$VERSION_ID)
 8 | 
 9 | curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | \
10 | 
11 | sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
12 | 
13 | sudo apt-get update


--------------------------------------------------------------------------------
/env-setup/aws/ubuntu-22.04/nvidia-driver-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | sudo apt-get update
 5 | 
 6 | sudo apt install -y ubuntu-drivers-common
 7 | 
 8 | ubuntu-drivers devices
 9 | 
10 | sudo ubuntu-drivers autoinstall
11 | 
12 | sudo reboot


--------------------------------------------------------------------------------
/pdf2md/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | pdf-inputs/
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 


--------------------------------------------------------------------------------
/pdf2md/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim-bookworm
 2 | 
 3 | WORKDIR /app
 4 | COPY requirements.txt .
 5 | RUN apt-get update && apt-get install -y \
 6 |     libgl1-mesa-glx \
 7 |     libglib2.0-0 \
 8 |     poppler-utils \
 9 |     tesseract-ocr \
10 |     tesseract-ocr-eng \
11 |     tesseract-ocr-chi-sim \
12 |     libmagic-dev \    
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | RUN pip install --no-cache-dir -r requirements.txt 
16 | RUN pip install -U llama-index --upgrade --no-cache-dir --force-reinstall
17 | RUN pip install llama-parse
18 | COPY src src
19 | CMD ["streamlit", "run", "src/main.py"]


--------------------------------------------------------------------------------
/pdf2md/README.md:
--------------------------------------------------------------------------------
 1 | # Running PDF Parsers on Docker Containers
 2 | ## Overview
 3 | This demo showcases running different PDF parsers in the same docker container. Three
 4 | approaches for parsing PDF files are included: [LlmSherpa](https://github.com/nlmatics/llmsherpa), [Unstructured](https://github.com/Unstructured-IO/unstructured), and [LlamaParse](https://github.com/run-llama/llama_parse).
 5 | 
 6 | ## Prerequisites
 7 | Before diving into this demo, please ensure that your system meets the following prerequisites:
 8 | 1. **Operating System**: The demo is compatible with Linux operating systems and tested on Ubuntu 22.04.
 9 | 
10 | 2. **Docker**: It's required to have `docker` installed on your system. Specifically, we have tested this demo with Docker Engine Community version 25.0.1 on Linux. 
11 | 
12 | 3. **OpenAI API Key for ChatGPT**: If you wish to use the ChatGPT functionality within this demo, an OpenAI API key is required. Please note that usage of this API is subject to OpenAI's pricing and usage policies. We use OpenAI text generation models to optimize the parsing of some special components like titles or tables etc. Without this API key, you can still try all three approaches.
13 |    
14 | 4. **LlamaParse API Key**: If you wish to try the newly launched LlamaParse API service, you need to get an API key from [here](https://cloud.llamaindex.ai/). As of today (02/26/2024), this API service is in preview mode for free. Without this API key, you will not be able to try LlamaParse.
15 | 
16 | ## Quick Start
17 | ### Setup environment on AWS
18 | Please follow this [README file](../env-setup/aws/ubuntu-22.04/README.md) to setup the demo environment on AWS EC2. Note that, GPU is not required in this demo, so you can run it on any instance that is installed Docker.
19 | 
20 | ### Running the demo
21 | 1. Start by cloning this repo to your instance with Docker installed:
22 | ```
23 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
24 | cd llm-in-containers/pdf2md
25 | ```
26 | 2. Replace '{your-openai-api-key}' and '{your-llamaparse-api-key}' with your API keys for the following commands. Then launch the demo. 
27 | ```
28 | export OPENAI_API_KEY={your-openai-api-key}
29 | export LLAMAPARSE_API_KEY={your-llamaparse-api-key}
30 | bash run.sh
31 | ```
32 | 3. Visit the UI at http://{IP of Host instance}:8501. On the UI, you can choose "LlamaParse", "LlmSherpa", or "Unstructured" to parse the uploaded PDF file.
33 | 4. Shut down the demo.
34 | ```
35 | bash shutdown.sh
36 | ```


--------------------------------------------------------------------------------
/pdf2md/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | services:
 3 |   nlm-ingestor:
 4 |     image: ghcr.io/nlmatics/nlm-ingestor:v0.1.6
 5 |     ports:
 6 |       - "5001:5001"
 7 |   pdf2md:
 8 |     image: pdf2md:1.0
 9 |     environment:
10 |       - OPENAI_API_KEY=${OPENAI_API_KEY}
11 |       - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
12 |     ports:
13 |       - "8501:8501"
14 |       


--------------------------------------------------------------------------------
/pdf2md/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.1.7
2 | llmsherpa==0.1.4
3 | openai==1.11.1
4 | pydantic==2.6.1
5 | streamlit==1.30.0
6 | unstructured[pdf]==0.12.4
7 | 


--------------------------------------------------------------------------------
/pdf2md/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -u
3 | 
4 | docker build -t pdf2md:1.0 .
5 | docker-compose up -d
6 | 


--------------------------------------------------------------------------------
/pdf2md/shutdown.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -u
3 | 
4 | docker-compose down
5 | 


--------------------------------------------------------------------------------
/pdf2md/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinkTime-Corp/llm-in-containers/ca85bbc3f024a2732041ed53aca7d57c1f24e095/pdf2md/src/__init__.py


--------------------------------------------------------------------------------
/pdf2md/src/llamaparse_pdf2md.py:
--------------------------------------------------------------------------------
 1 | import click, os, sys
 2 | import nest_asyncio
 3 | nest_asyncio.apply()
 4 | 
 5 | from llama_parse import LlamaParse
 6 | from pydantic import BaseModel
 7 | 
 8 | class LlamaParsePDFParser(BaseModel):
 9 |     client: LlamaParse = None
10 | 
11 |     class Config:
12 |         arbitrary_types_allowed = True
13 | 
14 |     def __init__(self, **data):
15 |         super().__init__(**data)
16 |         self.client = LlamaParse(
17 |             api_key=os.getenv("LLAMAPARSE_API_KEY", "your_llamaparse_api_key"),
18 |             result_type="markdown", 
19 |             num_workers=4,
20 |             verbose=True)
21 | 
22 |     def parse_pdf(self, input_file: str):
23 |         print(f"Processing {input_file}...")
24 |         try:
25 |             documents = self.client.load_data(input_file)
26 |             if len(documents) > 0:
27 |                 return documents[0].text
28 |             else:
29 |                 return ""
30 |         except Exception as e:
31 |             print(f"Error processing {input_file}: {e}")
32 | 
33 | @click.command()
34 | @click.option(
35 |     "-i",
36 |     "--input_file",
37 |     "input_file",
38 |     required=True,
39 |     help="The input pdf file to parse.",
40 | )
41 | @click.option(
42 |     "-o",
43 |     "--output_file",
44 |     "output_file",
45 |     required=True,
46 |     help="The output markdown file.",
47 | )
48 | def llamaparse_pdf2md(input_file: str, output_file: str) -> None:
49 |     pdf_parser = LlamaParsePDFParser()
50 |     md_text = pdf_parser.parse_pdf(input_file)
51 |     with open(output_file, "w") as f:
52 |         f.write(md_text)        
53 | 
54 | if __name__ == "__main__":
55 |     llamaparse_pdf2md()()
56 | 


--------------------------------------------------------------------------------
/pdf2md/src/llmsherpa_pdf2md.py:
--------------------------------------------------------------------------------
  1 | import click, json, os, re, sys
  2 | from llmsherpa.readers import LayoutPDFReader
  3 | from llmsherpa.readers import Paragraph, Table, ListItem, Section, Block
  4 | from openai_utils import parse_table, extract_title
  5 | from pydantic import BaseModel
  6 | import re
  7 | 
  8 | UPPERCASE_LINE_PATTERN = r"^[A-Z\s]+$"
  9 | UPPERCASE_FIRST_PATTERN = r'^([A-Z][a-z]+)(\s[A-Z][a-z]+)*$'
 10 | NUM_HEADING_PATTERN = r"^(\d+(\.\d+)*)\s+(.*)"
 11 | SECTION_HEADING_PATTERN = r"^SECTION \d+:"
 12 | START_HEADING_LEVEL = 1
 13 | 
 14 | LLMSHERPA_API_URL = "http://nlm-ingestor:5001/api/parseDocument?renderFormat=all"
 15 | 
 16 | class LayoutPDFParser(BaseModel):
 17 |     class Config:
 18 |         arbitrary_types_allowed = True
 19 | 
 20 |     pdf_reader : LayoutPDFReader = None
 21 |     def __init__(self, **kwargs):
 22 |         super().__init__(**kwargs)
 23 |         self.pdf_reader = LayoutPDFReader(LLMSHERPA_API_URL)
 24 | 
 25 |     def convert_line_to_markdown(self, line: str, previous_heading_level) -> str:
 26 |         prefix = ""
 27 |         # Use regular expressions to check if the line matches heading or subheading
 28 |         if prefix == "":
 29 |             match = re.match(SECTION_HEADING_PATTERN, line)
 30 |             if match:
 31 |                 # Convert to main heading if it matches "SECTION <number>:"
 32 |                 prefix = "#" * (previous_heading_level + 1)
 33 | 
 34 |         if prefix == "":
 35 |             match = re.match(NUM_HEADING_PATTERN, line)
 36 |             if match:
 37 |                 prefix = self.replacement(match, previous_heading_level)
 38 | 
 39 |         if prefix == "":
 40 |             match = re.match(UPPERCASE_LINE_PATTERN, line)
 41 |             if match:
 42 |                 prefix = "#" * (previous_heading_level + 1)
 43 | 
 44 |         if prefix == "":
 45 |             match = re.match(UPPERCASE_FIRST_PATTERN, line)
 46 |             if match:
 47 |                 prefix = "#" * (previous_heading_level + 1)
 48 | 
 49 |         if prefix != "":
 50 |             line = f"{prefix} {line}"   
 51 |             previous_heading_level += 1     
 52 |         return (line, previous_heading_level)
 53 | 
 54 |     def replacement(self, match, previous_heading_level: int):
 55 |         level = match.group(1).count('.') + previous_heading_level + 1  # Count the dots to determine the heading level
 56 |         return '#' * level
 57 | 
 58 |     def traversal_doc(self, node: Block, previous_heading_level: int) -> str: 
 59 |         md_text = ""
 60 |         node_type = ""
 61 |         if isinstance(node, Section):
 62 |             node_type = "Section"
 63 |             (node_text_with_heading, previous_heading_level) = \
 64 |                 self.convert_line_to_markdown(node.to_text(), previous_heading_level)
 65 |             md_text += node_text_with_heading + "\n\n"
 66 |         elif isinstance(node, Table):
 67 |             node_type = "Table"
 68 |             table_content = parse_table(node.to_text())
 69 |             md_text += table_content + "\n\n"
 70 |         elif isinstance(node, Paragraph):
 71 |             node_type = "Paragraph"
 72 |             md_text += node.to_text() + "\n\n"
 73 |         elif isinstance(node, ListItem):
 74 |             node_type = "ListItem"
 75 |             md_text += node.to_html() + "\n\n"
 76 |         elif isinstance(node, Block):
 77 |             node_type = "Block"
 78 |         else:
 79 |             print(type(node))
 80 |             raise ValueError("Unknown node type")
 81 | 
 82 |         if node_type not in ['ListItem', 'Paragraph', 'Table']:
 83 |             for child in node.children:
 84 |                 md_text += self.traversal_doc(child, previous_heading_level)
 85 |         return md_text
 86 | 
 87 |     def parse_pdf(self, pdf_filepath: str) -> str:
 88 |         try:
 89 |             doc = self.pdf_reader.read_pdf(pdf_filepath)
 90 |             md_text = self.traversal_doc(doc.root_node, START_HEADING_LEVEL)
 91 |             title = extract_title(md_text)
 92 |             return f"{title}\n\n{md_text}"
 93 |         except Exception as e:
 94 |             print(f"Error processing {pdf_filepath}: {e}")
 95 | 
 96 | @click.command()
 97 | @click.option(
 98 |     "-i",
 99 |     "--input_file",
100 |     "input_file",
101 |     required=True,
102 |     help="The input pdf file to parse.",
103 | )
104 | @click.option(
105 |     "-o",
106 |     "--output_file",
107 |     "output_file",
108 |     required=True,
109 |     help="The output markdown file.",
110 | )
111 | def llmsherpa_pdf2md(input_file, output_file):
112 |     pdf_parser = LayoutPDFParser()
113 |     md_content = pdf_parser.parse_pdf(input_file)
114 |     with open(output_file, "w") as f:
115 |         f.write(md_content)
116 | if __name__ == "__main__":
117 |     llmsherpa_pdf2md()


--------------------------------------------------------------------------------
/pdf2md/src/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | 
 4 | from llamaparse_pdf2md import LlamaParsePDFParser
 5 | from llmsherpa_pdf2md import LayoutPDFParser
 6 | from unstructured_pdf2md import UnstructuredPDFParser
 7 | 
 8 | INPUT_DIR = "pdf-inputs"
 9 | 
10 | LLAMAPARSE_PARSER = "LlamaParse"
11 | UNSTRUCTURED_PARSER = "Unstructured"
12 | LLMSHERPA_PARSER = "LLMSherpa"
13 | 
14 | def clear_dirs():
15 |     # make sure the directories exist and no files are in them
16 |     # So this is a bit of a hack, but it works for now
17 |     if not os.path.exists(INPUT_DIR):
18 |         os.makedirs(INPUT_DIR)
19 |     else:
20 |         for file in os.listdir(INPUT_DIR):
21 |             os.remove(os.path.join(INPUT_DIR, file))
22 | 
23 | def process_pdf(file, parser_type=LLAMAPARSE_PARSER):
24 |     clear_dirs()
25 | 
26 |     # save the uploaded file to a directory with the same name
27 |     filepath = f"{INPUT_DIR}/{file.name}"
28 |     with open(filepath, "wb") as f:
29 |         f.write(file.getbuffer())
30 | 
31 |     if parser_type == LLAMAPARSE_PARSER:
32 |         parser = LlamaParsePDFParser()
33 |     elif parser_type == UNSTRUCTURED_PARSER:
34 |         parser = UnstructuredPDFParser()
35 |     elif parser_type == LLMSHERPA_PARSER:
36 |         parser = LayoutPDFParser()
37 |     return parser.parse_pdf(filepath)
38 | 
39 | # Create a file uploader
40 | uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
41 | 
42 | parser_type = st.selectbox("PDF Parser Type", [LLMSHERPA_PARSER, UNSTRUCTURED_PARSER, LLAMAPARSE_PARSER])
43 | 
44 | # Create a button
45 | if st.button("Convert PDF to Markdown"):
46 |     # Check if a file has been uploaded
47 |     if uploaded_file is not None:
48 |         # Convert the PDF file
49 |         md_text = process_pdf(uploaded_file, parser_type)
50 |         
51 |         # Display the Markdown text
52 |         st.markdown(md_text, unsafe_allow_html=True)
53 |     else:
54 |         st.write("Please upload a PDF file.")
55 | 


--------------------------------------------------------------------------------
/pdf2md/src/openai_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from openai import OpenAI
 3 | 
 4 | TABLE_PROMPT = """
 5 | Given the following piece of text obtained from a PDF file, which represents a table, 
 6 | please return a table in markdown format without changing its content.
 7 | If it is not a table, then return the text as is. Don't return anything else.
 8 | 
 9 | If the first line of this text starts with something like this: "| 1.1.1 introduction name | ...", 
10 | please extract "1.1.1 introduction" as a heading in the markdown format and put
11 | it at the beginning of the table. The "name" should be treated as part of the table header.
12 | So "| 1.1.1 introduction name | ..." will be converted to:
13 | ### 1.1.1 introduction
14 | | name | ...
15 | 
16 | If the first line starts like "| introduction name | ..." then there is no heading to extract.
17 | So "| introduction name | ..." will be converted to:
18 | |introduction name | ...
19 | 
20 | The extracted text from PDF you need to process is:
21 | {table_text}
22 | """
23 | 
24 | TITLE_PROMPT = """
25 | Given the following first few lines of text obtained from a PDF file. 
26 | Please extract the title and return it in markdown format, 
27 | remembering to add only one # in front of the title text:
28 | # Some Title
29 | 
30 | Some Examples:
31 | ----------------
32 | 1. If the first few lines of the text are:
33 | ```
34 | Prodcut Name
35 | Version 1.0
36 | Manual
37 | ```
38 | Then the title to be extracted is "Product Name Version 1.0 Manual".
39 | ----------------
40 | 2. If the first few lines of the text are:
41 | ```
42 | # Some unrelated text
43 | 
44 | We are presenting a new product called VWP 1.0, its ...
45 | ```
46 | Then the title to be extracted is "Introducing VWP 1.0", which means you need to look into
47 | the paragraph for the purpose of this article.
48 | ----------------
49 | 3. If the first few lines of the text are:
50 | ```
51 | We, Company ABC, provide product CDF, as a major product in our line of products...
52 | ```
53 | Then the title to be extracted is "Introducing CDF from ABC", which means you need to look into
54 | the paragraph for the purpose of this article.
55 | ----------------
56 | 
57 | 
58 | The extracted text you need to process is:
59 | {text}
60 | """
61 | 
62 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)
63 | OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-3.5-turbo-0125")
64 | EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
65 | 
66 | client = OpenAI(api_key=OPENAI_API_KEY)
67 | 
68 | def extract_title(text: str) -> str:
69 |     # get the first 5 lines of the text
70 |     lines = text.split("\n")[:5]
71 |     title_prompt = TITLE_PROMPT.format(text="\n".join(lines))
72 |     if OPENAI_API_KEY is None or OPENAI_API_KEY == "":
73 |         return lines[0]
74 |     else:
75 |         return query_openai(title_prompt)
76 | 
77 | def parse_table(table_text: str) -> str:
78 |     chat_prompt = TABLE_PROMPT.format(table_text=table_text)
79 |     if OPENAI_API_KEY is None or OPENAI_API_KEY == "":
80 |         return table_text
81 |     else:
82 |         return query_openai(chat_prompt)
83 | 
84 | def query_openai(prompt):
85 |     chat_completion = client.chat.completions.create(
86 |         messages=[
87 |                 {
88 |                 "role": "user",
89 |                 "content": prompt,
90 |             }
91 |         ], model=OPENAI_MODEL)
92 |     return chat_completion.choices[0].message.content
93 | 
94 | def embed(text: str) -> list[float]:
95 |     response =  client.embeddings.create(
96 |         input=[text],
97 |         model=EMBEDDING_MODEL)
98 |     return response.data[0].embedding
99 | 


--------------------------------------------------------------------------------
/pdf2md/src/unstructured_pdf2md.py:
--------------------------------------------------------------------------------
 1 | import click, re
 2 | from openai_utils import parse_table, extract_title
 3 | from pydantic import BaseModel
 4 | from unstructured.partition.pdf import partition_pdf
 5 | 
 6 | NUM_HEADING_PATTERN = r"^(\d+(\.\d+)*)\s+(.*)"
 7 | IGNORE_LIST = ["Page ", "Copyright "]
 8 | ALLOWED_TYPES = ["Title", "NarrativeText", "Table"]
 9 | 
10 | class UnstructuredPDFParser(BaseModel):
11 |     class Config:
12 |         arbitrary_types_allowed = True
13 | 
14 |     def __init__(self, **kwargs):
15 |         super().__init__(**kwargs)
16 |     
17 |     def replacement(self, match):
18 |         level = match.group(1).count('.') + 1  # Count the dots to determine the heading level
19 |         return '#' * level
20 |      
21 |     def parse_pdf(self, pdf_filepath):
22 |         try:
23 |             rtn_text = ""
24 |             print(f"Processing {pdf_filepath}")
25 |             elements = partition_pdf(filename=pdf_filepath, strategy="hi_res")
26 |             print(f"Number of elements: {len(elements)}")
27 |             for el in elements:
28 |                 el_dict = el.to_dict()
29 |                 el_type = el_dict["type"]
30 |                 el_text = el_dict["text"]
31 |                 if el_type not in ALLOWED_TYPES:
32 |                     continue
33 | 
34 |                 if el_type == "Table":
35 |                     rtn_text += parse_table(el_text)+ "\n\n"
36 |                     continue
37 |                 if any(el_text.startswith(ignore) for ignore in IGNORE_LIST):
38 |                     continue
39 |                 else:
40 |                     match = re.match(NUM_HEADING_PATTERN, el_text)
41 |                     if match:
42 |                         markdown_prefix = self.replacement(match)
43 |                         rtn_text += f"\n\n{markdown_prefix} {el_text}\n\n"
44 |                     else:
45 |                         rtn_text += el_text + "\n\n"
46 |             if rtn_text != "":
47 |                 title = extract_title(rtn_text)
48 |                 return f"{title}\n\n{rtn_text}"
49 |             else:
50 |                 return "Empty text returned."
51 |         except Exception as e:
52 |             print(f"Error processing {pdf_filepath}: {e}")
53 | 
54 | @click.command()
55 | @click.option(
56 |     "-i",
57 |     "--input_file",
58 |     "input_file",
59 |     required=True,
60 |     help="The input pdf file to parse.",
61 | )
62 | @click.option(
63 |     "-o",
64 |     "--output_file",
65 |     "output_file",
66 |     required=True,
67 |     help="The output markdown file.",
68 | )
69 | def unstructured_pdf2md(input_file: str, output_file: str) -> None:
70 |     pdf_parser = UnstructuredPDFParser()
71 |     rtn_text = pdf_parser.parse_pdf(input_file)
72 |     with open(output_file, "w") as f:
73 |         f.write(rtn_text)
74 | 
75 | if __name__ == "__main__":
76 |     unstructured_pdf2md()


--------------------------------------------------------------------------------
/tabular-data-analysis/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # directories created during the runtime
 10 | LocalAI/
 11 | models/
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/tabular-data-analysis/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim-bookworm
2 | 
3 | WORKDIR /app
4 | COPY requirements.txt .
5 | RUN pip install -r requirements.txt
6 | COPY src src
7 | COPY conf conf
8 | CMD ["streamlit", "run", "src/main.py"]


--------------------------------------------------------------------------------
/tabular-data-analysis/Dockerfile.lit:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim-bookworm
2 | 
3 | WORKDIR /app
4 | COPY requirements.txt .
5 | RUN pip install --no-cache-dir -r requirements.txt \
6 |     --extra-index-url https://download.pytorch.org/whl/cpu
7 | COPY src src
8 | COPY conf conf
9 | CMD ["streamlit", "run", "src/main.py"]


--------------------------------------------------------------------------------
/tabular-data-analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Tabular Data Analysis
 2 | ## Oveview
 3 | This demo is designed to transform the way we interact with tabular data, particularly CSV files, using the advanced capabilities of Large Language Models and the LlamaIndex framework. 
 4 | 
 5 | Furthermore, we delve into the practical aspects of deploying Large Language Model applications using Docker. This process involves several key steps that ensure your LLM application is encapsulated within a Docker container, providing a consistent and isolated environment for operation, regardless of the underlying host system.
 6 | 
 7 | ## Context
 8 | In this demo, we explore two innovative methods for querying tabular data, each with its unique approach and theoretical underpinnings. The first method, "mix self-consistency," originates from the paper ["Rethinking Tabular Data Understanding with Large Language Models"](https://arxiv.org/abs/2312.16702) by Liu et al., and offers a novel perspective on data interpretation through LLMs. The second method, "Chain of Tables" detailed in ["Chain-of-Table: Evolving Tables in the Reasoning Chain for Table Understanding"](https://arxiv.org/abs/2401.04398) by Wang et al., presents an advanced technique for evolving table-based data reasoning. 
 9 | 
10 | Both methods are implemented using [Llama Packs](https://github.com/run-llama/llama-hub/tree/main/llama_hub/llama_packs/tables), a versatile and community-driven collection of prepackaged modules designed to enhance LLM applications.  
11 | 
12 | We've brought these approaches to life through an intuitive WebUI, resembling a chatbot interface, where users can interact with either ChatGPT or local models to execute their data queries. To enhance local model deployment, we utilize LocalAI to initiate docker containers hosting local models, while providing an OpenAI-compatible API for efficient inference.
13 | 
14 | This setup not only showcases the practical applications of these theoretical approaches but also provides an accessible platform for users to experience the cutting-edge in tabular data querying.
15 | 
16 | ## Prerequisites
17 | Before diving into this demo, please ensure that your system meets the following prerequisites:
18 | 1. **Operating System**: The demo is compatible with Linux operating systems and tested on Ubuntu 22.04.
19 | 
20 | 2. **Docker and wget**: It's required to have `docker`, `docker-compose` and `wget` installed on your system. Specifically, we have tested this demo with Docker Engine Community version 25.0.1 on Linux. 
21 | 
22 | 3. **OpenAI API Key for ChatGPT**: If you wish to use the ChatGPT functionality within this demo, an OpenAI API key is required. Please note that usage of this API is subject to OpenAI's pricing and usage policies.
23 | 
24 | ## Quick Start
25 | ### Setup environment on AWS
26 | Please follow this [README file](../env-setup/aws/ubuntu-22.04/README.md) to setup the demo environment on AWS EC2.
27 | 
28 | ### Running the demo on CPU
29 | 1. Start by cloning this repo to your EC2 CPU instance:
30 | ```
31 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
32 | cd llm-in-containers/tabular-data-analysis
33 | ```
34 | 2. Insert your OpenAI API Key into conf/config.json for "OPENAI_API_KEY". This step can be skipped if you don't want to evaluate against the OpenAI backend.
35 | 3. Download local model:
36 | ```
37 | bash download-models.sh
38 | ```
39 | 4. Launch the demo:
40 | ```
41 | bash run.sh
42 | ```
43 | 5. Visit the UI at http://{IP of EC2 CPU instance}:8501. On the UI, you can choose either "ChatGPT" or "Local_LLM" (the local model you downloaded) to query the tabular data.
44 | 6. Shutdown the demo.
45 | ```
46 | bash shutdown.sh
47 | ```
48 | 
49 | ### Running the demo on GPU
50 | 1. Start by cloning this repo to your EC2 GPU instance:
51 | ```
52 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
53 | cd llm-in-containers/tabular-data-analysis
54 | ```
55 | 2. Insert your OpenAI API Key into conf/config.json for "OPENAI_API_KEY". 
56 | 3. Launch the demo:
57 | ```
58 | bash run.sh -gpu
59 | ```
60 | 4. Visit the UI at http://{IP of EC2 GPU instance}:8501. On the UI, you can choose either "ChatGPT" or "Local_LLM" (the default local model "mistral-openorca" is configured to run on GPU) to query the tabular data.
61 | 5. Shutdown the demo.
62 | ```
63 | bash shutdown.sh -gpu
64 | ```
65 | 


--------------------------------------------------------------------------------
/tabular-data-analysis/conf/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "CPU_Only": true,
 3 |     "SHOW_TRACE_ON_UI": true,
 4 |     "API_BASE": "http://localai:8080/v1",
 5 |     "API_KEY": "anykey",
 6 |     "OPENAI_API_KEY": "your key",
 7 |     "OPENAI_API_MODEL": "gpt-4-1106-preview",
 8 |     "INPUT_DIR": "cvs-input",
 9 |     "MODELS_DIR": "models",
10 |     "models": [
11 |         {
12 |             "type": "CPU_Only",
13 |             "name": "mistral-7b-instruct-v0.2.Q6_K.gguf",
14 |             "url":  "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q6_K.gguf" 
15 |         },
16 |         {
17 |             "type": "GPU_Enabled",
18 |             "name": "mistral-openorca",
19 |             "url":  "https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF"  
20 |         }
21 |     ]
22 | }


--------------------------------------------------------------------------------
/tabular-data-analysis/docker-compose-lit.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   localai:
 5 |     image: localai/localai:v2.6.1-ffmpeg-core
 6 |     command: --models-path /models --context-size 4096 --threads 2
 7 |     volumes:
 8 |       - $PWD/models:/models
 9 |     ports:
10 |       - "10080:8080"
11 |   tabular-data-analysis:
12 |     image: tabular-data-analysis:lit-1.0
13 |     volumes:
14 |       - $PWD/conf:/app/conf
15 |     ports:
16 |       - "8501:8501"


--------------------------------------------------------------------------------
/tabular-data-analysis/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   localai:
 5 |     image: localai/localai:v2.6.1-cublas-cuda12-core
 6 |     command: ${MODEL_NAME}
 7 |     deploy:
 8 |       resources:
 9 |         reservations:
10 |           devices:
11 |             - driver: nvidia
12 |               count: 1
13 |               capabilities: [gpu]
14 |     ports:
15 |       - "10080:8080"
16 |   tabular-data-analysis:
17 |     image: tabular-data-analysis:1.0
18 |     volumes:
19 |       - $PWD/conf:/app/conf
20 |     ports:
21 |       - "8501:8501"


--------------------------------------------------------------------------------
/tabular-data-analysis/download-models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | CONFIG_FILE="conf/config.json"
 5 | 
 6 | find_model_info() {
 7 |     local file_path="$1"
 8 |     local found=0
 9 | 
10 |     while IFS= read -r line; do
11 |         if [[ "$line" == *'"type": "CPU_Only"'* ]]; then
12 |             found=1
13 |         fi
14 |         if [[ $found -eq 1 && "$line" == *'"url":'* ]]; then
15 |             CPU_MODEL_URL=$(echo "$line" | sed -e 's/.*"url": *"\(.*\)".*/\1/')
16 |             break
17 |         fi
18 |     done < "$file_path"
19 | }
20 | 
21 | find_model_info "$CONFIG_FILE"
22 | 
23 | mkdir -p models
24 | wget ${CPU_MODEL_URL} -P models/
25 | 


--------------------------------------------------------------------------------
/tabular-data-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | llama-hub==0.0.77
2 | llama-index==0.9.39
3 | streamlit==1.30.0
4 | tabulate==0.9.0
5 | torch==2.1.2
6 | transformers==4.37.1


--------------------------------------------------------------------------------
/tabular-data-analysis/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | CONFIG_FILE="conf/config.json"
 5 | 
 6 | find_model_info() {
 7 |     local file_path="$1"
 8 |     local found=0
 9 | 
10 |     while IFS= read -r line; do
11 |         if [[ "$line" == *'"type": "GPU_Enabled"'* ]]; then
12 |             found=1
13 |         fi
14 |         if [[ $found -eq 1 && "$line" == *'"name":'* ]]; then
15 |             GPU_MODEL_NAME=$(echo "$line" | sed -e 's/.*"name": *"\(.*\)".*/\1/')
16 |             break
17 |         fi
18 |     done < "$file_path"
19 | }
20 | 
21 | find_model_info "$CONFIG_FILE"
22 | 
23 | if [ "${1:-}" == "-gpu" ]; then
24 |     echo "GPU option provided. Running on GPU..."
25 |     sed -i'' -e "s/command:.*/command: $GPU_MODEL_NAME/" docker-compose.yaml
26 | 
27 |     # Set CPU_Only to false
28 |     sed -i'' -e 's/"CPU_Only": true/"CPU_Only": false/' "$CONFIG_FILE"
29 |     docker build -t tabular-data-analysis:1.0 .
30 |     docker-compose up -d
31 | else
32 |     echo "Running on CPU by default..."
33 |     # Set CPU_Only to true
34 |     sed -i'' -e 's/"CPU_Only": false/"CPU_Only": true/' "$CONFIG_FILE"
35 |     docker build -f Dockerfile.lit -t tabular-data-analysis:lit-1.0 .
36 |     docker-compose -f docker-compose-lit.yaml up -d
37 | fi


--------------------------------------------------------------------------------
/tabular-data-analysis/shutdown.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e -u
3 | 
4 | echo "Shutting down app..."
5 | if [ "$1" == "-gpu" ]; then
6 |     docker-compose down
7 | else
8 |     docker-compose -f docker-compose-lit.yaml down
9 | fi


--------------------------------------------------------------------------------
/tabular-data-analysis/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinkTime-Corp/llm-in-containers/ca85bbc3f024a2732041ed53aca7d57c1f24e095/tabular-data-analysis/src/__init__.py


--------------------------------------------------------------------------------
/tabular-data-analysis/src/backend.py:
--------------------------------------------------------------------------------
 1 | import llama_index
 2 | import json, os, time
 3 | from constants import MIXSC_ENGINE, CHAINOFTABLE_ENGINE, GPT_LLM, LOCAL_LLM
 4 | from llama_index import ServiceContext
 5 | from llama_index import set_global_service_context
 6 | from llama_index.embeddings import OpenAIEmbedding
 7 | from llama_index.llms import OpenAILike, OpenAI
 8 | from llama_hub.llama_packs.tables.mix_self_consistency.base import (
 9 |     MixSelfConsistencyQueryEngine
10 | ) 
11 | from llama_hub.llama_packs.tables.chain_of_table.base import (
12 |     ChainOfTableQueryEngine, serialize_table
13 | )
14 | 
15 | script_dir = os.path.dirname(os.path.realpath(__file__))
16 | upper_dir = os.path.dirname(script_dir)
17 | config_path = os.path.join(f"{upper_dir}/conf", "config.json")
18 | with open(config_path) as f:
19 |     config = json.load(f)
20 |     API_BASE = config["API_BASE"]
21 |     API_KEY = config["API_KEY"]
22 |     CPU_ONLY = config["CPU_Only"]
23 |     os.environ["OPENAI_API_KEY"] = config["OPENAI_API_KEY"]
24 |     OPENAI_API_MODEL = config["OPENAI_API_MODEL"]
25 |     for model in config["models"]:
26 |         if CPU_ONLY:
27 |             if model["type"] == "CPU_Only":
28 |                 MODEL_NAME = model["name"]
29 |         else:
30 |             if model["type"] == "GPU_Enabled":
31 |                 MODEL_NAME = model["name"]
32 | 
33 | MAC_M1_LUNADEMO_CONSERVATIVE_TIMEOUT = 10 * 60  # sec
34 | 
35 | class QueryEngineWrapper:
36 |     local_llm = None
37 |     openai_llm = None
38 |     def __init__(self):
39 |         self.openai_llm = OpenAI(model=OPENAI_API_MODEL)
40 |         self.local_llm = OpenAILike(
41 |             api_base=API_BASE,
42 |             api_key=API_KEY,
43 |             model=MODEL_NAME,
44 |             is_chat_model=True,
45 |             is_function_calling_model=True,
46 |             context_window=3900,
47 |             timeout=MAC_M1_LUNADEMO_CONSERVATIVE_TIMEOUT,
48 |         )
49 | 
50 |     def get_query_engine(self, table, llm_type, query_engine_type): 
51 |         query_engine = None
52 |         chosen_llm = None
53 |         if llm_type == GPT_LLM:
54 |             chosen_llm = self.openai_llm
55 |             embed_model = OpenAIEmbedding(embed_batch_size=10)
56 |             service_context = ServiceContext.from_defaults(
57 |                 chunk_size=1024, llm=chosen_llm, embed_model=embed_model)
58 |         else:
59 |             chosen_llm = self.local_llm
60 |             service_context = ServiceContext.from_defaults(
61 |                 chunk_size=1024, llm=chosen_llm, embed_model="local")
62 |             set_global_service_context(service_context)        
63 | 
64 |         text_paths = 3
65 |         symbolic_paths = 3
66 |         if CPU_ONLY and llm_type == LOCAL_LLM:
67 |             text_paths = 1
68 |             symbolic_paths = 1
69 | 
70 |         if query_engine_type == MIXSC_ENGINE:
71 |             query_engine = MixSelfConsistencyQueryEngine(
72 |                             df=table, 
73 |                             llm=chosen_llm, 
74 |                             text_paths=text_paths, 
75 |                             symbolic_paths=symbolic_paths, 
76 |                             aggregation_mode="self-consistency", 
77 |                             verbose=True)
78 |         elif query_engine_type == CHAINOFTABLE_ENGINE:
79 |             query_engine = ChainOfTableQueryEngine(
80 |                             table=table, llm=chosen_llm, verbose=True)
81 |         else:
82 |             raise Exception("Invalid query engine type")
83 |         return query_engine
84 | 
85 |     def process_query(self, question, table, llm_type, query_engine_type):
86 |         query_engine = self.get_query_engine(table, llm_type, query_engine_type)
87 |     
88 |         start_time = time.time()  # Start time recording
89 |         response = query_engine.query(question)
90 |         end_time = time.time()  # End time recording
91 |         time_spent = end_time - start_time  # Calculate duration
92 |         # add time spent to the end of the response
93 |         final_response = response.__str__() + "\nTime spent: {:.2f} seconds".format(time_spent)
94 |         return final_response
95 | 


--------------------------------------------------------------------------------
/tabular-data-analysis/src/constants.py:
--------------------------------------------------------------------------------
1 | MIXSC_ENGINE = 'MixSelfConsistency'
2 | CHAINOFTABLE_ENGINE = 'ChainOfTable'
3 | GPT_LLM = "ChatGPT"
4 | LOCAL_LLM = "Local_LLM"


--------------------------------------------------------------------------------
/tabular-data-analysis/src/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import streamlit as st
 3 | import json, io, os, shutil, sys, traceback
 4 | from constants import MIXSC_ENGINE, CHAINOFTABLE_ENGINE, GPT_LLM, LOCAL_LLM
 5 | from backend import QueryEngineWrapper
 6 | 
 7 | script_dir = os.path.dirname(os.path.realpath(__file__))
 8 | upper_dir = os.path.dirname(script_dir)
 9 | config_path = os.path.join(f"{upper_dir}/conf", "config.json")
10 | with open(config_path) as f:
11 |     config = json.load(f)
12 |     INPUT_DIR = config["INPUT_DIR"]
13 |     SHOW_TRACE_ON_UI = config["SHOW_TRACE_ON_UI"]
14 | 
15 | wrapper = QueryEngineWrapper()
16 | 
17 | class OutputCapture:
18 |     def __init__(self):
19 |         self.buffer = io.StringIO()
20 | 
21 |     def isatty(self):
22 |         return False
23 | 
24 |     def write(self, message):
25 |         self.buffer.write(message)
26 | 
27 |     def flush(self):
28 |         pass
29 | 
30 |     def get_output(self):
31 |         return self.buffer.getvalue()
32 | 
33 | def check_dirs():
34 |     if not os.path.exists(INPUT_DIR):
35 |         os.makedirs(INPUT_DIR)
36 |     else:
37 |         shutil.rmtree(INPUT_DIR)
38 |         os.makedirs(INPUT_DIR)
39 | 
40 | def process_cvs(csv_file):
41 |     check_dirs()
42 |     # save the uploaded file to a directory with the same name
43 |     filepath = f"{INPUT_DIR}/{csv_file.name}"
44 |     with open(filepath, "wb") as f:
45 |         f.write(csv_file.getbuffer())
46 |     return pd.read_csv(filepath)
47 | 
48 | def process_query(question, table, llm_type, query_engine_type):
49 |     captured_output_str = "No trace available!"
50 |     response = ""
51 |     try:
52 |         if SHOW_TRACE_ON_UI:
53 |             captured_output = OutputCapture()
54 |             sys.stdout = captured_output
55 |         response = wrapper.process_query(question, table, 
56 |             llm_type, query_engine_type)
57 |         if SHOW_TRACE_ON_UI:
58 |             sys.stdout = sys.__stdout__
59 | 
60 |         if SHOW_TRACE_ON_UI and captured_output is not None:
61 |             captured_output_str = captured_output.get_output()
62 |             
63 |     except Exception as e:
64 |         response = f"Error:\n{str(e)}"
65 |         traceback.print_exc()
66 |     return (response, captured_output_str)
67 | 
68 | st.sidebar.title("Inference Traces")
69 | uploaded_file = st.file_uploader("Upload CSV file", type=['csv'])
70 | if uploaded_file is not None:
71 |     debug_info = st.sidebar.empty()
72 |     table = process_cvs(uploaded_file)
73 |     st.write("Table Preview")
74 |     st.dataframe(table.head(5))
75 | 
76 |     llm_type = st.selectbox("LLM Type", [GPT_LLM, LOCAL_LLM])
77 |     query_engine_type = st.selectbox("Query Engine", [MIXSC_ENGINE, CHAINOFTABLE_ENGINE])
78 |     question = st.text_input("Question", "")
79 |     if question and st.button("Query"):
80 |         (response, captured_output_str) = process_query(question, table, llm_type, query_engine_type)
81 |         st.text_area("Response", response, height=2, max_chars=20)
82 |         debug_info.text_area("", captured_output_str, height=600)
83 | else:
84 |     st.write("Awaiting CSV file to be uploaded.")
85 | 


--------------------------------------------------------------------------------
/text2sql/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim-bookworm
2 | 
3 | WORKDIR /app
4 | COPY requirements.txt .
5 | RUN pip install -r requirements.txt
6 | COPY src src
7 | COPY conf conf
8 | CMD ["streamlit", "run", "src/main.py"]


--------------------------------------------------------------------------------
/text2sql/Dockerfile.lit:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim-bookworm
2 | 
3 | WORKDIR /app
4 | COPY requirements.txt .
5 | RUN pip install --no-cache-dir -r requirements.txt \
6 |     --extra-index-url https://download.pytorch.org/whl/cpu
7 | COPY src src
8 | COPY conf conf
9 | CMD ["streamlit", "run", "src/main.py"]


--------------------------------------------------------------------------------
/text2sql/README.md:
--------------------------------------------------------------------------------
 1 | # Running Text2SQL with LLMs on Docker Containers
 2 | ## Overview
 3 | This demo showcases querying databases through Text2SQL technology, leveraging the advanced features of Large Language Models (LLMs) with the LangChain and Ollama frameworks. Additionally, it explores the deployment of LLM applications using Docker, outlining crucial steps for encapsulating your LLM application in a Docker container. This ensures a consistent and isolated operational environment across different host systems.
 4 | 
 5 | ## Prerequisites
 6 | Before diving into this demo, please ensure that your system meets the following prerequisites:
 7 | 1. **Operating System**: The demo is compatible with Linux operating systems and tested on Ubuntu 22.04.
 8 | 
 9 | 2. **Docker, unzip and wget**: It's required to have `docker` and `docker-compose` installed on your system. Specifically, we have tested this demo with Docker Engine Community version 25.0.1 on Linux. 
10 | 
11 | 3. **OpenAI API Key for ChatGPT**: If you wish to use the ChatGPT functionality within this demo, an OpenAI API key is required. Please note that usage of this API is subject to OpenAI's pricing and usage policies.
12 | 
13 | ## Quick Start
14 | ### Setup environment on AWS
15 | Please follow this [README file](../env-setup/aws/ubuntu-22.04/README.md) to setup the demo environment on AWS EC2.
16 | 
17 | ### Running the demo on CPU
18 | 1. Start by cloning this repo to your EC2 CPU instance:
19 | ```
20 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
21 | cd llm-in-containers/text2sql
22 | ```
23 | 2. Insert your OpenAI API Key into conf/config.json for "OPENAI_API_KEY". This step can be skipped if you don't want to evaluate against the OpenAI backend.
24 | 3. Launch the demo. If everything runs smoothly, the last command in run.sh should show three containers actively running.
25 | ```
26 | bash run.sh
27 | ```
28 | 4. Load the sample data into MySQL:
29 | ```
30 | bash load_data.sh
31 | ```
32 | 5. Visit the UI at http://{IP of EC2 CPU instance}:8501. On the UI, you can choose either "ChatGPT" or "Local_LLM" (the default local model "sqlcoder:15b-q6_K" is configured to run on CPU) to query the MySQL database.
33 | 6. Shutdown the demo.
34 | ```
35 | bash shutdown.sh
36 | ```
37 | 
38 | ### Running the demo on GPU
39 | 1. Start by cloning this repo to your EC2 GPU instance:
40 | ```
41 | git clone https://github.com/LinkTime-Corp/llm-in-containers.git
42 | cd llm-in-containers/text2sql
43 | ```
44 | 2. Insert your OpenAI API Key into conf/config.json for "OPENAI_API_KEY". 
45 | 3. Launch the demo:
46 | ```
47 | bash run.sh -gpu
48 | ```
49 | 4. Load the sample data into MySQL:
50 | ```
51 | bash load_data.sh
52 | ```
53 | 5. Visit the UI at http://{IP of EC2 GPU instance}:8501. On the UI, you can choose either "ChatGPT" or "Local_LLM" (the default local model "sqlcoder:15b-q8_0" is configured to run on GPU) to query the tabular data.
54 | 6. Shutdown the demo.
55 | ```
56 | bash shutdown.sh -gpu
57 | ```


--------------------------------------------------------------------------------
/text2sql/conf/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "CPU_Only": true,
 3 |     "SHOW_TRACE_ON_UI": true,
 4 |     "API_BASE": "http://ollama:11434/v1/",
 5 |     "API_KEY": "anykey",
 6 |     "OPENAI_API_KEY": "your key",
 7 |     "OPENAI_API_MODEL": "gpt-4-1106-preview",
 8 |     "database": {
 9 |         "host": "mysql",
10 |         "user": "root",
11 |         "password": "123456",
12 |         "db": "classicmodels",
13 |         "port": 3306
14 |     },
15 |     "models": [
16 |         {
17 |             "type": "CPU_Only",
18 |             "name": "sqlcoder:15b-q6_K"
19 |         },
20 |         {
21 |             "type": "GPU_Enabled",
22 |             "name": "sqlcoder:15b-q8_0"
23 |         }
24 |     ]
25 | }


--------------------------------------------------------------------------------
/text2sql/docker-compose-lit.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | services:
 3 |   ollama:
 4 |     image: ollama/ollama:0.1.24
 5 |     ports:
 6 |       - "11434:11434"
 7 |     volumes:
 8 |       - ./ollama:/root/.ollama
 9 |   mysql:
10 |     image: mysql:8.3.0
11 |     environment:
12 |       MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD}
13 |     ports:
14 |       - "3306:3306"
15 |     volumes:
16 |       - ./data:/var/lib/mysql
17 |   text2sql:
18 |     image: text2sql:lit-1.0
19 |     volumes:
20 |       - $PWD/conf:/app/conf
21 |     ports:
22 |       - "8501:8501"
23 |       


--------------------------------------------------------------------------------
/text2sql/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | services:
 3 |   ollama:
 4 |     image: ollama/ollama:0.1.24
 5 |     ports:
 6 |       - "11434:11434"
 7 |     volumes:
 8 |       - ./ollama:/root/.ollama
 9 |     deploy:
10 |       resources:
11 |         reservations:
12 |           devices:
13 |             - driver: nvidia
14 |               count: 1
15 |               capabilities: [gpu]
16 |   mysql:
17 |     image: mysql:8.3.0
18 |     environment:
19 |       MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD}
20 |     ports:
21 |       - "3306:3306"
22 |     volumes:
23 |       - ./data:/var/lib/mysql
24 |   text2sql:
25 |     image: text2sql:1.0
26 |     volumes:
27 |       - $PWD/conf:/app/conf
28 |     ports:
29 |       - "8501:8501"
30 | 


--------------------------------------------------------------------------------
/text2sql/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | CUR_PATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 5 | CONFIG_FILE="${CUR_PATH}/conf/config.json"
 6 | DB_PATH="${CUR_PATH}/db"
 7 | 
 8 | find_password() {
 9 |     local file_path="$1"
10 |     local found=0
11 | 
12 |     while IFS= read -r line; do
13 |         if [[ "$line" == *'"database":'* ]]; then
14 |             found=1
15 |         fi
16 |         if [[ $found -eq 1 && "$line" == *'"password":'* ]]; then
17 |             PASSWORD=$(echo "$line" | sed -e 's/.*"password": *"\(.*\)".*/\1/')
18 |             break
19 |         fi
20 |     done < "$file_path"
21 | }
22 | 
23 | find_password "$CONFIG_FILE"
24 | 
25 | docker exec -i text2sql_mysql_1 sh -c "exec mysql -uroot -p$PASSWORD" < $DB_PATH/mysqlsampledatabase.sql
26 | 


--------------------------------------------------------------------------------
/text2sql/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography==42.0.2
2 | langchain==0.1.5
3 | langchain-community==0.0.18
4 | langchain-openai==0.0.5
5 | pydantic==2.6.1
6 | pymysql==1.1.0
7 | streamlit==1.30.0
8 | 


--------------------------------------------------------------------------------
/text2sql/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | CUR_PATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 5 | CONFIG_FILE="${CUR_PATH}/conf/config.json"
 6 | 
 7 | find_settings() {
 8 |     local file_path="$1"
 9 |     local found=0
10 | 
11 |     while IFS= read -r line; do
12 |         if [[ "$line" == *'"database":'* ]]; then
13 |             found=1
14 |         fi
15 |         if [[ $found -eq 1 && "$line" == *'"password":'* ]]; then
16 |             PASSWORD=$(echo "$line" | sed -e 's/.*"password": *"\(.*\)".*/\1/')
17 |         fi
18 |         if [[ "$line" == *'"type": "CPU_Only"'* ]]; then
19 |             found=2
20 |         fi
21 |         if [[ $found -eq 2 && "$line" == *'"name":'* ]]; then
22 |             CPU_LLM=$(echo "$line" | sed -e 's/.*"name": *"\(.*\)".*/\1/')
23 |         fi
24 |         if [[ "$line" == *'"type": "GPU_Enabled"'* ]]; then
25 |             found=3
26 |         fi
27 |         if [[ $found -eq 3 && "$line" == *'"name":'* ]]; then
28 |             GPU_LLM=$(echo "$line" | sed -e 's/.*"name": *"\(.*\)".*/\1/')
29 |         fi
30 |     done < "$file_path"
31 | }
32 | 
33 | find_settings "$CONFIG_FILE"
34 | 
35 | if [ $# -eq 0 ]; then
36 |     echo "Running on CPU by default..."
37 |     sed -i'' -e "s/MYSQL_ROOT_PASSWORD:.*/MYSQL_ROOT_PASSWORD: \"$PASSWORD\"/" docker-compose-lit.yaml
38 | 
39 |     # Set CPU_Only to true
40 |     sed -i'' -e 's/"CPU_Only": false/"CPU_Only": true/' "$CONFIG_FILE"
41 |     docker build -f Dockerfile.lit -t text2sql:lit-1.0 .
42 |     docker-compose -f docker-compose-lit.yaml up -d
43 |     echo "waiting for containers to start..."
44 |     sleep 5
45 |     docker exec text2sql_ollama_1 sh -c "ollama run $CPU_LLM --verbose"
46 | else
47 |     if [ "$1" == "-gpu" ]; then
48 |         echo "GPU option provided. Running on GPU..."
49 |         sed -i'' -e "s/MYSQL_ROOT_PASSWORD:.*/MYSQL_ROOT_PASSWORD: $PASSWORD/" docker-compose.yaml
50 | 
51 |         # Set CPU_Only to false
52 |         sed -i'' -e 's/"CPU_Only": true/"CPU_Only": false/' "$CONFIG_FILE"
53 |         docker build -t text2sql:1.0 .
54 |         docker-compose up -d
55 |         echo "waiting for containers to start..."
56 |         sleep 5
57 |         docker exec text2sql_ollama_1 sh -c "ollama run $GPU_LLM --verbose"
58 |     fi
59 | fi
60 | docker ps -a | grep text2sql
61 | 


--------------------------------------------------------------------------------
/text2sql/shutdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | echo "Shutting down app..."
 5 | if [ $# -eq 0 ]; then
 6 |     docker-compose -f docker-compose-lit.yaml down
 7 | else
 8 |     if [ "$1" == "-gpu" ]; then
 9 |         docker-compose down
10 |     fi
11 | fi
12 | 


--------------------------------------------------------------------------------
/text2sql/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinkTime-Corp/llm-in-containers/ca85bbc3f024a2732041ed53aca7d57c1f24e095/text2sql/src/__init__.py


--------------------------------------------------------------------------------
/text2sql/src/constants.py:
--------------------------------------------------------------------------------
1 | GPT_LLM = "ChatGPT"
2 | LOCAL_LLM = "Local_LLM"


--------------------------------------------------------------------------------
/text2sql/src/db_engine.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | from langchain_community.utilities import SQLDatabase
 3 | from pydantic import BaseModel, ConfigDict, Field
 4 | from typing import Any, Dict, Iterable, List, Optional, Sequence
 5 | import ast, json, os
 6 | import pandas as pd
 7 | from decimal import Decimal
 8 | 
 9 | script_dir = os.path.dirname(os.path.realpath(__file__))
10 | upper_dir = os.path.dirname(script_dir)
11 | config_path = os.path.join(f"{upper_dir}/conf", "config.json")
12 | with open(config_path) as f:
13 |     config = json.load(f)
14 |     database_settings = config["database"]
15 |     USER_NAME = database_settings["user"]
16 |     PASSWORD = database_settings["password"]
17 |     HOST = database_settings["host"]
18 |     PORT = database_settings["port"]
19 |     DB = database_settings["db"]
20 | 
21 | class DBEngine(BaseModel):
22 |     model_config = ConfigDict(arbitrary_types_allowed=True)
23 |         
24 |     db: SQLDatabase = Field(..., description="engine to use.")
25 |     def __init__(self, **kwargs: Any,):
26 |         db_url = f"mysql+pymysql://{USER_NAME}:{PASSWORD}@{HOST}:{PORT}/{DB}"
27 |         db = SQLDatabase(create_engine(db_url), max_string_length=1000)
28 |         super().__init__(db=db, **kwargs)
29 | 
30 |     def get_db(self) -> SQLDatabase:
31 |         return self.db
32 | 
33 |     def get_dialect(self) -> str:
34 |         try:
35 |             return self.db.dialect
36 |         except Exception as e:
37 |             raise e
38 | 
39 |     def get_table_names(self) -> Iterable[str]:
40 |         try:
41 |             return self.db.get_usable_table_names()
42 |         except Exception as e:
43 |             raise e
44 | 
45 |     def get_table_sample(self, table_name: str, limit: int = 3) -> pd.DataFrame:
46 |         try:
47 |             samples = self.exec_db("select * from " + table_name + " limit " + str(limit))
48 |             return pd.DataFrame(samples)
49 |         except Exception as e:
50 |             raise e
51 | 
52 |     def get_table_schema(self, table_name: str)-> str:
53 |         try:
54 |             schema_str = self.query_db("show create table " +table_name)
55 |             schema = ast.literal_eval(schema_str)
56 |             return schema[0][1]
57 |         except Exception as e:
58 |             raise e
59 | 
60 |     def query_db(self, query: str) -> str:
61 |         try:
62 |             return self.db.run(query)
63 |         except Exception as e:
64 |             raise e
65 | 
66 |     def exec_db(self, query: str) -> Sequence[Dict[str, Any]]:
67 |         try:
68 |             return self.db._execute(query)
69 |         except Exception as e:
70 |             raise e
71 | 
72 | if __name__ == "__main__":
73 |     db_engine = DBEngine()
74 |     print(db_engine.get_dialect())
75 |     print("------------------")
76 |     tables = db_engine.get_table_names()
77 |     print(tables[0])
78 |     print("------------------")
79 |     print(db_engine.get_table_schema(tables[0]))
80 |     print("------------------")
81 |     print(db_engine.get_table_sample(tables[0]))
82 |     print("------------------")
83 | 
84 | 


--------------------------------------------------------------------------------
/text2sql/src/main.py:
--------------------------------------------------------------------------------
 1 | import os, json, io, sys
 2 | import streamlit as st
 3 | from constants import LOCAL_LLM, GPT_LLM
 4 | from db_engine import DBEngine
 5 | from query_engine import SQLEngineWrapper
 6 | 
 7 | 
 8 | script_dir = os.path.dirname(os.path.realpath(__file__))
 9 | upper_dir = os.path.dirname(script_dir)
10 | config_path = os.path.join(f"{upper_dir}/conf", "config.json")
11 | with open(config_path) as f:
12 |     config = json.load(f)
13 |     SHOW_TRACE_ON_UI = config["SHOW_TRACE_ON_UI"]
14 | 
15 | class OutputCapture:
16 |     def __init__(self):
17 |         self.buffer = io.StringIO()
18 | 
19 |     def isatty(self):
20 |         return False
21 | 
22 |     def write(self, message):
23 |         self.buffer.write(message)
24 | 
25 |     def flush(self):
26 |         pass
27 | 
28 |     def get_output(self):
29 |         return self.buffer.getvalue()
30 | 
31 | def main():
32 |     st.title("💬 Text2SQL Demo")
33 |     st.sidebar.title("Inference Traces")
34 | 
35 |     query_engine = SQLEngineWrapper()
36 |     db_engine = DBEngine()
37 | 
38 |     with st.expander("🐬 DATABASE Information", False):
39 |         col1, col2 = st.columns([1, 2])
40 |         # show all the table names in col1 and show table schema in col2
41 |         # after user selects a table name
42 |         with col1:
43 |             table_names = db_engine.get_table_names()
44 |             table_name = st.selectbox("Select a table", table_names)
45 |         with col2:
46 |             #table_schema = db_engine.get_table_schema(table_name)
47 |             #st.write(table_schema)
48 |             st.dataframe(db_engine.get_table_sample(table_name))
49 | 
50 |     if "messages" not in st.session_state:
51 |         st.session_state.messages = []
52 | 
53 |     for message in st.session_state.messages:
54 |         with st.chat_message(message["role"]):
55 |             if message["role"] == "user":
56 |                 st.markdown(message["content"])
57 |             else:
58 |                 st.code(message["content"])
59 | 
60 |     llm_type = st.selectbox("LLM Type", [GPT_LLM, LOCAL_LLM])
61 |     debug_info = st.sidebar.empty()
62 |     if prompt := st.chat_input("Where is the office in San Francisco?"):
63 |         st.session_state.messages.append({"role": "user", "content": prompt})
64 |         with st.chat_message("user"):
65 |             st.markdown(prompt)
66 | 
67 |         with st.chat_message("assistant"):
68 |             result = None
69 |             try:
70 |                 if SHOW_TRACE_ON_UI:
71 |                     captured_output = OutputCapture()
72 |                     sys.stdout = captured_output
73 | 
74 |                 result = query_engine.process_query(prompt, llm_type)
75 | 
76 |                 if SHOW_TRACE_ON_UI:
77 |                     sys.stdout = sys.__stdout__
78 | 
79 |                 if SHOW_TRACE_ON_UI and captured_output is not None:
80 |                     captured_output_str = captured_output.get_output()
81 |                     debug_info.text_area("", captured_output_str, height=600)
82 |             except Exception as e:
83 |                 result = str(e)
84 |             st.write(result)
85 | 
86 |         st.session_state.messages.append(
87 |             {
88 |                 "role": "assistant",
89 |                 "content": result,
90 |             }
91 |         )
92 | 
93 | if __name__ == "__main__":
94 |     main()


--------------------------------------------------------------------------------
/text2sql/src/query_engine.py:
--------------------------------------------------------------------------------
 1 | from constants import LOCAL_LLM, GPT_LLM
 2 | from db_engine import DBEngine
 3 | from langchain.chains import create_sql_query_chain
 4 | from langchain_community.agent_toolkits import create_sql_agent
 5 | from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
 6 | from langchain_openai import ChatOpenAI
 7 | from pydantic import BaseModel
 8 | import os, json, re, yaml, time
 9 | 
10 | script_dir = os.path.dirname(os.path.realpath(__file__))
11 | upper_dir = os.path.dirname(script_dir)
12 | config_path = os.path.join(f"{upper_dir}/conf", "config.json")
13 | with open(config_path) as f:
14 |     config = json.load(f)
15 |     API_BASE = config["API_BASE"]
16 |     API_KEY = config["API_KEY"]
17 |     CPU_ONLY = config["CPU_Only"]
18 |     os.environ["OPENAI_API_KEY"] = config["OPENAI_API_KEY"]
19 |     OPENAI_API_MODEL = config["OPENAI_API_MODEL"]
20 |     model_list = config["models"]
21 |     for model in model_list:
22 |         if CPU_ONLY and model["type"] == "CPU_Only":
23 |             LOCAL_MODEL_NAME = model["name"]
24 |         if not CPU_ONLY and model["type"] == "GPU_Enabled":
25 |             LOCAL_MODEL_NAME = model["name"]
26 | 
27 | class SQLEngineWrapper:
28 |     gpt_agent = None
29 |     local_chain = None
30 |     db_engine = None
31 |     def __init__(self):
32 |         self.db_engine = DBEngine()
33 |         db = self.db_engine.get_db()
34 |         self.local_chain = self.init_local_chain(db)
35 |         self.gpt_agent = self.init_openai_agent(db)
36 | 
37 |     def init_openai_agent(self, db):
38 |         openai_llm = ChatOpenAI(model_name=OPENAI_API_MODEL)
39 |         return create_sql_agent(openai_llm, db=db,
40 |             agent_type="openai-tools", verbose=True)
41 | 
42 |     def init_local_chain(self, db):
43 |         local_llm = ChatOpenAI(model_name=LOCAL_MODEL_NAME, openai_api_base=API_BASE)
44 |         write_query = create_sql_query_chain(local_llm, db)
45 |         return write_query
46 | 
47 |     def process_query(self, question, llm_type):
48 |         start_time = time.time()  # Start time recording
49 |         response = None
50 |         if llm_type == GPT_LLM:
51 |             response_dict = self.gpt_agent.invoke(question)
52 |             # convert the response_dict into a string
53 |             response = ""
54 |             for key, value in response_dict.items():
55 |                 response = response + key + ": " + value + "  \n"
56 |         else:
57 |             text = self.local_chain.invoke({"question": question})
58 |             sql_text = self.extract_sql_statements(text)
59 |             response = self.db_engine.query_db(sql_text)
60 | 
61 |         end_time = time.time()  # End time recording
62 |         time_spent = end_time - start_time  # Calculate duration
63 |         final_response = response + "  \nTime spent: {:.2f} seconds".format(time_spent)
64 |         return final_response
65 | 
66 |     def extract_sql_statements(self, text):
67 |         # Regular expression to find substrings that could be SQL statements.
68 |         # This pattern assumes SQL statements end with a semicolon and attempts to exclude common comment patterns.
69 |         pattern = r"(?<![/\*])(?:.*?;)"
70 |         
71 |         # Using re.findall to extract all occurrences that match the pattern
72 |         sql_statements = re.findall(pattern, text, flags=re.DOTALL)
73 |         
74 |         # Filtering out empty strings or strings that only contain whitespace
75 |         sql_statements = [stmt.strip() for stmt in sql_statements if stmt.strip()]
76 |         sql_text = ""
77 |         for statement in sql_statements:
78 |             sql_text += statement + "\n"
79 |         return sql_text
80 | 
81 | if __name__ == "__main__":
82 |     sql_agent = SQLEngineWrapper()
83 |     question = "Where is the office in San Francisco?"
84 |     response = sql_agent.process_query(question, LOCAL_LLM)
85 |     print(response)
86 | 


--------------------------------------------------------------------------------