├── restart.bat
├── audio_samples
├── Blaze.wav
├── Serenity.wav
├── Thunder.wav
├── stop_sound.mp3
├── Blaze_sample.wav
├── start_sound.mp3
├── Serenity_sample.wav
└── Thunder_sample.wav
├── GITHUB
└── Chatbot_preview.png
├── .gitignore
├── restart.sh
├── install_docker.ps1
├── cmd_windows.bat
├── cmd_linux.sh
├── enable__windows_features.bat
├── requirements.txt
├── update_linux.sh
├── start_linux.sh
├── update_windows.bat
├── start_windows.bat
├── README.md
├── webui.py
├── LICENSE
└── main.py
/restart.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python "%~dp0main.py"
--------------------------------------------------------------------------------
/audio_samples/Blaze.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Blaze.wav
--------------------------------------------------------------------------------
/GITHUB/Chatbot_preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/GITHUB/Chatbot_preview.png
--------------------------------------------------------------------------------
/audio_samples/Serenity.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Serenity.wav
--------------------------------------------------------------------------------
/audio_samples/Thunder.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Thunder.wav
--------------------------------------------------------------------------------
/audio_samples/stop_sound.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/stop_sound.mp3
--------------------------------------------------------------------------------
/audio_samples/Blaze_sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Blaze_sample.wav
--------------------------------------------------------------------------------
/audio_samples/start_sound.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/start_sound.mp3
--------------------------------------------------------------------------------
/audio_samples/Serenity_sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Serenity_sample.wav
--------------------------------------------------------------------------------
/audio_samples/Thunder_sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darthph0enix7/DocPOI_repo/HEAD/audio_samples/Thunder_sample.wav
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | installer_files
2 | test.ipynb
3 | TTS
4 | XTTS-v2
5 | process.log
6 | record_manager_cache.sql
7 | params.json
8 | output_combined.wav
9 | temp_output*.wav
10 |
--------------------------------------------------------------------------------
/restart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Get the directory of the current script
4 | SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
5 |
6 | # Run the main.py script
7 | python "$SCRIPT_DIR/main.py"
--------------------------------------------------------------------------------
/install_docker.ps1:
--------------------------------------------------------------------------------
1 | param(
2 | [string]$dockerInstallerPath
3 | )
4 |
5 | if (-not (Test-Path $dockerInstallerPath)) {
6 | Write-Output "Docker installer not found at $dockerInstallerPath"
7 | exit 1
8 | }
9 |
10 | # Install Docker Desktop
11 | Write-Output "Installing Docker Desktop..."
12 |
13 | # Directly run the Docker installer command
14 | & "$dockerInstallerPath" install --quiet --norestart
15 |
16 | Write-Output "Docker Desktop installed successfully."
17 |
--------------------------------------------------------------------------------
/cmd_windows.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | cd /D "%~dp0"
4 |
5 | echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
6 |
7 | set PATH=%PATH%;%SystemRoot%\system32
8 |
9 | @rem config
10 | set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
11 | set INSTALL_ENV_DIR=%cd%\installer_files\env
12 |
13 | @rem activate installer env
14 | call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
15 |
16 | @rem enter commands
17 | cmd /k "%*"
18 |
19 | :end
20 | pause
21 |
--------------------------------------------------------------------------------
/cmd_linux.sh:
--------------------------------------------------------------------------------
1 |
2 | #!/usr/bin/env bash
3 |
4 | cd "$(dirname "${BASH_SOURCE[0]}")"
5 |
6 | if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
7 |
8 | # deactivate existing conda envs as needed to avoid conflicts
9 | { conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
10 |
11 | # config
12 | CONDA_ROOT_PREFIX="$(pwd)/installer_files/conda"
13 | INSTALL_ENV_DIR="$(pwd)/installer_files/env"
14 |
15 | # environment isolation
16 | export PYTHONNOUSERSITE=1
17 | unset PYTHONPATH
18 | unset PYTHONHOME
19 | export CUDA_PATH="$INSTALL_ENV_DIR"
20 | export CUDA_HOME="$CUDA_PATH"
21 |
22 | # activate env
23 | bash --init-file <(echo "source \"$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh\" && conda activate \"$INSTALL_ENV_DIR\"")
24 |
--------------------------------------------------------------------------------
/enable__windows_features.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | cd /D "%~dp0"
3 |
4 | :: Check if the script is running as administrator
5 | net session >nul 2>&1
6 | if %errorLevel% NEQ 0 (
7 | echo Requesting administrative privileges...
8 | powershell start-process '%0' -verb runas
9 | exit /B
10 | )
11 |
12 | :: Function to check and enable a Windows feature
13 | :CheckAndEnableFeature
14 | setlocal
15 | set "featureName=%~1"
16 | echo Checking feature: %featureName%
17 | Dism /online /Get-FeatureInfo /FeatureName:%featureName% | findstr /C:"State : Enabled" >nul
18 | if %ERRORLEVEL% EQU 0 (
19 | echo %featureName% is already enabled.
20 | ) else (
21 | echo Enabling %featureName%...
22 | Dism /online /Enable-Feature /FeatureName:%featureName% /All
23 | if %ERRORLEVEL% NEQ 0 (
24 | echo Failed to enable %featureName%.
25 | ) else (
26 | echo %featureName% enabled successfully.
27 | )
28 | )
29 | endlocal
30 | exit /B
31 |
32 | :: Check and enable Hyper-V features
33 | echo Checking and enabling Hyper-V features...
34 | call :CheckAndEnableFeature Microsoft-Hyper-V-Tools-All
35 | pause
36 | call :CheckAndEnableFeature Microsoft-Hyper-V-Management-PowerShell
37 | pause
38 | call :CheckAndEnableFeature Microsoft-Hyper-V-Hypervisor
39 | pause
40 | call :CheckAndEnableFeature Microsoft-Hyper-V-Services
41 | pause
42 | call :CheckAndEnableFeature Microsoft-Hyper-V-Management-Clients
43 | pause
44 |
45 | :: Check and enable Virtual Machine Platform
46 | echo Checking and enabling Virtual Machine Platform...
47 | call :CheckAndEnableFeature HypervisorPlatform
48 | pause
49 |
50 | :: Check and enable Windows Subsystem for Linux
51 | echo Checking and enabling Windows Subsystem for Linux...
52 | call :CheckAndEnableFeature Microsoft-Windows-Subsystem-Linux
53 | pause
54 |
55 | echo All required features are enabled.
56 |
57 | :: Done
58 | echo Done!
59 | pause
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | soundfile==0.12.1
2 | pytesseract==0.3.10
3 | pycountry==24.6.1
4 | langdetect==1.0.9
5 | opencv-python==4.10.0.84
6 | openai-whisper==20231117
7 | pynput==1.7.7
8 | pdf2image==1.17.0
9 | pillow==10.3.0
10 | pydub==0.25.1
11 | sounddevice==0.5.0
12 | scipy==1.11.4
13 | gradio==4.41.0
14 | PyMuPDF==1.24.9
15 | PyPDF2==3.0.1
16 | langchain_core==0.2.28
17 | langchain_community==0.2.11
18 | langchain_experimental==0.0.62
19 | langchain_huggingface==0.0.3
20 | langchain==0.2.12
21 | langchain_elasticsearch==0.2.2
22 | #coqui_tts==0.24.1
23 | PyAudio==0.2.14
24 | httpx==0.25.0
25 | ffmpeg-python==0.2.0
26 | yarl==1.9.7
27 | langsmith==0.1.108
28 | kiwisolver==1.4.5
29 | mkl_random==1.2.4
30 | fastapi==0.112.2
31 | simsimd==5.0.1
32 | inflect==7.3.1
33 | fsspec==2024.6.1
34 | ruff==0.6.3
35 | importlib-resources==6.4.4
36 | mkl_fft==1.3.8
37 | pydantic==2.8.2
38 | more-itertools==10.4.0
39 | cffi==1.17.0
40 | safetensors==0.4.4
41 | platformdirs==4.2.2
42 | pydantic-core==2.20.1
43 | sqlalchemy==2.0.32
44 | starlette==0.38.4
45 | pypdf==4.3.1
46 | numpy==1.26.4
47 |
48 | scipy>=1.11.2
49 | librosa>=0.10.0
50 | scikit-learn>=1.3.0
51 | numba==0.55.1;python_version<"3.9"
52 | numba>=0.57.0;python_version>="3.9"
53 | inflect>=5.6.0
54 | tqdm>=4.64.1
55 | anyascii>=0.3.0
56 | pyyaml>=6.0
57 | fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
58 | aiohttp>=3.8.1
59 | packaging>=23.1
60 | mutagen==1.47.0
61 | # deps for examples
62 | flask>=2.0.1
63 | # deps for inference
64 | pysbd>=0.3.4
65 | # deps for notebooks
66 | umap-learn>=0.5.1
67 | pandas>=1.4,<2.0
68 | # deps for training
69 | matplotlib>=3.7.0
70 | # coqui stack
71 | trainer>=0.0.36
72 | # config management
73 | coqpit>=0.0.16
74 | # chinese g2p deps
75 | jieba
76 | pypinyin
77 | # korean
78 | hangul_romanize
79 | # gruut+supported langs
80 | gruut[de,es,fr]==2.2.3
81 | # deps for korean
82 | jamo
83 | nltk
84 | g2pkk>=0.1.1
85 | # deps for bangla
86 | bangla
87 | bnnumerizer
88 | bnunicodenormalizer
89 | #deps for tortoise
90 | einops>=0.6.0
91 | #deps for bark
92 | encodec>=0.1.1
93 | # deps for XTTS
94 | unidecode>=1.3.2
95 | num2words
96 | spacy[ja]>=3
--------------------------------------------------------------------------------
/update_linux.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Navigate to the script's directory
4 | cd "$(dirname "${BASH_SOURCE[0]}")"
5 |
6 | # Check if the script is located in a directory with spaces
7 | if [[ "$(pwd)" =~ " " ]]; then
8 | echo "This script relies on Miniconda which cannot be silently installed under a path with spaces."
9 | exit 1
10 | fi
11 |
12 | # Deactivate existing conda environments to avoid conflicts
13 | { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null
14 |
15 | # Determine the system architecture
16 | OS_ARCH=$(uname -m)
17 | case "${OS_ARCH}" in
18 | x86_64*) OS_ARCH="x86_64";;
19 | arm64*|aarch64*) OS_ARCH="aarch64";;
20 | *) echo "Unknown system architecture: $OS_ARCH! This script runs only on x86_64 or arm64." && exit 1;;
21 | esac
22 |
23 | # Configuration
24 | INSTALL_DIR="$(pwd)/installer_files"
25 | CONDA_ROOT_PREFIX="$INSTALL_DIR/conda"
26 | INSTALL_ENV_DIR="$INSTALL_DIR/env"
27 | TTS_REPO_DIR="$(pwd)/DocPOI_repo/XTTS-v2"
28 | MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-${OS_ARCH}.sh"
29 | TESSERACT_INSTALL_CMD="sudo -S apt-get install -y tesseract-ocr"
30 | POPPLER_INSTALL_CMD="sudo -S apt-get install -y poppler-utils"
31 | DOCKER_INSTALL_CMD="sudo -S apt-get install -y docker.io"
32 | OLLAMA_INSTALL_CMD="curl -fsSL https://ollama.com/install.sh | sudo -S sh"
33 | OLLAMA_PULL_CMD="ollama pull llama3.1:8b"
34 | conda_exists="F"
35 |
36 | # Check if Conda needs to be installed
37 | if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then
38 | conda_exists="T"
39 | fi
40 |
41 | # Install Conda if necessary
42 | if [ "$conda_exists" == "F" ]; then
43 | echo "Downloading Miniconda from $MINICONDA_DOWNLOAD_URL to $INSTALL_DIR/miniconda_installer.sh"
44 |
45 | mkdir -p "$INSTALL_DIR"
46 | curl -L "$MINICONDA_DOWNLOAD_URL" > "$INSTALL_DIR/miniconda_installer.sh"
47 |
48 | chmod u+x "$INSTALL_DIR/miniconda_installer.sh"
49 | bash "$INSTALL_DIR/miniconda_installer.sh" -b -p "$CONDA_ROOT_PREFIX"
50 |
51 | echo "Miniconda version:"
52 | "$CONDA_ROOT_PREFIX/bin/conda" --version
53 |
54 | # Optionally, remove the Miniconda installer
55 | rm "$INSTALL_DIR/miniconda_installer.sh"
56 | fi
57 |
58 | # Create the installer environment if it doesn't exist
59 | if [ ! -d "$INSTALL_ENV_DIR" ]; then
60 | "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python=3.11
61 | fi
62 |
63 | # Check if Conda environment was actually created
64 | if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
65 | echo "Conda environment is empty."
66 | exit 1
67 | fi
68 |
69 | # Environment isolation
70 | export PYTHONNOUSERSITE=1
71 | unset PYTHONPATH
72 | unset PYTHONHOME
73 |
74 | # Activate the installer environment
75 | source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh"
76 | conda activate "$INSTALL_ENV_DIR"
77 |
78 | # Ensure necessary Python modules are installed
79 | python -c "import requests" 2>/dev/null || python -m pip install requests
80 |
81 | # Check if Tesseract, Poppler, Docker, and Ollama are installed
82 | tesseract_installed=$(command -v tesseract &> /dev/null && echo "yes" || echo "no")
83 | poppler_installed=$(command -v pdftocairo &> /dev/null && echo "yes" || echo "no")
84 | docker_installed=$(command -v docker &> /dev/null && echo "yes" || echo "no")
85 | ollama_installed=$(command -v ollama &> /dev/null && echo "yes" || echo "no")
86 |
87 | # Prompt for sudo password if any of the required software is not installed
88 | if [ "$tesseract_installed" == "no" ] || [ "$poppler_installed" == "no" ] || [ "$docker_installed" == "no" ] || [ "$ollama_installed" == "no" ]; then
89 | read -sp "Enter your sudo password: " sudo_password
90 | echo
91 | fi
92 |
93 | # Install Tesseract if not installed
94 | if [ "$tesseract_installed" == "no" ]; then
95 | echo "Installing Tesseract OCR..."
96 | echo "$sudo_password" | eval "$TESSERACT_INSTALL_CMD"
97 | else
98 | echo "Tesseract is already installed."
99 | fi
100 |
101 | # Install Poppler if not installed
102 | if [ "$poppler_installed" == "no" ]; then
103 | echo "Installing Poppler utilities..."
104 | echo "$sudo_password" | eval "$POPPLER_INSTALL_CMD"
105 | else
106 | echo "Poppler utilities are already installed."
107 | fi
108 |
109 | # Install Docker if not installed
110 | if [ "$docker_installed" == "no" ]; then
111 | echo "Installing Docker..."
112 | echo "$sudo_password" | eval "$DOCKER_INSTALL_CMD"
113 | echo "$sudo_password" | sudo -S systemctl start docker
114 | echo "$sudo_password" | sudo -S systemctl enable docker
115 | else
116 | echo "Docker is already installed."
117 | fi
118 |
119 | # Install Ollama if not installed
120 | if [ "$ollama_installed" == "no" ]; then
121 | echo "Installing Ollama..."
122 | echo "$sudo_password" | eval "$OLLAMA_INSTALL_CMD"
123 |
124 | # Pull the model directly
125 | echo "Pulling Ollama model llama3.1:8b..."
126 | eval "$OLLAMA_PULL_CMD"
127 | else
128 | echo "Ollama is already installed."
129 | fi
130 |
131 | @rem Run the Python script to update dependencies
132 | call python webui.py --update
133 |
134 |
135 | # End of the script
136 | echo "Done!"
--------------------------------------------------------------------------------
/start_linux.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Navigate to the script's directory
4 | cd "$(dirname "${BASH_SOURCE[0]}")"
5 |
6 | # Check if the script is located in a directory with spaces
7 | if [[ "$(pwd)" =~ " " ]]; then
8 | echo "This script relies on Miniconda which cannot be silently installed under a path with spaces."
9 | exit 1
10 | fi
11 |
12 | # Deactivate existing conda environments to avoid conflicts
13 | { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null
14 |
15 | # Determine the system architecture
16 | OS_ARCH=$(uname -m)
17 | case "${OS_ARCH}" in
18 | x86_64*) OS_ARCH="x86_64";;
19 | arm64*|aarch64*) OS_ARCH="aarch64";;
20 | *) echo "Unknown system architecture: $OS_ARCH! This script runs only on x86_64 or arm64." && exit 1;;
21 | esac
22 |
23 | # Configuration
24 | INSTALL_DIR="$(pwd)/installer_files"
25 | CONDA_ROOT_PREFIX="$INSTALL_DIR/conda"
26 | INSTALL_ENV_DIR="$INSTALL_DIR/env"
27 | TTS_REPO_DIR="$(pwd)/DocPOI_repo/XTTS-v2"
28 | MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-${OS_ARCH}.sh"
29 | TESSERACT_INSTALL_CMD="sudo -S apt-get install -y tesseract-ocr"
30 | POPPLER_INSTALL_CMD="sudo -S apt-get install -y poppler-utils"
31 | DOCKER_INSTALL_CMD="sudo -S apt-get install -y docker.io"
32 | OLLAMA_INSTALL_CMD="curl -fsSL https://ollama.com/install.sh | sudo -S sh"
33 | OLLAMA_PULL_CMD="ollama pull llama3.1:8b"
34 | conda_exists="F"
35 |
36 | # Check if Conda needs to be installed
37 | if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then
38 | conda_exists="T"
39 | fi
40 |
41 | # Install Conda if necessary
42 | if [ "$conda_exists" == "F" ]; then
43 | echo "Downloading Miniconda from $MINICONDA_DOWNLOAD_URL to $INSTALL_DIR/miniconda_installer.sh"
44 |
45 | mkdir -p "$INSTALL_DIR"
46 | curl -L "$MINICONDA_DOWNLOAD_URL" > "$INSTALL_DIR/miniconda_installer.sh"
47 |
48 | chmod u+x "$INSTALL_DIR/miniconda_installer.sh"
49 | bash "$INSTALL_DIR/miniconda_installer.sh" -b -p "$CONDA_ROOT_PREFIX"
50 |
51 | echo "Miniconda version:"
52 | "$CONDA_ROOT_PREFIX/bin/conda" --version
53 |
54 | # Optionally, remove the Miniconda installer
55 | rm "$INSTALL_DIR/miniconda_installer.sh"
56 | fi
57 |
58 | # Create the installer environment if it doesn't exist
59 | if [ ! -d "$INSTALL_ENV_DIR" ]; then
60 | "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python=3.11.5
61 | fi
62 |
63 | # Check if Conda environment was actually created
64 | if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
65 | echo "Conda environment is empty."
66 | exit 1
67 | fi
68 |
69 | # Environment isolation
70 | export PYTHONNOUSERSITE=1
71 | unset PYTHONPATH
72 | unset PYTHONHOME
73 |
74 | # Activate the installer environment
75 | source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh"
76 | conda activate "$INSTALL_ENV_DIR"
77 |
78 | # Ensure necessary Python modules are installed
79 | python -c "import requests" 2>/dev/null || python -m pip install requests
80 |
81 | # Check if Tesseract, Poppler, Docker, and Ollama are installed
82 | tesseract_installed=$(command -v tesseract &> /dev/null && echo "yes" || echo "no")
83 | poppler_installed=$(command -v pdftocairo &> /dev/null && echo "yes" || echo "no")
84 | docker_installed=$(command -v docker &> /dev/null && echo "yes" || echo "no")
85 | ollama_installed=$(command -v ollama &> /dev/null && echo "yes" || echo "no")
86 |
87 | # Prompt for sudo password if any of the required software is not installed
88 | if [ "$tesseract_installed" == "no" ] || [ "$poppler_installed" == "no" ] || [ "$docker_installed" == "no" ] || [ "$ollama_installed" == "no" ]; then
89 | read -sp "Enter your sudo password: " sudo_password
90 | echo
91 | fi
92 |
93 | # Install Tesseract if not installed
94 | if [ "$tesseract_installed" == "no" ]; then
95 | echo "Installing Tesseract OCR..."
96 | echo "$sudo_password" | eval "$TESSERACT_INSTALL_CMD"
97 | else
98 | echo "Tesseract is already installed."
99 | fi
100 |
101 | # Install Poppler if not installed
102 | if [ "$poppler_installed" == "no" ]; then
103 | echo "Installing Poppler utilities..."
104 | echo "$sudo_password" | eval "$POPPLER_INSTALL_CMD"
105 | else
106 | echo "Poppler utilities are already installed."
107 | fi
108 |
109 | # Install Docker if not installed
110 | if [ "$docker_installed" == "no" ]; then
111 | echo "Installing Docker..."
112 | echo "$sudo_password" | eval "$DOCKER_INSTALL_CMD"
113 | echo "$sudo_password" | sudo -S systemctl start docker
114 | echo "$sudo_password" | sudo -S systemctl enable docker
115 | else
116 | echo "Docker is already installed."
117 | fi
118 |
119 | # Install Ollama if not installed
120 | if [ "$ollama_installed" == "no" ]; then
121 | echo "Installing Ollama..."
122 | echo "$sudo_password" | eval "$OLLAMA_INSTALL_CMD"
123 |
124 | # Pull the model directly
125 | echo "Pulling Ollama model llama3.1:8b..."
126 | eval "$OLLAMA_PULL_CMD"
127 | else
128 | echo "Ollama is already installed."
129 | fi
130 |
131 | # Run the Docker Installation (assuming this is a Python script command)
132 | python webui.py --linux-setup-elasticsearch
133 |
134 | # Run the Docker Installation for Ollama (assuming this is a Python script command)
135 | python webui.py --run-ollama
136 |
137 | # Run the main Python script
138 | python webui.py "$@"
139 |
140 | # End of the script
141 | echo "Done!"
--------------------------------------------------------------------------------
/update_windows.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | cd /D "%~dp0"
4 |
5 | set "currentDir=%CD%"
6 | echo %currentDir% | findstr " " >nul
7 | if "%ERRORLEVEL%" == "0" (
8 | echo This script relies on Miniconda which cannot be silently installed under a path with spaces.
9 | goto end
10 | )
11 |
12 | set PATH=%PATH%;%SystemRoot%\system32
13 |
14 | @rem config
15 | set DISTUTILS_USE_SDK=1
16 |
17 | set INSTALL_DIR=%cd%\installer_files
18 | set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
19 | set INSTALL_ENV_DIR=%cd%\installer_files\env
20 | set TTS_REPO_DIR=%cd%\DocPOI_repo\XTTS-v2
21 | set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe
22 | set TESSERACT_PATH=%ProgramFiles%\Tesseract-OCR\tesseract.exe
23 | set OLLAMA_PATH=%LocalAppData%\Programs\Ollama\ollama app.exe
24 | set POPPLER_PATH=%INSTALL_DIR%\poppler-24.07.0
25 | set DOCKER_PATH=%ProgramFiles%\Docker\Docker\Docker Desktop.exe
26 | set TESSERACT_DOWNLOAD_URL=https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-5.4.0.20240606.exe
27 | set OLLAMA_DOWNLOAD_URL=https://ollama.com/download/OllamaSetup.exe
28 | set POPPLER_DOWNLOAD_URL=https://github.com/oschwartz10612/poppler-windows/releases/download/v24.07.0-0/Release-24.07.0-0.zip
29 | set conda_exists=F
30 | set VS_BUILD_TOOLS_URL=https://aka.ms/vs/17/release/vs_BuildTools.exe
31 | set "MSBUILD_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Current\Bin"
32 |
33 | @rem figure out whether conda needs to be installed
34 | call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
35 | if "%ERRORLEVEL%" EQU "0" set conda_exists=T
36 |
37 | @rem (if necessary) install conda into a contained environment
38 | if "%conda_exists%" == "F" (
39 | echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe
40 |
41 | mkdir "%INSTALL_DIR%"
42 | curl -L -o "%INSTALL_DIR%\miniconda_installer.exe" %MINICONDA_DOWNLOAD_URL% || ( echo. && echo Miniconda failed to download. && goto end )
43 |
44 | echo Installing Miniconda to %CONDA_ROOT_PREFIX%
45 | "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
46 |
47 | @rem test the conda binary
48 | echo Miniconda version:
49 | call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
50 | )
51 |
52 | @rem create the installer env if it doesn't exist
53 | if not exist "%INSTALL_ENV_DIR%" (
54 | echo Creating the conda environment...
55 | call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10 || ( echo. && echo Conda environment creation failed. && goto end )
56 | )
57 |
58 | @rem check if conda environment was actually created
59 | if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
60 |
61 | @rem activate installer env
62 | call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
63 |
64 | @rem Check if requests is installed; if not, install it
65 | call python -c "import requests" 2>nul || (
66 | echo Installing requests module...
67 | call python -m pip install requests || ( echo. && echo Failed to install requests. && goto end )
68 | )
69 |
70 | @rem Check for and install Tesseract if not installed
71 | if not exist "%TESSERACT_PATH%" (
72 | echo Downloading Tesseract from %TESSERACT_DOWNLOAD_URL% to %INSTALL_DIR%\tesseract_installer.exe
73 | curl -L -o "%INSTALL_DIR%\tesseract_installer.exe" %TESSERACT_DOWNLOAD_URL% || ( echo. && echo Tesseract failed to download. && goto end )
74 |
75 | echo Installing Tesseract silently
76 | pushd "%INSTALL_DIR%"
77 | tesseract_installer.exe /S
78 | popd
79 | ) else (
80 | echo Tesseract is already installed at %TESSERACT_PATH%.
81 | )
82 |
83 | @rem Check for and install Ollama if not installed
84 | if not exist "%OLLAMA_PATH%" (
85 | echo Downloading Ollama from %OLLAMA_DOWNLOAD_URL% to %INSTALL_DIR%\ollama_installer.exe
86 | curl -L -o "%INSTALL_DIR%\ollama_installer.exe" %OLLAMA_DOWNLOAD_URL% || ( echo. && echo Ollama failed to download. && goto end )
87 |
88 | echo Installing Ollama
89 | "%INSTALL_DIR%\ollama_installer.exe" /S
90 |
91 | @rem Pull the model in a new terminal and continue with the rest of the installation
92 | start "" cmd /c "ollama pull llama3.1:8b"
93 | ) else (
94 | echo Ollama is already installed at %OLLAMA_PATH%.
95 | )
96 |
97 | @rem Check for and unzip Poppler if not already unzipped
98 | if not exist "%POPPLER_PATH%" (
99 | echo Downloading Poppler from %POPPLER_DOWNLOAD_URL% to %INSTALL_DIR%\poppler.zip
100 | curl -L -o "%INSTALL_DIR%\poppler.zip" %POPPLER_DOWNLOAD_URL% || ( echo. && echo Poppler failed to download. && goto end )
101 |
102 | echo Unzipping Poppler to %INSTALL_DIR%
103 | tar -xf "%INSTALL_DIR%\poppler.zip" -C "%INSTALL_DIR%"
104 | ) else (
105 | echo Poppler is already unzipped at %POPPLER_PATH%.
106 | )
107 |
108 | @rem Check if MSBuild exists in the specified path
109 | if exist "%MSBUILD_PATH%\MSBuild.exe" (
110 | echo MSBuild found in the specified path.
111 | ) else (
112 | echo MSBuild not found in the specified path, checking Visual Studio installation...
113 |
114 | @rem Check for Visual Studio Build Tools installation using vswhere
115 | vswhere -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 >nul 2>&1
116 |
117 | if "%ERRORLEVEL%" NEQ "0" (
118 | echo Visual Studio c++ Build Tools not found. Downloading and installing...
119 |
120 | curl -L -o "%INSTALL_DIR%\vs_buildtools.exe" %VS_BUILD_TOOLS_URL% || (
121 | echo.
122 | echo Visual Studio Build Tools failed to download.
123 | goto end
124 | )
125 |
126 | echo Installing Visual Studio Build Tools silently. This process may take a while.
127 | "%INSTALL_DIR%\vs_buildtools.exe" --quiet --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Workload.ManagedDesktopBuildTools --add Microsoft.VisualStudio.Workload.WebBuildTools --add Microsoft.VisualStudio.Workload.NetCoreBuildTools --add Microsoft.VisualStudio.Component.Windows10SDK.18362 --add Microsoft.Net.Component.4.7.TargetingPack --includeRecommended --wait || (
128 | echo.
129 | echo Visual Studio Build Tools installation failed.
130 | goto end
131 | )
132 | ) else (
133 | echo Visual Studio c++ Build Tools are already installed.
134 | )
135 | )
136 |
137 | @rem Check for and install Docker if not installed
138 | if not exist "%DOCKER_PATH%" (
139 | echo Docker is not installed. Proceeding with Docker installation...
140 |
141 | @rem Download Docker installer using Python
142 | call python webui.py --download-docker
143 |
144 | @rem Run the PowerShell script to install Docker
145 | powershell -ExecutionPolicy Bypass -File "%cd%\install_docker.ps1" -dockerInstallerPath "%INSTALL_DIR%\docker-installer.exe"
146 | ) else (
147 | echo Docker is already installed at %DOCKER_PATH%.
148 | )
149 |
150 | @rem Run the Python script to update dependencies
151 | call python webui.py --update
152 |
153 | echo.
154 | echo Dependencies updated!
155 | pause
156 |
--------------------------------------------------------------------------------
/start_windows.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | cd /D "%~dp0"
4 |
5 | set "currentDir=%CD%"
6 | echo %currentDir% | findstr " " >nul
7 | if "%ERRORLEVEL%" == "0" (
8 | echo This script relies on Miniconda which cannot be silently installed under a path with spaces.
9 | goto end
10 | )
11 |
12 | @echo off
13 | set PATH=%PATH%;%SystemRoot%\system32
14 |
15 | @rem config
16 | set DISTUTILS_USE_SDK=1
17 |
18 | set INSTALL_DIR=%cd%\installer_files
19 | set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
20 | set INSTALL_ENV_DIR=%cd%\installer_files\env
21 | set TTS_REPO_DIR=%cd%\DocPOI_repo\XTTS-v2
22 | set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Windows-x86_64.exe
23 | set TESSERACT_PATH=%ProgramFiles%\Tesseract-OCR\tesseract.exe
24 | set OLLAMA_PATH=%LocalAppData%\Programs\Ollama\ollama app.exe
25 | set POPPLER_PATH=%INSTALL_DIR%\poppler-24.07.0
26 | set DOCKER_PATH=%ProgramFiles%\Docker\Docker\Docker Desktop.exe
27 | set TESSERACT_DOWNLOAD_URL=https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-5.4.0.20240606.exe
28 | set OLLAMA_DOWNLOAD_URL=https://ollama.com/download/OllamaSetup.exe
29 | set POPPLER_DOWNLOAD_URL=https://github.com/oschwartz10612/poppler-windows/releases/download/v24.07.0-0/Release-24.07.0-0.zip
30 | set TESSDATA_REPO_URL=https://github.com/Darthph0enix7/Tesseract_Tessdata_current.git
31 | set conda_exists=F
32 | set VS_BUILD_TOOLS_URL=https://aka.ms/vs/17/release/vs_BuildTools.exe
33 | set "MSBUILD_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Current\Bin"
34 |
35 | @rem figure out whether conda needs to be installed
36 | call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
37 | if "%ERRORLEVEL%" EQU "0" set conda_exists=T
38 |
39 | @rem (if necessary) install conda into a contained environment
40 | if "%conda_exists%" == "F" (
41 | echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe
42 |
43 | mkdir "%INSTALL_DIR%"
44 | curl -L -o "%INSTALL_DIR%\miniconda_installer.exe" %MINICONDA_DOWNLOAD_URL% || ( echo. && echo Miniconda failed to download. && goto end )
45 |
46 | echo Installing Miniconda to %CONDA_ROOT_PREFIX%
47 | "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
48 |
49 | @rem test the conda binary
50 | echo Miniconda version:
51 | call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
52 | )
53 |
54 | @rem create the installer env if it doesn't exist
55 | if not exist "%INSTALL_ENV_DIR%" (
56 | echo Creating the conda environment...
57 | call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.11.5 || ( echo. && echo Conda environment creation failed. && goto end )
58 | )
59 |
60 | @rem check if conda environment was actually created
61 | if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
62 |
63 | @rem activate installer env
64 | call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
65 |
66 | @rem Check if requests is installed; if not, install it
67 | call python -c "import requests" 2>nul || (
68 | echo Installing requests module...
69 | call python -m pip install requests || ( echo. && echo Failed to install requests. && goto end )
70 | )
71 | @rem Set up the environment for Microsoft C++ Build Tools
72 | call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64 || ( echo. && echo Failed to set up the environment for Microsoft C++ Build Tools. && goto end )
73 |
74 | @rem Check for and install Tesseract if not installed
75 | if not exist "%TESSERACT_PATH%" (
76 | echo Downloading Tesseract from %TESSERACT_DOWNLOAD_URL% to %INSTALL_DIR%\tesseract_installer.exe
77 | curl -L -o "%INSTALL_DIR%\tesseract_installer.exe" %TESSERACT_DOWNLOAD_URL% || ( echo. && echo Tesseract failed to download. && goto end )
78 |
79 | echo Installing Tesseract silently
80 | pushd "%INSTALL_DIR%"
81 | tesseract_installer.exe /S
82 | popd
83 | ) else (
84 | echo Tesseract is already installed at %TESSERACT_PATH%.
85 | )
86 |
87 | @rem Check for and download tessdata if not already downloaded
88 | if not exist "%INSTALL_DIR%\tessdata" (
89 | echo Cloning tessdata repository from %TESSDATA_REPO_URL% to %INSTALL_DIR%\tessdata
90 | git clone %TESSDATA_REPO_URL% "%INSTALL_DIR%\tessdata" || ( echo. && echo Tessdata repository failed to clone. && goto end )
91 |
92 | echo Tessdata successfully cloned.
93 | ) else (
94 | echo Tessdata is already downloaded at %INSTALL_DIR%.
95 | )
96 |
97 | @rem Check for and install Ollama if not installed
98 | if not exist "%OLLAMA_PATH%" (
99 | echo Downloading Ollama from %OLLAMA_DOWNLOAD_URL% to %INSTALL_DIR%\ollama_installer.exe
100 | curl -L -o "%INSTALL_DIR%\ollama_installer.exe" %OLLAMA_DOWNLOAD_URL% || ( echo. && echo Ollama failed to download. && goto end )
101 |
102 | echo Installing Ollama
103 | "%INSTALL_DIR%\ollama_installer.exe" /S
104 |
105 | @rem Pull the model in a new terminal and continue with the rest of the installation
106 | start "" cmd /c "ollama pull llama3.1:8b"
107 | ) else (
108 | echo Ollama is already installed at %OLLAMA_PATH%.
109 | )
110 |
111 | @rem Check for and unzip Poppler if not already unzipped
112 | if not exist "%POPPLER_PATH%" (
113 | echo Downloading Poppler from %POPPLER_DOWNLOAD_URL% to %INSTALL_DIR%\poppler.zip
114 | curl -L -o "%INSTALL_DIR%\poppler.zip" %POPPLER_DOWNLOAD_URL% || ( echo. && echo Poppler failed to download. && goto end )
115 |
116 | echo Unzipping Poppler to %INSTALL_DIR%
117 | tar -xf "%INSTALL_DIR%\poppler.zip" -C "%INSTALL_DIR%"
118 | ) else (
119 | echo Poppler is already unzipped at %POPPLER_PATH%.
120 | )
121 |
122 | @rem Check if MSBuild exists in the specified path
123 | if exist "%MSBUILD_PATH%\MSBuild.exe" (
124 | echo MSBuild found in the specified path.
125 | ) else (
126 | echo MSBuild not found in the specified path, checking Visual Studio installation...
127 |
128 | @rem Check for Visual Studio Build Tools installation using vswhere
129 | vswhere -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 >nul 2>&1
130 |
131 | if "%ERRORLEVEL%" NEQ "0" (
132 | echo Visual Studio c++ Build Tools not found. Downloading and installing...
133 |
134 | curl -L -o "%INSTALL_DIR%\vs_buildtools.exe" %VS_BUILD_TOOLS_URL% || (
135 | echo.
136 | echo Visual Studio Build Tools failed to download.
137 | goto end
138 | )
139 |
140 | echo Installing Visual Studio Build Tools silently. This process may take a while.
141 | "%INSTALL_DIR%\vs_buildtools.exe" --quiet --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Workload.ManagedDesktopBuildTools --add Microsoft.VisualStudio.Workload.WebBuildTools --add Microsoft.VisualStudio.Workload.NetCoreBuildTools --add Microsoft.VisualStudio.Component.Windows10SDK.18362 --add Microsoft.Net.Component.4.7.TargetingPack --includeRecommended --wait || (
142 | echo.
143 | echo Visual Studio Build Tools installation failed.
144 | goto end
145 | )
146 | ) else (
147 | echo Visual Studio c++ Build Tools are already installed.
148 | )
149 | )
150 |
151 | @rem Check for and install Docker if not installed
152 | if not exist "%DOCKER_PATH%" (
153 | echo Docker is not installed. Proceeding with Docker installation...
154 |
155 | @rem Download Docker installer using Python
156 | call python webui.py --download-docker
157 |
158 | @rem Run the PowerShell script to install Docker
159 | powershell -ExecutionPolicy Bypass -File "%cd%\install_docker.ps1" -dockerInstallerPath "%INSTALL_DIR%\docker-installer.exe"
160 | ) else (
161 | echo Docker is already installed at %DOCKER_PATH%.
162 | )
163 |
164 | @rem run the Docker Installation
165 | call python webui.py --setup-elasticsearch
166 |
167 | @rem run the Docker Installation
168 | call python webui.py --run-ollama
169 |
170 | @rem run the Python script
171 | call python webui.py %*
172 |
173 | echo.
174 | echo Done!
175 |
176 | :end
177 | pause
178 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DocPOI
2 |
3 | **Contact**
4 | Please contact me or our group for any issues, feature recommendations, ideas, or even criticism of the code. Everything is welcome!
5 | You can join our Discord for the best experience.
6 | Website: [eren.enpoi.com](https://eren.enpoi.com)
7 | Discord: [Join our Discord](https://discord.gg/D5M6WpWB)
8 | Email: eren@enpoi.com
9 | Feel free to contribute and just hang out ;)
10 |
11 | ## Requirements
12 | - **Over 35GB of free storage space**
13 | - **A NVIDIA GPU with over 6GB of VRAM** (While theoretically it would work with CPU offloading but the performance would be significantly worse.)
14 |
15 | ## The Problem
16 |
17 | I have always struggled with managing my hundreds of thousands of documents, never being able to find the information I needed without sifting through the chaos I call my paper piles of official documents, certificates, and more. As someone who tends to procrastinate, this is not sustainable. To solve this problem, I wanted to create an assistant that not only acts as a personal assistant but also retrieves documents based on my query, eliminating the need for manual searches.
18 |
19 | Overall, this project aims to solve the following common problems:
20 |
21 | - **Scanning documents but never OCRing them:** It's a pain to OCR documents one by one, batch by batch, or manually enter the information when needed.
22 | - **Not naming documents properly:** I often tell myself I’ll do it later or remember where I saved them, but I never do.
23 | - **Security concerns:** Using drives, clouds, or other AI agents like ChatGPT for personal files can be risky.
24 | - **Lack of a quick personal assistant:** There's a need for a personal assistant that is always available to help, securely and locally.
25 |
26 | ---
27 |
28 | 
29 |
30 | ---
31 |
32 | DocPOI is a personal assistant that runs locally on your computer, utilizing your GPU to access and manage your personal documents securely. The focus is on maintaining absolute security by ensuring that everything runs locally, with the option to host it locally for everyone. While this is an amateur project, I believe in its potential and plan to expand and improve it significantly. The goal is to give everyone a personal "Jarvis" for their documents and tasks.
33 |
34 | **Quick Links**
35 | - [Installation](#installation)
36 | - [Usage](#usage)
37 | - [Project Goals](#project-goals)
38 | - [FAQ](#faq)
39 | - [Acknowledgements](#acknowledgements)
40 | - [Contributing](#contributing)
41 |
42 | # Features
43 |
44 | - **Normal Conversation:** Chat with the assistant just like you would with any chatbot.
45 | - **Document Retrieval:** Quickly find and retrieve documents from your collection.
46 | - **Auto Metadata Extraction:** Automatically extract and create metadata for your documents.
47 | - **Auto Naming:** Automatically generate appropriate names for your documents.
48 | - **PDF to PDF OCR:** Convert PDFs to searchable PDFs using OCR.
49 | - **Speech Recognition:** Interact with the assistant using voice commands.
50 | - **Voice Over (TTS):** Have documents and responses read aloud.
51 |
52 | ### Coming Soon
53 | - **Document Categorization:** Automatically categorize documents based on their content.
54 | - **Custom Labeling:** Label documents with custom tags (e.g., school, work, etc.).
55 | - **Financial Document Management:** Special handling of financial documents, tracking incomes, and outcomes.
56 | - **Image Recognition and Labeling:** Extend document handling to images, including people recognition, object detection, and characterization.
57 | - **Wake Word Detection:** Activate the assistant with a wake word, enabling hands-free interaction.
58 | - **Multiple Agents:** Use different agents for different tasks, with support for multiple wake words.
59 | - **One-Click Installer for Linux:** Simplifying the installation process for Linux users. (DONE)
60 | ### Significant Quality Improvements Planned
61 | - **Better OCR Engine:** Exploring Surya OCR for improved accuracy.
62 | - **Structured Information Extraction:** Extract structured data from documents more effectively.
63 | - **Enhanced Retrieval Logic:** Improve the logic for retrieving documents.
64 | - **More Tools and Custom Tool Calling:** Expand the available tools and allow easy customization.
65 | - **Multiple Agents Support:** Detect and interact with multiple agents simultaneously using different wake words.
66 |
67 | # Installation
68 |
69 | ### Auto Installation (Strongly Recommended)
70 | A one-click installer is available for Windows. It installs all the required programs, including Docker, Ollama, C++ build tools, PyTorch, CUDA, PyTesseract, and more.
71 |
72 | ## For linux
73 | just run the start_linux.sh, simple
74 |
75 | ## For Windows
76 | Important!!
77 | before running "Virtualization" must be anabled in BIOS
78 |
79 | - Step 1: run enable__windows_features.bat once if you have not anabled any windows features
80 | - Step 2: Run start_windows.bat, confirm any installations processes and questions that comes in the way
81 |
82 | for updates run update_dependencies.bat
83 |
84 | ### Manual Installation
85 | Ensure virtualization is enabled in your BIOS. Then follow these steps:
86 |
87 | 1. Install Docker: [Docker Installation Guide](https://docs.docker.com/get-started/get-docker/)
88 | 2. Install C++ Build Tools: [Download C++ Build Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe)
89 | 3. Install Tesseract OCR Engine: [Tesseract OCR Installation](https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.3.0.20221214.exe)
90 | 4. Install Ollama: [Download Ollama](https://ollama.com/download/OllamaSetup.exe)
91 | 5. Download Poppler and unzip it, then set the `POPPLER_PATH` variable in `main.py` to the Poppler `bin` folder: [Download Poppler](https://github.com/oschwartz10612/poppler-windows/releases/download/v24.07.0-0/Release-24.07.0-0.zip)
92 | 6. Run Docker:
93 | ```bash
94 | docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.security.http.ssl.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.12.1
95 | ```
96 |
97 | 7. Get the XTTS model:
98 | ```bash
99 | cd DocPOI_repo
100 | git lfs install
101 | git clone https://huggingface.co/coqui/XTTS-v2
102 | ```
103 |
104 | 8. Set up your Python environment. Install CUDA and PyTorch accordingly. Here’s an example using Anaconda:
105 | ```bash
106 | conda create --name torch python=3.11.5
107 | conda install nvidia/label/cuda-12.1.0::cuda-toolkit
108 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
109 | ```
110 |
111 | 9. Install the required Python packages:
112 | ```bash
113 | pip install -r requirements.txt
114 | ```
115 |
116 | 10. Run `main.py` in your environment.
117 |
118 | # Usage
119 |
120 | After running the initial code, a setup environment will open. Follow the prompts to set everything up.
121 |
122 | Once setup is complete:
123 | - Refresh the page or wait for it to open a new tab.
124 | - First, press "Process Files" to OCR, create metadata, rename, and add documents to the vector store.
125 | - The rest is up to you! If Voice Over is activated, press the key combination to record your voice, press again to stop recording, and interact with the assistant. You can also type directly in the chat environment.
126 |
127 | # Project Goals
128 |
129 | - **Absolute Security:** Ensure all operations run locally to protect user data.
130 | - **Accessibility:** Make it easy for anyone, regardless of technical skill, to set up and use a personal assistant.
131 | - **Expandability:** Continue to add features and improve existing ones, with a focus on user feedback and community contributions.
132 | - **Open-Source Development:** Encourage collaboration and transparency in the development process.
133 |
134 | # FAQ
135 |
136 | **Is there a Linux installer?**
137 | - It's coming soon! The installation process is generally easier on Linux.
138 |
139 | **What is your setup to test these Projects?**
140 | - I currently doesnt have anything crazy, a mid-range Desktop PC with ryzen 7, dual gpu setup with rtx 3070ti and Tesla p40, 48gb ddr4 ram, 2tb pcie5 ssd with overall enough rgb (even my monitors have this Ambient lights) to light up an entire Taylor Swift concert.
141 |
142 | # Acknowledgements
143 |
144 | Special thanks to:
145 | - [oobabooga](https://github.com/oobabooga) for the basis of the installer.
146 | - The Ollama team.
147 | - The Langchain team.
148 | - [dscripka](https://github.com/dscripka/openWakeWord/tree/main) for the future wake word implementation.
149 |
150 | # Contributing
151 |
152 | Contributions are welcome! Please fork the repository and submit pull requests. :)
153 |
--------------------------------------------------------------------------------
/webui.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import sys
5 | import shutil
6 | import time
7 | import requests
8 |
9 | script_dir = os.getcwd()
10 | repo_dir = os.path.join(script_dir)
11 | remote_url = "https://github.com/Darthph0enix7/DocPOI_repo.git"
12 | xtts_repo_dir = os.path.join(repo_dir, "XTTS-v2")
13 | tts_repo_dir = os.path.join(repo_dir, "TTS")
14 |
15 |
16 | def run_cmd(cmd, capture_output=False, env=None):
17 | # Run shell commands
18 | return subprocess.run(cmd, shell=True, capture_output=capture_output, env=env)
19 |
20 |
21 | def check_env():
22 | # If we have access to conda, we are probably in an environment
23 | conda_not_exist = run_cmd("conda", capture_output=True).returncode
24 | if conda_not_exist:
25 | print("Conda is not installed. Exiting...")
26 | sys.exit()
27 |
28 | # Ensure this is a new environment and not the base environment
29 | if os.environ["CONDA_DEFAULT_ENV"] == "base":
30 | print("Create an environment for this project and activate it. Exiting...")
31 | sys.exit()
32 |
33 |
34 | def install_dependencies():
35 | # Select your GPU or, choose to run in CPU mode
36 | print("What is your GPU")
37 | print()
38 | print("A) NVIDIA")
39 | print("B) AMD")
40 | print("C) Apple M Series")
41 | print("D) None (I want to run in CPU mode)")
42 | print()
43 | gpuchoice = input("Input> ").lower()
44 |
45 | # Install the version of PyTorch needed
46 | if gpuchoice == "a":
47 | run_cmd(
48 | "conda install -y -k nvidia/label/cuda-12.1.0::cuda-toolkit"
49 | )
50 | run_cmd(
51 | "conda install -y -k pytorch torchvision torchaudio pytorch-cuda=12.1 ninja git -c pytorch -c nvidia"
52 | )
53 | elif gpuchoice == "b":
54 | print("AMD GPUs are not supported. Exiting...")
55 | sys.exit()
56 | elif gpuchoice == "c" or gpuchoice == "d":
57 | run_cmd(
58 | "conda install -y -k pytorch torchvision torchaudio cpuonly git -c pytorch"
59 | )
60 | else:
61 | print("Invalid choice. Exiting...")
62 | sys.exit()
63 |
64 | run_cmd("conda install -y -c pytorch ffmpeg") # LGPL
65 |
66 | # Install the webui dependencies
67 | update_dependencies()
68 |
69 | # Install Git LFS if not installed
70 | if run_cmd("git lfs --version", capture_output=True).returncode != 0:
71 | print("Git LFS is not installed. Installing Git LFS...")
72 | run_cmd("git lfs install")
73 |
74 | # Clone the XTTS-v2 repository if it doesn't already exist
75 | if not os.path.exists(xtts_repo_dir):
76 | print(f"Cloning the XTTS-v2 repository into {xtts_repo_dir}...")
77 | run_cmd(f"git clone https://huggingface.co/coqui/XTTS-v2 {xtts_repo_dir}")
78 | else:
79 | print("XTTS-v2 repository already exists.")
80 | # Clone the XTTS-v2 repository if it doesn't already exist
81 | if not os.path.exists(tts_repo_dir):
82 | print(f"Cloning the Coqui-tts repository into {tts_repo_dir}...")
83 | run_cmd(f"git clone https://github.com/Darthph0enix7/TTS.git {tts_repo_dir}")
84 | else:
85 | print("XTTS-v2 repository already exists.")
86 |
87 |
88 | def setup_elasticsearch():
89 | # Use environment variable to construct the Docker Desktop path
90 | docker_path = os.path.join(os.environ.get("ProgramFiles", r"C:\Program Files"), "Docker", "Docker", "Docker Desktop.exe")
91 |
92 | if not os.path.exists(docker_path):
93 | print(f"Docker Desktop is not installed at the expected path: {docker_path}")
94 | sys.exit(1)
95 |
96 | # Check if Docker is running
97 | print("Checking if Docker is running...")
98 | docker_running = subprocess.run(["docker", "info"], capture_output=True).returncode == 0
99 |
100 | if not docker_running:
101 | print("Docker is not running. Starting Docker Desktop...")
102 | subprocess.Popen([docker_path])
103 |
104 | # Wait for Docker to be ready
105 | print("Waiting for Docker to start...")
106 | while not docker_running:
107 | time.sleep(5) # Check every 5 seconds
108 | docker_running = subprocess.run(["docker", "info"], capture_output=True).returncode == 0
109 | print("Docker is now running.")
110 | else:
111 | print("Docker is already running.")
112 |
113 | # Check if an Elasticsearch container exists
114 | print("Checking if an Elasticsearch container already exists...")
115 | result = subprocess.run(["docker", "ps", "-a", "--filter", "ancestor=docker.elastic.co/elasticsearch/elasticsearch:8.12.1", "--format", "{{.ID}}"], capture_output=True, text=True)
116 |
117 | container_id = result.stdout.strip()
118 |
119 | if container_id:
120 | # Check if the container is running
121 | print("Checking if the existing Elasticsearch container is running...")
122 | result = subprocess.run(["docker", "ps", "--filter", f"id={container_id}", "--format", "{{.ID}}"], capture_output=True, text=True)
123 |
124 | if result.stdout.strip():
125 | print("Elasticsearch container is already running.")
126 | else:
127 | # Start the existing container
128 | print("Starting the existing Elasticsearch container...")
129 | subprocess.run(["docker", "start", container_id])
130 | print("Elasticsearch container started.")
131 | else:
132 | # Run a new Elasticsearch Docker container in detached mode (background)
133 | print("No existing Elasticsearch container found. Running a new one in the background...")
134 | subprocess.Popen('docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -e "xpack.security.http.ssl.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.12.1', shell=True)
135 | print("New Elasticsearch container started in the background.")
136 |
137 | # Close the command prompt
138 | print("Closing command prompt...")
139 | sys.exit(0)
140 |
141 | def run_ollama():
142 | # Run the Ollama command
143 | print("Starting Ollama with llama3.1:8b...")
144 | process = subprocess.Popen(["ollama", "run", "llama3.1:8b"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
145 |
146 |
147 | def update_conda():
148 | # Update conda
149 | run_cmd("conda update -y -n base -c defaults conda")
150 |
151 |
152 | def update_dependencies():
153 | # Update the webui dependencies
154 | os.chdir(repo_dir)
155 |
156 | # Check if the .git directory exists
157 | if not os.path.isdir(".git"):
158 | print("Initializing new Git repository...")
159 | run_cmd("git init")
160 | run_cmd(f"git remote add origin {remote_url}")
161 |
162 | # Ensure the repository is connected to the remote
163 | run_cmd("git fetch origin")
164 |
165 | # Reset any local changes and pull the latest version
166 | run_cmd("git reset --hard origin/main")
167 | run_cmd("git pull origin main --force")
168 |
169 | # Install dependencies
170 | run_cmd("pip install -r requirements.txt")
171 |
172 | os.chdir(script_dir)
173 |
174 |
175 | def download_docker_installer():
176 | docker_installer_url = "https://desktop.docker.com/win/main/amd64/Docker%20Desktop%20Installer.exe?utm_source=docker&utm_medium=webreferral&utm_campaign=docs-driven-download-win-amd64"
177 | docker_installer_path = os.path.join(script_dir, "installer_files", "docker-installer.exe")
178 |
179 | # Create installer_files directory if it doesn't exist
180 | os.makedirs(os.path.dirname(docker_installer_path), exist_ok=True)
181 |
182 | # Download Docker installer
183 | print("Downloading Docker Desktop...")
184 | try:
185 | response = requests.get(docker_installer_url, stream=True)
186 | response.raise_for_status()
187 | with open(docker_installer_path, "wb") as file:
188 | for chunk in response.iter_content(chunk_size=8192):
189 | if chunk:
190 | file.write(chunk)
191 | print("Docker Desktop downloaded successfully.")
192 | except requests.exceptions.RequestException as e:
193 | print(f"Failed to download Docker: {e}")
194 | sys.exit(1)
195 |
196 | return docker_installer_path
197 |
198 |
199 | def run_model():
200 | os.chdir(repo_dir)
201 | run_cmd("python main.py") # put your flags here!
202 |
203 |
204 | if __name__ == "__main__":
205 | # Verifies we are in a conda environment
206 | check_env()
207 |
208 | parser = argparse.ArgumentParser()
209 | parser.add_argument("--download-docker", action="store_true", help="Download Docker Desktop installer.")
210 | parser.add_argument("--update", action="store_true", help="Update the web UI.")
211 | parser.add_argument("--setup-elasticsearch", action="store_true", help="Setup and run Elasticsearch in Docker.")
212 | parser.add_argument("--run-ollama", action="store_true", help="Run and terminate Ollama with llama3.1:8b.")
213 | args = parser.parse_args()
214 |
215 | if args.update:
216 | update_dependencies()
217 | elif args.download_docker:
218 | download_docker_installer()
219 | elif args.setup_elasticsearch:
220 | setup_elasticsearch()
221 | elif args.run_ollama:
222 | run_ollama()
223 | else:
224 | # If webui has already been installed, skip and run
225 | if not os.path.exists(xtts_repo_dir):
226 | install_dependencies()
227 | os.chdir(script_dir)
228 |
229 | # Run the model with webui
230 | run_model()
231 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import io
3 | import gc
4 | import time
5 | import json
6 | from typing import List, Optional
7 | import wave
8 | import uuid
9 | import shutil
10 | import torch
11 | import random
12 | import logging
13 | import webbrowser
14 | import threading
15 | import soundfile as sf
16 | import pytesseract
17 | import torch
18 | import pycountry
19 | from langdetect.lang_detect_exception import LangDetectException
20 | from langdetect import detect
21 | import cv2
22 | import whisper
23 | import numpy as np
24 | from datetime import datetime
25 | from os.path import splitext, exists
26 | from collections import OrderedDict
27 | from pynput import keyboard
28 | from pdf2image import convert_from_path
29 | from PIL import Image, ImageEnhance
30 | from pydub import AudioSegment
31 | from pydub.playback import play
32 | import sounddevice as sd
33 | from scipy.io.wavfile import write
34 | import gradio as gr
35 | import tempfile
36 |
37 | # PyMuPDF
38 | import fitz
39 |
40 | # PyPDF2
41 | from PyPDF2 import PdfReader, PdfWriter
42 | from PyPDF2.generic import NameObject, TextStringObject
43 |
44 | # Langchain and related imports
45 | from langchain_core.prompts import ChatPromptTemplate
46 | from langchain_core.document_loaders import BaseLoader
47 | from langchain_core.documents import Document
48 | from langchain_core.messages import SystemMessage
49 | from langchain_community.chat_models import ChatOllama
50 | from langchain_community.document_loaders import PyPDFLoader, TextLoader
51 | from langchain_experimental.text_splitter import SemanticChunker
52 | from langchain_experimental.llms.ollama_functions import OllamaFunctions
53 | from langchain_huggingface import HuggingFaceEmbeddings
54 | from langchain.indexes import SQLRecordManager, index
55 | from langchain_elasticsearch import ElasticsearchStore
56 | from langchain.chains import LLMChain
57 | from langchain.chains.combine_documents import create_stuff_documents_chain
58 | from langchain.chains.retrieval import create_retrieval_chain
59 | from langchain.retrievers.multi_query import MultiQueryRetriever
60 | from langchain_core.prompts import (
61 | ChatPromptTemplate,
62 | HumanMessagePromptTemplate,
63 | MessagesPlaceholder,
64 | )
65 | from langchain.memory import ConversationBufferMemory
66 | from TTS.tts.configs.xtts_config import XttsConfig
67 | from TTS.tts.models.xtts import Xtts
68 |
69 | import tkinter as tk
70 | from tkinter import filedialog
71 | import subprocess
72 | import platform
73 | import sys
74 | import re
75 |
76 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
77 | print(f"Using device: {device}")
78 | # Constants
79 | POPPLER_PATH = r'.\installer_files\poppler-24.07.0\Library\bin'
80 | program_files = os.environ.get('ProgramFiles')
81 | tessdata_dir = os.path.join("installer_files", "tessdata")
82 | tessdata_dir_config = f'--tessdata-dir "{tessdata_dir}"'
83 |
84 | if platform.system() == 'Windows':
85 | program_files = os.environ.get('PROGRAMFILES', 'C:\\Program Files')
86 | PYTESSERACT_CMD = os.path.join(program_files, 'Tesseract-OCR', 'tesseract.exe')
87 | pytesseract.pytesseract.tesseract_cmd = PYTESSERACT_CMD
88 |
89 | PARAM_FILE = "params.json"
90 | LOG_FILE = "process.log"
91 | is_recording = False # To track if we are currently recording
92 | recording = None # Global variable to hold the recording data
93 | stream = None # To handle the audio stream
94 | filename = "output_combined.wav" # File to save the recording
95 | metadata_llm = ChatOllama(model="llama3.1:8b", temperature=0.9)
96 | naming_llm = ChatOllama(model="llama3.1:8b", temperature=0.5, num_predict=30)
97 | embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs = {'device': device})
98 | session_params = {
99 | "filter_key": "",
100 | "filter_value": ""
101 | }
102 |
103 |
104 | system_prompt = """You are a helpful assistant with the name Jarvis created by Eren Kalinsazlioglu at Enpoi co. that has access to users' documents. Your primary goal is to be as helpful and precise as possible in answering the users' questions. If the user asks a specific or personalized question that you do not have knowledge of, you can retrieve the relevant documents like this:
105 |
106 | retriever_tool: [here describe what kind of document the user wants to retrieve, this will be used for the similarity search so write some queries that are likely to be in the document]
107 |
108 | only use the retriever when you need the document. Do not include any filter text or explanation, only the retriever calling.
109 | Try to answer general questions without using the retriever. If you need to provide information about a specific document."""
110 | metadata_template = """
111 | You are tasked with extracting detailed metadata information from the content of a document. Follow these detailed guidelines to ensure the metadata is comprehensive and accurately reflects the document's content.
112 |
113 | **Guidelines**:
114 |
115 | 1. **Document Type**:
116 | - Identify the type of document (e.g., research paper, article, report).
117 | - Examples: "Research Paper", "Article", "Report", "Forschungsbericht", "Artikel", "Bericht"
118 |
119 | 2. **Mentions**:
120 | - Extract the main names like persons and companies mentioned in the document.
121 | - Examples: "John Doe", "Acme Corporation", "United Nations", "Johann Schmidt", "Siemens AG", "Vereinte Nationen"
122 |
123 | 3. **Keywords**:
124 | - Identify relevant keywords central to the document's topic.
125 | - Examples: "Machine Learning", "Climate Change", "Economic Policy", "Maschinelles Lernen", "Klimawandel", "Wirtschaftspolitik"
126 |
127 | 4. **About**:
128 | - Provide a brief description of the document's purpose and main arguments/findings.
129 | - Examples: "This research paper explores the impact of AI on healthcare, focusing on predictive analytics and patient outcomes.", "Dieses Forschungspapier untersucht die Auswirkungen von KI auf das Gesundheitswesen, mit einem Fokus auf prädiktive Analysen und Patientenergebnisse."
130 |
131 | 5. **Questions**:
132 | - List questions the document can answer.
133 | - Examples: "What are the benefits of renewable energy?", "How does blockchain technology work?", "Welche Vorteile bietet erneuerbare Energie?", "Wie funktioniert die Blockchain-Technologie?"
134 |
135 | 6. **Entities**:
136 | - Identify the main entities (people, places, organizations) mentioned.
137 | - Examples: "Albert Einstein", "New York City", "World Health Organization", "Albert Einstein", "New York City", "Weltgesundheitsorganisation"
138 |
139 | 7. **Summaries**:
140 | - Provide summaries of different sections or key points.
141 | - Examples: "Introduction: Overview of AI in healthcare", "Methodology: Data collection and analysis techniques", "Conclusion: Implications of findings for future research", "Einleitung: Überblick über KI im Gesundheitswesen", "Methodik: Datenerfassungs- und Analysetechniken", "Fazit: Auswirkungen der Ergebnisse auf zukünftige Forschung"
142 |
143 | 8. **Authors**:
144 | - List the document's authors.
145 | - Examples: "Jane Smith", "John Doe", "Alice Johnson", "Hans Müller", "Peter Schmid", "Anna Meier"
146 |
147 | 9. **Source**:
148 | - Specify the source or location where the document can be found.
149 | - Examples: "https://example.com/research-paper", "Library of Congress", "Journal of Medical Research", "https://beispiel.de/forschungspapier", "Bibliothek des Kongresses", "Zeitschrift für medizinische Forschung"
150 |
151 | 10. **Language**:
152 | - Indicate the language(s) the document is written in.
153 | - Examples: "English", "German", "Spanish", "Englisch", "Deutsch", "Spanisch"
154 |
155 | 11. **Audience**:
156 | - Describe the intended audience for the document.
157 | - Examples: "Healthcare professionals", "University students", "Policy makers", "Gesundheitsfachkräfte", "Universitätsstudenten", "Politische Entscheidungsträger"
158 |
159 | **Context**:
160 | {context}
161 |
162 | **Task**:
163 | Extract and provide the following metadata from the document's content based on the above guidelines. Ensure that extracted information is in the original language of the document.
164 |
165 | **Output Format**:
166 | Return the metadata in the following structured format with no filter text or extra explanation, only give the extracted metadata:
167 | ```json
168 | {{
169 | "document_type": "Type of document",
170 | "mentions": ["Main names mentioned"],
171 | "keywords": ["Relevant keywords"],
172 | "about": "Brief description",
173 | "questions": ["Questions the document can answer"],
174 | "entities": ["Main entities mentioned"],
175 | "summaries": ["Summaries of key sections"],
176 | "authors": ["List of authors"],
177 | "source": "Source or location",
178 | "language": "Document language",
179 | "audience": "Intended audience"
180 | }}
181 | ```
182 | """
183 | naming_template = """
184 | You are tasked with generating appropriate and consistent names for documents based on their content. Follow these detailed guidelines to ensure the names are informative, unique, and easy to manage:
185 |
186 | 1. **Think about your files**:
187 | - Identify the group of files your naming convention will cover.
188 | - Check for established file naming conventions in your discipline or group.
189 |
190 | 2. **Identify metadata**:
191 | - Include important information to easily locate a specific file.
192 | - Consider including a combination of the following:
193 | - Experiment conditions
194 | - Type of data
195 | - Researcher name/initials
196 | - Lab name/location
197 | - Project or experiment name or acronym
198 | - Experiment number or sample ID (use leading zeros for clarity)
199 |
200 | 3. **Abbreviate or encode metadata**:
201 | - Standardize categories and/or replace them with 2- or 3-letter codes.
202 | - Document any codes used.
203 |
204 | 4. **Think about how you will search for your files**:
205 | - Decide what metadata should appear at the beginning.
206 | - Use default ordering: alphabetically, numerically, or chronologically.
207 |
208 | 5. **Deliberately separate metadata elements**:
209 | - Avoid spaces or special characters in file names.
210 | - Use dashes (-), underscores (_), or capitalize the first letter of each word.
211 |
212 | **Example Naming Convention**:
213 | - Format: [Type]_[Project]_[SampleID].[ext]
214 | - Example: FinancialReport_ProjectX_001.pdf
215 |
216 | **Context**:
217 | {context}
218 |
219 | **Extracted Metadata**:
220 | The extracted metadata contains important information such as keywords, entities, mentions, summaries, and other details that are useful for naming the document. This metadata helps in creating a name that is both descriptive and unique.
221 |
222 | {metadata}
223 |
224 | **Task**:
225 | Generate a new, unique name for this document based on its content and the provided metadata. The new name should be formal, detailed, and distinctive to avoid confusion with other documents. Ensure the name is concise yet informative, highlighting significant details like names, firms, companies, etc. that capture the essence and purpose of the document. Be specific.
226 |
227 | **Output Format**:
228 | Provide only the new name in the following format with no filter or extra explanation, give only the new name: [Type]_[Name]_[YearRange]
229 |
230 | **Question**: {question}
231 | """
232 | # Set up logging
233 | logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
234 |
235 |
236 |
237 | class DocPOIDirectoryLoader(BaseLoader):
238 | def __init__(self, directory_path: str, metadata_path: Optional[str] = None) -> None:
239 | self.directory_path = directory_path
240 | self.metadata_path = metadata_path or directory_path
241 |
242 | def load(self) -> List[Document]:
243 | documents = []
244 | for filename in os.listdir(self.directory_path):
245 | file_path = os.path.join(self.directory_path, filename)
246 | metadata_file = os.path.join(self.metadata_path, f"{os.path.splitext(filename)[0]}.json")
247 |
248 | if os.path.exists(metadata_file):
249 | with open(metadata_file, 'r', encoding='utf-8') as f:
250 | metadata = json.load(f)
251 | else:
252 | metadata = {}
253 |
254 | # Ensure document_id is included in metadata
255 | if 'document_id' not in metadata:
256 | metadata['document_id'] = os.path.splitext(filename)[0]
257 |
258 | if filename.endswith('.pdf'):
259 | documents.extend(self.load_pdf(file_path, metadata))
260 | elif filename.endswith('.txt'):
261 | documents.extend(self.load_text(file_path, metadata))
262 | return documents
263 |
264 | def load_pdf(self, file_path: str, metadata: dict) -> List[Document]:
265 | with fitz.open(file_path) as pdf_document:
266 | full_text = ''.join([pdf_document.load_page(page_number).get_text() for page_number in range(len(pdf_document))])
267 |
268 | embedding = embed_model
269 | text_splitter = SemanticChunker(embedding, breakpoint_threshold_type="percentile")
270 | chunks = text_splitter.create_documents([full_text])
271 |
272 | return [
273 | Document(
274 | page_content=chunk.page_content,
275 | metadata=OrderedDict(metadata, page_number=page_number + 1, source=file_path)
276 | ) for page_number, chunk in enumerate(chunks)
277 | ]
278 |
279 | def load_text(self, file_path: str, metadata: dict) -> List[Document]:
280 | with open(file_path, 'r', encoding='utf-8') as f:
281 | text_content = f.read()
282 |
283 | embedding = embed_model
284 | text_splitter = SemanticChunker(embedding, breakpoint_threshold_type="percentile")
285 | chunks = text_splitter.create_documents([text_content])
286 |
287 | return [
288 | Document(
289 | page_content=chunk.page_content,
290 | metadata=OrderedDict(metadata, page_number=page_number + 1, source=file_path)
291 | ) for page_number, chunk in enumerate(chunks)
292 | ]
293 | class DocPOI(BaseLoader):
294 | """A custom document loader that reads and processes PDF or TXT files."""
295 |
296 | def __init__(self, file_path: str, metadata_path: str = None) -> None:
297 | """
298 | Initialize the loader with a file path and an optional metadata path.
299 | Args:
300 | file_path: Path to the PDF or TXT file.
301 | metadata_path: Path to the metadata file (optional, defaults to None).
302 | """
303 | self.file_path = file_path
304 | # Set metadata path based on file path if not provided
305 | if not metadata_path:
306 | assumed_metadata_path = splitext(file_path)[0] + '.json'
307 | if exists(assumed_metadata_path):
308 | metadata_path = assumed_metadata_path
309 | else:
310 | print("No metadata file found, proceeding without external metadata.")
311 | self.metadata_path = metadata_path
312 |
313 | def load(self) -> list:
314 | """
315 | Load and process the file, returning a list of Document objects.
316 | """
317 | # Load metadata from a JSON file if provided
318 | if self.metadata_path and exists(self.metadata_path):
319 | with open(self.metadata_path, 'r') as f:
320 | metadata = json.load(f)
321 | else:
322 | metadata = {'source': self.file_path, 'processed_date': datetime.now().isoformat()}
323 |
324 | # Ensure document_id is included in metadata
325 | if 'document_id' not in metadata:
326 | metadata['document_id'] = os.path.splitext(os.path.basename(self.file_path))[0]
327 |
328 | ordered_metadata = OrderedDict(metadata)
329 |
330 | # Set up the text chunker
331 | embedding = embed_model
332 | text_splitter = SemanticChunker(embedding, breakpoint_threshold_type="percentile")
333 |
334 | # Read and process the file
335 | if self.file_path.endswith('.pdf'):
336 | with fitz.open(self.file_path) as pdf:
337 | full_text = ''.join([page.get_text() for page in pdf])
338 | elif self.file_path.endswith('.txt'):
339 | with open(self.file_path, 'r', encoding='utf-8') as file:
340 | full_text = file.read()
341 | else:
342 | raise ValueError("Unsupported file type. Please provide a PDF or TXT file.")
343 |
344 | # Use the SemanticChunker to split the text
345 | documents = text_splitter.create_documents([full_text])
346 |
347 | # Generate Document objects
348 | return [
349 | Document(
350 | page_content=chunk.page_content,
351 | metadata=OrderedDict(ordered_metadata, page_number=page_number + 1)
352 | ) for page_number, chunk in enumerate(documents)
353 | ]
354 |
355 | class TTSStreamer:
356 | def __init__(self, model_path, config_path, vocab_path, speaker_wav="thunder"):
357 | self.model_path = model_path
358 | self.config_path = config_path
359 | self.vocab_path = vocab_path
360 | self.speaker_wav = f"audio_samples\\{speaker_wav}.wav"
361 | self.model = self.load_model()
362 | self.stop_flag = threading.Event() # To control stopping
363 | self.playback_thread = None
364 | self.text_chunks = [] # Store text chunks
365 |
366 | def load_model(self):
367 | config = XttsConfig()
368 | config.load_json(self.config_path)
369 | model = Xtts.init_from_config(config)
370 | model.load_checkpoint(config, checkpoint_dir=self.model_path, eval=True, vocab_path=self.vocab_path)
371 | model.cuda()
372 | return model
373 |
374 | def unload_model(self):
375 | del self.model
376 | gc.collect()
377 | if torch.cuda.is_available():
378 | torch.cuda.empty_cache()
379 | gc.collect()
380 | print("Model unloaded and GPU memory cleared successfully.")
381 |
382 | def estimate_times(self, text_chunk, avg_gen_time_per_char, avg_audio_time_per_char):
383 | gen_time = len(text_chunk) * avg_gen_time_per_char
384 | audio_duration = len(text_chunk) * avg_audio_time_per_char
385 | return gen_time, audio_duration
386 |
387 | def split_text_into_sentences(self, text):
388 | # Split text into sentences using regular expressions
389 | sentences = re.split(r'(?<=[.!?]) +', text.strip())
390 |
391 | final_sentences = []
392 | current_chunk = ""
393 |
394 | for sentence in sentences:
395 | if len(current_chunk) + len(sentence) + 1 <= 200: # +1 for the space or punctuation
396 | if current_chunk:
397 | current_chunk += " " + sentence
398 | else:
399 | current_chunk = sentence
400 | else:
401 | if current_chunk:
402 | final_sentences.append(current_chunk)
403 | current_chunk = sentence
404 |
405 | if current_chunk:
406 | final_sentences.append(current_chunk)
407 |
408 | return final_sentences
409 |
410 | def generate_audio_chunk(self, chunk, chunk_index, audio_buffer, playback_event, avg_gen_time_per_char, avg_audio_time_per_char, total_gen_time, language, speed):
411 | if self.stop_flag.is_set():
412 | return
413 | est_gen_time, est_audio_duration = self.estimate_times(chunk, avg_gen_time_per_char, avg_audio_time_per_char)
414 | print(f"Chunk {chunk_index + 1} estimated generation time: {est_gen_time:.2f} seconds, estimated audio duration: {est_audio_duration:.2f} seconds")
415 |
416 | print(f"Generating audio for chunk {chunk_index + 1}...")
417 | start_gen_time = time.time()
418 | outputs = self.model.synthesize(
419 | text=chunk,
420 | config=self.model.config,
421 | speaker_wav=self.speaker_wav,
422 | gpt_cond_len=10,
423 | language=language,
424 | speed=speed
425 | )
426 | end_gen_time = time.time()
427 | generation_time = end_gen_time - start_gen_time
428 | total_gen_time[0] += generation_time
429 | print(f"Chunk {chunk_index + 1} generated in {generation_time:.2f} seconds (estimated: {est_gen_time:.2f} seconds)")
430 |
431 | wav_data = outputs['wav']
432 | temp_output_file = f'temp_output_{chunk_index}.wav'
433 | sf.write(temp_output_file, wav_data, 22050)
434 | line_audio = AudioSegment.from_wav(temp_output_file)
435 |
436 | actual_audio_duration = len(line_audio) / 1000.0
437 | print(f"Chunk {chunk_index + 1} actual audio duration: {actual_audio_duration:.2f} seconds (estimated: {est_audio_duration:.2f} seconds)")
438 |
439 | audio_buffer[chunk_index] = line_audio
440 | print(f"Chunk {chunk_index + 1} audio saved and buffered")
441 |
442 | playback_event.set()
443 |
444 | def stream_audio_with_buffering(self, text, language="en", speed=1.2, speaker=None, fireup_delay=1.0, avg_gen_time_per_char=0.08058659382140704, avg_audio_time_per_char=0.1064346054068992):
445 | self.stop_flag.clear() # Clear the stop flag at the start
446 | if speaker:
447 | self.speaker_wav = f"audio_samples\\{speaker}.wav"
448 |
449 | print("Starting the audio streaming process...")
450 | start_time = time.time()
451 |
452 | self.text_chunks = self.split_text_into_sentences(text) # Store text chunks
453 | audio_buffer = [None] * len(self.text_chunks)
454 | playback_events = [threading.Event() for _ in self.text_chunks]
455 | total_gen_time = [0]
456 |
457 | def start_playback_after_delay():
458 | print(f"Waiting {fireup_delay:.2f} seconds before starting playback...")
459 | time.sleep(fireup_delay)
460 | print("Fireup delay is over, starting playback...")
461 | for chunk_index in range(len(self.text_chunks)):
462 | if self.stop_flag.is_set():
463 | break
464 | playback_events[chunk_index].wait()
465 | if audio_buffer[chunk_index] is not None:
466 | with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
467 | temp_output_file = temp_file.name
468 | self.play_audio_segment(audio_buffer[chunk_index], temp_output_file)
469 | if os.path.exists(temp_output_file):
470 | os.remove(temp_output_file)
471 |
472 | self.playback_thread = threading.Thread(target=start_playback_after_delay)
473 | self.playback_thread.start()
474 |
475 | for chunk_index, chunk in enumerate(self.text_chunks):
476 | if self.stop_flag.is_set():
477 | break
478 |
479 | print(f"Processing chunk {chunk_index + 1}/{len(self.text_chunks)}: '{chunk}'")
480 | self.generate_audio_chunk(chunk, chunk_index, audio_buffer, playback_events[chunk_index], avg_gen_time_per_char, avg_audio_time_per_char, total_gen_time, language, speed)
481 |
482 | self.playback_thread.join()
483 | print("Audio streaming process completed.")
484 | print(f"Total generation time: {total_gen_time[0]:.2f} seconds")
485 |
486 | def stop_streaming(self):
487 | """Stops the audio streaming process."""
488 | self.stop_flag.set()
489 | if self.playback_thread and self.playback_thread.is_alive():
490 | self.playback_thread.join()
491 | # Remove all temporary files
492 | for chunk_index in range(len(self.text_chunks)):
493 | temp_output_file = f'temp_output_{chunk_index}.wav'
494 | if os.path.exists(temp_output_file):
495 | os.remove(temp_output_file)
496 |
497 | def play_audio_segment(self, audio_segment, temp_output_file):
498 | audio_segment.export(temp_output_file, format="wav")
499 | play(AudioSegment.from_wav(temp_output_file))
500 |
501 | class DocumentAssistant:
502 | def __init__(self, model_name, temperature=0.9):
503 | self.llm = ChatOllama(
504 | model=model_name,
505 | temperature=temperature,
506 | )
507 |
508 | self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
509 |
510 | # Initialize the default agent prompt
511 | self.agent_prompt = ChatPromptTemplate.from_messages(
512 | [
513 | SystemMessage(content=system_prompt),
514 | MessagesPlaceholder(variable_name="chat_history"),
515 | HumanMessagePromptTemplate.from_template("{input}")
516 | ]
517 | )
518 |
519 | # Initialize the LLM chain with the agent prompt
520 | self.chain = LLMChain(
521 | llm=self.llm,
522 | prompt=self.agent_prompt,
523 | verbose=True,
524 | memory=self.memory
525 | )
526 |
527 | def reset_memory(self):
528 | """Resets the memory of the assistant."""
529 | self.memory.clear()
530 |
531 | def create_filter_criteria(self, filter_criteria):
532 | if filter_criteria:
533 | filter_key = filter_criteria.get("key")
534 | filter_value = filter_criteria.get("value")
535 | if filter_key and filter_value:
536 | return {"term": {f"metadata.{filter_key}.keyword": filter_value}}
537 | return None
538 |
539 | def document_retriever(self, user_query, top_k, score_threshold, filter_criteria):
540 | """
541 | Retrieves relevant documents based on the user's query and returns them with their scores and metadata.
542 | """
543 | # Create filter criteria
544 | filter_query = self.create_filter_criteria(filter_criteria)
545 |
546 | # Perform the similarity search with scores
547 | results = vectorstore.similarity_search_with_score(
548 | query=user_query,
549 | k=top_k,
550 | filter=filter_query
551 | )
552 |
553 | # Filter documents based on the score threshold
554 | filtered_documents = [
555 | (doc, score) for doc, score in results if score >= score_threshold
556 | ]
557 |
558 | # Extract document names, content, metadata, and score for returning
559 | retrieved_documents = []
560 | for doc, score in filtered_documents:
561 | document_info = {
562 | "document_name": doc.metadata.get('given_document_name', 'Unnamed'),
563 | "document_content": doc.page_content,
564 | "metadata": doc.metadata, # Include all metadata
565 | "score": score
566 | }
567 | print(f"Retrieved document with score {score}: {document_info['document_name']}")
568 | retrieved_documents.append(document_info)
569 |
570 | return retrieved_documents
571 |
572 | def formulate_final_prompt(self, user_query, context):
573 | """
574 | Formulates the final input for the LLM considering the retrieved documents.
575 | """
576 | combined_input = f"Here is the context from retrieved documents. Please use this information to answer the user's question.\n\nContext:\n{context}\n\nQuestion: {user_query}"
577 | return combined_input
578 |
579 | def query_llm(self, user_query, top_k, score_threshold, filter_criteria):
580 | filter_criteria = self.create_filter_criteria(filter_criteria)
581 |
582 | # First, use the LLM chain to determine whether document retrieval is necessary
583 | response = self.chain.invoke({"input": user_query})
584 | print(f"Initial response: {response['text']}")
585 | if "retriever_tool:" in response['text'].lower():
586 | retrieval_instruction = response['text'].split("retriever_tool:")[1].strip()
587 | combined_query = f"{user_query} {retrieval_instruction}"
588 | retrieved_documents = self.document_retriever(combined_query, top_k, score_threshold, filter_criteria)
589 |
590 | context = "\n\n".join(
591 | [f"Document Name: {doc['document_name']}\nMetadata: {doc['metadata']}\nContent:\n{doc['document_content']}"
592 | for doc in retrieved_documents]
593 | )
594 | combined_input = self.formulate_final_prompt(user_query, context)
595 |
596 | # Use the LLM chain again to answer the user's query based on the retrieved documents
597 | final_response = self.chain.invoke({"input": combined_input})
598 | print(f"Final response: {final_response['text']}")
599 | parsed_response = final_response['text']
600 | else:
601 | parsed_response = response['text']
602 | retrieved_documents = []
603 |
604 | return parsed_response, retrieved_documents
605 |
606 | def convert_image_to_pdf(image_path):
607 | """Convert an image file to a PDF and return the new PDF path."""
608 | try:
609 | image = Image.open(image_path)
610 | pdf_path = image_path.lower().replace('.png', '.pdf').replace('.jpg', '.pdf').replace('.jpeg', '.pdf')
611 | rgb_image = image.convert('RGB')
612 | rgb_image.save(pdf_path, 'PDF', resolution=100.0)
613 | os.remove(image_path)
614 | print(f"Converted {image_path} to {pdf_path}")
615 | return pdf_path
616 | except Exception as e:
617 | logging.error(f"Error converting image to PDF: {e}")
618 | return None
619 |
620 | def adaptive_image_processing(image):
621 | # Convert to grayscale
622 | gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
623 |
624 | # Apply adaptive histogram equalization
625 | clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
626 | clahe_image = clahe.apply(gray)
627 |
628 | # Apply a slight Gaussian blur to reduce noise
629 | blurred_image = cv2.GaussianBlur(clahe_image, (5, 5), 0)
630 |
631 | # Enhance contrast
632 | pil_image = Image.fromarray(blurred_image)
633 | enhancer = ImageEnhance.Contrast(pil_image)
634 | enhanced_image = enhancer.enhance(1.5)
635 |
636 | return enhanced_image
637 |
638 | def ocr_pdf(input_pdf_path):
639 | try:
640 | images = convert_from_path(input_pdf_path, poppler_path=POPPLER_PATH)
641 | pdf_writer = PdfWriter()
642 |
643 | # OCR the first page and detect its language
644 | first_page_text = pytesseract.image_to_string(images[0])
645 | try:
646 | detected_lang = detect(first_page_text)
647 | detected_lang_iso3 = pycountry.languages.get(alpha_2=detected_lang).alpha_3
648 | print(f"Detected language: {detected_lang_iso3}")
649 | except LangDetectException:
650 | logging.warning("Language detection failed, defaulting to English.")
651 | detected_lang_iso3 = 'eng' # Default to English if language detection fails
652 |
653 | # OCR the entire PDF with the detected language
654 | for image in images:
655 | processed_image = adaptive_image_processing(image)
656 | if processed_image is None:
657 | print("Error: processed_image is None")
658 | continue
659 |
660 | pdf_bytes = pytesseract.image_to_pdf_or_hocr(processed_image, extension='pdf', lang=detected_lang_iso3, config=tessdata_dir_config)
661 | if pdf_bytes is None:
662 | print("Error: pdf_bytes is None")
663 | continue
664 |
665 | pdf_stream = io.BytesIO(pdf_bytes)
666 | pdf = PdfReader(pdf_stream)
667 | pdf_writer.add_page(pdf.pages[0])
668 |
669 | output_pdf_path = input_pdf_path # Keep the file path consistent
670 | with open(output_pdf_path, "wb") as f_out:
671 | pdf_writer.write(f_out)
672 |
673 | print(f"OCR processed and replaced {output_pdf_path}")
674 | return output_pdf_path
675 | except Exception as e:
676 | logging.error(f"Error during OCR: {e}")
677 | return None
678 |
679 | def check_pdf_has_readable_text(file_path: str) -> bool:
680 | """Check if the PDF contains any readable text."""
681 | try:
682 | with open(file_path, "rb") as file:
683 | reader = PdfReader(file)
684 | for page in reader.pages:
685 | text = page.extract_text()
686 | if text:
687 | return True
688 | except Exception as e:
689 | logging.error(f"Error reading {file_path}: {e}")
690 | return False
691 |
692 | def check_pdf_metadata_keys(file_path: str, required_keys: list) -> bool:
693 | """Check if the PDF metadata contains all the required keys."""
694 | try:
695 | with open(file_path, "rb") as file:
696 | reader = PdfReader(file)
697 | metadata = reader.metadata
698 | return all(key in metadata for key in required_keys)
699 | except Exception as e:
700 | logging.error(f"Error reading metadata from {file_path}: {e}")
701 | return False
702 |
703 | import os
704 |
705 | def check_for_metadata_json(file_path: str) -> bool:
706 | """Check if the corresponding JSON metadata file exists."""
707 | json_file_path = f"{os.path.splitext(file_path)[0]}.json"
708 | return os.path.exists(json_file_path)
709 |
710 | def process_pdf_file(file_path: str) -> str:
711 | """Process a single PDF file and perform all checks."""
712 | print(f"Processing {file_path}...")
713 |
714 | document_name = None # Initialize document_name
715 |
716 | has_text = check_pdf_has_readable_text(file_path)
717 | metadata_keys = ["/document_id", "/original_file_name", "/given_document_name"]
718 |
719 | # Perform OCR if no readable text is found
720 | if not has_text:
721 | print(f"The PDF {file_path} does not contain readable text. Performing OCR...")
722 | file_path = ocr_pdf(file_path) # Update file path if the file was replaced
723 |
724 | # Generate metadata if necessary
725 | if not check_for_metadata_json(file_path):
726 | print(f"The corresponding metadata JSON file does not exist for {file_path}. Performing metadata extraction...")
727 | document_name, metadata = generate_metadata_and_name(file_path)
728 |
729 | # Check for required metadata keys
730 | if check_pdf_metadata_keys(file_path, metadata_keys):
731 | print(f"The PDF {file_path} contains the required metadata keys.")
732 | else:
733 | print(f"The PDF {file_path} is missing some required metadata keys.")
734 |
735 | return document_name
736 |
737 | def process_txt_file(file_path: str) -> str:
738 | """Process a TXT file by generating metadata if necessary."""
739 | print(f"Processing TXT file {file_path}...")
740 |
741 | document_name = None # Initialize document_name
742 |
743 | if check_for_metadata_json(file_path):
744 | print(f"The corresponding metadata JSON file exists for {file_path}.")
745 | else:
746 | print(f"The corresponding metadata JSON file does not exist for {file_path}. Generating metadata...")
747 | document_name, metadata = generate_metadata_and_name(file_path)
748 |
749 | return document_name
750 |
751 | def process_image_file(file_path: str) -> str:
752 | """Convert an image file to a PDF, perform OCR, and generate metadata."""
753 | print(f"Processing image file {file_path}...")
754 |
755 | document_name = None # Initialize document_name
756 |
757 | pdf_path = convert_image_to_pdf(file_path)
758 | if pdf_path:
759 | print(f"Converted image to PDF: {pdf_path}. Performing OCR and generating metadata...")
760 | pdf_path = ocr_pdf(pdf_path) # Capture the full text from OCR
761 | document_name, metadata = generate_metadata_and_name(file_path)
762 |
763 | return document_name
764 |
765 | def process_files_in_directory(directory_path: str, only_pdf: bool = False) -> None:
766 | def process():
767 | print(f"Processing files in directory {directory_path}...")
768 | for root, dirs, files in os.walk(directory_path):
769 | for file in files:
770 | file_path = os.path.join(root, file)
771 | if file.endswith(".pdf"):
772 | process_pdf_file(file_path)
773 | elif not only_pdf:
774 | if file.endswith(".txt"):
775 | process_txt_file(file_path)
776 | elif file.lower().endswith(('.png', '.jpg', '.jpeg')):
777 | process_image_file(file_path)
778 |
779 | try:
780 | process()
781 | except Exception as e:
782 | print(f"Error encountered: {e}. Retrying...")
783 | try:
784 | process()
785 | except Exception as e:
786 | print(f"Retry failed: {e}")
787 |
788 | def update_pdfmetadata(file_path: str, new_metadata: dict) -> None:
789 | """Updates the metadata of the given PDF file with new keys.
790 |
791 | Args:
792 | file_path: The path to the PDF file.
793 | new_metadata: A dictionary of new metadata to add.
794 | """
795 | # Open the existing PDF
796 | with open(file_path, "rb") as file:
797 | reader = PdfReader(file)
798 | writer = PdfWriter()
799 | writer.append_pages_from_reader(reader)
800 |
801 | # Get existing metadata
802 | existing_metadata = reader.metadata
803 |
804 | # Update existing metadata with new keys
805 | updated_metadata = {NameObject(key): TextStringObject(value) for key, value in existing_metadata.items()}
806 |
807 | for key, value in new_metadata.items():
808 | updated_metadata[NameObject(key)] = TextStringObject(value)
809 |
810 | # Add updated metadata
811 | writer.add_metadata(updated_metadata)
812 |
813 | # Save the PDF with the updated metadata back to the same file
814 | with open(file_path, "wb") as updated_file:
815 | writer.write(updated_file)
816 | def generate_metadata_and_name(file_path):
817 |
818 | # Load the document content
819 | file_extension = os.path.splitext(file_path)[1].lower()
820 |
821 | if file_extension == ".pdf":
822 | loader = PyPDFLoader(file_path, extract_images=False)
823 | elif file_extension == ".txt":
824 | loader = TextLoader(file_path)
825 | else:
826 | raise ValueError("Unsupported file type")
827 |
828 | docs = loader.load()
829 | metadata_prompt = ChatPromptTemplate.from_template(metadata_template)
830 |
831 | # Create the chain for metadata extraction
832 | metadata_chain = metadata_prompt | metadata_llm
833 |
834 | # Invoke the chain with the document content
835 | metadata_result = metadata_chain.invoke({
836 | "context": docs
837 | })
838 |
839 | # Extract the content from the result
840 | metadata_content = metadata_result.content
841 |
842 | # Parse the JSON part of the content
843 | json_start = metadata_content.find('{')
844 | json_end = metadata_content.rfind('}') + 1
845 | json_content = metadata_content[json_start:json_end]
846 | metadata = json.loads(json_content)
847 |
848 | # Ensure all keys have values that are basic lists or primitive types
849 | def collapse_dicts(value):
850 | if isinstance(value, list):
851 | return [item['name'] if isinstance(item, dict) and 'name' in item else item for item in value]
852 | return value
853 |
854 | metadata = {key: collapse_dicts(value) for key, value in metadata.items()}
855 |
856 | # Format the metadata in a readable format
857 | formatted_metadata = json.dumps(metadata, indent=4, ensure_ascii=False)
858 | naming_prompt = ChatPromptTemplate.from_template(naming_template)
859 |
860 | # Create the chain for document naming
861 | naming_chain = naming_prompt | naming_llm
862 |
863 | # Invoke the chain with the context and metadata
864 | naming_result = naming_chain.invoke({
865 | "question": "What is the most suitable name for this document based on its content?",
866 | "context": docs,
867 | "metadata": formatted_metadata
868 | })
869 |
870 | # Extract the document name from the result
871 | naming_content = naming_result.content
872 | temp_document_name = naming_content.split('\n')[0].strip()
873 |
874 | # Replace spaces in document name with underscores
875 | document_name = temp_document_name.replace(" ", "_")
876 | # Determine the file extension
877 |
878 | # Load the metadata JSON
879 | metadata = json.loads(formatted_metadata)
880 |
881 | # Ensure given_document_name is the first key
882 | ordered_metadata = OrderedDict([("given_document_name", document_name)])
883 | ordered_metadata.update(metadata)
884 |
885 | # Generate a unique ID for the document
886 | document_id = str(uuid.uuid4())
887 |
888 | # Get file details
889 | file_directory = os.path.dirname(file_path)
890 | original_file_name = os.path.basename(file_path)
891 |
892 | # Get file creation and modification dates
893 | file_creation_date = datetime.fromtimestamp(os.path.getctime(file_path)).isoformat()
894 | file_modification_date = datetime.fromtimestamp(os.path.getmtime(file_path)).isoformat()
895 | metadata_creation_date = datetime.now().isoformat()
896 |
897 | # Append additional metadata
898 | ordered_metadata['document_id'] = document_id
899 | ordered_metadata['file_directory'] = file_directory
900 | ordered_metadata['original_file_name'] = original_file_name
901 | ordered_metadata['file_creation_date'] = file_creation_date
902 | ordered_metadata['file_modification_date'] = file_modification_date
903 | ordered_metadata['metadata_creation_date'] = metadata_creation_date
904 |
905 | # Format the metadata in a readable format
906 | formatted_metadata = json.dumps(ordered_metadata, indent=4, ensure_ascii=False)
907 |
908 | # Save the metadata to a JSON file with the same name as the document
909 | metadata_file_path = os.path.join(file_directory, f"{document_name}.json")
910 | with open(metadata_file_path, 'w', encoding='utf-8') as f:
911 | f.write(formatted_metadata)
912 |
913 | if file_extension == ".pdf":
914 | # Update PDF metadata with relevant keys (only for PDFs)
915 | pdf_metadata = {
916 | "/document_id": document_id,
917 | "/original_file_name": original_file_name,
918 | "/given_document_name": document_name
919 | }
920 | update_pdfmetadata(file_path, pdf_metadata)
921 | # Rename the original PDF file to the new document name
922 | new_file_path = os.path.join(file_directory, f"{document_name}.pdf")
923 | elif file_extension == ".txt":
924 | # Rename the original TXT file to the new document name
925 | new_file_path = os.path.join(file_directory, f"{document_name}.txt")
926 |
927 | os.rename(file_path, new_file_path)
928 |
929 |
930 | return document_name, formatted_metadata
931 |
932 | # Function to save parameters to a file
933 | def save_params(params):
934 | with open(PARAM_FILE, "w") as f:
935 | json.dump(params, f)
936 |
937 | def load_params():
938 | # Define default parameters
939 | default_params = {
940 | "collection_name": "docpoi",
941 | "score_threshold": 0.7,
942 | "top_k": 5,
943 | "voiceover_speed": 1.3,
944 | "fireup_speed": 5.0,
945 | "language": "en",
946 | "speaker": "thunder",
947 | "use_voiceover": False, # Default to unchecked
948 | "filter_key": "",
949 | "filter_value": ""
950 | }
951 |
952 | params_file = PARAM_FILE
953 |
954 | # Attempt to load parameters from the file
955 | try:
956 | with open(params_file, "r") as file:
957 | params = json.load(file)
958 | except (FileNotFoundError, json.JSONDecodeError):
959 | params = {}
960 |
961 | # Update missing parameters with default values
962 | for key, value in default_params.items():
963 | if key not in params:
964 | params[key] = value
965 |
966 | return params
967 |
968 | def initialize_vectorstore(collection_name="docpoi"):
969 | embedding = embed_model
970 |
971 | vectorstore = ElasticsearchStore(
972 | es_url="http://localhost:9200", index_name=collection_name, embedding=embedding, strategy=ElasticsearchStore.ExactRetrievalStrategy()
973 | )
974 |
975 | namespace = f"elasticsearch/{collection_name}"
976 | record_manager = SQLRecordManager(
977 | namespace, db_url="sqlite:///record_manager_cache.sql"
978 | )
979 |
980 | record_manager.create_schema()
981 |
982 | return vectorstore, record_manager
983 | # Function to reload the vectorstore
984 | def reload_vectorstore():
985 | global vectorstore, record_manager
986 | vectorstore, record_manager = initialize_vectorstore(params["collection_name"])
987 | return "Vectorstore reloaded with collection: " + params["collection_name"]
988 |
989 | def add_to_vectorstore():
990 | loader = DocPOIDirectoryLoader(directory_path=DIRECTORY_PATH)
991 | documents = loader.load()
992 | index(
993 | documents,
994 | record_manager,
995 | vectorstore,
996 | cleanup="incremental",
997 | source_id_key="document_id",
998 | )
999 |
1000 | def reset_vectorstore():
1001 | index(
1002 | [], # Empty list to clear the vectorstore
1003 | record_manager,
1004 | vectorstore,
1005 | cleanup="full",
1006 | source_id_key="document_id",
1007 | )
1008 | return "Vectorstore has been reset."
1009 |
1010 | def upload_file(file):
1011 | params = load_params()
1012 | UPLOAD_FOLDER = params.get("directory", "ocr/data")
1013 | if not os.path.exists(UPLOAD_FOLDER):
1014 | os.mkdir(UPLOAD_FOLDER)
1015 | destination = shutil.copy(file, UPLOAD_FOLDER)
1016 |
1017 | if destination.endswith(".pdf"):
1018 | new_document_name = process_pdf_file(destination)
1019 | elif destination.endswith(".txt"):
1020 | new_document_name = process_txt_file(destination)
1021 | elif destination.lower().endswith(('.png', '.jpg', '.jpeg')):
1022 | new_document_name = process_image_file(destination)
1023 |
1024 | file_directory = os.path.dirname(destination)
1025 | file_extension = os.path.splitext(destination)[1]
1026 | new_destination = os.path.join(file_directory, f"{new_document_name}{file_extension}")
1027 |
1028 | if os.path.exists(destination):
1029 | os.rename(destination, new_destination)
1030 | else:
1031 | possible_new_path = os.path.join(file_directory, f"{new_document_name}{file_extension}")
1032 | if os.path.exists(possible_new_path):
1033 | new_destination = possible_new_path
1034 | else:
1035 | return "File not found during renaming."
1036 |
1037 | loader = DocPOI(file_path=new_destination)
1038 | documents = loader.load()
1039 |
1040 | index(
1041 | documents,
1042 | record_manager,
1043 | vectorstore,
1044 | cleanup="incremental",
1045 | source_id_key="document_id",
1046 | )
1047 |
1048 | return "File Uploaded and Processed!!!"
1049 |
1050 | def process_files():
1051 | # Load parameters
1052 | params = load_params()
1053 |
1054 | # Check if mode key is set to default or only_pdf
1055 | mode = params.get("mode", "default")
1056 | only_pdf = mode == "only_pdf"
1057 |
1058 | # Process files in directory with the only_pdf parameter
1059 | process_files_in_directory(DIRECTORY_PATH, only_pdf)
1060 |
1061 | add_to_vectorstore()
1062 | return "Files are being processed..."
1063 |
1064 | def print_like_dislike(x: gr.LikeData):
1065 | print(x.index, x.value, x.liked)
1066 |
1067 | def add_message(history, message):
1068 | for x in message["files"]:
1069 | history.append(((x,), None))
1070 | if message["text"] is not None:
1071 | history.append((message["text"], None))
1072 | return history, gr.MultimodalTextbox(value=None, interactive=False)
1073 |
1074 | def on_param_change(top_k, score_threshold, collection_name, filter_key=None, filter_value=None):
1075 | params["top_k"] = top_k
1076 | params["score_threshold"] = score_threshold
1077 | params["collection_name"] = collection_name
1078 |
1079 | if filter_key is not None:
1080 | params["filter_key"] = filter_key
1081 | if filter_value is not None:
1082 | params["filter_value"] = filter_value
1083 |
1084 | save_params(params)
1085 | return gr.update()
1086 |
1087 | def on_advanced_param_change(voiceover_speed, fireup_speed, language, speaker, use_voiceover):
1088 | params["voiceover_speed"] = voiceover_speed
1089 | params["fireup_speed"] = fireup_speed
1090 | params["language"] = language
1091 | params["speaker"] = speaker
1092 | params["use_voiceover"] = use_voiceover
1093 | save_params(params)
1094 | return gr.update()
1095 |
1096 | def list_microphones():
1097 | """Lists available audio input devices (microphones)."""
1098 | devices = sd.query_devices()
1099 | input_devices = [device for device in devices if device['max_input_channels'] > 0]
1100 | print("Available Microphones:")
1101 | for i, device in enumerate(input_devices):
1102 | print(f"{i}: {device['name']}")
1103 | return input_devices
1104 |
1105 | def start_recording():
1106 | """Starts recording audio."""
1107 | global is_recording, recording, stream
1108 | if not is_recording:
1109 | print("Recording started...")
1110 | play(AudioSegment.from_file("audio_samples\start_sound.mp3")) # Play start sound
1111 | recording = []
1112 | is_recording = True
1113 | stream = sd.InputStream(callback=callback, samplerate=44100, channels=1)
1114 | stream.start()
1115 |
1116 | def stop_recording_and_save():
1117 | """Stops recording and saves the audio to a file."""
1118 | global is_recording, recording, stream
1119 | if is_recording:
1120 | print("Recording stopped. Saving to file...")
1121 | stream.stop()
1122 | stream.close()
1123 | play(AudioSegment.from_file("audio_samples\stop_sound.mp3")) # Play stop sound
1124 | # Convert the list of recordings to a NumPy array
1125 | recording = np.concatenate(recording, axis=0)
1126 | # Save to a WAV file using scipy.io.wavfile.write
1127 | write(filename, 44100, recording)
1128 | print(f"Recording saved to {filename}")
1129 | recording = None # Reset the recording
1130 | is_recording = False
1131 |
1132 | def callback(indata, frames, time, status):
1133 | """This function is called for each audio block."""
1134 | global recording
1135 | if recording is not None:
1136 | recording.append(indata.copy())
1137 |
1138 | def on_key_press(key, history):
1139 | """Triggered when the key is pressed."""
1140 | global is_recording
1141 | if is_recording:
1142 | stop_recording_and_save()
1143 | transcribed_text = transcribe_audio_to_text(filename) # Transcribe the audio
1144 | print("Transcribed Text:", transcribed_text)
1145 | # Simulate the transcription as user input and pass it to the chatbot
1146 | history.append([transcribed_text, ""]) # Add the transcribed text as user input
1147 | for response in bot_response(history):
1148 | pass # Generate bot response based on transcribed text
1149 | else:
1150 | start_recording()
1151 | def transcribe_audio_to_text(filename):
1152 | """Transcribes the recorded audio to text using Whisper."""
1153 | whisper_model = whisper.load_model("small")
1154 | result = whisper_model.transcribe(filename)
1155 | return result["text"]
1156 |
1157 | def setup_keyboard_shortcuts(history):
1158 | """Sets up keyboard shortcuts for Ctrl + Alt + F13, F14, and F15."""
1159 | def on_press(key):
1160 | try:
1161 | if key == keyboard.Key.f13 and {keyboard.Key.ctrl_l, keyboard.Key.alt_l}.issubset(pressed_keys):
1162 | on_key_press(key, history)
1163 | elif key == keyboard.Key.f14 and {keyboard.Key.ctrl_l, keyboard.Key.alt_l}.issubset(pressed_keys):
1164 | on_key_press(key, history)
1165 | elif key == keyboard.Key.f15 and {keyboard.Key.ctrl_l, keyboard.Key.alt_l}.issubset(pressed_keys):
1166 | on_key_press(key, history)
1167 | except AttributeError:
1168 | pass
1169 |
1170 | pressed_keys = set()
1171 |
1172 | def on_press_wrapper(key):
1173 | if key in {keyboard.Key.ctrl_l, keyboard.Key.alt_l, keyboard.Key.f13, keyboard.Key.f14, keyboard.Key.f15}:
1174 | pressed_keys.add(key)
1175 | on_press(key)
1176 |
1177 | def on_release_wrapper(key):
1178 | if key in {keyboard.Key.ctrl_l, keyboard.Key.alt_l, keyboard.Key.f13, keyboard.Key.f14, keyboard.Key.f15}:
1179 | pressed_keys.discard(key)
1180 |
1181 | listener = keyboard.Listener(on_press=on_press_wrapper, on_release=on_release_wrapper)
1182 | listener.start()
1183 |
1184 | # Function to get the full path with extension
1185 | def get_full_path(doc_name):
1186 | base_path = f"{DIRECTORY_PATH}/"
1187 | pdf_path = os.path.join(base_path, f"{doc_name}.pdf")
1188 | txt_path = os.path.join(base_path, f"{doc_name}.txt")
1189 |
1190 | if os.path.exists(pdf_path):
1191 | return pdf_path
1192 | elif os.path.exists(txt_path):
1193 | return txt_path
1194 | else:
1195 | raise FileNotFoundError(f"Neither {pdf_path} nor {txt_path} exists.")
1196 |
1197 | def bot_response(history):
1198 |
1199 | # Get the last message from the user
1200 | user_message = history[-1][0]
1201 |
1202 | # Load parameters
1203 | params = load_params()
1204 |
1205 | # Extract the key and value for filtering
1206 | filter_key = key_box.value
1207 | filter_value = value_box.value
1208 | filter_criteria = {filter_key: filter_value} if filter_key and filter_value else None
1209 |
1210 | # Simulate a query to the LLM
1211 | llm_response, retrieved_documents = assistant.query_llm(
1212 | user_query=user_message,
1213 | top_k=params["top_k"],
1214 | score_threshold=params["score_threshold"],
1215 | filter_criteria=filter_criteria
1216 | )
1217 |
1218 | # Prepare the full response text
1219 | full_response = llm_response
1220 |
1221 | # Make the retrieved documents available for download
1222 | downloadable_files = [get_full_path(doc['document_name']) for doc in retrieved_documents]
1223 |
1224 | # Commented out TTS related code
1225 | # If use_voiceover is True, run the TTS streamer in a separate thread
1226 | if params["use_voiceover"]:
1227 | tts_thread = threading.Thread(
1228 | target=tts_streamer.stream_audio_with_buffering,
1229 | args=(full_response,),
1230 | kwargs={
1231 | "language": params["language"],
1232 | "speed": params["voiceover_speed"],
1233 | "speaker": params["speaker"],
1234 | "fireup_delay": params["fireup_speed"]
1235 | }
1236 | )
1237 | tts_thread.start()
1238 |
1239 | # Initialize the bot's response in the history
1240 | history[-1][1] = ""
1241 |
1242 | # Stream the response character by character
1243 | for character in full_response:
1244 | if tts_streamer.stop_flag.is_set(): # Check if stop was requested
1245 | break
1246 | history[-1][1] += character
1247 | time.sleep(0.01) # Adjust the speed of streaming if needed
1248 | yield history, None
1249 |
1250 | # After streaming the text, yield the downloadable files
1251 | yield history, gr.File(value=downloadable_files)
1252 |
1253 | # Wait for the TTS thread to complete if it was started
1254 | if params["use_voiceover"]:
1255 | tts_thread.join()
1256 |
1257 | def stop_all_streaming():
1258 | """Stops all ongoing text and voiceover streaming."""
1259 | tts_streamer.stop_streaming()
1260 |
1261 | def reset_conversation():
1262 | """Resets the conversation by clearing the memory and chat history."""
1263 | assistant.reset_memory() # Reset the assistant's memory
1264 | return [], [] # Return empty lists for the chatbot and its state
1265 |
1266 | def check_setup():
1267 | # Check if the params file exists
1268 | if not os.path.exists(PARAM_FILE):
1269 | return False
1270 |
1271 | # Load the parameters from the file
1272 | with open(PARAM_FILE, "r") as file:
1273 | params = json.load(file)
1274 |
1275 | # Check if the required parameters are set
1276 | if "use_voiceover" in params and "directory" in params:
1277 | return True
1278 |
1279 | return False
1280 |
1281 | user_responses = {}
1282 | params = load_params()
1283 | current_state = None
1284 |
1285 | # Function to save parameters to a JSON file
1286 | def save_params(params):
1287 | with open("params.json", "w") as f:
1288 | json.dump(params, f)
1289 |
1290 | # Function to list available speakers (dummy list for now)
1291 | def list_speakers():
1292 | return [
1293 | {"name": "Thunder"},
1294 | {"name": "Serenity"},
1295 | {"name": "Blaze"},
1296 | ]
1297 |
1298 | # Function to handle the welcome message and ask for the user's name
1299 | def handle_welcome(history):
1300 | bot_message = "Welcome to the setup process. What is your name? (Type 'skip' to use 'User')"
1301 | history[-1][1] = bot_message
1302 | return history
1303 |
1304 | # Function to handle the user's name input
1305 | def handle_ask_name(history):
1306 | user_message = history[-1][0].strip()
1307 | if not user_message or user_message.lower() == "skip":
1308 | user_message = "User"
1309 | user_responses['name'] = user_message
1310 | save_params({"name": user_responses['name']})
1311 | bot_message = f"""Great to meet you, {user_responses['name']}! I'd love to hear your voice, too. Would you like to enable my speech function? (yes/no)\n
1312 | Note: This feature is recommended for systems with at least 10GB of GPU VRAM for optimal performance. You can disable it later if needed."""
1313 | history[-1][1] = bot_message
1314 | return history
1315 |
1316 | # Function to handle voice recognition choice
1317 | def handle_voice_recognition_choice(history):
1318 | user_message = history[-1][0].strip().lower()
1319 | if user_message == "no" or user_message == "skip" or user_message == "":
1320 | user_responses['use_voiceover'] = False
1321 | return handle_general_setup(history)
1322 | elif user_message == "yes":
1323 | user_responses['use_voiceover'] = True
1324 | save_params({"name": user_responses['name'], "use_voiceover": True})
1325 | return handle_microphone_selection(history)
1326 | else:
1327 | bot_message = "Invalid response. Please type 'yes' or 'no'."
1328 | history[-1][1] = bot_message
1329 | return history
1330 |
1331 | # Function to handle microphone selection
1332 | def handle_microphone_selection(history):
1333 | global current_state
1334 | input_devices = list_microphones() # This should return a list of available microphones
1335 | bot_message = "Please choose your microphone from the list by entering the corresponding number:\n"
1336 |
1337 | for i, device in enumerate(input_devices):
1338 | bot_message += f"{i + 1}. {device['name']}\n"
1339 |
1340 | history[-1][1] = bot_message
1341 | current_state = "waiting_for_microphone_selection"
1342 | return history
1343 |
1344 | def handle_microphone_response(history):
1345 | global current_state
1346 | user_message = history[-1][0].strip()
1347 | input_devices = list_microphones() # Retrieve available microphones again to validate selection
1348 |
1349 | if user_message.isdigit() and 1 <= int(user_message) <= len(input_devices):
1350 | selected_device = input_devices[int(user_message) - 1]
1351 | user_responses['microphone'] = selected_device['name']
1352 | save_params(user_responses)
1353 | bot_message = f"Microphone '{selected_device['name']}' selected."
1354 | history.append([None, bot_message])
1355 | current_state = "waiting_for_speaker_selection"
1356 | return handle_speaker_selection(history)
1357 | else:
1358 | bot_message = "Invalid selection. Please choose a microphone by number."
1359 | history[-1][1] = bot_message
1360 | return history
1361 |
1362 | # Function to handle speaker selection
1363 | def handle_speaker_selection(history):
1364 | global current_state
1365 | speakers = list_speakers()
1366 | bot_message = "I have a range of voices available. Please select your preferred voice by number:\n"
1367 | for i, speaker in enumerate(speakers):
1368 | bot_message += f"{i + 1}. {speaker['name']}\n"
1369 | history[-1][1] = bot_message
1370 | current_state = "waiting_for_speaker_selection"
1371 | return history
1372 |
1373 | def handle_speaker_response(history):
1374 | global current_state
1375 | user_message = history[-1][0].strip()
1376 | speakers = list_speakers()
1377 |
1378 | if user_message.isdigit() and 1 <= int(user_message) <= len(speakers):
1379 | selected_speaker = speakers[int(user_message) - 1]
1380 | user_responses['speaker'] = selected_speaker['name']
1381 | save_params(user_responses)
1382 | bot_message = f"Speaker '{selected_speaker['name']}' selected."
1383 | history.append([None, bot_message])
1384 | return handle_key_combination_selection(history)
1385 | else:
1386 | bot_message = "Invalid selection. Please choose a speaker by number."
1387 | history[-1][1] = bot_message
1388 | return history
1389 |
1390 | # Function to handle key combination selection
1391 | def handle_key_combination_selection(history):
1392 | global current_state
1393 | bot_message = """Let's set up voice recognition. Please enter the key combination you'd like to use to activate it.
1394 | The default is Ctrl+Alt+F13 for custom keyboards.
1395 | Type 'skip' or press Enter to keep the default. Use standard key names like Ctrl, Alt, Win, Tab, Shift, etc."""
1396 | history[-1][1] = bot_message
1397 | current_state = "waiting_for_key_combination"
1398 | return history
1399 |
1400 | def handle_key_combination_response(history):
1401 | user_message = history[-1][0].strip()
1402 | if not user_message or user_message.lower() == "skip":
1403 | user_message = "Ctrl+Alt+F13"
1404 | user_responses['key_combination'] = user_message
1405 | save_params(user_responses)
1406 |
1407 | return handle_general_setup(history)
1408 |
1409 | # Function to handle the general setup (directory selection and language)
1410 | def handle_general_setup(history):
1411 | global current_state
1412 | bot_message = "Now, let's select the main directory where your files are stored. The file explorer will open shortly."
1413 | history[-1][1] = bot_message
1414 |
1415 | # Display the message first and then automatically proceed to select the directory
1416 | current_state = "waiting_for_directory_selection"
1417 |
1418 | # Trigger directory selection after displaying the message
1419 | history = handle_directory_selection(history) # Automatically call directory selection after showing the message
1420 | return history
1421 |
1422 | # Function to actually open the directory selector immediately after the message is shown
1423 | def handle_directory_selection(history):
1424 | global current_state
1425 | root = tk.Tk()
1426 | root.withdraw() # Hide the root window
1427 | selected_directory = filedialog.askdirectory() # Open the file explorer for directory selection
1428 | root.destroy() # Destroy the root window after selection
1429 |
1430 | if selected_directory:
1431 | user_responses['directory'] = selected_directory
1432 | else:
1433 | user_responses['directory'] = "Default Directory"
1434 |
1435 | save_params(user_responses)
1436 |
1437 | # Show message for only_pdf or reselect options after selecting the directory
1438 | bot_message = f"""Directory chosen: {user_responses['directory']}.
1439 | Important: All files in this directory will be processed.
1440 | Type 'only_pdf' to limit to PDF files, type 'reselect' to choose a different directory, or press Enter to proceed."""
1441 |
1442 | history.append([None, bot_message])
1443 | current_state = "waiting_for_directory_response" # Now wait for user's response
1444 |
1445 | return history
1446 |
1447 | # Function to handle the directory response after the user makes a choice
1448 | def handle_directory_response(history):
1449 | global current_state
1450 | user_message = history[-1][0].strip().lower()
1451 |
1452 | if user_message == "reselect":
1453 | # Go back to the general setup to reselect the directory
1454 | return handle_general_setup(history)
1455 | elif not user_message or user_message == "skip":
1456 | # If no input or skip, use the default mode
1457 | user_responses['mode'] = "default"
1458 | elif user_message == "only_pdf":
1459 | # If user selects only_pdf mode
1460 | user_responses['mode'] = "only_pdf"
1461 | else:
1462 | # Default mode if user presses Enter
1463 | user_responses['mode'] = "default"
1464 |
1465 | save_params(user_responses)
1466 | current_state = "waiting_for_language_selection" # Move to next state
1467 | return handle_language_selection(history)
1468 |
1469 | def handle_language_selection(history):
1470 | global current_state
1471 | bot_message = "Lastly, please enter your primary language. Type 'skip' to default to English."
1472 | history[-1][1] = bot_message
1473 | current_state = "waiting_for_language_response"
1474 | return history
1475 |
1476 | # Function to restart the script
1477 | def restart_script():
1478 | if sys.platform.startswith('win'):
1479 | subprocess.Popen(["restart.bat"])
1480 | else:
1481 | # Ensure restart.sh has execute permissions
1482 | restart_script_path = os.path.join(os.path.dirname(__file__), "restart.sh")
1483 | subprocess.run(["chmod", "+x", restart_script_path])
1484 | subprocess.Popen(["bash", restart_script_path])
1485 | sys.exit()
1486 |
1487 | def handle_auto_restart(history):
1488 | bot_message = "Restarting now..."
1489 | history.append([None, bot_message])
1490 |
1491 | # Call the restart function
1492 | restart_script()
1493 |
1494 | return history
1495 |
1496 | def handle_final_message(history):
1497 | global current_state
1498 | bot_message = "The setup is complete. The assistant will restart shortly. Enter anything to continue..."
1499 | history[-1][1] = bot_message
1500 | current_state = "ready_for_auto_restart"
1501 |
1502 | # Schedule restart automatically without user interaction
1503 | threading.Timer(0.5, handle_auto_restart, args=[history]).start() # 3-second delay before restart
1504 | return history
1505 |
1506 | # Function to handle auto-restart
1507 | def handle_auto_restart(history):
1508 | bot_message = "Restarting now..."
1509 | history.append([None, bot_message])
1510 |
1511 | restart_script()
1512 | return history
1513 |
1514 | def handle_language_response(history):
1515 | global current_state
1516 | user_message = history[-1][0].strip().lower()
1517 |
1518 | if not user_message or user_message == "skip":
1519 | user_message = "English"
1520 | user_responses['main_language'] = user_message
1521 | save_params(user_responses)
1522 |
1523 | bot_message = "Language preference saved. The setup is complete."
1524 | history[-1][1] = bot_message
1525 | current_state = "show_final_message" # Set state for the final message display
1526 |
1527 | return handle_final_message(history)
1528 |
1529 | # Bot response logic to handle the flow of messages and actions
1530 | def setup_bot_response(history):
1531 | global current_state
1532 |
1533 | if current_state is None and len(history) == 1:
1534 | return handle_welcome(history)
1535 | elif current_state is None and len(history) == 2:
1536 | return handle_ask_name(history)
1537 | elif current_state is None and len(history) == 3:
1538 | return handle_voice_recognition_choice(history)
1539 | elif current_state == "waiting_for_microphone_selection":
1540 | return handle_microphone_response(history)
1541 | elif current_state == "waiting_for_speaker_selection":
1542 | return handle_speaker_response(history)
1543 | elif current_state == "waiting_for_key_combination":
1544 | return handle_key_combination_response(history)
1545 | elif current_state == "waiting_for_directory_selection":
1546 | return handle_directory_selection(history) # Automatically open the file explorer
1547 | elif current_state == "waiting_for_directory_response":
1548 | return handle_directory_response(history)
1549 | elif current_state == "waiting_for_language_selection":
1550 | return handle_language_selection(history)
1551 | elif current_state == "waiting_for_language_response":
1552 | return handle_language_response(history)
1553 | elif current_state == "show_final_message":
1554 | return handle_final_message(history)
1555 | elif current_state == "ready_for_auto_restart":
1556 | return handle_auto_restart(history)
1557 | else:
1558 | bot_message = "I'm not sure how to respond to that. Could you please provide more details?"
1559 | history[-1][1] = bot_message
1560 | return history
1561 |
1562 | # Function to add a message to the history and reset the input box
1563 | def add_message_setup(history, message):
1564 | if message is not None:
1565 | history.append([message, None])
1566 | return history, ""
1567 |
1568 | history = []
1569 |
1570 | with gr.Blocks(theme=gr.themes.Soft(), css="footer{display:none !important} #chatbot { height: 100%; flex-grow: 1; }") as setup_demo:
1571 | # Display audio samples when speaker selection is reached
1572 | def display_audio_samples():
1573 | return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
1574 |
1575 | # Hide audio samples after speaker selection
1576 | def hide_audio_samples():
1577 | return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
1578 |
1579 | with gr.Row():
1580 | with gr.Column(scale=1):
1581 | chatbot = gr.Chatbot(elem_id="chatbot")
1582 | chat_input = gr.Textbox(interactive=True, placeholder="Enter message...", show_label=False, autoscroll=True)
1583 | chat_msg = chat_input.submit(add_message_setup, [chatbot, chat_input], [chatbot, chat_input])
1584 | bot_msg = chat_msg.then(setup_bot_response, chatbot, chatbot, api_name="setup_bot_response")
1585 |
1586 | with gr.Row():
1587 | # Audio components for speaker samples, initially hidden
1588 | audio1 = gr.Audio("audio_samples/Thunder_sample.wav", autoplay=False, format="wav", visible=False, label="Thunder")
1589 | audio2 = gr.Audio("audio_samples/Serenity_sample.wav", autoplay=False, format="wav", visible=False, label="Serenity")
1590 | audio3 = gr.Audio("audio_samples/Blaze_sample.wav", autoplay=False, format="wav", visible=False, label="Blaze")
1591 |
1592 | # When the bot reaches the speaker selection step, show audio samples
1593 | bot_msg.then(display_audio_samples, [], [audio1, audio2, audio3])
1594 |
1595 | # After selecting a speaker, hide the audio samples
1596 | chat_msg.then(hide_audio_samples, [], [audio1, audio2, audio3])
1597 |
1598 | # Start the conversation with an initial message
1599 | setup_demo.load(lambda: [[None, "Welcome to your personal assistant setup! Before we dive into our conversations, how would you like me to address you?"]], outputs=chatbot)
1600 | # Gradio app starts here
1601 | with gr.Blocks(theme=gr.themes.Soft(), css="footer{display:none !important} #chatbot { height: 100%; flex-grow: 1; }") as demo:
1602 | with gr.Row():
1603 | with gr.Column(scale=2):
1604 | chatbot = gr.Chatbot([], elem_id="chatbot")
1605 | with gr.Row():
1606 | chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False, autoscroll=True, scale=6)
1607 | stop_button = gr.Button("Stop", size="sm", scale=1)
1608 | reset_button = gr.Button("Reset Conversation", size="sm", scale=1)
1609 | chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
1610 | bot_msg = chat_msg.then(bot_response, chatbot, [chatbot, gr.File(label="Related Documents")], api_name="bot_response")
1611 | bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
1612 | chatbot.like(print_like_dislike, None, None)
1613 | #stop_button.click(stop_all_streaming)
1614 | reset_button.click(reset_conversation, [], [chatbot, chatbot])
1615 |
1616 | with gr.Column():
1617 | upload_button = gr.UploadButton("Click to Upload a File")
1618 | upload_button.upload(upload_file, upload_button)
1619 | process_button = gr.Button("Process Files")
1620 | process_button.click(process_files, [], [])
1621 | gr.Markdown("### Filters")
1622 | with gr.Row():
1623 | key_box = gr.Dropdown(
1624 | choices=[
1625 | "given_document_name",
1626 | "document_type",
1627 | "mentions",
1628 | "keywords",
1629 | "about",
1630 | "questions",
1631 | "entities",
1632 | "summaries",
1633 | "authors",
1634 | "source",
1635 | "language",
1636 | "audience",
1637 | "document_id",
1638 | "file_directory",
1639 | "original_file_name",
1640 | "file_creation_date",
1641 | "file_modification_date",
1642 | "metadata_creation_date"
1643 | ],
1644 | label="Key"
1645 | )
1646 | value_box = gr.Dropdown(choices=["Name1", "Name2", "Name3"], label="Value")
1647 |
1648 | # Add change event handlers to update params
1649 | key_box.change(lambda key: on_param_change(params["top_k"], params["score_threshold"], params["collection_name"], filter_key=key), key_box)
1650 | value_box.change(lambda value: on_param_change(params["top_k"], params["score_threshold"], params["collection_name"], filter_value=value), value_box)
1651 |
1652 | gr.Markdown("### Parameters")
1653 | score_threshold_slider = gr.Slider(0.01, 0.99, label="Score Threshold", value=params["score_threshold"], interactive=True)
1654 | top_k_input = gr.Number(label="Top K", value=params["top_k"], interactive=True)
1655 | with gr.Row():
1656 | collection_name_input = gr.Textbox(placeholder="Collection Name", label="Collection Name", value=params["collection_name"], interactive=True)
1657 | reload_button = gr.Button("Reload Vectorstore", scale=0)
1658 | reload_button.click(reload_vectorstore, [], None)
1659 | score_threshold_slider.change(lambda value: on_param_change(top_k_input.value, value, collection_name_input.value), score_threshold_slider)
1660 | top_k_input.change(lambda value: on_param_change(value, score_threshold_slider.value, collection_name_input.value), top_k_input)
1661 | collection_name_input.change(lambda value: on_param_change(top_k_input.value, score_threshold_slider.value, value), collection_name_input)
1662 | with gr.Accordion("Advanced Settings", open=False):
1663 | use_voiceover_checkbox = gr.Checkbox(label="Use Voice Over", value=params["use_voiceover"], interactive=True)
1664 | voiceover_speed_slider = gr.Slider(0.5, 1.5, value=params["voiceover_speed"], step=0.1, interactive=True, label="Voiceover Speed")
1665 | fireup_speed_box = gr.Number(label="Fireup Speed (seconds)", value=params["fireup_speed"], interactive=True)
1666 | language_input = gr.Textbox(placeholder="Language", label="Language", value=params["language"], interactive=True)
1667 | speaker_dropdown = gr.Dropdown(["Thunder", "Serenity", "Blaze"], label="Speaker", value=params["speaker"], interactive=True)
1668 | voiceover_speed_slider.change(on_advanced_param_change, [voiceover_speed_slider, fireup_speed_box, language_input, speaker_dropdown, use_voiceover_checkbox])
1669 | fireup_speed_box.change(on_advanced_param_change, [voiceover_speed_slider, fireup_speed_box, language_input, speaker_dropdown, use_voiceover_checkbox])
1670 | language_input.change(on_advanced_param_change, [voiceover_speed_slider, fireup_speed_box, language_input, speaker_dropdown, use_voiceover_checkbox])
1671 | speaker_dropdown.change(on_advanced_param_change, [voiceover_speed_slider, fireup_speed_box, language_input, speaker_dropdown, use_voiceover_checkbox])
1672 | use_voiceover_checkbox.change(on_advanced_param_change, [voiceover_speed_slider, fireup_speed_box, language_input, speaker_dropdown, use_voiceover_checkbox])
1673 |
1674 | # Main execution
1675 | if check_setup():
1676 | print("Setup is already complete. Launching the main interface...")
1677 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1678 | params = load_params()
1679 |
1680 | # Initialize vectorstore and record manager
1681 | vectorstore, record_manager = initialize_vectorstore(params["collection_name"])
1682 |
1683 | # Get the value of the key 'directory' and set it as DIRECTORY_PATH
1684 | DIRECTORY_PATH = params.get("directory", "data")
1685 | tts_streamer = TTSStreamer(model_path="XTTS-v2", config_path="XTTS-v2\\config.json", vocab_path="XTTS-v2\\vocab.json")
1686 | # Check if use_voiceover is True
1687 | if params.get("use_voiceover", False):
1688 | whisper_model = whisper.load_model("small")
1689 | tts_streamer = TTSStreamer(model_path="XTTS-v2", config_path="XTTS-v2\\config.json", vocab_path="XTTS-v2\\vocab.json")
1690 | list_microphones()
1691 |
1692 | assistant = DocumentAssistant(model_name="llama3.1:8b", temperature=0.9)
1693 | history = [] # Initialize empty history
1694 | setup_keyboard_shortcuts(history)
1695 |
1696 | # Define the system prompt for the assistant
1697 | demo.launch(inbrowser=True, show_error=True)
1698 | else:
1699 | print("Starting setup process...")
1700 | setup_demo.launch(inbrowser=True, show_error=False)
1701 |
--------------------------------------------------------------------------------