├── .gitignore
├── requirements.txt
├── main.py
├── chunkify_littleprince_translate.gif
├── pyproject.toml
├── LICENSE
├── chunkify-no-kobold.bat
├── chunkify-run.bat
├── chunkify-run.sh
├── chunkify.kcppt
├── README.md
├── chunker_regex.py
├── chunkify-gui.py
└── chunkify.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /chunkify_env
2 | /ignore
3 | /__pycache__
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | regex
3 | extractous
4 | PyQt6
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | def main():
2 | print("Hello from chunkify!")
3 |
4 |
5 | if __name__ == "__main__":
6 | main()
7 |
--------------------------------------------------------------------------------
/chunkify_littleprince_translate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jabberjabberjabber/Chunkify/HEAD/chunkify_littleprince_translate.gif
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "chunkify"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.13"
7 | dependencies = [
8 | "extractous>=0.3.0",
9 | "pyqt6>=6.10.0",
10 | "regex>=2025.11.3",
11 | "requests>=2.32.5",
12 | ]
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 jabberjabberjabber
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/chunkify-no-kobold.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | setlocal enabledelayedexpansion
3 |
4 | REM Set the name of your virtual environment
5 | set "VENV_NAME=chunkify_env"
6 |
7 | REM Set the path to your Python installation (update this if needed)
8 | set "PYTHON_PATH=python"
9 |
10 | REM Check if Python is installed and in PATH
11 | %PYTHON_PATH% --version >nul 2>&1
12 | if errorlevel 1 (
13 | echo Python is not found. Please ensure Python is installed and added to your PATH.
14 | pause
15 | exit /b 1
16 | )
17 |
18 |
19 | REM Check if the virtual environment exists, create if it doesn't
20 | if not exist "%VENV_NAME%\Scripts\activate.bat" (
21 | echo Creating new virtual environment: %VENV_NAME%
22 | %PYTHON_PATH% -m venv %VENV_NAME%
23 | if errorlevel 1 (
24 | echo Failed to create virtual environment. Please check your Python installation.
25 | pause
26 | exit /b 1
27 | )
28 | ) else (
29 | echo Virtual environment %VENV_NAME% already exists.
30 | )
31 |
32 | REM Activate the virtual environment
33 | call "%VENV_NAME%\Scripts\activate.bat"
34 |
35 | REM Check if requirements.txt exists
36 | if not exist requirements.txt (
37 | echo requirements.txt not found. Please create a requirements.txt file in the same directory as this script.
38 | pause
39 | exit /b 1
40 | )
41 |
42 | REM Upgrade pip to the latest version
43 | python -m pip install --upgrade pip
44 |
45 | REM Install packages from requirements.txt
46 | echo Installing packages from requirements.txt...
47 | pip install -r requirements.txt
48 | if errorlevel 1 (
49 | echo Failed to install some packages. Please check your internet connection and requirements.txt file.
50 | pause
51 | exit /b 1
52 | )
53 |
54 |
55 |
56 |
57 | python chunkify-gui.py
58 | pause
59 |
60 | REM Deactivate the virtual environment
61 | deactivate
62 |
63 | pause
--------------------------------------------------------------------------------
/chunkify-run.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | setlocal enabledelayedexpansion
3 |
4 | REM Set the name of your virtual environment
5 | set "VENV_NAME=chunkify_env"
6 |
7 | REM Set the path to your Python installation (update this if needed)
8 | set "PYTHON_PATH=python"
9 |
10 | REM Check if Python is installed and in PATH
11 | %PYTHON_PATH% --version >nul 2>&1
12 | if errorlevel 1 (
13 | echo Python is not found. Please ensure Python is installed and added to your PATH.
14 | pause
15 | exit /b 1
16 | )
17 |
18 |
19 | REM Check if the virtual environment exists, create if it doesn't
20 | if not exist "%VENV_NAME%\Scripts\activate.bat" (
21 | echo Creating new virtual environment: %VENV_NAME%
22 | %PYTHON_PATH% -m venv %VENV_NAME%
23 | if errorlevel 1 (
24 | echo Failed to create virtual environment. Please check your Python installation.
25 | pause
26 | exit /b 1
27 | )
28 | ) else (
29 | echo Virtual environment %VENV_NAME% already exists.
30 | )
31 |
32 | REM Activate the virtual environment
33 | call "%VENV_NAME%\Scripts\activate.bat"
34 |
35 | REM Check if requirements.txt exists
36 | if not exist requirements.txt (
37 | echo requirements.txt not found. Please create a requirements.txt file in the same directory as this script.
38 | pause
39 | exit /b 1
40 | )
41 |
42 | REM Upgrade pip to the latest version
43 | python -m pip install --upgrade pip
44 |
45 | REM Install packages from requirements.txt
46 | echo Installing packages from requirements.txt...
47 | pip install -r requirements.txt
48 | if errorlevel 1 (
49 | echo Failed to install some packages. Please check your internet connection and requirements.txt file.
50 | pause
51 | exit /b 1
52 | )
53 |
54 |
55 | REM Check if koboldcpp.exe exists, if not, check for koboldcpp_cu12.exe
56 | if exist koboldcpp.exe (
57 | set "KOBOLD_EXE=koboldcpp.exe"
58 | ) else if exist koboldcpp_cu12.exe (
59 | set "KOBOLD_EXE=koboldcpp_cu12.exe"
60 | ) else (
61 | echo Neither koboldcpp.exe nor koboldcpp_cu12.exe found. Please ensure one of these files exists.
62 | pause
63 | exit /b 1
64 | )
65 |
66 | REM Launch your Python script
67 | start %KOBOLD_EXE% --config chunkify.kcppt
68 |
69 | python chunkify-gui.py
70 | pause
71 |
72 | REM Deactivate the virtual environment
73 | deactivate
74 |
75 | pause
--------------------------------------------------------------------------------
/chunkify-run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set the name of your virtual environment
4 | VENV_NAME="chunkify_env"
5 |
6 | # Function to check if a command exists
7 | command_exists() {
8 | command -v "$1" >/dev/null 2>&1
9 | }
10 |
11 | # Check if Python is installed
12 | if ! command_exists python3; then
13 | echo "Python 3 is not found. Please ensure Python 3 is installed and added to your PATH."
14 | exit 1
15 | fi
16 |
17 | # Check if the virtual environment exists, create if it doesn't
18 | if [ ! -d "$VENV_NAME" ]; then
19 | echo "Creating new virtual environment: $VENV_NAME"
20 | python3 -m venv "$VENV_NAME"
21 | if [ $? -ne 0 ]; then
22 | echo "Failed to create virtual environment. Please check your Python installation."
23 | exit 1
24 | fi
25 | else
26 | echo "Virtual environment $VENV_NAME already exists."
27 | fi
28 |
29 | # Activate the virtual environment
30 | source "$VENV_NAME/bin/activate"
31 |
32 | # Check if requirements.txt exists
33 | if [ ! -f "requirements.txt" ]; then
34 | echo "requirements.txt not found. Please create a requirements.txt file in the same directory as this script."
35 | exit 1
36 | fi
37 |
38 | # Upgrade pip to the latest version
39 | python3 -m pip install --upgrade pip
40 |
41 | # Install packages from requirements.txt
42 | echo "Installing packages from requirements.txt..."
43 | pip install -r requirements.txt
44 | if [ $? -ne 0 ]; then
45 | echo "Failed to install some packages. Please check your internet connection and requirements.txt file."
46 | exit 1
47 | fi
48 |
49 | # Determine the correct KoboldCPP binary based on the system
50 | if [[ "$(uname)" == "Darwin" ]]; then
51 | if [[ "$(uname -m)" == "arm64" ]]; then
52 | KOBOLDCPP_BINARY="./koboldcpp-mac-arm64"
53 | else
54 | KOBOLDCPP_BINARY="./koboldcpp-mac-x64"
55 | fi
56 | elif [[ "$(uname)" == "Linux" ]]; then
57 | KOBOLDCPP_BINARY="./koboldcpp-linux-x64"
58 | else
59 | echo "Unsupported operating system. Please run on macOS or Linux."
60 | exit 1
61 | fi
62 |
63 | # Check if the KoboldCPP binary exists and is executable
64 | if [ ! -x "$KOBOLDCPP_BINARY" ]; then
65 | echo "KoboldCPP binary not found or not executable. Please check the binary and its permissions."
66 | exit 1
67 | fi
68 |
69 | # Launch KoboldCPP
70 | "$KOBOLDCPP_BINARY" --config chunkify.kcppt &
71 |
72 | # Wait for KoboldCPP to start
73 | echo "Waiting for KoboldCPP to start..."
74 | while ! nc -z localhost 5001; do
75 | sleep 1
76 | done
77 |
78 | # Launch the Python GUI script
79 | python3 chunkify-gui.py
80 |
81 | # Deactivate the virtual environment when the GUI is closed
82 | deactivate
83 |
84 | # Wait for user input before closing
85 | read -p "Press Enter to exit..."
86 |
--------------------------------------------------------------------------------
/chunkify.kcppt:
--------------------------------------------------------------------------------
1 | {
2 | "benchmark": null,
3 | "blasbatchsize": 512,
4 | "blasthreads": null,
5 | "config": null,
6 | "contextsize": 8192,
7 | "debugmode": 0,
8 | "flashattention": true,
9 | "forceversion": 0,
10 | "foreground": false,
11 | "gpulayers": -1,
12 | "highpriority": false,
13 | "hordeconfig": null,
14 | "hordegenlen": 0,
15 | "hordekey": "",
16 | "hordemaxctx": 0,
17 | "hordemodelname": "",
18 | "hordeworkername": "",
19 | "host": "",
20 | "ignoremissing": false,
21 | "istemplate": true,
22 | "launch": false,
23 | "lora": null,
24 | "mmproj": "",
25 | "model": "",
26 | "model_param": "https://huggingface.co/bartowski/aya-expanse-8b-GGUF/resolve/main/aya-expanse-8b-Q6_K.gguf",
27 | "multiuser": 8,
28 | "noavx2": false,
29 | "noblas": false,
30 | "nocertify": false,
31 | "nommap": false,
32 | "noshift": true,
33 | "onready": "",
34 | "password": null,
35 | "port": 5001,
36 | "port_param": 5001,
37 | "preloadstory": {
38 | "actions": [
39 | ],
40 | "actions_metadata": {
41 | },
42 | "anotestr": 320,
43 | "anotetemplate": "[Author's note: <|>]",
44 | "authorsnote": "",
45 | "completed_imgs_meta": {
46 | },
47 | "extrastopseq": "",
48 | "gamestarted": true,
49 | "logitbiasdict": {
50 | },
51 | "memory": "",
52 | "personal_notes": "",
53 | "placeholder_tags_data": [
54 | ],
55 | "prompt": "",
56 | "regexreplace_data": [
57 | ],
58 | "savedsettings": {
59 | "adventure_context_mod": true,
60 | "adventure_is_action": false,
61 | "chat_context_mod": true,
62 | "chatname": "User",
63 | "chatopponent": "KoboldAI",
64 | "compressnewlines": false,
65 | "eos_ban_mode": "0",
66 | "gui_type_instruct": "3",
67 | "instruct_endtag": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n",
68 | "instruct_has_markdown": true,
69 | "instruct_starttag": "<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\n",
70 | "instruct_sysprompt": "",
71 | "instruct_systag": "<|start_header_id|>system<|end_header_id|>\\n\\n",
72 | "opmode": "4",
73 | "persist_session": true,
74 | "placeholder_tags": true,
75 | "render_special_tags": false,
76 | "trimsentences": true,
77 | "trimwhitespace": false
78 | },
79 | "tokenbans": "",
80 | "wifolders_d": {
81 | },
82 | "wifolders_l": [
83 | ],
84 | "wiinsertlocation": 0,
85 | "wisearchdepth": 0,
86 | "worldinfo": [
87 | ]
88 | },
89 | "quantkv": 0,
90 | "quiet": false,
91 | "remotetunnel": false,
92 | "ropeconfig": [
93 | 0.0,
94 | 10000.0
95 | ],
96 | "sdclamped": 0,
97 | "sdconfig": null,
98 | "sdlora": "",
99 | "sdloramult": 1.0,
100 | "sdmodel": "",
101 | "sdquant": false,
102 | "sdthreads": 0,
103 | "sdvae": "",
104 | "sdvaeauto": false,
105 | "skiplauncher": true,
106 | "smartcontext": false,
107 | "ssl": null,
108 | "tensor_split": null,
109 | "threads": -1,
110 | "unpack": "",
111 | "useclblast": null,
112 | "usecublas": null,
113 | "usemlock": false,
114 | "usevulkan": null,
115 | "whispermodel": "",
116 | "nofastforward": true
117 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Chunkify: A Python Script for Text Processing with Large Language Models
2 |
3 | ## Overview
4 |
5 | Chunkify was made as a proof-of-concept for a chunking method that doesn't rely on a tokenizer. The text processing features were added because they are commonly used and useful. Aya Expanse is particularly good at these tasks and is downloaded automatically the first time you run it if you use the batch file.
6 |
7 | ## Key Features
8 |
9 | - **Document Chunking:** Divides large documents into manageable chunks, intelligently identifying breaks based on chapters, headings, paragraphs, or sentences.
10 | - **Automatic Template Selection:** Adapts to the loaded model, ensuring the correct instruction template is used.
11 | - **Real-time Monitoring:** Provides continuous feedback on the generation process, allowing users to track progress.
12 | - **Compatible with multiple document formats, including PDF and HTML**
13 | - **Multiple Processing Modes:**
14 | - **Summary:** Generates concise summaries of the content.
15 | - **Translation:** Translates text into a language you can specify (default is English).
16 | - **Distillation:** Rewrites content for conciseness while retaining key information.
17 | - **Correction:** Fixes grammar, spelling, and style issues.
18 | - **File Output Support:** Saves results to specified output files.
19 |
20 | ## Requirements
21 |
22 | - Python 3.8 or later
23 | - KoboldCpp executable in the script directory
24 | - Essential Python packages:
25 | - `requests`
26 | - `extractous` (for text extraction)
27 | - `PyQt6` (for GUI)
28 |
29 | ## Installation
30 |
31 | #### Windows Installation:
32 |
33 | 1. Clone the repository or download the ZIP file from GitHub.
34 | 2. Install Python 3.8 or later if not already present.
35 | 3. Download KoboldCPP.exe from the [KoboldCPP releases](https://github.com/LostRuins/koboldcpp/releases) page and place it in the project folder.
36 | 4. Run `chunkify-run.bat`. This script will install necessary dependencies and download the Aya Expanse 8b Q6_K model.
37 | 5. Upon completion, you should see a message: "Please connect to custom endpoint at http://localhost:5001".
38 |
39 | #### macOS Installation:
40 |
41 | 1. Follow the Windows installation steps, ensuring you use the appropriate KoboldCPP binary for macOS.
42 |
43 | #### Linux Installation:
44 |
45 | 1. Similar to Windows, clone the repository, install Python 3.8 or later, and download the Linux KoboldCPP binary from the releases page.
46 | 2. Run the script using: `./chunkify-run.sh`.
47 |
48 | ## Usage
49 |
50 | 1. **GUI Launch:**
51 | - Windows: Run `chunkify-run.bat`.
52 | - macOS/Linux: Execute `python3 chunkify-gui.py`.
53 |
54 | 2. Ensure KoboldCPP is running and displaying the message: "Please connect to custom endpoint at http://localhost:5001".
55 |
56 | 3. Configure settings and API details through the GUI or a configuration JSON file.
57 |
58 | 4. Click "Process" to initiate the text processing task.
59 |
60 | 5. Monitor progress in the GUI's output area.
61 |
62 | ## Configuration
63 |
64 | Configuration can be managed through:
65 |
66 | - Command-line arguments
67 | - `chunkify_config.json` file
68 | - GUI settings
69 |
70 | ## Command-Line Usage
71 |
72 | ```bash
73 | python chunkify.py --content input.txt --task summary
74 | ```
75 |
76 | or with a configuration file:
77 |
78 | ```bash
79 | python chunkify.py --config config.json --content input.txt --task translate
80 | ```
81 |
82 | ## Output Format
83 |
84 | When using the `--file` option, the script generates a Markdown-formatted output file containing:
85 |
86 | - Document metadata
87 | - Task-specific results
88 |
89 | The default output file is `output.txt` in the script directory, or the GUI will save files with an added '_processed' suffix.
90 |
91 |
92 | ## Limitations
93 |
94 | - Context length is model-dependent.
95 | - Chunking and generation length are set to half the context size.
96 | - Speed varies based on API response time.
97 | - Consider a GPU with 8GB VRAM or a powerful CPU with 16GB RAM for optimal performance.
98 |
99 | ## Contribution and License
100 |
101 | Feel free to contribute and submit issues or pull requests. The script is licensed under the MIT license.
102 |
103 |
--------------------------------------------------------------------------------
/chunker_regex.py:
--------------------------------------------------------------------------------
1 | # code source: https://gist.github.com/hanxiao/3f60354cf6dc5ac698bc9154163b4e6a
2 | # link: https://jina.ai/tokenizer/
3 |
4 | import regex as re
5 |
6 | MAX_HEADING_LENGTH = 7
7 | MAX_HEADING_CONTENT_LENGTH = 200
8 | MAX_HEADING_UNDERLINE_LENGTH = 200
9 | MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100
10 | MAX_LIST_ITEM_LENGTH = 200
11 | MAX_NESTED_LIST_ITEMS = 6
12 | MAX_LIST_INDENT_SPACES = 7
13 | MAX_BLOCKQUOTE_LINE_LENGTH = 200
14 | MAX_BLOCKQUOTE_LINES = 15
15 | MAX_CODE_BLOCK_LENGTH = 1500
16 | MAX_CODE_LANGUAGE_LENGTH = 20
17 | MAX_INDENTED_CODE_LINES = 20
18 | MAX_TABLE_CELL_LENGTH = 200
19 | MAX_TABLE_ROWS = 20
20 | MAX_HTML_TABLE_LENGTH = 2000
21 | MIN_HORIZONTAL_RULE_LENGTH = 3
22 | MAX_SENTENCE_LENGTH = 400
23 | MAX_QUOTED_TEXT_LENGTH = 600
24 | MAX_PARENTHETICAL_CONTENT_LENGTH = 400
25 | MAX_NESTED_PARENTHESES = 5
26 | MAX_MATH_INLINE_LENGTH = 100
27 | MAX_MATH_BLOCK_LENGTH = 500
28 | MAX_PARAGRAPH_LENGTH = 1000
29 | MAX_STANDALONE_LINE_LENGTH = 800
30 | MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100
31 | MAX_HTML_TAG_CONTENT_LENGTH = 1000
32 | LOOKAHEAD_RANGE = 200;
33 |
34 | chunk_regex = re.compile(
35 | r"(" +
36 | # 1. Headings (Setext-style, Markdown, and HTML-style)
37 | rf"(?:^(?:[#*=-]{{1,{MAX_HEADING_LENGTH}}}|\w[^\r\n]{{0,{MAX_HEADING_CONTENT_LENGTH}}}\r?\n[-=]{{2,{MAX_HEADING_UNDERLINE_LENGTH}}}|
(?:)[\s\S]{{0,{MAX_CODE_BLOCK_LENGTH}}}?(?:)?)" +
52 | "|" +
53 | # 6. Tables
54 | rf"(?:(?:^|\r?\n)\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|(?:\r?\n\|[-:]{{1,{MAX_TABLE_CELL_LENGTH}}}\|)?(?:\r?\n\|[^\r\n]{{0,{MAX_TABLE_CELL_LENGTH}}}\|){{0,{MAX_TABLE_ROWS}}})" +
55 | rf"|)?(?:(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?=\s|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_PARAGRAPH_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))?))(?:
)?(?:\r?\n[ \t]+[^\r\n]*)*)" + 78 | "|" + 79 | 80 | # 12. HTML-like tags and their content 81 | rf"(?:<[a-zA-Z][^>]{{0,{MAX_HTML_TAG_ATTRIBUTES_LENGTH}}}(?:>[\s\S]{{0,{MAX_HTML_TAG_CONTENT_LENGTH}}}[a-zA-Z]+>|\s*/>))" + 82 | "|" + 83 | # 13. LaTeX-style math expressions 84 | rf"(?:(?:\$\$[\s\S]{{0,{MAX_MATH_BLOCK_LENGTH}}}?\$\$)|(?:\$[^\$\r\n]{{0,{MAX_MATH_INLINE_LENGTH}}}\$))" + 85 | "|" + 86 | # 14. Fallback for any remaining content (Keep content together if it's indented) 87 | rf"(?:(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}})?(?=\s|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[\r\n]|$))|(?:[^\r\n]{{1,{MAX_STANDALONE_LINE_LENGTH}}}(?=[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?:.{{1,{LOOKAHEAD_RANGE}}}(?:[.!?…]|\.\.\.|[\u2026\u2047-\u2049]|\p{{Emoji_Presentation}}\p{{Extended_Pictographic}}])(?=\s|$))(?:\r?\n[ \t]+[^\r\n]*)?))" + 88 | r")", 89 | re.MULTILINE | re.UNICODE 90 | ) 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /chunkify-gui.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import asyncio 4 | from pathlib import Path 5 | from PyQt6.QtWidgets import ( 6 | QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, 7 | QPushButton, QRadioButton, QLabel, QTextEdit, QFileDialog, 8 | QDialog, QFormLayout, QLineEdit, QSpinBox, QDialogButtonBox, 9 | QGroupBox, QScrollArea, QMessageBox, QProgressBar 10 | ) 11 | from PyQt6.QtCore import Qt, QObject, pyqtSignal, pyqtSlot, QThread 12 | 13 | import chunkify 14 | 15 | 16 | class OutputRedirector(QObject): 17 | """ Redirects print output to a text widget """ 18 | text_output = pyqtSignal(str) 19 | 20 | def __init__(self): 21 | super().__init__() 22 | self.buffer = "" 23 | 24 | def write(self, text): 25 | self.buffer += text 26 | if '\n' in text: 27 | self.text_output.emit(self.buffer) 28 | self.buffer = "" 29 | return len(text) 30 | 31 | def flush(self): 32 | if self.buffer: 33 | self.text_output.emit(self.buffer) 34 | self.buffer = "" 35 | 36 | 37 | class WorkerThread(QThread): 38 | """ Thread for running the text processing operations """ 39 | finished = pyqtSignal(int) 40 | progress = pyqtSignal(int, int) # (current_chunk, total_chunks) 41 | 42 | def __init__(self, api_url, input_path, task, output_path, language, max_chunk_size, api_password): 43 | super().__init__() 44 | self.api_url = api_url 45 | self.input_path = input_path 46 | self.task = task 47 | self.output_path = output_path 48 | self.language = language 49 | self.max_chunk_size = max_chunk_size 50 | self.api_password = api_password 51 | 52 | def run(self): 53 | try: 54 | loop = asyncio.new_event_loop() 55 | asyncio.set_event_loop(loop) 56 | exit_code = loop.run_until_complete(chunkify.process_file( 57 | api_url=self.api_url, 58 | input_path=self.input_path, 59 | task=self.task, 60 | output_path=self.output_path, 61 | language=self.language, 62 | max_chunk_size=self.max_chunk_size, 63 | api_password=self.api_password 64 | )) 65 | self.finished.emit(exit_code) 66 | except Exception as e: 67 | print(f"Error in worker thread: {str(e)}") 68 | self.finished.emit(1) 69 | 70 | 71 | class ConfigDialog(QDialog): 72 | """ Dialog for configuring the API settings """ 73 | 74 | def __init__(self, parent=None, settings=None): 75 | super().__init__(parent) 76 | 77 | self.settings = settings or { 78 | "api_url": "http://localhost:5001", 79 | "api_password": "", 80 | "language": "English", 81 | "max_chunk_size": 4096 82 | } 83 | 84 | self.setWindowTitle("Configuration") 85 | self.resize(400, 200) 86 | 87 | # Create form layout for settings 88 | layout = QFormLayout(self) 89 | 90 | # API URL 91 | self.api_url_edit = QLineEdit(self.settings["api_url"]) 92 | layout.addRow("API URL:", self.api_url_edit) 93 | 94 | # API Password/Key 95 | self.api_password_edit = QLineEdit(self.settings["api_password"]) 96 | layout.addRow("API Key:", self.api_password_edit) 97 | 98 | # Default language 99 | self.language_edit = QLineEdit(self.settings["language"]) 100 | layout.addRow("Default Language:", self.language_edit) 101 | 102 | # Max chunk size 103 | self.chunk_size_spin = QSpinBox() 104 | self.chunk_size_spin.setRange(256, 8192) 105 | self.chunk_size_spin.setSingleStep(128) 106 | self.chunk_size_spin.setValue(self.settings["max_chunk_size"]) 107 | layout.addRow("Max Chunk Size:", self.chunk_size_spin) 108 | 109 | # Dialog buttons 110 | self.button_box = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | 111 | QDialogButtonBox.StandardButton.Cancel) 112 | self.button_box.accepted.connect(self.accept) 113 | self.button_box.rejected.connect(self.reject) 114 | layout.addRow(self.button_box) 115 | 116 | def get_settings(self): 117 | """ Return the current settings """ 118 | return { 119 | "api_url": self.api_url_edit.text(), 120 | "api_password": self.api_password_edit.text(), 121 | "language": self.language_edit.text(), 122 | "max_chunk_size": self.chunk_size_spin.value() 123 | } 124 | 125 | 126 | class ChunkifyGUI(QMainWindow): 127 | """ Main application window for Chunkify GUI """ 128 | 129 | def __init__(self): 130 | super().__init__() 131 | 132 | # Initialize settings 133 | self.settings = { 134 | "api_url": "http://localhost:5001", 135 | "api_password": "", 136 | "language": "English", 137 | "max_chunk_size": 4096 138 | } 139 | 140 | self.input_files = [] 141 | self.selected_task = "summary" 142 | 143 | # Main window setup 144 | self.setWindowTitle("Chunkify Text Processor") 145 | self.resize(800, 600) 146 | 147 | # Central widget and main layout 148 | central_widget = QWidget() 149 | self.setCentralWidget(central_widget) 150 | main_layout = QVBoxLayout(central_widget) 151 | 152 | # Input file selection area 153 | file_layout = QHBoxLayout() 154 | self.file_label = QLabel("No files selected") 155 | file_layout.addWidget(self.file_label) 156 | 157 | self.choose_file_btn = QPushButton("Choose Files") 158 | self.choose_file_btn.clicked.connect(self.select_input_files) 159 | file_layout.addWidget(self.choose_file_btn) 160 | 161 | main_layout.addLayout(file_layout) 162 | 163 | # Task selection group 164 | task_group = QGroupBox("Task") 165 | task_layout = QVBoxLayout(task_group) 166 | 167 | self.task_buttons = {} 168 | for task in ["summary", "translate", "correct", "distill"]: 169 | self.task_buttons[task] = QRadioButton(task.capitalize()) 170 | self.task_buttons[task].clicked.connect(self.update_selected_task) 171 | task_layout.addWidget(self.task_buttons[task]) 172 | 173 | # Set default task 174 | self.task_buttons["summary"].setChecked(True) 175 | 176 | main_layout.addWidget(task_group) 177 | 178 | # Control buttons 179 | controls_layout = QHBoxLayout() 180 | 181 | self.config_btn = QPushButton("Configuration") 182 | self.config_btn.clicked.connect(self.open_config_dialog) 183 | controls_layout.addWidget(self.config_btn) 184 | 185 | self.process_btn = QPushButton("Process Files") 186 | self.process_btn.clicked.connect(self.process_files) 187 | self.process_btn.setEnabled(False) # Disabled until files are selected 188 | controls_layout.addWidget(self.process_btn) 189 | 190 | main_layout.addLayout(controls_layout) 191 | 192 | # Progress bar 193 | self.progress_bar = QProgressBar() 194 | self.progress_bar.setRange(0, 100) 195 | self.progress_bar.setValue(0) 196 | self.progress_bar.setVisible(False) 197 | main_layout.addWidget(self.progress_bar) 198 | 199 | # Output window 200 | output_group = QGroupBox("Output") 201 | output_layout = QVBoxLayout(output_group) 202 | 203 | self.output_text = QTextEdit() 204 | self.output_text.setReadOnly(True) 205 | output_layout.addWidget(self.output_text) 206 | 207 | # Create a scroll area for the output 208 | scroll_area = QScrollArea() 209 | scroll_area.setWidget(output_group) 210 | scroll_area.setWidgetResizable(True) 211 | 212 | main_layout.addWidget(scroll_area) 213 | 214 | # Setup output redirection 215 | self.redirector = OutputRedirector() 216 | self.redirector.text_output.connect(self.update_output) 217 | sys.stdout = self.redirector 218 | 219 | def update_selected_task(self): 220 | """ Update the selected task based on radio button selection """ 221 | for task, button in self.task_buttons.items(): 222 | if button.isChecked(): 223 | self.selected_task = task 224 | break 225 | 226 | def select_input_files(self): 227 | """ Open file dialog to select input files """ 228 | files, _ = QFileDialog.getOpenFileNames( 229 | self, 230 | "Select Input Files", 231 | "", 232 | "All Files (*.*)" 233 | ) 234 | 235 | if files: 236 | self.input_files = files 237 | if len(files) == 1: 238 | self.file_label.setText(Path(files[0]).name) 239 | else: 240 | self.file_label.setText(f"{len(files)} files selected") 241 | 242 | self.process_btn.setEnabled(True) 243 | 244 | def open_config_dialog(self): 245 | """ Open the configuration dialog """ 246 | dialog = ConfigDialog(self, self.settings) 247 | if dialog.exec(): 248 | self.settings = dialog.get_settings() 249 | 250 | @pyqtSlot(str) 251 | def update_output(self, text): 252 | """ Update the output text widget """ 253 | self.output_text.append(text) 254 | # Scroll to the bottom 255 | cursor = self.output_text.textCursor() 256 | cursor.movePosition(cursor.MoveOperation.End) 257 | self.output_text.setTextCursor(cursor) 258 | 259 | def process_files(self): 260 | """ Start processing the selected files """ 261 | if not self.input_files: 262 | QMessageBox.warning(self, "Warning", "No input files selected") 263 | return 264 | 265 | # Clear output 266 | self.output_text.clear() 267 | 268 | # Disable the process button during processing 269 | self.process_btn.setEnabled(False) 270 | self.choose_file_btn.setEnabled(False) 271 | self.config_btn.setEnabled(False) 272 | 273 | # Set up progress tracking 274 | self.progress_bar.setValue(0) 275 | self.progress_bar.setVisible(True) 276 | 277 | # Process each file 278 | self.current_file_index = 0 279 | self.process_next_file() 280 | 281 | def process_next_file(self): 282 | """ Process the next file in the queue """ 283 | if self.current_file_index >= len(self.input_files): 284 | # All files processed 285 | self.process_btn.setEnabled(True) 286 | self.choose_file_btn.setEnabled(True) 287 | self.config_btn.setEnabled(True) 288 | self.progress_bar.setVisible(False) 289 | return 290 | 291 | input_path = self.input_files[self.current_file_index] 292 | input_stem = Path(input_path).stem 293 | output_path = f"{input_stem}_{self.selected_task}.txt" 294 | 295 | # Update progress 296 | progress_value = int((self.current_file_index / len(self.input_files)) * 100) 297 | self.progress_bar.setValue(progress_value) 298 | 299 | # Add file header to output 300 | self.update_output(f"\n\n--- Processing {Path(input_path).name} ({self.current_file_index + 1}/{len(self.input_files)}) ---\n") 301 | 302 | # Start processing in a separate thread 303 | self.worker = WorkerThread( 304 | api_url=self.settings["api_url"], 305 | input_path=input_path, 306 | task=self.selected_task, 307 | output_path=output_path, 308 | language=self.settings["language"], 309 | max_chunk_size=self.settings["max_chunk_size"], 310 | api_password=self.settings["api_password"] 311 | ) 312 | 313 | self.worker.finished.connect(self.on_file_processed) 314 | self.worker.start() 315 | 316 | def on_file_processed(self, exit_code): 317 | """ Handle completion of file processing """ 318 | if exit_code != 0: 319 | self.update_output(f"\nError processing file {Path(self.input_files[self.current_file_index]).name}") 320 | 321 | # Move to next file 322 | self.current_file_index += 1 323 | self.process_next_file() 324 | 325 | def closeEvent(self, event): 326 | """ Handle window close event """ 327 | # Restore stdout 328 | sys.stdout = sys.__stdout__ 329 | event.accept() 330 | 331 | 332 | if __name__ == "__main__": 333 | app = QApplication(sys.argv) 334 | window = ChunkifyGUI() 335 | window.show() 336 | sys.exit(app.exec()) 337 | -------------------------------------------------------------------------------- /chunkify.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import json 4 | import os 5 | import random 6 | import re 7 | import sys 8 | import time 9 | from datetime import datetime 10 | from pathlib import Path 11 | from typing import Dict, List, Optional, Tuple, Iterator, Union, Any 12 | import requests 13 | from requests.exceptions import RequestException 14 | import unicodedata 15 | 16 | from extractous import Extractor 17 | from chunker_regex import chunk_regex 18 | 19 | def normalize_content(content: str) -> str: 20 | """Convert fixed-width text and normalize characters.""" 21 | content = unicodedata.normalize('NFKC', content) 22 | content = content.replace('--', '—') 23 | content = content.replace('"', '"').replace('"', '"') 24 | content = content.replace(''', "'").replace(''', "'") 25 | 26 | text = content.replace('\r\n', '\n').replace('\r', '\n') 27 | paragraphs = text.split('\n\n') 28 | 29 | result = '\n\n'.join(para.replace('\n', ' ') for para in paragraphs) 30 | #result = result.replace('\n\n', '\n\n ') 31 | return result 32 | 33 | class ChunkingProcessor: 34 | """ Handles splitting content into manageable chunks using natural breaks """ 35 | 36 | def __init__(self, api_url: str, 37 | max_chunk_length: int, 38 | api_password: Optional[str] = None, 39 | max_total_chunks: int = 1000): 40 | """ Initialize the chunking processor 41 | 42 | Args: 43 | api_url: URL to the KoboldAPI server 44 | max_chunk_length: Maximum token length for a single chunk 45 | api_password: Optional API password/key 46 | max_total_chunks: Maximum number of chunks to process 47 | """ 48 | if max_chunk_length <= 0: 49 | raise ValueError("max_chunk_length must be positive") 50 | 51 | self.api_url = api_url 52 | self.max_chunk = max_chunk_length 53 | self.max_total_chunks = max_total_chunks 54 | self.api_password = api_password 55 | self.headers = { 56 | "Content-Type": "application/json", 57 | } 58 | 59 | if api_password: 60 | self.headers["Authorization"] = f"Bearer {api_password}" 61 | 62 | # Generate a unique key for this processing session 63 | self.genkey = self._create_genkey() 64 | 65 | # Verify API and get max context length if needed 66 | self.api_max_context = self._get_max_context_length() 67 | if self.max_chunk > self.api_max_context // 2: 68 | print(f"Warning: Reducing chunk size to fit model context window") 69 | self.max_chunk = self.api_max_context // 2 70 | 71 | def _create_genkey(self) -> str: 72 | """ Create a unique generation key to prevent cross-request contamination """ 73 | return f"KCPP{''.join(str(random.randint(0, 9)) for _ in range(4))}" 74 | 75 | def _get_max_context_length(self) -> int: 76 | """ Get the maximum context length from the KoboldAPI """ 77 | try: 78 | response = requests.get(f"{self.api_url}/api/extra/true_max_context_length") 79 | if response.status_code == 200: 80 | max_context = int(response.json().get("value", 8192)) 81 | print(f"Model has maximum context length of: {max_context}") 82 | return max_context 83 | else: 84 | print(f"Warning: Could not get max context length. Defaulting to 8192") 85 | return 8192 86 | except Exception as e: 87 | print(f"Error getting max context length: {str(e)}. Defaulting to 8192") 88 | return 8192 89 | 90 | def count_tokens(self, text: str) -> int: 91 | """ Count tokens in the provided text using KoboldAPI """ 92 | try: 93 | payload = {"prompt": text, "genkey": self.genkey} 94 | response = requests.post( 95 | f"{self.api_url}/api/extra/tokencount", 96 | json=payload, 97 | headers=self.headers 98 | ) 99 | if response.status_code == 200: 100 | return int(response.json().get("value", 0)) 101 | else: 102 | # Fallback estimation 103 | return len(text.split()) 104 | except Exception as e: 105 | print(f"Error counting tokens: {str(e)}. Using word count as estimate.") 106 | return len(text.split()) 107 | 108 | def chunk_text(self, content: str) -> List[Tuple[str, int]]: 109 | """ Split content into chunks using natural breakpoints 110 | 111 | Args: 112 | content: The text content to chunk 113 | 114 | Returns: 115 | List of (chunk_text, token_count) tuples 116 | """ 117 | if not content: 118 | return [] 119 | 120 | chunks = [] 121 | remaining = content 122 | chunk_num = 0 123 | 124 | while remaining and chunk_num < self.max_total_chunks: 125 | # KoboldCPP has max char limit of 50k 126 | current_section = remaining[:45000] 127 | remaining = remaining[45000:] 128 | 129 | chunk = self._get_chunk(current_section) 130 | chunk_len = len(chunk) 131 | 132 | if chunk_len == 0: 133 | continue 134 | 135 | chunk_tokens = self.count_tokens(chunk) 136 | chunks.append((chunk, chunk_tokens)) 137 | 138 | # Update remaining with what wasn't included in this chunk 139 | remaining = current_section[len(chunk):].strip() + remaining 140 | 141 | chunk_num += 1 142 | print(f"Created chunk {chunk_num}: {chunk_tokens} tokens") 143 | 144 | if remaining and chunk_num >= self.max_total_chunks: 145 | raise ValueError(f"Text exceeded maximum of {self.max_total_chunks} chunks") 146 | 147 | return chunks 148 | 149 | def _get_chunk(self, content: str) -> str: 150 | """ Get appropriately sized chunk using natural breaks 151 | 152 | Args: 153 | content: Text content to chunk 154 | 155 | Returns: 156 | A chunk of text within token limits 157 | """ 158 | total_tokens = self.count_tokens(content) 159 | if total_tokens < self.max_chunk: 160 | return content 161 | 162 | # chunk_regex is designed to break at natural language points 163 | # to preserve context and readability 164 | matches = chunk_regex.finditer(content) 165 | current_size = 0 166 | chunks = [] 167 | 168 | for match in matches: 169 | chunk = match.group(0) 170 | chunk_size = self.count_tokens(chunk) 171 | if current_size + chunk_size > self.max_chunk: 172 | if not chunks: 173 | chunks.append(chunk) 174 | break 175 | chunks.append(chunk) 176 | current_size += chunk_size 177 | 178 | return ''.join(chunks) 179 | 180 | def chunk_file(self, file_path) -> Tuple[List[Tuple[str, int]], Dict]: 181 | """ Chunk text from file 182 | 183 | Args: 184 | file_path: Path to text file (str or Path object) 185 | 186 | Returns: 187 | Tuple of (chunks with token counts, file metadata) 188 | """ 189 | extractor = Extractor() 190 | extractor = extractor.set_extract_string_max_length(100000000) 191 | 192 | try: 193 | content, metadata = extractor.extract_file_to_string(str(file_path)) 194 | normalized_content = normalize_content(content) 195 | chunks = self.chunk_text(normalized_content) 196 | return chunks, metadata 197 | except Exception as e: 198 | print(f"Error extracting file: {str(e)}") 199 | return [], {"error": str(e)} 200 | 201 | 202 | class SSEProcessingClient: 203 | """ Client for processing chunks with OpenAI-compatible endpoints via SSE streaming """ 204 | 205 | def __init__(self, api_url: str, api_password: Optional[str] = None): 206 | """ Initialize the processing client 207 | 208 | Args: 209 | api_url: URL to the OpenAI-compatible API 210 | api_password: Optional API key/password 211 | """ 212 | self.api_url = api_url 213 | self.api_password = api_password 214 | 215 | # Ensure API URL ends with correct endpoint for OpenAI compatibility 216 | if not self.api_url.endswith('/v1/chat/completions'): 217 | self.api_url = f"{self.api_url.rstrip('/')}/v1/chat/completions" 218 | 219 | self.headers = { 220 | "Content-Type": "application/json", 221 | "Accept": "text/event-stream", 222 | } 223 | 224 | if api_password: 225 | self.headers["Authorization"] = f"Bearer {api_password}" 226 | 227 | def _create_payload(self, instruction: str, content: str, 228 | max_tokens: int = 2048, 229 | temperature: float = 0.2, 230 | top_p: float = 1.0, 231 | top_k: int = 0, 232 | rep_pen: float = 1.0, 233 | min_p: float = 0.05) -> Dict: 234 | """ Create the API payload with standard parameters 235 | 236 | Args: 237 | instruction: Instruction text for the model 238 | content: Content text to process 239 | max_tokens: Maximum tokens to generate 240 | temperature: Temperature parameter 241 | top_p: Top-p parameter 242 | top_k: Top-k parameter 243 | rep_pen: Repetition penalty 244 | min_p: Minimum p parameter 245 | 246 | Returns: 247 | Dictionary payload for the API 248 | """ 249 | system_content = "You are a helpful assistant." 250 | 251 | combined_content = f"