├── .gitignore
├── README.md
├── setup.py
├── stream2sentence
    ├── __init__.py
    ├── avoid_pause_words.py
    ├── delimiter_ignore_prefixes.py
    ├── stream2sentence.py
    └── stream2sentence_time_based.py
└── tests
    ├── run_test.bat
    ├── simpletest.py
    ├── test_data
        ├── 1.txt
        ├── 2.txt
        ├── 3.txt
        ├── 4.txt
        └── debug.py
    ├── test_stream2sentence.py
    ├── test_stream_from_llm.py
    ├── test_stream_from_llm_old_api.py
    └── test_time_based.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | tests_private/
  2 | test_env/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | myenv/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | .DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Real-Time Sentence Detection
  2 | 
  3 | Real-time processing and delivery of sentences from a continuous stream of characters or text chunks.
  4 | 
  5 | > **Hint:** *If you're interested in state-of-the-art voice solutions you might also want to <strong>have a look at [Linguflex](https://github.com/KoljaB/Linguflex)</strong>, the original project from which stream2sentence is spun off. It lets you control your environment by speaking and is one of the most capable and sophisticated open-source assistants currently available.*
  6 | 
  7 | ## Table of Contents
  8 | 
  9 | - [Features](#features)
 10 | - [Installation](#installation)
 11 | - [Usage](#usage)
 12 | - [Configuration](#configuration)
 13 | - [Contributing](#contributing)
 14 | - [License](#license)
 15 | 
 16 | ## Features
 17 | 
 18 | - Generates sentences from a stream of text in real-time.
 19 | - Customizable to finetune/balance speed vs reliability.
 20 | - Option to clean the output by removing links and emojis from the detected sentences.
 21 | - Easy to configure and integrate.
 22 | 
 23 | ## Installation
 24 | 
 25 | ```bash
 26 | pip install stream2sentence
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | Pass a generator of characters or text chunks to `generate_sentences()` to get a generator of sentences in return.
 32 | 
 33 | Here's a basic example:
 34 | 
 35 | ```python
 36 | from stream2sentence import generate_sentences
 37 | 
 38 | # Dummy generator for demonstration
 39 | def dummy_generator():
 40 |     yield "This is a sentence. And here's another! Yet, "
 41 |     yield "there's more. This ends now."
 42 | 
 43 | for sentence in generate_sentences(dummy_generator()):
 44 |     print(sentence)
 45 | ```
 46 | 
 47 | This will output:
 48 | ```
 49 | This is a sentence.
 50 | And here's another!
 51 | Yet, there's more.
 52 | This ends now.
 53 | ```
 54 | 
 55 | One main use case of this library is enable fast text to speech synthesis in the context of character feeds generated from large language models: this library enables fastest possible access to a complete sentence or sentence fragment (using the quick_yield_single_sentence_fragment flag) that then can be synthesized in realtime. The usage of this is demonstrated in the test_stream_from_llm.py file in the tests directory.
 56 | 
 57 | ## Configuration
 58 | 
 59 | The `generate_sentences()` function offers various parameters to fine-tune its behavior:
 60 | 
 61 | ### Core Parameters
 62 | 
 63 | - `generator: Iterator[str]`
 64 |   - The primary input source, yielding chunks of text to be processed.
 65 |   - Can be any iterator that emits text chunks of any size.
 66 | 
 67 | - `context_size: int = 12`
 68 |   - Number of characters considered for sentence boundary detection.
 69 |   - Larger values improve accuracy but may increase latency.
 70 |   - Default: 12 characters
 71 | 
 72 | - `context_size_look_overhead: int = 12`
 73 |   - Additional characters to examine beyond `context_size` for sentence splitting.
 74 |   - Enhances sentence detection accuracy.
 75 |   - Default: 12 characters
 76 | 
 77 | - `minimum_sentence_length: int = 10`
 78 |   - Minimum character count for a text chunk to be considered a sentence.
 79 |   - Shorter fragments are buffered until this threshold is met.
 80 |   - Default: 10 characters
 81 | 
 82 | - `minimum_first_fragment_length: int = 10`
 83 |   - Minimum character count required for the first sentence fragment.
 84 |   - Ensures the initial output meets a specified length threshold.
 85 |   - Default: 10 characters
 86 | 
 87 | ### Yield Control
 88 | 
 89 | These parameters control how quickly and frequently the generator yields sentence fragments:
 90 | 
 91 | - `quick_yield_single_sentence_fragment: bool = False`
 92 |   - When True, yields the first fragment of the first sentence as quickly as possible.
 93 |   - Useful for getting immediate output in real-time applications like speech synthesis.
 94 |   - Default: False
 95 | 
 96 | - `quick_yield_for_all_sentences: bool = False`
 97 |   - When True, yields the first fragment of every sentence as quickly as possible.
 98 |   - Extends the quick yield behavior to all sentences, not just the first one.
 99 |   - Automatically sets `quick_yield_single_sentence_fragment` to True.
100 |   - Default: False
101 | 
102 | - `quick_yield_every_fragment: bool = False`
103 |   - When True, yields every fragment of every sentence as quickly as possible.
104 |   - Provides the most granular output, yielding fragments as soon as they're detected.
105 |   - Automatically sets both `quick_yield_for_all_sentences` and `quick_yield_single_sentence_fragment` to True.
106 |   - Default: False
107 | 
108 | ### Text Cleanup
109 | 
110 | - `cleanup_text_links: bool = False`
111 |   - When True, removes hyperlinks from the output sentences.
112 |   - Default: False
113 | 
114 | - `cleanup_text_emojis: bool = False`
115 |   - When True, removes emoji characters from the output sentences.
116 |   - Default: False
117 | 
118 | ### Tokenization
119 | 
120 | - `tokenize_sentences: Callable = None`
121 |   - Custom function for sentence tokenization.
122 |   - If None, uses the default tokenizer specified by `tokenizer`.
123 |   - Default: None
124 | 
125 | - `tokenizer: str = "nltk"`
126 |   - Specifies the tokenizer to use. Options: "nltk" or "stanza"
127 |   - Default: "nltk"
128 | 
129 | - `language: str = "en"`
130 |   - Language setting for the tokenizer.
131 |   - Use "en" for English or "multilingual" for Stanza tokenizer.
132 |   - Default: "en"
133 | 
134 | ### Debugging and Fine-tuning
135 | 
136 | - `log_characters: bool = False`
137 |   - When True, logs each processed character to the console.
138 |   - Useful for debugging or monitoring real-time processing.
139 |   - Default: False
140 | 
141 | - `sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-"`
142 |   - Characters considered as potential sentence fragment delimiters.
143 |   - Used for quick yielding of sentence fragments.
144 |   - Default: ".?!;:,\n…)]}。-"
145 | 
146 | - `full_sentence_delimiters: str = ".?!\n…。"`
147 |   - Characters considered as full sentence delimiters.
148 |   - Used for more definitive sentence boundary detection.
149 |   - Default: ".?!\n…。"
150 | 
151 | - `force_first_fragment_after_words: int = 15`
152 |   - Forces the yield of the first sentence fragment after this many words.
153 |   - Ensures timely output even with long opening sentences.
154 |   - Default: 15 words
155 | 
156 | 
157 | ## Time based strategy
158 | Instead of a purely lexigraphical strategy, a time based strategy is available.
159 | A target tokens per second (tps) is input, and generate_sentences will yield the best
160 | available output (full sentence, longest fragment, or any available buffer, in that order) if it is approaching a "deadline"
161 | where what has been output would be slower than the input tps target. If LLM is more than
162 | two full sentences ahead of the target it will output a sentence even if it's ahead of the "deadline"
163 | 
164 | `from stream2sentence.stream2sentence_time_based import generate_sentences_time_based`
165 | 
166 | ### Parameters
167 | - `generator (Iterator[str])`
168 |   - A generator that yields chunks of text as a stream of characters.`
169 | - `target_tps: float = 4`
170 |   - the rate in tokens per second you want to use to calculate deadlines for output.
171 |   - Default is 4. (approximately the speed of human speech)
172 | - `lead_time: float = 1`
173 |   - amount of time in seconds to wait for the buffer to build for before returning values.
174 | - `max_wait_for_fragments = [3, 2]`
175 |   - Max amount of time in seconds that the Nth sentence will wait beyond the "deadline" for a "fragment" (text preceeding a fragment delimiter), which is preferred over a piece of buffer.
176 |   - The last value in the array is used for all subsequent checks.
177 | - `min_output_lengths: int[] = [2, 3, 3, 4]`
178 |   - An array that corresponds to the minimum output size in words for the corresponding output sentence, the last value in the array is used for all remaining output.
179 |   - For example [4,5,6] would mean the first piece of output must have 4 words, the second 5 words, and all subsequent 6.
180 | - `preferred_sentence_fragment_delimiters: str[] = ['. ', '? ', '! ', '\n']`
181 |   - Array of strings that deliniate a sentence fragment. "Preferred" are checked first and always used if the fragment meets the length requirement over the other fragment delimiters.
182 |   - Note the trailing spaces, added to differentiate between values like $3.5 and a proper sentence end
183 | - `sentence_fragment_delimiters: str[] = ['; ', ': ', ', ', '* ', '**', '– ']`
184 |   - Array of strings that are checked after "preferred" delimiters
185 | - `delimiter_ignore_prefixes: str[]` 
186 |   - Array of strings that will not be considered "delimiters" if preceeded by a delimiter.
187 |   - Used to ignore common abbreviations for things like Mr. Dr. and Mrs. where we don't want to split
188 |   - Default is a long list documented in delimiter_ignore_prefixes
189 | - `wait_for_if_non_fragment: str[]`
190 |   - Array of strings that the algorithm will not use as the last value if the whole buffer is being output (not a fragment or sentence).
191 |   - Avoids awkward pauses on common words that are unnatural to pause at.
192 |   - Default is a long list of common words documented in avoid_pause_words.py
193 | - `deadline_offsets_static: float[] = [1]`
194 |   - Constant amount of time in seconds to subtract from the deadline for first n sentences. 
195 |   - Last value applied to all subsequent sentences
196 | - `deadline_offsets_dynamic: float[] = [0]`: 
197 |   - Added to account for the time it takes a TTS engine to generate output. 
198 |   - For example, if it takes your TTS engine around 1 second to generate 10 words, you can use a value of 0.1 so that the TTS generation time is included in the deadline. 
199 |   - Applied to first n sentences, last value applied to all subsequent
200 | 
201 | ## Contributing
202 | 
203 | Any Contributions you make are welcome and **greatly appreciated**.
204 | 
205 | 1. **Fork** the Project.
206 | 2. **Create** your Feature Branch (`git checkout -b feature/AmazingFeature`).
207 | 3. **Commit** your Changes (`git commit -m 'Add some AmazingFeature'`).
208 | 4. **Push** to the Branch (`git push origin feature/AmazingFeature`).
209 | 5. **Open** a Pull Request.
210 | 
211 | ## License
212 | 
213 | This project is licensed under the MIT License. For more details, see the [`LICENSE`](LICENSE) file.
214 | 
215 | ---
216 | 
217 | Project created and maintained by [Kolja Beigel](https://github.com/KoljaB).
218 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="stream2sentence", 
 8 |     version="0.3.1",
 9 |     author="Kolja Beigel",
10 |     author_email="kolja.beigel@web.de",
11 |     description="Real-time processing and delivery of sentences from a continuous stream of characters or text chunks.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/KoljaB/stream2sentence",
15 |     packages=setuptools.find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.6',
22 |     install_requires=[
23 |         'nltk==3.9.1',
24 |         'emoji==2.14.1',
25 |         'stanza==1.10.1'
26 |     ],
27 |     keywords='realtime, text streaming, stream, sentence, sentence detection, sentence generation, tts, speech synthesis, nltk, text analysis, audio processing, boundary detection, sentence boundary detection'
28 | )


--------------------------------------------------------------------------------
/stream2sentence/__init__.py:
--------------------------------------------------------------------------------
 1 | from .stream2sentence import (
 2 |     generate_sentences,
 3 |     generate_sentences_async,
 4 |     init_tokenizer,
 5 | )
 6 | 
 7 | from .stream2sentence_time_based import (
 8 |     generate_sentences_time_based,
 9 | )
10 | 
11 | from .avoid_pause_words import (
12 |     AVOID_PAUSE_WORDS,
13 | )
14 | 


--------------------------------------------------------------------------------
/stream2sentence/avoid_pause_words.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Conjunctions
 4 | conjunctions = ["and", "or", "but", "so", "for", "nor", "yet"]
 5 | 
 6 | # Prepositions
 7 | prepositions = [
 8 |     "in", "on", "at", "by", "with", "about", "of", "to", "for", 
 9 |     "from", "as", "over", "under", "through", "between", "during", "there"
10 | ]
11 | 
12 | # Articles
13 | articles = ["a", "an", "the"]
14 | 
15 | # Possessives and Demonstratives
16 | possessives_and_demonstratives = [
17 |     "my", "your", "his", "her", "its", "our", "their", 
18 |     "this", "that", "these", "those"
19 | ]
20 | 
21 | # Auxiliary/Helping Verbs
22 | auxiliary_verbs = [
23 |     "is", "are", "was", "were", "am", "be", "been", "being",
24 |     "do", "does", "did", "have", "has", "had", 
25 |     "can", "could", "shall", "should", 
26 |     "will", "would", "may", "might", "must"
27 | ]
28 | 
29 | # Pronouns
30 | pronouns = [
31 |     "I", "we", "you", "he", "she", "it", "they", 
32 |     "who", "whom", "whose", "which", "that"
33 | ]
34 | 
35 | # Quantifiers
36 | quantifiers = ["some", "many", "few", "all", "any", "most", "much", "none", "several"]
37 | 
38 | # Adverbs that Modify Flow
39 | adverbs = ["very", "too", "just", "quite", "almost", "nearly", "only"]
40 | 
41 | # Interrogatives
42 | interrogatives = ["what", "where", "when", "why", "how"]
43 | 
44 | # Relative Pronouns
45 | relative_pronouns = ["who", "which", "that"]
46 | 
47 | # Subordinating Conjunctions
48 | subordinating_conjunctions = [
49 |     "although", "because", "if", "since", 
50 |     "though", "while", "until", "unless"
51 | ]
52 | 
53 | AVOID_PAUSE_WORDS = set(
54 |   conjunctions + 
55 |   prepositions + 
56 |   articles + 
57 |   possessives_and_demonstratives + 
58 |   auxiliary_verbs + 
59 |   pronouns + 
60 |   quantifiers + 
61 |   adverbs + 
62 |   interrogatives + 
63 |   subordinating_conjunctions
64 | )
65 | 
66 | 


--------------------------------------------------------------------------------
/stream2sentence/delimiter_ignore_prefixes.py:
--------------------------------------------------------------------------------
 1 | 
 2 | titles_and_abbreviations = [
 3 |     "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "St.",
 4 |     "Ph.D.", "Phd.", "PhD.", "M.D.", "B.A.", "M.A.", "D.D.S.", "J.D.",
 5 |     "Co.", "Corp.", "Ave.", "Blvd.", "Rd.", "Mt.",
 6 |     "a.m.", "p.m.", "Jr.", "Sr.",
 7 |     "Gov.", "Gen.", "Capt.", "Lt.", "Maj.", "Col.", "Adm.", "Cmdr.",
 8 |     "Sgt.", "Cpl.", "Pvt.", "U.S.", "U.K.", "vs.", "i.e.", "e.g.",
 9 |     "Vol.", "Art.", "Sec.", "Chap.", "Fig.", "Ref.", "Dept."
10 | ]
11 | 
12 | dates_and_times = [
13 |     "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.",
14 |     "Sep.", "Oct.", "Nov.", "Dec.",
15 |     "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.",
16 | ]
17 | 
18 | financial_abbreviations = [
19 |     "Inc.", "Ltd.", "Corp.", "PLC.", "LLC.", "LLP.",
20 |     "P/E.", "EPS.", "NAV.", "ROI.", "ROA.", "ROE.",
21 | ]
22 | 
23 | country_abbreviations = [
24 |     "U.S.A.", "U.K.", "U.A.E.", "P.R.C.", "D.R.C.", "R.O.C.", 
25 |     "E.U.", "U.N.", "A.U.",
26 |     "U.S.", "U.K.", "E.U.", "P.R.C.", "D.R.C.", "R.O.C.",
27 | ]
28 | 
29 | DELIMITER_IGNORE_PREFIXES = set(
30 |     titles_and_abbreviations + dates_and_times +
31 |     financial_abbreviations + country_abbreviations
32 | )
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/stream2sentence/stream2sentence.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Real-time processing and delivery of sentences
  3 | from a continuous stream of characters or text chunks
  4 | """
  5 | 
  6 | import functools
  7 | import logging
  8 | import re
  9 | import time
 10 | from typing import (
 11 |     AsyncIterable,
 12 |     AsyncIterator,
 13 |     Awaitable,
 14 |     Callable,
 15 |     Concatenate,
 16 |     Iterable,
 17 |     Iterator,
 18 |     ParamSpec,
 19 | )
 20 | 
 21 | import emoji
 22 | 
 23 | current_tokenizer = "nltk"
 24 | stanza_initialized = False
 25 | nltk_initialized = False
 26 | nlp = None
 27 | 
 28 | 
 29 | def initialize_nltk(debug=False):
 30 |     """
 31 |     Initializes NLTK by downloading required data for sentence tokenization.
 32 |     """
 33 |     global nltk_initialized
 34 |     if nltk_initialized:
 35 |         return
 36 | 
 37 |     logging.info("Initializing NLTK Tokenizer")
 38 | 
 39 |     try:
 40 |         import nltk
 41 | 
 42 |         nltk.download("punkt_tab", quiet=not debug)
 43 |         nltk_initialized = True
 44 |     except Exception as e:
 45 |         print(f"Error initializing nltk tokenizer: {e}")
 46 |         nltk_initialized = False
 47 | 
 48 | 
 49 | def initialize_stanza(language: str = "en", offline=False):
 50 |     """
 51 |     Initializes Stanza by downloading required data for sentence tokenization.
 52 |     """
 53 |     global nlp, stanza_initialized
 54 |     if stanza_initialized:
 55 |         return
 56 | 
 57 |     logging.info("Initializing Stanza Tokenizer")
 58 | 
 59 |     try:
 60 |         import stanza
 61 | 
 62 |         if not offline:
 63 |             stanza.download(language)
 64 | 
 65 |         nlp = stanza.Pipeline(language, download_method=None)
 66 |         stanza_initialized = True
 67 |     except Exception as e:
 68 |         print(f"Error initializing stanza tokenizer: {e}")
 69 |         stanza_initialized = False
 70 | 
 71 | 
 72 | def _remove_links(text: str) -> str:
 73 |     """
 74 |     Removes any links from the input text.
 75 | 
 76 |     Args:
 77 |         text (str): Input text
 78 | 
 79 |     Returns:
 80 |         str: Text with links removed
 81 |     """
 82 |     pattern = (
 83 |         r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|"
 84 |         r"[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
 85 |     )
 86 | 
 87 |     return re.sub(pattern, "", text)
 88 | 
 89 | 
 90 | def _remove_emojis(text: str) -> str:
 91 |     """
 92 |     Removes emojis from the input text.
 93 | 
 94 |     Args:
 95 |         text (str): Input text
 96 | 
 97 |     Returns:
 98 |         str: Text with emojis removed
 99 |     """
100 |     return emoji.replace_emoji(text, "")
101 | 
102 | 
103 | async def _generate_characters(
104 |     generator: AsyncIterable[str], log_characters: bool = False
105 | ) -> AsyncIterator[str]:
106 |     """
107 |     Generates individual characters from a text generator.
108 | 
109 |     Args:
110 |         generator (Iterator[str]): Input text generator
111 |         log_characters (bool): Whether to log the characters to the console
112 | 
113 |     Yields:
114 |         Individual characters from the generator
115 |     """
116 |     if log_characters:
117 |         print("Stream: ", end="", flush=True)
118 |     async for chunk in generator:
119 |         for char in chunk:
120 |             if log_characters:
121 |                 print(char, end="", flush=True)
122 |             yield char
123 |     if log_characters:
124 |         print()
125 | 
126 | 
127 | def _clean_text(
128 |     text: str,
129 |     cleanup_text_links: bool = False,
130 |     cleanup_text_emojis: bool = False,
131 |     strip_text: bool = True,
132 | ) -> str:
133 |     """
134 |     Cleans the text by removing links and emojis.
135 | 
136 |     Args:
137 |         text (str): Input text
138 |         cleanup_text_links (boolean, optional): Remove non-desired links from
139 |           the stream.
140 |         cleanup_text_emojis (boolean, optional): Remove non-desired emojis
141 |           from the stream.
142 | 
143 |     Returns:
144 |         str: Cleaned text
145 |     """
146 |     if cleanup_text_links:
147 |         text = _remove_links(text)
148 |     if cleanup_text_emojis:
149 |         text = _remove_emojis(text)
150 |     if strip_text:
151 |         text = text.strip()
152 |     return text
153 | 
154 | 
155 | def _tokenize_sentences(text: str, tokenize_sentences=None) -> list[str]:
156 |     """
157 |     Tokenizes sentences from the input text.
158 | 
159 |     Args:
160 |         text (str): Input text
161 |         tokenize_sentences (Callable, optional): A function that tokenizes
162 |           sentences from the input text. Defaults to None.
163 | 
164 |     Yields:
165 |         Iterator[str]: An iterator of sentences
166 |     """
167 |     if tokenize_sentences:
168 |         sentences = tokenize_sentences(text)
169 |     else:
170 |         nlp_start_time = time.time()
171 |         if current_tokenizer == "nltk":
172 |             import nltk
173 | 
174 |             sentences = nltk.tokenize.sent_tokenize(text)
175 |         elif current_tokenizer == "stanza":
176 |             import stanza
177 | 
178 |             global nlp
179 |             doc = nlp(text)
180 |             sentences = [sentence.text for sentence in doc.sentences]
181 |         else:
182 |             raise ValueError(f"Unknown tokenizer: {current_tokenizer}")
183 |         nlp_end_time = time.time()
184 |         logging.debug("Time to split sentences: " f"{nlp_end_time - nlp_start_time}")
185 |     return sentences
186 | 
187 | 
188 | def init_tokenizer(tokenizer: str, language: str = "en", offline=False, debug=False):
189 |     """
190 |     Initializes the sentence tokenizer.
191 |     """
192 |     if tokenizer == "nltk":
193 |         initialize_nltk(debug)
194 |     elif tokenizer == "stanza":
195 |         initialize_stanza(language,offline=offline)
196 |     else:
197 |         logging.warning(f"Unknown tokenizer: {tokenizer}")
198 | 
199 | async def generate_sentences_async(
200 |     generator: AsyncIterable[str],
201 |     context_size: int = 12,
202 |     context_size_look_overhead: int = 12,
203 |     minimum_sentence_length: int = 10,
204 |     minimum_first_fragment_length=10,
205 |     quick_yield_single_sentence_fragment: bool = False,
206 |     quick_yield_for_all_sentences: bool = False,
207 |     quick_yield_every_fragment: bool = False,
208 |     cleanup_text_links: bool = False,
209 |     cleanup_text_emojis: bool = False,
210 |     tokenize_sentences=None,
211 |     tokenizer: str = "nltk",
212 |     language: str = "en",
213 |     log_characters: bool = False,
214 |     sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-",
215 |     full_sentence_delimiters: str = ".?!\n…。",
216 |     force_first_fragment_after_words=30,
217 |     filter_first_non_alnum_characters: bool = False,
218 |     debug=False,
219 | ) -> AsyncIterator[str]:
220 |     """
221 |     Generates well-formed sentences from a stream of characters or text chunks
222 |       provided by an input generator.
223 | 
224 |     Args:
225 |         generator (Iterator[str]): A generator that yields chunks of text as a
226 |           stream of characters.
227 |         context_size (int): The number of characters used to establish context
228 |           for sentence boundary detection. A larger context improves the
229 |           accuracy of detecting sentence boundaries.
230 |           Default is 12 characters.
231 |         context_size_look_overhead: The number of characters to look
232 |           over the context_size boundaries to detect sentence splitting
233 |           characters (improves sentence detection).
234 |         minimum_sentence_length (int): The minimum number of characters a
235 |           sentence must have. If a sentence is shorter, it will be
236 |           concatenated with the following one, improving the overall
237 |           readability. This parameter does not apply to the first sentence
238 |           fragment, which is governed by `minimum_first_fragment_length`.
239 |           Default is 10 characters.
240 |         minimum_first_fragment_length (int): The minimum number of characters
241 |           required for the first sentence fragment before yielding.
242 |           Default is 10 characters.
243 |         quick_yield_single_sentence_fragment (bool): If set to True, the
244 |           generator will yield the first sentence first fragment as quickly as
245 |           possible. This is particularly useful for real-time applications
246 |           such as speech synthesis.
247 |         quick_yield_for_all_sentences (bool): If set to True, the
248 |           generator will yield every sentence first fragment as quickly as
249 |           possible (not only the first sentence first fragment)
250 |         quick_yield_every_fragment (bool): If set to True, the
251 |           generator not only yield every sentence first fragment, but also every
252 |           following fragment.
253 |         cleanup_text_links (bool): If True, removes hyperlinks from the text
254 |           stream to ensure clean output.
255 |         cleanup_text_emojis (bool): If True, filters out emojis from the text
256 |           stream for clear textual content.
257 |         tokenize_sentences (Callable): A function that tokenizes sentences
258 |           from the input text. Defaults to None.
259 |         tokenizer (str): The tokenizer to use for sentence tokenization.
260 |           Default is "nltk". Can be "nltk" or "stanza".
261 |         language (str): The language to use for sentence tokenization.
262 |           Default is "en". Can be "multilingual" for stanze tokenizer.
263 |         log_characters (bool): If True, logs each character to the console as
264 |           they are processed.
265 |         sentence_fragment_delimiters (str): A string of characters that are
266 |           considered sentence fragment delimiters. Default is ".?!;:,\n…)]}。-".
267 |         full_sentence_delimiters (str): A string of characters that are
268 |           considered full sentence delimiters. Default is ".?!\n…。".
269 |         force_first_fragment_after_words (int): The number of words after
270 |           which the first sentence fragment is forced to be yielded.
271 |           Default is 30 words.
272 |         filter_first_non_alnum_characters (bool): If True, filters out the
273 |           first non-alphanumeric characters from the text stream.
274 |         debug (bool): If True, enables debug mode for logging.
275 | 
276 |     Yields:
277 |         Iterator[str]: An iterator of complete sentences constructed from the
278 |           input text stream. Each yielded sentence meets the specified minimum
279 |           length requirements and is cleaned up if specified.
280 | 
281 |     The function maintains a buffer to accumulate text chunks and applies
282 |       natural language processing to detect sentence boundaries.
283 |       It employs various heuristics, such as minimum sentence length and
284 |       sentence delimiters, to ensure the quality of the output sentences.
285 |       The function also provides options to clean up the text stream,
286 |       making it versatile for different types of text processing applications.
287 |     """
288 | 
289 |      # Initialize the tokenizer based on the specified tokenizer and language
290 |     global current_tokenizer
291 |     current_tokenizer = tokenizer
292 |     init_tokenizer(current_tokenizer, language, debug)
293 | 
294 |     buffer = ""
295 |     is_first_sentence = True
296 |     word_count = 0  # Initialize word count
297 |     last_delimiter_position = -1  # Position of last full sentence delimiter
298 | 
299 |     # Adjust quick yield flags based on settings
300 |     if quick_yield_every_fragment:
301 |         quick_yield_for_all_sentences = True
302 | 
303 |     if quick_yield_for_all_sentences:
304 |         quick_yield_single_sentence_fragment = True
305 | 
306 |     async for char in _generate_characters(generator, log_characters):
307 | 
308 |         if char:
309 |             if len(buffer) == 0:
310 |                 if filter_first_non_alnum_characters:
311 |                     if not char.isalnum():
312 |                         continue
313 | 
314 |             buffer += char
315 |             buffer = buffer.lstrip()
316 | 
317 |             # Update word count on encountering space or sentence fragment delimiter
318 |             if char.isspace() or char in sentence_fragment_delimiters:
319 |                 word_count += 1
320 | 
321 |             if debug:
322 |                 print("\033[36mDebug: Added char, buffer size: \"{}\"\033[0m".format(len(buffer)))
323 | 
324 |             # Check conditions to yield first sentence fragment quickly
325 |             if (
326 |                 is_first_sentence
327 |                 and len(buffer) > minimum_first_fragment_length
328 |                 and quick_yield_single_sentence_fragment
329 |             ):
330 | 
331 |                 if (
332 |                     buffer[-1] in sentence_fragment_delimiters
333 |                     or char.isspace() and word_count >= force_first_fragment_after_words
334 |                 ):
335 | 
336 |                     yield_text = _clean_text(
337 |                         buffer,
338 |                         cleanup_text_links,
339 |                         cleanup_text_emojis)
340 |                     if debug:
341 |                         if buffer[-1] in sentence_fragment_delimiters:
342 |                             print("\033[36mDebug: Yielding first sentence fragment: \"{}\" because buffer[-1] {} is sentence frag \033[0m".format(yield_text, buffer[-1]))
343 |                         else:
344 |                             print("\033[36mDebug: Yielding first sentence fragment: \"{}\" because word_count {} is >= force_first_fragment_after_words \033[0m".format(yield_text, word_count))
345 | 
346 |                     yield yield_text
347 | 
348 |                     buffer = ""
349 |                     word_count = 0
350 |                     if not quick_yield_every_fragment:
351 |                         is_first_sentence = False
352 | 
353 |                     continue
354 | 
355 |              # Continue accumulating characters if buffer is under minimum sentence length
356 |             if len(buffer) <= minimum_sentence_length + context_size:
357 | 
358 |                 continue
359 | 
360 |             # Update last delimiter position if a new delimiter is found
361 |             if char in full_sentence_delimiters:
362 |                 last_delimiter_position = len(buffer) - 1
363 | 
364 |             # Define context window for checking potential sentence boundaries
365 |             context_window_end_pos = len(buffer) - context_size - 1
366 |             context_window_start_pos = (
367 |                 context_window_end_pos - context_size_look_overhead
368 |             )
369 |             if context_window_start_pos < 0:
370 |                 context_window_start_pos = 0
371 | 
372 |             # Tokenize sentences from buffer
373 |             sentences = _tokenize_sentences(buffer, tokenize_sentences)
374 | 
375 |             if debug:
376 |                 print("\033[36mbuffer: \"{}\"\033[0m".format(buffer))
377 |                 print("\033[36mlast_delimiter_position: {}\033[0m".format(last_delimiter_position))
378 |                 print("\033[36mlen(sentences) > 2: {}\033[0m".format(len(sentences) > 2))
379 |                 print("\033[36mcontext_window_start_pos: {}\033[0m".format(context_window_start_pos))
380 |                 print("\033[36mcontext_window_end_pos: {}\033[0m".format(context_window_end_pos))
381 | 
382 |             # Combine sentences below minimum_sentence_length with the next sentence(s)
383 |             combined_sentences = []
384 |             temp_sentence = ""
385 | 
386 |             for sentence in sentences:
387 |                 if len(sentence) < minimum_sentence_length:
388 |                     temp_sentence += sentence + " "
389 |                 else:
390 |                     if temp_sentence:
391 |                         temp_sentence += sentence
392 |                         combined_sentences.append(temp_sentence.strip())
393 |                         temp_sentence = ""
394 |                     else:
395 |                         combined_sentences.append(sentence.strip())
396 | 
397 |             # If there's a leftover temp_sentence that hasn't been appended
398 |             if temp_sentence:
399 |                 combined_sentences.append(temp_sentence.strip())
400 | 
401 |             # Replace the original sentences with the combined_sentences
402 |             sentences = combined_sentences
403 | 
404 |             # Process and yield sentences based on conditions
405 |             if len(sentences) > 2 or (
406 |                 last_delimiter_position >= 0
407 |                 and context_window_start_pos
408 |                 <= last_delimiter_position
409 |                 <= context_window_end_pos
410 |             ):
411 |                 
412 |                 if len(sentences) > 1:
413 |                     total_length_except_last = sum(
414 |                         len(sentence) for sentence in sentences[:-1]
415 |                     )
416 |                     if total_length_except_last >= minimum_sentence_length:
417 |                         for sentence in sentences[:-1]:
418 |                             yield_text = _clean_text(
419 |                                 sentence,
420 |                                 cleanup_text_links,
421 |                                 cleanup_text_emojis)
422 |                             if debug:
423 |                                 print("\033[36mDebug: Yielding sentence: \"{}\"\033[0m".format(yield_text))
424 | 
425 |                             yield yield_text
426 |                             word_count = 0
427 | 
428 |                         if quick_yield_for_all_sentences:
429 |                             is_first_sentence = True
430 | 
431 |                         # we need to remember if the buffer ends with space
432 |                         # - sentences returned by the tokenizers are rtrimmed
433 |                         # - this takes any blank spaces away from the last unfinshed sentence
434 |                         # - we have to work around this by re-adding the blank space in this case
435 |                         ends_with_space = buffer.endswith(" ")
436 | 
437 |                         # set buffer to last unfinshed sentence returned by tokenizers
438 |                         buffer = sentences[-1]
439 | 
440 |                         # reset the blank space if it was there:
441 |                         if ends_with_space:
442 |                             buffer += " "
443 | 
444 |                         # reset the last delimiter position after yielding
445 |                         last_delimiter_position = -1 
446 | 
447 |     
448 |     # Yield remaining buffer as final sentence(s)
449 |     if buffer:
450 |         sentences = _tokenize_sentences(buffer, tokenize_sentences)
451 |         sentence_buffer = ""
452 | 
453 |         for sentence in sentences:
454 |             sentence_buffer += sentence
455 |             if len(sentence_buffer) < minimum_sentence_length:
456 |                 sentence_buffer += " "
457 | 
458 |                 continue
459 | 
460 |             yield_text = _clean_text(
461 |                 sentence_buffer, cleanup_text_links, cleanup_text_emojis
462 |             )
463 | 
464 |             if debug:
465 |                 print("\033[36mDebug: Yielding final sentence(s): \"{}\"\033[0m".format(yield_text))
466 | 
467 |             yield yield_text
468 | 
469 |             sentence_buffer = ""
470 | 
471 |         if sentence_buffer:
472 |             yield_text = _clean_text(
473 |                 sentence_buffer,
474 |                 cleanup_text_links,
475 |                 cleanup_text_emojis)
476 |             if debug:
477 |                 print("\033[36mDebug: Yielding remaining text: \"{}\"\033[0m".format(yield_text))
478 | 
479 |             yield yield_text
480 | 
481 | 
482 | def _await_sync(f: Awaitable[str]) -> str:
483 |     gen = f.__await__()
484 |     try:
485 |         next(gen)
486 |         raise RuntimeError(f"{f} failed to be synchronous")
487 |     except StopIteration as e:
488 |         return e.value
489 | 
490 | 
491 | def _async_iter_to_sync(f: AsyncIterator[str]) -> Iterator[str]:
492 |     try:
493 |         while True:
494 |             yield _await_sync(f.__anext__())
495 |     except StopAsyncIteration:
496 |         return
497 | 
498 | 
499 | P = ParamSpec("P")
500 | 
501 | 
502 | def _dowrap(
503 |     f: Callable[Concatenate[AsyncIterable[str], P], AsyncIterator[str]]
504 | ) -> Callable[Concatenate[Iterable[str], P], Iterator[str]]:
505 |     @functools.wraps(f)
506 |     def inner(generator: Iterable[str], *args: P.args, **kwargs: P.kwargs):
507 |         async def gen_wrap():
508 |             for x in generator:
509 |                 yield x
510 | 
511 |         return _async_iter_to_sync(f(gen_wrap(), *args, **kwargs))
512 | 
513 |     return inner
514 | 
515 | 
516 | generate_sentences = _dowrap(generate_sentences_async)
517 | generate_sentences.__name__ = "generate_sentences"
518 | generate_sentences.__qualname__ = "generate_sentences"
519 | 


--------------------------------------------------------------------------------
/stream2sentence/stream2sentence_time_based.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import nltk
  3 | from nltk.tokenize import PunktSentenceTokenizer
  4 | 
  5 | import time
  6 | from itertools import accumulate
  7 | 
  8 | from stream2sentence import init_tokenizer
  9 | from stream2sentence.avoid_pause_words import AVOID_PAUSE_WORDS
 10 | from stream2sentence.delimiter_ignore_prefixes import DELIMITER_IGNORE_PREFIXES
 11 | 
 12 | 
 13 | init_tokenizer("nltk")
 14 | 
 15 | WORDS_PER_TOKEN = 0.75
 16 | preferred_sentence_fragment_delimiters_global = []
 17 | sentence_fragment_delimiters_global = []
 18 | delimiter_ignore_prefixes_global = []
 19 | 
 20 | def get_index_or_last(a_list, index):
 21 |     return a_list[index] if index < len(a_list) else a_list[-1]
 22 | 
 23 | def find_last_delimiter(s, delimiters):
 24 |     valid_indices = []
 25 |     for delimiter in delimiters:
 26 |         index = s.rfind(delimiter)
 27 |         if index != -1:
 28 |             # Get the word preceding the delimiter
 29 |             preceding_word_start = s.rfind(" ", 0, index)
 30 |             preceding_word = s[preceding_word_start:index + 1].strip()
 31 |             
 32 |             if preceding_word not in delimiter_ignore_prefixes_global:
 33 |                 valid_indices.append(index)
 34 |     
 35 |     return max(valid_indices, default=-1)
 36 | 
 37 | def find_last_preferred_fragment_delimiter(s):
 38 |     return find_last_delimiter(s, preferred_sentence_fragment_delimiters_global)
 39 | 
 40 | def find_last_fragment_delimiter(s):
 41 |     return find_last_delimiter(s, sentence_fragment_delimiters_global)
 42 | 
 43 | def get_num_words(s):
 44 |     return len(s.split())
 45 | 
 46 | def find_first_greater(nums, value):
 47 |     for index, num in enumerate(nums):
 48 |         if num > value:
 49 |             return index
 50 |     return -1
 51 | 
 52 | 
 53 | def is_output_needed(has_output_started, start_time, lead_time, output_sentences, estimated_time_between_words, deadline_offset):
 54 |     cur_time = time.time()
 55 |     if not has_output_started and cur_time - start_time < lead_time:
 56 |         return False
 57 |     
 58 |     num_words_output = get_num_words(" ".join(output_sentences))
 59 |     output_deadline = num_words_output * estimated_time_between_words - deadline_offset
 60 |     return cur_time - start_time > output_deadline
 61 | 
 62 | def is_output_long_enough(output, min_output_length):
 63 |     num_words = get_num_words(output)
 64 |     return (num_words >= min_output_length)
 65 | 
 66 | def get_fragment(llm_buffer, min_output_length):
 67 |     delimiter_index = find_last_preferred_fragment_delimiter(llm_buffer)
 68 |     if delimiter_index != -1 and is_output_long_enough(llm_buffer[:delimiter_index], min_output_length):
 69 |         return llm_buffer[:delimiter_index + 1]
 70 |     
 71 |     delimiter_index = find_last_fragment_delimiter(llm_buffer)
 72 |     if delimiter_index != -1 and is_output_long_enough(llm_buffer[:delimiter_index], min_output_length):
 73 |         return llm_buffer[:delimiter_index + 1]
 74 |     return ""
 75 | 
 76 | def get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length):
 77 |     word_lengths_of_sentences = list(map(get_num_words, sentences_on_buffer))
 78 |     sums_of_word_lens = list(accumulate(word_lengths_of_sentences))
 79 |     return find_first_greater(sums_of_word_lens, min_output_length) + 1
 80 | 
 81 | 
 82 | def generate_sentences_time_based(
 83 |     generator, 
 84 |     lead_time = 1,
 85 |     max_wait_for_fragments = [3, 2],
 86 |     target_tps = 4,
 87 |     min_output_lengths = [2, 3, 3, 4],
 88 |     preferred_sentence_fragment_delimiters = ['. ', '? ', '! ', '\n'],
 89 |     sentence_fragment_delimiters = ['; ', ': ', ', ', '* ', '**', '– '],
 90 |     delimiter_ignore_prefixes = DELIMITER_IGNORE_PREFIXES,
 91 |     wait_for_if_non_fragment = AVOID_PAUSE_WORDS,
 92 |     deadline_offsets_static = [1],
 93 |     deadline_offsets_dynamic = [0],
 94 | ):
 95 |     """
 96 |     Uses a time based strategy to determine whether to yield. A target tps is provided,
 97 |     and when the outputted values are approaching the "deadline" where output will lag behind
 98 |     the target then yield best available option.
 99 | 
100 |     Args:
101 |         generator (Iterator[str]): A generator that yields chunks of text as a stream of characters.
102 |         lead_time (float): amount of time in seconds to wait for the buffer to build for before returning values.
103 |             Default is 1.
104 |         max_wait_for_fragments (float): Max amount of time in seconds that the Nth sentence will wait beyond the 
105 |             "deadline" for a "fragment" (text preceeding a fragment delimiter), which is preferred over a piece of buffer.
106 |             The last value in the array is used for all subsequent checks.
107 |             Default is [3, 2].
108 |         target_tps (float): the rate in tokens per second you want to use to calculate output deadlines.
109 |             Default is 4. (approximately the speed of human speech)
110 |         min_output_lengths (int[]]): An array that corresponds to the minimum output size in words 
111 |             for the corresponding output sentence, the last value in the array is used for all remaining output. 
112 |             For example [4,5,6] would mean the first piece of output must have 4 words, the second 5 words, and all subsequent 6.
113 |             Default is [2, 3, 3, 4]
114 |         preferred_sentence_fragment_delimiters (str[]): Array of strings that deliniate a sentence fragment. "Preferred"
115 |             are checked first and always used if the fragment meets the length requirement over the other fragment delimiters.
116 |             Note the trailing spaces, added to differentiate between values like $3.5 and a proper sentence end
117 |             Default is ['. ', '? ', '! ', '\n']
118 |         sentence_fragment_delimiters (str[]): Array of strings that are checked after "preferred" delimiters
119 |             Default is ['; ', ': ', ', ', '* ']
120 |         delimiter_ignore_prefixes (str[]): Array of strings that will not be considered "delimiters" if preceeded by a delimiter.
121 |             Used to ignore common abbreviations for things like Mr. Dr. and Mrs. where we don't want to split
122 |             Default is a long list documented in delimiter_ignore_prefixes
123 |         wait_for_if_non_fragment (str[]): Array of strings that the algorithm will not use as the last value if the whole buffer
124 |             is being output. Avoids awkward pauses on common words that are unnatural to pause at. 
125 |             Default is a long list of common words documented in avoid_pause_words.py
126 |         deadline_offsets_static float[]: Constant amount of time in seconds to subtract from the deadline for first n sentences.
127 |             Last value applied to all subsequent sentences
128 |             Default is [1].
129 |         deadline_offsets_dynamic float[]: Added to account for the time it takes a TTS engine to generate output. 
130 |             For example, if it takes your TTS engine around 1 second to generate 10 words, you can use a value of 0.1
131 |             so that the TTS generation time is included in the deadline. Applied to first n sentences, last value applied to all subsequent
132 |             Default is [0].
133 |     Yields:
134 |         Iterator[str]: An iterator of complete sentences constructed from the
135 |           input text stream.
136 |     """
137 |     global preferred_sentence_fragment_delimiters_global, sentence_fragment_delimiters_global, delimiter_ignore_prefixes_global
138 |     preferred_sentence_fragment_delimiters_global = set(preferred_sentence_fragment_delimiters)
139 |     sentence_fragment_delimiters_global = set(sentence_fragment_delimiters)
140 |     delimiter_ignore_prefixes_global = set(delimiter_ignore_prefixes)
141 |     punkt_sentence_tokenizer = PunktSentenceTokenizer()
142 | 
143 |     start_time = time.time()
144 |     last_sentence_time = time.time()
145 |     estimated_time_between_words = 1 / (target_tps * WORDS_PER_TOKEN)
146 |     output_sentences = []
147 |     llm_buffer_full = ""
148 |     has_output_started = False
149 | 
150 | 
151 |     def handle_output(output, sentence_boundary_index=None):
152 |         nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, last_sentence_time
153 |         if not has_output_started:
154 |             #once output has started we go based on TTS start for deadline
155 |             start_time = time.time()
156 |             has_output_started = True
157 |         
158 |         end_index = len(output)
159 |         if sentence_boundary_index != None:
160 |             end_index = sentence_boundary_index
161 |         llm_buffer_full = llm_buffer_full[end_index:]
162 |         output_sentences.append(output)
163 |         last_sentence_time = time.time()
164 |         return output
165 | 
166 |     for token in generator:
167 |         llm_buffer_full += token
168 |         llm_buffer_full = llm_buffer_full.lstrip()
169 |         if len(llm_buffer_full.split(None, 2)) < 2:
170 |             #must have at least two words since last token may not be a full word
171 |             continue
172 | 
173 |         llm_buffer = llm_buffer_full.rsplit(" ", 1)[0] #remove last word
174 | 
175 |         #TODO edge case with disagreement, how to identify and use len(output) as fallback?
176 |         sentences_on_buffer = nltk.tokenize.sent_tokenize(llm_buffer)
177 |         sentence_boundaries = list(punkt_sentence_tokenizer.span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split()
178 | 
179 |         num_sentences_output = len(output_sentences)
180 |         min_output_length = get_index_or_last(min_output_lengths, num_sentences_output)
181 |         sentences_needed_for_min_len = get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length)
182 | 
183 |         current_output = llm_buffer
184 |         use_first_sentence = len(sentences_on_buffer) > 1 and is_output_long_enough(sentences_on_buffer[0], min_output_length)
185 |         if use_first_sentence:
186 |             current_output = sentences_on_buffer[0]
187 |         else:
188 |             current_fragment = get_fragment(llm_buffer, min_output_length)
189 |             if current_fragment != "":
190 |                 current_output = current_fragment
191 |         
192 |         num_words_for_offset = get_num_words(current_output)
193 |         deadline_offset_dynamic = get_index_or_last(deadline_offsets_dynamic, num_sentences_output)
194 |         deadline_offset_static = get_index_or_last(deadline_offsets_static, num_sentences_output)
195 |         deadline_offset = (num_words_for_offset * deadline_offset_dynamic) + deadline_offset_static
196 | 
197 |         output_needed = is_output_needed(has_output_started, start_time, lead_time, output_sentences, estimated_time_between_words, deadline_offset)
198 |         if output_needed and use_first_sentence:
199 |             end_index = len(sentences_on_buffer[0]) if len(sentence_boundaries) == 1 else sentence_boundaries[1][0] #edge case where sentence_boundaries disagrees with nltk.tokenize.sent_tokenize
200 |             yield handle_output(sentences_on_buffer[0], end_index)
201 |         elif output_needed:
202 |             output = current_fragment
203 |             if output == "":
204 |                 output = llm_buffer
205 |                 is_not_min_length = get_num_words(output) < min_output_length
206 |                 max_wait_for_fragment = get_index_or_last(max_wait_for_fragments, num_sentences_output)
207 |                 waiting_for_fragment = (time.time() - last_sentence_time < max_wait_for_fragment)
208 |                 if " " in output:
209 |                     _, last_word = output.rsplit(" ", 1)
210 |                 else:
211 |                     last_word = output
212 |                 last_word_avoid_pause = last_word in wait_for_if_non_fragment
213 | 
214 |                 if is_not_min_length or waiting_for_fragment or last_word_avoid_pause:
215 |                     continue
216 |             
217 |             yield handle_output(output)
218 |         else:
219 |             if sentences_needed_for_min_len == 0 or sentences_needed_for_min_len + 2 > len(sentences_on_buffer):
220 |                 #two sentences ahead is ideal
221 |                 continue
222 |             end_index = sentence_boundaries[sentences_needed_for_min_len][0]
223 |             output = " ".join(sentences_on_buffer[:sentences_needed_for_min_len])
224 |             yield handle_output(output, end_index)
225 | 
226 |     #after all tokens are processed yield whatever is left
227 |     for sentence in nltk.tokenize.sent_tokenize(llm_buffer_full):
228 |         yield sentence
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/tests/run_test.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | 
3 | :: switch to current execution directory
4 | cd /d %~dp0
5 | 
6 | TITLE basic stream2sentence test 
7 | python test_stream2sentence.py
8 | pause


--------------------------------------------------------------------------------
/tests/simpletest.py:
--------------------------------------------------------------------------------
 1 | from stream2sentence import generate_sentences
 2 | 
 3 | def generator():
 4 |     yield """No, the way it "cuts midway" is NOT like the audio is cut abruptly (like when you pause a video). You can check below the audio (sorry for not doing that earlier)"""
 5 | #     yield "Hallo, "
 6 | #     yield "wie geht es dir? "
 7 | #     yield "Mir geht es gut."
 8 | # expected = ["Hallo,", "wie geht es dir?", "Mir geht es gut."]
 9 | sentences = list(generate_sentences(generator(), minimum_sentence_length = 3, context_size=5, minimum_first_fragment_length = 3, quick_yield_single_sentence_fragment=True, debug=True))
10 | print(sentences)


--------------------------------------------------------------------------------
/tests/test_data/1.txt:
--------------------------------------------------------------------------------
  1 | Here!@#0.0000
  2 | 's!@#0.1226
  3 |  an!@#0.2501
  4 |  article!@#0.3877
  5 |  on!@#0.5165
  6 |  contemporary!@#0.6441
  7 |  sculptures!@#0.7736
  8 | :
  9 | 
 10 | !@#0.9468
 11 | **!@#1.1154
 12 | The!@#1.2851
 13 |  Vibr!@#1.4344
 14 | ant!@#1.5733
 15 |  World!@#1.7188
 16 |  of!@#1.8563
 17 |  Contemporary!@#1.9919
 18 |  Sculpt!@#2.1275
 19 | ure!@#2.2698
 20 | **
 21 | 
 22 | !@#2.4197
 23 | Cont!@#2.5498
 24 | emporary!@#2.6798
 25 |  sculpture!@#2.8095
 26 |  has!@#2.9456
 27 |  evolved!@#3.0844
 28 |  to!@#3.2220
 29 |  become!@#3.3618
 30 |  a!@#3.5000
 31 |  diverse!@#3.6489
 32 |  and!@#3.7822
 33 |  exciting!@#3.9078
 34 |  field!@#4.0370
 35 | ,!@#4.1796
 36 |  pushing!@#4.3138
 37 |  the!@#4.4497
 38 |  boundaries!@#4.5878
 39 |  of!@#4.7236
 40 |  what!@#4.8613
 41 |  art!@#5.0025
 42 |  can!@#5.1396
 43 |  be!@#5.2834
 44 | .!@#5.4285
 45 |  From!@#5.5672
 46 |  the!@#5.7131
 47 |  abstract!@#5.8658
 48 |  expression!@#5.9940
 49 | ist!@#6.1250
 50 |  sculptures!@#6.2582
 51 |  of!@#6.3902
 52 |  the!@#6.5205
 53 |  !@#6.6523
 54 | 195!@#6.7829
 55 | 0!@#6.9109
 56 | s!@#7.0437
 57 |  to!@#7.1729
 58 |  the!@#7.3029
 59 |  cutting!@#7.4321
 60 | -edge!@#7.5642
 61 |  installations!@#7.7011
 62 |  of!@#7.8383
 63 |  today!@#7.9776
 64 | ,!@#8.1163
 65 |  contemporary!@#8.2550
 66 |  sculpture!@#8.3987
 67 |  offers!@#8.5447
 68 |  a!@#8.6847
 69 |  wide!@#8.8347
 70 |  range!@#8.9662
 71 |  of!@#9.0988
 72 |  styles!@#9.2286
 73 | ,!@#9.3615
 74 |  materials!@#9.4918
 75 | ,!@#9.6230
 76 |  and!@#9.7524
 77 |  ideas!@#9.8841
 78 | .
 79 | 
 80 | !@#10.0167
 81 | **!@#10.1456
 82 | Key!@#10.2788
 83 |  Trends!@#10.4097
 84 |  in!@#10.5402
 85 |  Contemporary!@#10.6720
 86 |  Sculpt!@#10.8059
 87 | ure!@#10.9369
 88 | **
 89 | 
 90 | !@#11.0699
 91 | Some!@#11.2104
 92 |  of!@#11.3505
 93 |  the!@#11.4903
 94 |  most!@#11.6298
 95 |  notable!@#11.7702
 96 |  trends!@#11.9045
 97 |  in!@#12.0441
 98 |  contemporary!@#12.1864
 99 |  sculpture!@#12.3252
100 |  include!@#12.4633
101 | :
102 | 
103 | !@#12.6048
104 | *!@#12.7449
105 |  **!@#12.8872
106 | Abstract!@#13.0307
107 |  Sculpt!@#13.1650
108 | ure!@#13.3063
109 | **:!@#13.4519
110 |  Abstract!@#13.5971
111 |  sculpture!@#13.7428
112 |  emphasizes!@#13.8939
113 |  the!@#14.0357
114 |  idea!@#14.1810
115 |  or!@#14.3167
116 |  concept!@#14.4647
117 |  behind!@#14.6148
118 |  a!@#14.7596
119 |  work!@#14.9109
120 | ,!@#15.0604
121 |  often!@#15.2102
122 |  over!@#15.3408
123 |  its!@#15.4755
124 |  physical!@#15.6045
125 |  form!@#15.7351
126 | .
127 | !@#15.8680
128 | *!@#16.0013
129 |  **!@#16.1327
130 | Environmental!@#16.2669
131 |  Sculpt!@#16.3957
132 | ure!@#16.5278
133 | **:!@#16.6626
134 |  Environmental!@#16.7933
135 |  sculpture!@#16.9237
136 |  explores!@#17.0570
137 |  the!@#17.1869
138 |  relationship!@#17.3189
139 |  between!@#17.4513
140 |  art!@#17.5833
141 |  and!@#17.7155
142 |  the!@#17.8465
143 |  natural!@#17.9796
144 |  world!@#18.1110
145 | .
146 | !@#18.2436
147 | *!@#18.3752
148 |  **!@#18.5068
149 | Performance!@#18.6484
150 |  Sculpt!@#18.7892
151 | ure!@#18.9265
152 | **:!@#19.0698
153 |  Performance!@#19.2081
154 |  sculpture!@#19.3479
155 |  involves!@#19.4869
156 |  the!@#19.6276
157 |  artist!@#19.7690
158 | 's!@#19.9091
159 |  body!@#20.0474
160 |  and!@#20.1885
161 |  the!@#20.3252
162 |  viewer!@#20.4667
163 | 's!@#20.6114
164 |  experience!@#20.7561
165 | ,!@#20.8950
166 |  creating!@#21.0312
167 |  a!@#21.1801
168 |  new!@#21.3251
169 |  kind!@#21.4727
170 |  of!@#21.6204
171 |  dialogue!@#21.7732
172 |  between!@#21.9129
173 |  artist!@#22.0510
174 |  and!@#22.1929
175 |  audience!@#22.3374
176 | .
177 | !@#22.4842
178 | *!@#22.6363
179 |  **!@#22.7843
180 | Digital!@#22.9376
181 |  Sculpt!@#23.0920
182 | ure!@#23.2494
183 | **:!@#23.3886
184 |  Digital!@#23.5309
185 |  sculpture!@#23.6705
186 |  uses!@#23.8126
187 |  new!@#23.9498
188 |  technologies!@#24.0893
189 |  and!@#24.2271
190 |  materials!@#24.3666
191 |  to!@#24.5049
192 |  create!@#24.6488
193 |  innovative!@#24.7860
194 |  and!@#24.9320
195 |  interactive!@#25.0819
196 |  works!@#25.2293
197 | .
198 | 
199 | !@#25.3684
200 | **!@#25.5160
201 | Not!@#25.6669
202 | able!@#25.8172
203 |  Contemporary!@#25.9579
204 |  Sculpt!@#26.1009
205 | ors!@#26.2416
206 | **
207 | 
208 | !@#26.3834
209 | Some!@#26.5225
210 |  notable!@#26.6635
211 |  contemporary!@#26.8027
212 |  sculpt!@#26.9417
213 | ors!@#27.0801
214 |  include!@#27.2189
215 | :
216 | 
217 | !@#27.3561
218 | *!@#27.4985
219 |  **!@#27.6449
220 | An!@#27.7858
221 | ish!@#27.9271
222 |  Kapoor!@#28.0725
223 | **:!@#28.2208
224 |  A!@#28.3691
225 |  British!@#28.5163
226 |  sculpt!@#28.6589
227 | or!@#28.7991
228 |  known!@#28.9478
229 |  for!@#29.0967
230 |  his!@#29.2488
231 |  large!@#29.4031
232 | -scale!@#29.5505
233 | ,!@#29.6869
234 |  site!@#29.8201
235 | -specific!@#29.9524
236 |  works!@#30.0846
237 |  that!@#30.2212
238 |  explore!@#30.3509
239 |  the!@#30.4849
240 |  relationship!@#30.6181
241 |  between!@#30.7513
242 |  art!@#30.8848
243 |  and!@#31.0175
244 |  the!@#31.1494
245 |  natural!@#31.2831
246 |  world!@#31.4145
247 | .
248 | !@#31.5461
249 | *!@#31.6766
250 |  **!@#31.8114
251 | Richard!@#31.9529
252 |  S!@#32.0948
253 | erra!@#32.2339
254 | **:!@#32.3738
255 |  An!@#32.5118
256 |  American!@#32.6533
257 |  sculpt!@#32.7928
258 | or!@#32.9342
259 |  who!@#33.0802
260 |  has!@#33.2155
261 |  created!@#33.3658
262 |  many!@#33.5115
263 |  of!@#33.6634
264 |  the!@#33.8030
265 |  most!@#33.9475
266 |  iconic!@#34.0965
267 |  public!@#34.2502
268 |  sculptures!@#34.4086
269 |  of!@#34.5414
270 |  the!@#34.6778
271 |  past!@#34.8118
272 |  century!@#34.9442
273 | ,!@#35.0773
274 |  including!@#35.2099
275 |  the!@#35.3420
276 |  "!@#35.4783
277 | T!@#35.6104
278 | ilt!@#35.7445
279 | ed!@#35.8798
280 |  Arc!@#36.0139
281 | "!@#36.1462
282 |  in!@#36.2803
283 |  New!@#36.4118
284 |  York!@#36.5461
285 |  City!@#36.6782
286 | .
287 | !@#36.8135
288 | *!@#36.9487
289 |  **!@#37.0810
290 | Y!@#37.2250
291 | ay!@#37.3674
292 | oi!@#37.5118
293 |  K!@#37.6547
294 | us!@#37.7991
295 | ama!@#37.9404
296 | **:!@#38.0820
297 |  A!@#38.2237
298 |  Japanese!@#38.3635
299 |  sculpt!@#38.5041
300 | or!@#38.6472
301 |  known!@#38.7884
302 |  for!@#38.9278
303 |  her!@#39.0739
304 |  immersive!@#39.2198
305 |  installations!@#39.3672
306 |  and!@#39.5015
307 |  sculptures!@#39.6478
308 |  that!@#39.7983
309 |  explore!@#39.9441
310 |  themes!@#40.0941
311 |  of!@#40.2445
312 |  infinity!@#40.3960
313 |  and!@#40.5367
314 |  self!@#40.6840
315 | -!@#40.8238
316 | obl!@#40.9735
317 | iteration!@#41.1250
318 | .
319 | !@#41.2744
320 | *!@#41.4277
321 |  **!@#41.5806
322 | C!@#41.7402
323 | ai!@#41.8775
324 |  Gu!@#42.0240
325 | o!@#42.1668
326 | -Q!@#42.3061
327 | iang!@#42.4487
328 | **:!@#42.5885
329 |  A!@#42.7288
330 |  Chinese!@#42.8691
331 |  sculpt!@#43.0131
332 | or!@#43.1547
333 |  who!@#43.2946
334 |  has!@#43.4403
335 |  created!@#43.5861
336 |  many!@#43.7251
337 |  innovative!@#43.8729
338 |  and!@#44.0209
339 |  interactive!@#44.1735
340 |  works!@#44.3239
341 |  using!@#44.4713
342 |  fire!@#44.6138
343 | ,!@#44.7615
344 |  oil!@#44.9123
345 | ,!@#45.0642
346 |  and!@#45.2173
347 |  other!@#45.3722
348 |  materials!@#45.5095
349 | .
350 | 
351 | !@#45.6437
352 | **!@#45.7766
353 | M!@#45.9128
354 | use!@#46.0471
355 | ums!@#46.1823
356 |  and!@#46.3172
357 |  Gall!@#46.4608
358 | eries!@#46.6032
359 | **
360 | 
361 | !@#46.7450
362 | Several!@#46.8844
363 |  museums!@#47.0281
364 |  and!@#47.1705
365 |  galleries!@#47.3114
366 |  showcase!@#47.4544
367 |  contemporary!@#47.5988
368 |  sculpture!@#47.7454
369 |  from!@#47.8902
370 |  around!@#48.0350
371 |  the!@#48.1867
372 |  world!@#48.3355
373 | .!@#48.4871
374 |  Some!@#48.6313
375 |  notable!@#48.7799
376 |  institutions!@#48.9345
377 |  include!@#49.0863
378 | :
379 | 
380 | !@#49.2394
381 | *!@#49.3938
382 |  **!@#49.5343
383 | The!@#49.6766
384 |  Museum!@#49.8181
385 |  of!@#49.9603
386 |  Modern!@#50.1021
387 |  Art!@#50.2454
388 |  (!@#50.3880
389 | Mo!@#50.5310
390 | MA!@#50.6792
391 | )!@#50.8176
392 |  -!@#50.9679
393 |  New!@#51.1190
394 |  York!@#51.2691
395 |  City!@#51.4234
396 | ,!@#51.5667
397 |  USA!@#51.7136
398 | **
399 | !@#51.8644
400 | *!@#52.0164
401 |  **!@#52.1699
402 | The!@#52.3167
403 |  Tate!@#52.4514
404 |  Modern!@#52.5861
405 |  -!@#52.7210
406 |  London!@#52.8571
407 | ,!@#52.9923
408 |  UK!@#53.1277
409 | **
410 | !@#53.2647
411 | *!@#53.4011
412 |  **!@#53.5373
413 | The!@#53.6710
414 |  Whitney!@#53.8092
415 |  Museum!@#53.9437
416 |  of!@#54.0809
417 |  American!@#54.2188
418 |  Art!@#54.3516
419 |  -!@#54.4858
420 |  New!@#54.6227
421 |  York!@#54.7565
422 |  City!@#54.8910
423 | ,!@#55.0285
424 |  USA!@#55.1626
425 | **
426 | !@#55.2999
427 | *!@#55.4347
428 |  **!@#55.5705
429 | The!@#55.7071
430 |  G!@#55.8412
431 | ugg!@#55.9770
432 | enheim!@#56.1141
433 |  Museum!@#56.2491
434 |  -!@#56.3840
435 |  New!@#56.5201
436 |  York!@#56.6551
437 |  City!@#56.7900
438 | ,!@#56.9246
439 |  USA!@#57.0604
440 | **
441 | !@#57.1959
442 | *!@#57.3315
443 |  **!@#57.4712
444 | The!@#57.6027
445 |  Centre!@#57.7391
446 |  Pom!@#57.8743
447 | pid!@#58.0085
448 | ou!@#58.1437
449 |  -!@#58.2805
450 |  Paris!@#58.4164
451 | ,!@#58.5513
452 |  France!@#58.6879
453 | **
454 | 
455 | !@#58.8225
456 | **!@#58.9584
457 | Outdoor!@#59.0925
458 |  Sculpt!@#59.2278
459 | ure!@#59.3642
460 | **
461 | 
462 | !@#59.5006
463 | Many!@#59.6365
464 |  contemporary!@#59.7759
465 |  sculpt!@#59.9112
466 | ors!@#60.0488
467 |  have!@#60.1859
468 |  also!@#60.3198
469 |  been!@#60.4530
470 |  exploring!@#60.5910
471 |  the!@#60.7262
472 |  relationship!@#60.8612
473 |  between!@#60.9950
474 |  art!@#61.1309
475 |  and!@#61.2667
476 |  the!@#61.4034
477 |  natural!@#61.5384
478 |  world!@#61.6753
479 | ,!@#61.8115
480 |  creating!@#61.9472
481 |  large!@#62.0843
482 | -scale!@#62.2215
483 |  outdoor!@#62.3586
484 |  sculptures!@#62.4908
485 |  that!@#62.6285
486 |  interact!@#62.7639
487 |  with!@#62.9010
488 |  their!@#63.0404
489 |  surroundings!@#63.1746
490 | .!@#63.3122
491 |  Some!@#63.4490
492 |  notable!@#63.5847
493 |  examples!@#63.7260
494 |  include!@#63.8695
495 | :
496 | 
497 | !@#64.0111
498 | *!@#64.1550
499 |  **!@#64.2985
500 | An!@#64.4417
501 | ish!@#64.5855
502 |  Kapoor!@#64.7323
503 | 's!@#64.8747
504 |  "!@#65.0252
505 | Cloud!@#65.1754
506 |  Gate!@#65.3307
507 | "!@#65.4791
508 |  (!@#65.6280
509 | Chicago!@#65.7825
510 | ,!@#65.9369
511 |  USA!@#66.0865
512 | )**!@#66.2242
513 | :!@#66.3616
514 |  A!@#66.4975
515 |  stainless!@#66.6360
516 |  steel!@#66.7713
517 |  sculpture!@#66.9128
518 |  that!@#67.0503
519 |  reflects!@#67.1859
520 |  the!@#67.3216
521 |  sky!@#67.4600
522 |  and!@#67.5953
523 |  the!@#67.7308
524 |  city!@#67.8685
525 | ,!@#68.0034
526 |  creating!@#68.1404
527 |  a!@#68.2781
528 |  dynamic!@#68.4122
529 |  and!@#68.5475
530 |  interactive!@#68.6851
531 |  experience!@#68.8203
532 | .
533 | !@#68.9578
534 | *!@#69.0957
535 |  **!@#69.2326
536 | Richard!@#69.3767
537 |  S!@#69.5275
538 | erra!@#69.6715
539 | 's!@#69.8214
540 |  "!@#69.9642
541 | T!@#70.1109
542 | ilt!@#70.2575
543 | ed!@#70.4032
544 |  Arc!@#70.5552
545 | "!@#70.7009
546 |  (!@#70.8488
547 | Washington!@#70.9965
548 | ,!@#71.1406
549 |  D!@#71.2898
550 | .C!@#71.4363
551 | .,!@#71.5848
552 |  USA!@#71.7333
553 | )**!@#71.8761
554 | :!@#72.0234
555 |  A!@#72.1714
556 |  large!@#72.3172
557 | -scale!@#72.4626
558 |  steel!@#72.6113
559 |  sculpture!@#72.7569
560 |  that!@#72.9050
561 |  explores!@#73.0512
562 |  the!@#73.1978
563 |  relationship!@#73.3465
564 |  between!@#73.4904
565 |  art!@#73.6389
566 |  and!@#73.7908
567 |  the!@#73.9402
568 |  natural!@#74.0907
569 |  world!@#74.2410
570 | .
571 | !@#74.3891
572 | *!@#74.5328
573 |  **!@#74.6820
574 | Y!@#74.8354
575 | ay!@#74.9866
576 | oi!@#75.1384
577 |  K!@#75.2882
578 | us!@#75.4411
579 | ama!@#75.5917
580 | 's!@#75.7447
581 |  "!@#75.8970
582 | Infinity!@#76.0525
583 |  Room!@#76.2165
584 | "!@#76.3657
585 |  (!@#76.5154
586 | Tok!@#76.6622
587 | yo!@#76.8155
588 | ,!@#76.9586
589 |  Japan!@#77.1104
590 | )**!@#77.2614
591 | :!@#77.4148
592 |  An!@#77.5702
593 |  immersive!@#77.7253
594 |  installation!@#77.8788
595 |  that!@#78.0346
596 |  creates!@#78.1909
597 |  a!@#78.3488
598 |  sense!@#78.5043
599 |  of!@#78.6621
600 |  infinity!@#78.8225
601 |  and!@#78.9936
602 |  self!@#79.1372
603 | -!@#79.2866
604 | obl!@#79.4350
605 | iteration!@#79.5810
606 | .
607 | 
608 | !@#79.7277
609 | **!@#79.8714
610 | Conclusion!@#80.0149
611 | **
612 | 
613 | !@#80.1605
614 | Cont!@#80.3064
615 | emporary!@#80.4560
616 |  sculpture!@#80.6023
617 |  is!@#80.7513
618 |  a!@#80.9028
619 |  vibrant!@#81.0455
620 |  and!@#81.1920
621 |  diverse!@#81.3430
622 |  field!@#81.4963
623 | ,!@#81.6491
624 |  pushing!@#81.8071
625 |  the!@#81.9552
626 |  boundaries!@#82.1056
627 |  of!@#82.2579
628 |  what!@#82.4122
629 |  art!@#82.5736
630 |  can!@#82.7287
631 |  be!@#82.8881
632 | .!@#83.0347
633 |  From!@#83.1812
634 |  abstract!@#83.3246
635 |  expression!@#83.4747
636 | ist!@#83.6213
637 |  to!@#83.7678
638 |  environmental!@#83.9099
639 | ,!@#84.0592
640 |  performance!@#84.2091
641 | ,!@#84.3536
642 |  and!@#84.5056
643 |  digital!@#84.6620
644 |  sculpture!@#84.8152
645 | !@#84.8169
646 | 


--------------------------------------------------------------------------------
/tests/test_data/2.txt:
--------------------------------------------------------------------------------
  1 | Here!@#0.0000
  2 | 's!@#0.1291
  3 |  an!@#0.2670
  4 |  article!@#0.4081
  5 |  on!@#0.5451
  6 |  the!@#0.7283
  7 |  use!@#0.9064
  8 |  of!@#1.0996
  9 |  food!@#1.2352
 10 |  in!@#1.3745
 11 |  contemporary!@#1.5210
 12 |  art!@#1.6654
 13 | :
 14 | 
 15 | !@#1.8154
 16 | **!@#1.9700
 17 | Food!@#2.1085
 18 |  for!@#2.2481
 19 |  Thought!@#2.3884
 20 | :!@#2.5320
 21 |  The!@#2.6773
 22 |  Rise!@#2.8268
 23 |  of!@#2.9765
 24 |  Ed!@#3.1370
 25 | ible!@#3.2736
 26 |  Art!@#3.4119
 27 | **
 28 | 
 29 | !@#3.5607
 30 | Food!@#3.7067
 31 |  has!@#3.8558
 32 |  long!@#4.0117
 33 |  been!@#4.1498
 34 |  a!@#4.2885
 35 |  vital!@#4.4289
 36 |  part!@#4.5674
 37 |  of!@#4.7120
 38 |  human!@#4.8603
 39 |  culture!@#5.0081
 40 | ,!@#5.1609
 41 |  but!@#5.3157
 42 |  in!@#5.4572
 43 |  contemporary!@#5.5956
 44 |  art!@#5.7346
 45 | ,!@#5.8760
 46 |  it!@#6.0168
 47 | 's!@#6.1559
 48 |  taking!@#6.2948
 49 |  center!@#6.4425
 50 |  stage!@#6.5885
 51 | .!@#6.7329
 52 |  Ed!@#6.8783
 53 | ible!@#7.0251
 54 |  art!@#7.1726
 55 | ,!@#7.3175
 56 |  also!@#7.4698
 57 |  known!@#7.6227
 58 |  as!@#7.7795
 59 |  food!@#7.9321
 60 |  art!@#8.0834
 61 |  or!@#8.2418
 62 |  food!@#8.3976
 63 |  sculpture!@#8.5353
 64 | ,!@#8.6745
 65 |  has!@#8.8140
 66 |  become!@#8.9518
 67 |  a!@#9.0910
 68 |  popular!@#9.2312
 69 |  medium!@#9.3706
 70 |  for!@#9.5106
 71 |  artists!@#9.6507
 72 |  to!@#9.7900
 73 |  express!@#9.9323
 74 |  themselves!@#10.0706
 75 | .!@#10.2095
 76 |  From!@#10.3509
 77 |  delicate!@#10.4895
 78 |  past!@#10.6341
 79 | ries!@#10.7826
 80 |  to!@#10.9294
 81 |  massive!@#11.0830
 82 |  installations!@#11.2403
 83 | ,!@#11.3818
 84 |  food!@#11.5188
 85 |  is!@#11.6601
 86 |  being!@#11.7985
 87 |  used!@#11.9401
 88 |  in!@#12.0791
 89 |  innovative!@#12.2162
 90 |  and!@#12.3541
 91 |  thought!@#12.4953
 92 | -pro!@#12.6354
 93 | v!@#12.7769
 94 | oking!@#12.9180
 95 |  ways!@#13.0643
 96 | .
 97 | 
 98 | !@#13.2131
 99 | **!@#13.3609
100 | Why!@#13.5054
101 |  Food!@#13.6549
102 |  in!@#13.8035
103 |  Art!@#13.9531
104 | ?!@#14.1019
105 | **
106 | 
107 | !@#14.2548
108 | So!@#14.4013
109 | ,!@#14.5550
110 |  why!@#14.7113
111 |  are!@#14.8698
112 |  artists!@#15.0281
113 |  turning!@#15.1746
114 |  to!@#15.3307
115 |  food!@#15.4889
116 |  as!@#15.6485
117 |  a!@#15.8132
118 |  medium!@#15.9617
119 | ?!@#16.1114
120 |  One!@#16.2568
121 |  reason!@#16.4044
122 |  is!@#16.5538
123 |  that!@#16.7010
124 |  food!@#16.8499
125 |  is!@#16.9999
126 |  a!@#17.1485
127 |  universal!@#17.3011
128 |  language!@#17.4569
129 | ,!@#17.6079
130 |  understood!@#17.7599
131 |  by!@#17.9183
132 |  everyone!@#18.0792
133 | .!@#18.2383
134 |  It!@#18.3840
135 | 's!@#18.5319
136 |  also!@#18.6825
137 |  a!@#18.8407
138 |  very!@#18.9869
139 |  tactile!@#19.1376
140 |  medium!@#19.2814
141 | ,!@#19.4333
142 |  inviting!@#19.5854
143 |  the!@#19.7420
144 |  viewer!@#19.8919
145 |  to!@#20.0541
146 |  touch!@#20.1965
147 |  and!@#20.3393
148 |  explore!@#20.4800
149 | .!@#20.6195
150 |  Additionally!@#20.7637
151 | ,!@#20.9042
152 |  food!@#21.0461
153 |  is!@#21.1952
154 |  a!@#21.3444
155 |  fleeting!@#21.4926
156 |  and!@#21.6390
157 |  ephem!@#21.7863
158 | eral!@#21.9357
159 |  medium!@#22.0825
160 | ,!@#22.2302
161 |  which!@#22.3859
162 |  can!@#22.5435
163 |  be!@#22.6963
164 |  both!@#22.8544
165 |  beautiful!@#23.0150
166 |  and!@#23.1631
167 |  imper!@#23.3053
168 | manent!@#23.4459
169 | .
170 | 
171 | !@#23.5868
172 | **!@#23.7269
173 | Not!@#23.8668
174 | able!@#24.0074
175 |  Examples!@#24.1482
176 |  of!@#24.3003
177 |  Ed!@#24.4497
178 | ible!@#24.5968
179 |  Art!@#24.7459
180 | **
181 | 
182 | !@#24.8906
183 | Some!@#25.0386
184 |  notable!@#25.1896
185 |  examples!@#25.3372
186 |  of!@#25.4905
187 |  edible!@#25.6501
188 |  art!@#25.8008
189 |  include!@#25.9580
190 | :
191 | 
192 | !@#26.1173
193 | *!@#26.2648
194 |  **!@#26.4085
195 | Mar!@#26.5489
196 | ina!@#26.6924
197 |  Abram!@#26.8362
198 | ovic!@#26.9776
199 | 's!@#27.1176
200 |  "!@#27.2601
201 | The!@#27.4034
202 |  Artist!@#27.5470
203 |  is!@#27.6862
204 |  Present!@#27.8277
205 | "!@#27.9694
206 | **:!@#28.1144
207 |  A!@#28.2557
208 |  performance!@#28.3975
209 |  art!@#28.5364
210 |  piece!@#28.6790
211 |  where!@#28.8231
212 |  Abram!@#28.9639
213 | ovic!@#29.1056
214 |  sat!@#29.2488
215 |  silently!@#29.3917
216 |  for!@#29.5339
217 |  !@#29.6784
218 | 736!@#29.8212
219 |  hours!@#29.9615
220 | ,!@#30.1045
221 |  with!@#30.2472
222 |  visitors!@#30.3897
223 |  seated!@#30.5325
224 |  across!@#30.6736
225 |  from!@#30.8151
226 |  her!@#30.9557
227 | .!@#31.0981
228 |  She!@#31.2409
229 |  offered!@#31.3845
230 |  them!@#31.5315
231 |  a!@#31.6804
232 |  cup!@#31.8266
233 |  of!@#31.9751
234 |  tea!@#32.1220
235 | ,!@#32.2683
236 |  a!@#32.4190
237 |  gesture!@#32.5714
238 |  that!@#32.7222
239 |  symbol!@#32.8796
240 | ized!@#33.0387
241 |  the!@#33.1913
242 |  connection!@#33.3490
243 |  between!@#33.5093
244 |  the!@#33.6725
245 |  artist!@#33.8133
246 |  and!@#33.9548
247 |  the!@#34.0954
248 |  viewer!@#34.2381
249 | .
250 | !@#34.3812
251 | *!@#34.5229
252 |  **!@#34.6660
253 | Y!@#34.8181
254 | ay!@#34.9694
255 | oi!@#35.1210
256 |  K!@#35.2743
257 | us!@#35.4256
258 | ama!@#35.5779
259 | 's!@#35.7264
260 |  "!@#35.8770
261 | Infinity!@#36.0282
262 |  Room!@#36.1780
263 | "!@#36.3331
264 | **:!@#36.4841
265 |  An!@#36.6342
266 |  immersive!@#36.7865
267 |  installation!@#36.9418
268 |  that!@#37.0985
269 |  features!@#37.2495
270 |  a!@#37.3991
271 |  room!@#37.5544
272 |  filled!@#37.7117
273 |  with!@#37.8663
274 |  twink!@#38.0245
275 | ling!@#38.1842
276 |  lights!@#38.3532
277 |  and!@#38.5019
278 |  a!@#38.6551
279 |  mirror!@#38.8048
280 |  ball!@#38.9632
281 | .!@#39.1201
282 |  The!@#39.2852
283 |  artist!@#39.4421
284 |  has!@#39.6077
285 |  also!@#39.7689
286 |  created!@#39.9419
287 |  edible!@#40.0867
288 |  versions!@#40.2398
289 |  of!@#40.3912
290 |  this!@#40.5397
291 |  installation!@#40.6887
292 | ,!@#40.8387
293 |  using!@#40.9884
294 |  food!@#41.1396
295 |  to!@#41.2895
296 |  create!@#41.4452
297 |  a!@#41.6017
298 |  sense!@#41.7629
299 |  of!@#41.9177
300 |  infinity!@#42.0786
301 |  and!@#42.2391
302 |  self!@#42.4013
303 | -!@#42.5431
304 | obl!@#42.6866
305 | iteration!@#42.8322
306 | .
307 | !@#42.9739
308 | *!@#43.1183
309 |  **!@#43.2621
310 | El!@#43.4126
311 |  An!@#43.5652
312 | ats!@#43.7149
313 | ui!@#43.8655
314 | 's!@#44.0153
315 |  "!@#44.1675
316 | G!@#44.3176
317 | hana!@#44.4743
318 | ian!@#44.6219
319 | -G!@#44.7759
320 | erman!@#44.9327
321 |  Sculpt!@#45.0920
322 | ure!@#45.2476
323 | "!@#45.3993
324 | **:!@#45.5616
325 |  A!@#45.7217
326 |  series!@#45.8843
327 |  of!@#46.0410
328 |  sculptures!@#46.1819
329 |  made!@#46.3253
330 |  from!@#46.4687
331 |  discarded!@#46.6178
332 |  materials!@#46.7589
333 | ,!@#46.9021
334 |  including!@#47.0465
335 |  bottle!@#47.1914
336 |  caps!@#47.3350
337 |  and!@#47.4791
338 |  cans!@#47.6227
339 | .!@#47.7633
340 |  The!@#47.9053
341 |  artist!@#48.0505
342 | 's!@#48.2011
343 |  use!@#48.3541
344 |  of!@#48.5045
345 |  food!@#48.6546
346 |  waste!@#48.8051
347 |  is!@#48.9562
348 |  a!@#49.1074
349 |  commentary!@#49.2596
350 |  on!@#49.4145
351 |  the!@#49.5682
352 |  environment!@#49.7237
353 |  and!@#49.8818
354 |  the!@#50.0439
355 |  value!@#50.2005
356 |  of!@#50.3537
357 |  waste!@#50.5167
358 | .
359 | 
360 | !@#50.6776
361 | **!@#50.8431
362 | Food!@#51.0024
363 |  as!@#51.1508
364 |  a!@#51.3018
365 |  Medium!@#51.4527
366 | **
367 | 
368 | !@#51.6042
369 | Food!@#51.7528
370 |  can!@#51.9073
371 |  be!@#52.0589
372 |  used!@#52.2117
373 |  in!@#52.3629
374 |  a!@#52.5182
375 |  variety!@#52.6658
376 |  of!@#52.8260
377 |  ways!@#52.9852
378 |  in!@#53.1516
379 |  art!@#53.3087
380 | ,!@#53.4581
381 |  from!@#53.6210
382 | :
383 | 
384 | !@#53.7843
385 | *!@#53.9525
386 |  **!@#54.0992
387 | Food!@#54.2442
388 |  sculpture!@#54.3892
389 | **:!@#54.5346
390 |  Creating!@#54.6758
391 |  intricate!@#54.8211
392 |  and!@#54.9646
393 |  beautiful!@#55.1096
394 |  designs!@#55.2548
395 |  using!@#55.3990
396 |  food!@#55.5420
397 | ,!@#55.6861
398 |  such!@#55.8309
399 |  as!@#55.9743
400 |  pastry!@#56.1183
401 |  or!@#56.2625
402 |  chocolate!@#56.4080
403 | .
404 | !@#56.5511
405 | *!@#56.6956
406 |  **!@#56.8412
407 | Food!@#56.9910
408 |  installation!@#57.1421
409 | **:!@#57.2913
410 |  Creating!@#57.4411
411 |  large!@#57.5916
412 | -scale!@#57.7403
413 |  installations!@#57.8966
414 |  using!@#58.0448
415 |  food!@#58.2051
416 | ,!@#58.3682
417 |  such!@#58.5263
418 |  as!@#58.6791
419 |  a!@#58.8415
420 |  massive!@#59.0074
421 |  pile!@#59.1542
422 |  of!@#59.2994
423 |  fruit!@#59.4417
424 |  or!@#59.5860
425 |  a!@#59.7293
426 |  giant!@#59.8727
427 |  pastry!@#60.0159
428 |  sculpture!@#60.1591
429 | .
430 | !@#60.3016
431 | *!@#60.4484
432 |  **!@#60.5922
433 | Food!@#60.7452
434 |  performance!@#60.8976
435 | **:!@#61.0511
436 |  Using!@#61.2043
437 |  food!@#61.3543
438 |  as!@#61.5060
439 |  a!@#61.6588
440 |  medium!@#61.8100
441 |  for!@#61.9650
442 |  performance!@#62.1149
443 |  art!@#62.2721
444 | ,!@#62.4291
445 |  such!@#62.5763
446 |  as!@#62.7383
447 |  cooking!@#62.8969
448 |  or!@#63.0562
449 |  serving!@#63.2208
450 |  meals!@#63.3757
451 | .
452 | !@#63.5320
453 | *!@#63.6891
454 |  **!@#63.8514
455 | Food!@#64.0166
456 |  photography!@#64.1791
457 | **:!@#64.3517
458 |  Creating!@#64.5040
459 |  photographs!@#64.6597
460 |  that!@#64.8120
461 |  showcase!@#64.9637
462 |  food!@#65.1135
463 |  as!@#65.2661
464 |  a!@#65.4182
465 |  medium!@#65.5717
466 | ,!@#65.7234
467 |  such!@#65.8785
468 |  as!@#66.0364
469 |  still!@#66.1860
470 | -life!@#66.3477
471 |  compositions!@#66.5084
472 |  or!@#66.6728
473 |  food!@#66.8267
474 |  portraits!@#66.9800
475 | .
476 | 
477 | !@#67.1441
478 | **!@#67.3057
479 | M!@#67.4717
480 | use!@#67.6287
481 | ums!@#67.7725
482 |  and!@#67.9178
483 |  Gall!@#68.0756
484 | eries!@#68.2276
485 | **
486 | 
487 | !@#68.3777
488 | Some!@#68.5285
489 |  notable!@#68.6801
490 |  museums!@#68.8306
491 |  and!@#68.9859
492 |  galleries!@#69.1395
493 |  that!@#69.2955
494 |  showcase!@#69.4555
495 |  edible!@#69.6214
496 |  art!@#69.7755
497 |  include!@#69.9316
498 | :
499 | 
500 | !@#70.0930
501 | *!@#70.2614
502 |  **!@#70.4081
503 | The!@#70.5516
504 |  Museum!@#70.6983
505 |  of!@#70.8434
506 |  Food!@#70.9891
507 |  and!@#71.1355
508 |  Drink!@#71.2797
509 |  (!@#71.4256
510 | New!@#71.5736
511 |  York!@#71.7186
512 |  City!@#71.8663
513 | ,!@#72.0101
514 |  USA!@#72.1561
515 | )!@#72.3011
516 | **
517 | !@#72.4482
518 | *!@#72.5926
519 |  **!@#72.7376
520 | The!@#72.8833
521 |  National!@#73.0258
522 |  Museum!@#73.1715
523 |  of!@#73.3159
524 |  Food!@#73.4595
525 |  and!@#73.6060
526 |  Drink!@#73.7515
527 |  (!@#73.8944
528 | Washington!@#74.0395
529 | ,!@#74.1854
530 |  D!@#74.3309
531 | .C!@#74.4773
532 | .,!@#74.6240
533 |  USA!@#74.7709
534 | )!@#74.9159
535 | **
536 | !@#75.0614
537 | *!@#75.2085
538 |  **!@#75.3540
539 | The!@#75.4996
540 |  Tate!@#75.6459
541 |  Modern!@#75.7929
542 |  (!@#75.9378
543 | London!@#76.0865
544 | ,!@#76.2306
545 |  UK!@#76.3757
546 | )!@#76.5227
547 | **
548 | !@#76.6677
549 | *!@#76.8127
550 |  **!@#76.9582
551 | The!@#77.0993
552 |  G!@#77.2485
553 | ugg!@#77.3940
554 | enheim!@#77.5403
555 |  Museum!@#77.6866
556 |  (!@#77.8332
557 | New!@#77.9781
558 |  York!@#78.1218
559 |  City!@#78.2663
560 | ,!@#78.4121
561 |  USA!@#78.5587
562 | )**!@#78.7041
563 | 
564 | 
565 | !@#78.8521
566 | **!@#78.9974
567 | Conclusion!@#79.1419
568 | **
569 | 
570 | !@#79.2861
571 | Food!@#79.4337
572 |  is!@#79.5812
573 |  a!@#79.7267
574 |  vital!@#79.8719
575 |  part!@#80.0190
576 |  of!@#80.1639
577 |  human!@#80.3102
578 |  culture!@#80.4574
579 | ,!@#80.6046
580 |  and!@#80.7511
581 |  in!@#80.8968
582 |  contemporary!@#81.0454
583 |  art!@#81.1931
584 | ,!@#81.3402
585 |  it!@#81.4865
586 | 's!@#81.6336
587 |  being!@#81.7784
588 |  used!@#81.9241
589 |  in!@#82.0702
590 |  innovative!@#82.2160
591 |  and!@#82.3620
592 |  thought!@#82.5090
593 | -pro!@#82.6557
594 | v!@#82.8043
595 | oking!@#82.9499
596 |  ways!@#83.0955
597 | .!@#83.2447
598 |  From!@#83.3903
599 |  delicate!@#83.5389
600 |  past!@#83.6943
601 | ries!@#83.8487
602 |  to!@#84.0024
603 |  massive!@#84.1551
604 |  installations!@#84.3086
605 | ,!@#84.4645
606 |  food!@#84.6168
607 |  is!@#84.7742
608 |  a!@#84.9309
609 |  medium!@#85.0896
610 |  that!@#85.2493
611 |  invites!@#85.4137
612 |  the!@#85.5764
613 |  viewer!@#85.7299
614 |  to!@#85.8941
615 |  touch!@#86.0596
616 | ,!@#86.2285
617 |  taste!@#86.3925
618 | ,!@#86.5396
619 |  and!@#86.6859
620 |  explore!@#86.8322
621 | .!@#86.9794
622 |  As!@#87.1264
623 |  the!@#87.2733
624 |  use!@#87.4292
625 |  of!@#87.5839
626 |  food!@#87.7418
627 |  in!@#87.9005
628 |  art!@#88.0579
629 |  continues!@#88.2112
630 |  to!@#88.3709
631 |  evolve!@#88.5299
632 | ,!@#88.6858
633 |  we!@#88.8419
634 |  can!@#88.9980
635 |  expect!@#89.1576
636 |  to!@#89.3133
637 |  see!@#89.4713
638 |  even!@#89.6281
639 | !@#89.6298
640 | 


--------------------------------------------------------------------------------
/tests/test_data/3.txt:
--------------------------------------------------------------------------------
 1 | even!@#0.0000
 2 |  more!@#0.1390
 3 |  creative!@#0.2878
 4 |  and!@#0.4349
 5 |  delicious!@#0.5823
 6 |  uses!@#0.7270
 7 |  of!@#0.9210
 8 |  food!@#1.1057
 9 |  in!@#1.2976
10 |  art!@#1.4520
11 | .!@#1.6193
12 |  Whether!@#1.7652
13 |  it!@#1.9202
14 | 's!@#2.0769
15 |  a!@#2.2316
16 |  delicate!@#2.3931
17 |  pastry!@#2.5600
18 |  or!@#2.7080
19 |  a!@#2.8537
20 |  massive!@#3.0008
21 |  installation!@#3.1482
22 | ,!@#3.2950
23 |  food!@#3.4516
24 |  has!@#3.6057
25 |  the!@#3.7609
26 |  power!@#3.9146
27 |  to!@#4.0762
28 |  inspire!@#4.2324
29 |  and!@#4.3999
30 |  provoke!@#4.5483
31 | .!@#4.6986
32 |  So!@#4.8471
33 | ,!@#4.9943
34 |  the!@#5.1399
35 |  next!@#5.2875
36 |  time!@#5.4368
37 |  you!@#5.5831
38 |  see!@#5.7284
39 |  a!@#5.8771
40 |  piece!@#6.0218
41 |  of!@#6.1722
42 |  edible!@#6.3298
43 |  art!@#6.4825
44 | ,!@#6.6381
45 |  remember!@#6.7923
46 |  that!@#6.9520
47 |  it!@#7.1181
48 | 's!@#7.2798
49 |  not!@#7.4412
50 |  just!@#7.6106
51 |  a!@#7.7572
52 |  tasty!@#7.9016
53 |  treat!@#8.0501
54 | ,!@#8.1975
55 |  but!@#8.3454
56 |  also!@#8.4949
57 |  a!@#8.6402
58 |  work!@#8.7863
59 |  of!@#8.9362
60 |  art!@#9.0861
61 |  in!@#9.2352
62 |  its!@#9.3918
63 |  own!@#9.5459
64 |  right!@#9.6992
65 | .!@#9.8541
66 | !@#10.0080
67 | 


--------------------------------------------------------------------------------
/tests/test_data/4.txt:
--------------------------------------------------------------------------------
  1 | Here!@#0.0000
  2 | 's!@#0.1408
  3 |  an!@#0.2889
  4 |  article!@#0.4359
  5 |  on!@#0.5847
  6 |  the!@#0.7856
  7 |  relationship!@#0.9715
  8 |  between!@#1.1708
  9 |  nudity!@#1.3124
 10 |  and!@#1.4590
 11 |  artwork!@#1.6143
 12 | :
 13 | 
 14 | !@#1.7709
 15 | **!@#1.9235
 16 | The!@#2.0877
 17 |  Naked!@#2.2507
 18 |  Truth!@#2.4043
 19 | :!@#2.5537
 20 |  Un!@#2.7089
 21 | packing!@#2.8667
 22 |  the!@#3.0278
 23 |  Art!@#3.1937
 24 |  of!@#3.3412
 25 |  N!@#3.4893
 26 | ud!@#3.6406
 27 | ity!@#3.7889
 28 | **
 29 | 
 30 | !@#3.9361
 31 | N!@#4.0838
 32 | ud!@#4.2376
 33 | ity!@#4.3962
 34 |  has!@#4.5533
 35 |  long!@#4.7142
 36 |  been!@#4.8714
 37 |  a!@#5.0378
 38 |  subject!@#5.1840
 39 |  of!@#5.3335
 40 |  fascination!@#5.4838
 41 |  and!@#5.6331
 42 |  debate!@#5.7819
 43 |  in!@#5.9291
 44 |  the!@#6.0869
 45 |  art!@#6.2417
 46 |  world!@#6.4000
 47 | .!@#6.5548
 48 |  From!@#6.7141
 49 |  ancient!@#6.8755
 50 |  Greek!@#7.0406
 51 |  sculptures!@#7.2023
 52 |  to!@#7.3748
 53 |  modern!@#7.5214
 54 | -day!@#7.6706
 55 |  installations!@#7.8200
 56 | ,!@#7.9683
 57 |  the!@#8.1190
 58 |  human!@#8.2691
 59 |  body!@#8.4196
 60 |  has!@#8.5687
 61 |  been!@#8.7177
 62 |  a!@#8.8667
 63 |  source!@#9.0162
 64 |  of!@#9.1668
 65 |  inspiration!@#9.3163
 66 |  for!@#9.4643
 67 |  artists!@#9.6206
 68 |  throughout!@#9.7770
 69 |  history!@#9.9374
 70 | .!@#10.1029
 71 |  But!@#10.2523
 72 |  what!@#10.4050
 73 |  is!@#10.5536
 74 |  it!@#10.7001
 75 |  about!@#10.8504
 76 |  nudity!@#11.0002
 77 |  that!@#11.1484
 78 |  draws!@#11.2972
 79 |  artists!@#11.4546
 80 |  to!@#11.6155
 81 |  it!@#11.7744
 82 | ,!@#11.9300
 83 |  and!@#12.0867
 84 |  how!@#12.2451
 85 |  does!@#12.4043
 86 |  it!@#12.5610
 87 |  impact!@#12.7233
 88 |  the!@#12.8771
 89 |  viewer!@#13.0407
 90 | 's!@#13.2095
 91 |  experience!@#13.3776
 92 | ?
 93 | 
 94 | !@#13.5397
 95 | **!@#13.7008
 96 | The!@#13.8696
 97 |  Origins!@#14.0413
 98 |  of!@#14.2130
 99 |  N!@#14.3611
100 | ud!@#14.5086
101 | ity!@#14.6597
102 |  in!@#14.8114
103 |  Art!@#14.9622
104 | **
105 | 
106 | !@#15.1098
107 | N!@#15.2600
108 | ud!@#15.4092
109 | ity!@#15.5579
110 |  has!@#15.7108
111 |  its!@#15.8609
112 |  roots!@#16.0129
113 |  in!@#16.1661
114 |  ancient!@#16.3134
115 |  cultures!@#16.4633
116 | ,!@#16.6140
117 |  where!@#16.7615
118 |  the!@#16.9140
119 |  human!@#17.0659
120 |  body!@#17.2142
121 |  was!@#17.3629
122 |  seen!@#17.5114
123 |  as!@#17.6636
124 |  a!@#17.8128
125 |  symbol!@#17.9627
126 |  of!@#18.1123
127 |  beauty!@#18.2619
128 | ,!@#18.4221
129 |  strength!@#18.5781
130 | ,!@#18.7328
131 |  and!@#18.8899
132 |  fertility!@#19.0438
133 | .!@#19.2018
134 |  In!@#19.3636
135 |  ancient!@#19.5186
136 |  Greece!@#19.6846
137 | ,!@#19.8552
138 |  for!@#20.0145
139 |  example!@#20.1835
140 | ,!@#20.3556
141 |  the!@#20.5176
142 |  nude!@#20.6698
143 |  male!@#20.8191
144 |  form!@#20.9696
145 |  was!@#21.1194
146 |  celebrated!@#21.2702
147 |  as!@#21.4199
148 |  a!@#21.5702
149 |  symbol!@#21.7181
150 |  of!@#21.8688
151 |  masculinity!@#22.0175
152 |  and!@#22.1697
153 |  athletic!@#22.3188
154 |  achievement!@#22.4700
155 | .!@#22.6218
156 |  This!@#22.7732
157 |  tradition!@#22.9243
158 |  continued!@#23.0815
159 |  into!@#23.2417
160 |  the!@#23.4002
161 |  Renaissance!@#23.5571
162 | ,!@#23.7149
163 |  where!@#23.8748
164 |  artists!@#24.0335
165 |  such!@#24.1935
166 |  as!@#24.3509
167 |  Michel!@#24.5153
168 | angelo!@#24.6706
169 |  and!@#24.8347
170 |  Leonardo!@#25.0019
171 |  da!@#25.1717
172 |  Vinci!@#25.3367
173 |  created!@#25.4958
174 |  iconic!@#25.6630
175 |  works!@#25.8319
176 |  of!@#26.0051
177 |  art!@#26.1823
178 |  featuring!@#26.3368
179 |  the!@#26.4871
180 |  human!@#26.6399
181 |  form!@#26.7955
182 |  in!@#26.9434
183 |  all!@#27.0951
184 |  its!@#27.2481
185 |  glory!@#27.3992
186 | .
187 | 
188 | !@#27.5505
189 | **!@#27.7015
190 | The!@#27.8528
191 |  Art!@#28.0033
192 |  of!@#28.1633
193 |  the!@#28.3239
194 |  Nude!@#28.4811
195 | :!@#28.6403
196 |  A!@#28.8018
197 |  Symbol!@#28.9601
198 |  of!@#29.1199
199 |  Truth!@#29.2865
200 | **
201 | 
202 | !@#29.4444
203 | So!@#29.6128
204 | ,!@#29.7821
205 |  why!@#29.9576
206 |  do!@#30.1163
207 |  artists!@#30.2842
208 |  choose!@#30.4572
209 |  to!@#30.6282
210 |  depict!@#30.7791
211 |  the!@#30.9289
212 |  human!@#31.0806
213 |  form!@#31.2320
214 |  in!@#31.3819
215 |  the!@#31.5310
216 |  nude!@#31.6836
217 | ?!@#31.8333
218 |  One!@#31.9836
219 |  reason!@#32.1351
220 |  is!@#32.2919
221 |  that!@#32.4529
222 |  the!@#32.6130
223 |  nude!@#32.7758
224 |  body!@#32.9301
225 |  is!@#33.0897
226 |  a!@#33.2471
227 |  symbol!@#33.4069
228 |  of!@#33.5670
229 |  truth!@#33.7263
230 |  and!@#33.8901
231 |  authenticity!@#34.0553
232 | .!@#34.2136
233 |  When!@#34.3806
234 |  an!@#34.5472
235 |  artist!@#34.7211
236 |  paints!@#34.8848
237 |  or!@#35.0463
238 |  sc!@#35.2091
239 | ul!@#35.3803
240 | pts!@#35.5498
241 |  the!@#35.7295
242 |  nude!@#35.8896
243 |  form!@#36.0504
244 | ,!@#36.2074
245 |  they!@#36.3663
246 |  are!@#36.5241
247 |  revealing!@#36.6825
248 |  the!@#36.8469
249 |  underlying!@#37.0132
250 |  structure!@#37.1829
251 |  and!@#37.3452
252 |  beauty!@#37.5141
253 |  of!@#37.6740
254 |  the!@#37.8452
255 |  human!@#38.0187
256 |  body!@#38.1906
257 | ,!@#38.3438
258 |  stripped!@#38.4940
259 |  of!@#38.6473
260 |  the!@#38.8009
261 |  distractions!@#38.9534
262 |  of!@#39.1246
263 |  clothing!@#39.2792
264 |  and!@#39.4521
265 |  social!@#39.6596
266 |  convention!@#39.8562
267 | .
268 | 
269 | !@#40.0292
270 | **!@#40.2125
271 | The!@#40.3830
272 |  Viewer!@#40.5408
273 | 's!@#40.7037
274 |  Experience!@#40.8676
275 | **
276 | 
277 | !@#41.0302
278 | The!@#41.1946
279 |  experience!@#41.3573
280 |  of!@#41.5265
281 |  viewing!@#41.7008
282 |  a!@#41.8712
283 |  nude!@#42.0470
284 |  work!@#42.2306
285 |  of!@#42.4021
286 |  art!@#42.5775
287 |  can!@#42.7657
288 |  be!@#42.9419
289 |  complex!@#43.1266
290 |  and!@#43.3073
291 |  multif!@#43.4966
292 | ac!@#43.6851
293 | eted!@#43.8749
294 | .!@#44.0592
295 |  On!@#44.2471
296 |  one!@#44.4405
297 |  hand!@#44.6290
298 | ,!@#44.8193
299 |  the!@#45.0012
300 |  nude!@#45.1775
301 |  body!@#45.3503
302 |  can!@#45.5255
303 |  evoke!@#45.7044
304 |  feelings!@#45.8859
305 |  of!@#46.0576
306 |  comfort!@#46.2263
307 |  and!@#46.3991
308 |  familiarity!@#46.5659
309 | ,!@#46.7446
310 |  as!@#46.9398
311 |  it!@#47.1200
312 |  is!@#47.3143
313 |  a!@#47.5048
314 |  reminder!@#47.6909
315 |  of!@#47.8653
316 |  our!@#48.0415
317 |  own!@#48.2224
318 |  physical!@#48.4112
319 | ity!@#48.6000
320 |  and!@#48.7674
321 |  vulnerability!@#48.9562
322 | .!@#49.1325
323 |  On!@#49.3433
324 |  the!@#49.5232
325 |  other!@#49.7296
326 |  hand!@#49.9098
327 | ,!@#50.1050
328 |  the!@#50.3006
329 |  nude!@#50.4939
330 |  body!@#50.7042
331 |  can!@#50.8902
332 |  also!@#51.1030
333 |  be!@#51.2906
334 |  a!@#51.4791
335 |  source!@#51.6754
336 |  of!@#51.8503
337 |  discomfort!@#52.0351
338 |  and!@#52.2109
339 |  self!@#52.9610
340 | -conscious!@#53.1345
341 | ness!@#53.3259
342 | ,!@#53.5074
343 |  as!@#53.6989
344 |  it!@#53.8765
345 |  challenges!@#54.0603
346 |  our!@#54.2400
347 |  social!@#54.4234
348 |  norms!@#54.6132
349 |  and!@#54.8218
350 |  expectations!@#55.0390
351 |  around!@#55.2377
352 |  nudity!@#55.3969
353 | .
354 | 
355 | !@#55.5609
356 | **!@#55.7210
357 | Examples!@#55.8811
358 |  of!@#56.0452
359 |  Not!@#56.2101
360 | able!@#56.3771
361 |  N!@#56.5450
362 | udes!@#56.7120
363 | **
364 | 
365 | !@#56.8774
366 | Some!@#57.0447
367 |  notable!@#57.2145
368 |  examples!@#57.3815
369 |  of!@#57.5494
370 |  nude!@#57.7329
371 |  artwork!@#57.9322
372 |  include!@#58.1209
373 | :
374 | 
375 | !@#58.3163
376 | *!@#58.5166
377 |  **!@#58.7115
378 | Mich!@#58.9083
379 | el!@#59.0998
380 | angelo!@#59.3117
381 | 's!@#59.4892
382 |  "!@#59.6862
383 | David!@#59.8816
384 | "!@#60.0637
385 | **:!@#60.2921
386 |  A!@#60.4699
387 |  iconic!@#60.6621
388 |  sculpture!@#60.8525
389 |  of!@#61.0525
390 |  the!@#61.2401
391 |  biblical!@#61.4277
392 |  hero!@#61.6294
393 |  David!@#61.8199
394 | ,!@#62.0096
395 |  which!@#62.1895
396 |  has!@#62.3609
397 |  become!@#62.5408
398 |  one!@#62.7320
399 |  of!@#62.9188
400 |  the!@#63.1140
401 |  most!@#63.3052
402 |  famous!@#63.4903
403 |  works!@#63.6859
404 |  of!@#63.8528
405 |  art!@#64.0106
406 |  in!@#64.1686
407 |  history!@#64.3281
408 | .
409 | !@#64.5123
410 | *!@#64.6932
411 |  **!@#64.8782
412 | Leon!@#65.0605
413 | ardo!@#65.2439
414 |  da!@#65.4300
415 |  Vinci!@#65.6254
416 | 's!@#65.8193
417 |  "!@#66.0029
418 | V!@#66.1753
419 | it!@#66.3494
420 | ru!@#66.5277
421 | v!@#66.7018
422 | ian!@#66.8887
423 |  Man!@#67.0585
424 | "!@#67.2364
425 | **:!@#67.9768
426 |  A!@#68.1536
427 |  drawing!@#68.3440
428 |  that!@#68.5213
429 |  depicts!@#68.7118
430 |  the!@#68.9007
431 |  human!@#69.0814
432 |  form!@#69.2572
433 |  ins!@#69.4431
434 | cribed!@#69.6231
435 |  within!@#69.8097
436 |  a!@#70.0097
437 |  circle!@#70.1910
438 |  and!@#70.3723
439 |  square!@#70.5254
440 | ,!@#70.6826
441 |  highlighting!@#70.8347
442 |  the!@#70.9875
443 |  body!@#71.1439
444 | 's!@#71.3004
445 |  relationship!@#71.4530
446 |  to!@#71.6079
447 |  geometry!@#71.7595
448 |  and!@#71.9140
449 |  proportion!@#72.0679
450 | .
451 | !@#72.2264
452 | *!@#72.3794
453 |  **!@#72.5350
454 | Mar!@#72.6993
455 | ina!@#72.8658
456 |  Abram!@#73.0403
457 | ovic!@#73.2097
458 | 's!@#73.3746
459 |  "!@#73.5370
460 | The!@#73.7023
461 |  Artist!@#73.8649
462 |  is!@#74.0287
463 |  Present!@#74.1920
464 | "!@#74.3540
465 | **:!@#74.5238
466 |  A!@#74.6920
467 |  performance!@#74.8583
468 |  art!@#75.0164
469 |  piece!@#75.1905
470 |  in!@#75.3609
471 |  which!@#75.5343
472 |  Abram!@#75.7150
473 | ovic!@#75.8826
474 |  sat!@#76.0501
475 |  silently!@#76.2136
476 |  for!@#76.3880
477 |  !@#76.5567
478 | 736!@#76.7326
479 |  hours!@#76.9097
480 | ,!@#77.0839
481 |  inviting!@#77.2401
482 |  visitors!@#77.3944
483 |  to!@#77.5509
484 |  sit!@#77.7059
485 |  across!@#77.8604
486 |  from!@#78.0165
487 |  her!@#78.1699
488 |  and!@#78.3240
489 |  engage!@#78.4775
490 |  in!@#78.6323
491 |  a!@#78.7868
492 |  silent!@#78.9427
493 |  conversation!@#79.1002
494 | .
495 | 
496 | !@#79.2566
497 | **!@#79.4117
498 | The!@#79.5675
499 |  Future!@#79.7237
500 |  of!@#79.8837
501 |  N!@#80.0517
502 | ud!@#80.2178
503 | ity!@#80.3852
504 |  in!@#80.5491
505 |  Art!@#80.7135
506 | **
507 | 
508 | !@#80.8768
509 | As!@#81.0406
510 |  we!@#81.2049
511 |  move!@#81.3686
512 |  forward!@#81.5319
513 |  into!@#81.7021
514 |  a!@#81.8696
515 |  more!@#82.0299
516 |  open!@#82.2006
517 |  and!@#82.3723
518 |  accepting!@#82.5465
519 |  society!@#82.7180
520 | ,!@#82.8878
521 |  it!@#83.0581
522 | 's!@#83.2342
523 |  likely!@#83.4102
524 |  that!@#83.5892
525 |  we!@#83.7634
526 | 'll!@#83.9191
527 |  see!@#84.0749
528 |  more!@#84.2324
529 |  and!@#84.3873
530 |  more!@#84.5441
531 |  artists!@#84.7019
532 |  exploring!@#84.8600
533 |  the!@#85.0151
534 |  theme!@#85.1687
535 |  of!@#85.3257
536 |  nudity!@#85.4796
537 |  in!@#85.6365
538 |  their!@#85.7903
539 |  work!@#85.9453
540 | .!@#86.1022
541 |  With!@#86.2564
542 |  the!@#86.4121
543 |  rise!@#86.5775
544 |  of!@#86.7414
545 |  social!@#86.9072
546 |  media!@#87.0741
547 |  and!@#87.2414
548 |  the!@#87.4039
549 |  increasing!@#87.5683
550 |  visibility!@#87.7325
551 |  of!@#87.8960
552 |  the!@#88.0611
553 |  human!@#88.2224
554 |  body!@#88.3935
555 | ,!@#88.5628
556 |  artists!@#88.7284
557 |  are!@#88.8931
558 |  finding!@#89.0657
559 |  new!@#89.2398
560 |  and!@#89.4158
561 |  innovative!@#89.5962
562 |  ways!@#89.7638
563 |  to!@#89.9253
564 |  depict!@#90.0993
565 |  the!@#90.2757
566 |  nude!@#90.4495
567 |  form!@#90.6278
568 | .
569 | 
570 | !@#90.8158
571 | **!@#90.9735
572 | Conclusion!@#91.1285
573 | **
574 | 
575 | !@#91.2864
576 | N!@#91.4456
577 | ud!@#91.6100
578 | ity!@#91.7739
579 |  is!@#91.9381
580 |  a!@#92.1040
581 |  powerful!@#92.2652
582 |  symbol!@#92.4313
583 |  in!@#92.5965
584 |  the!@#92.7647
585 |  world!@#92.9293
586 |  of!@#93.0936
587 |  art!@#93.2648
588 | ,!@#93.4321
589 |  ev!@#93.5966
590 | oking!@#93.7677
591 |  feelings!@#93.9396
592 |  of!@#94.1158
593 |  truth!@#94.2905
594 | ,!@#94.4633
595 |  authenticity!@#94.6285
596 | ,!@#94.8053
597 |  and!@#94.9809
598 |  vulnerability!@#95.1617
599 | .!@#95.3493
600 |  Whether!@#95.5071
601 |  it!@#95.6623
602 | 's!@#95.8301
603 |  through!@#95.9957
604 |  sculpture!@#96.1616
605 | ,!@#96.3242
606 |  painting!@#96.4884
607 | ,!@#96.6526
608 |  or!@#96.8207
609 |  performance!@#96.9831
610 | ,!@#97.1512
611 |  the!@#97.3169
612 |  human!@#97.4838
613 |  body!@#97.6578
614 |  continues!@#97.8319
615 |  to!@#98.0087
616 |  inspire!@#98.1755
617 |  and!@#98.3441
618 |  challenge!@#98.5205
619 |  artists!@#98.6975
620 |  and!@#98.8913
621 |  viewers!@#99.0492
622 |  alike!@#99.2067
623 | .!@#99.3658
624 |  As!@#99.5284
625 |  we!@#99.6863
626 |  move!@#99.8507
627 |  forward!@#100.0180
628 |  into!@#100.1839
629 |  a!@#100.3492
630 |  more!@#100.5172
631 |  open!@#100.6820
632 |  and!@#100.8487
633 | !@#100.8504
634 | 


--------------------------------------------------------------------------------
/tests/test_data/debug.py:
--------------------------------------------------------------------------------
 1 | 
 2 | '''
 3 | Run this script to replay actual tokens on a configuration of your choice
 4 | '''
 5 | 
 6 | from stream2sentence.stream2sentence_time_based import generate_sentences
 7 | import time
 8 | 
 9 | records = []
10 | buffer = ""
11 | with open("1.txt", "r") as f:
12 |     for line in f:
13 |         #include newlines in split
14 |         if "!@#" in line:
15 |             buffer += line
16 |             records.append(buffer)
17 |             buffer = ""
18 |         else:
19 |             buffer += line
20 |     if buffer:
21 |         records.append(buffer)
22 | 
23 | token_times = [tuple(record.split("!@#", 1)) for record in records]
24 | 
25 | 
26 | 
27 | def get_llm_output_simulation():
28 |     start = time.time()
29 |     def llm_output_simulation():
30 |         for tt in token_times:
31 |             # print(tt)
32 |             while (time.time() - start) < float(tt[1]):
33 |                 time.sleep(0.0001)
34 |             yield tt[0]
35 | 
36 |     return llm_output_simulation()
37 | 
38 | 
39 | def run_test():
40 |   time_to_sentences = []
41 |   start_time = time.time()
42 |   for i, sentence in enumerate(
43 |     generate_sentences(
44 |         get_llm_output_simulation(),
45 |         lead_time = 0.3,
46 |         max_wait_for_fragments = [1, 0.8, 1, 1.1, 1.5],
47 |         target_tps = 3.6,
48 |         min_output_lengths = [2, 3],
49 |         deadline_offsets_dynamic=[.1]
50 |     )):
51 |         t = time.time() - start_time
52 |         print(f"Sentence {i}: t={t:.1f} {sentence}")
53 |         time_to_sentences.append([sentence, f"{t:.1f}"])
54 |   return time_to_sentences
55 | 
56 | 
57 | run_test()


--------------------------------------------------------------------------------
/tests/test_stream2sentence.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from stream2sentence import generate_sentences, generate_sentences_async
  3 | 
  4 | class TestSentenceGenerator(unittest.TestCase):
  5 | 
  6 |     def test_chinese(self):
  7 |         text = "我喜欢读书。天气很好。我们去公园吧。今天是星期五。早上好。这是我的朋友。请帮我。吃饭了吗？我在学中文。晚安。"
  8 |         #expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。吃饭了吗？", "我在学中文。", "晚安。"]
  9 |         #expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。", "吃饭了吗？", "我在学中文。", "晚安。"]
 10 |         expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。吃饭了吗？我在学中文。", "晚安。"] # this changed with new stanza version
 11 |         sentences = list(generate_sentences(text, minimum_sentence_length=2, context_size=2, tokenizer="stanza", language="zh"))
 12 |         self.assertEqual(sentences, expected)    
 13 | 
 14 |     def test_chinese2(self):
 15 |         text = """
 16 |         胡/爷/爷，我/来/给/您/讲/一下/下/周/每/天/的/安/排。 
 17 |         周/一/：/9:00-10:00：晨/练/太/极/拳/，/地点/：/活/动/室/。
 18 |         10:30-11:30：园/艺/活/动/菠菜/种/植/，/地点/：/花/园/。
 19 |         14:00-15:00：手/工/制/作/睡/眠/香/囊/，/地点/：/手/工/室/。
 20 |         15:30-16:30：观/看/老/电/影/，/地点/：/影/音/室/。
 21 | 
 22 |         周/二/：/9:00-10:00：八/段/锦/简/化/版/，/地点/：/大/厅/。
 23 |         10:30-11:30：书/法/练/习/，/地点/：/书/画/室/。
 24 |         14:00-15:00：棋/牌/娱/乐/象/棋/、/围/棋/等/，/地点/：/棋/牌/室/。
 25 |         15:30-16:30：养/生/讲/座/春/天/养/生/1/，/地点/：/会/议/室/。
 26 |         大/厅/"""
 27 |         expected = [
 28 |             "胡/爷/爷，我/来/给/您/讲/一下/下/周/每/天/的/安/排。",
 29 |             "周/一/：/9:00-10:00：晨/练/太/极/拳/，/地点/：/活/动/室/。",
 30 |             "10:30-11:30：园/艺/活/动/菠菜/种/植/，/地点/：/花/园/。",
 31 |             "14:00-15:00：手/工/制/作/睡/眠/香/囊/，/地点/：/手/工/室/。",
 32 |             "15:30-16:30：观/看/老/电/影/，/地点/：/影/音/室/。",
 33 |             "周/二/：/9:00-10:00：八/段/锦/简/化/版/，/地点/：/大/厅/。",
 34 |             "10:30-11:30：书/法/练/习/，/地点/：/书/画/室/。",
 35 |             "14:00-15:00：棋/牌/娱/乐/象/棋/、/围/棋/等/，/地点/：/棋/牌/室/。",
 36 |             "15:30-16:30：养/生/讲/座/春/天/养/生/1/，/地点/：/会/议/室/。",
 37 |             "大/厅/",
 38 |         ]
 39 |         sentences = list(generate_sentences(text, minimum_sentence_length=2, context_size=2, tokenizer="stanza", language="zh"))
 40 |         self.assertEqual(sentences, expected)    
 41 | 
 42 |     def test_generator(self):
 43 |         def generator():
 44 |             yield "Hallo, "
 45 |             yield "wie geht es dir? "
 46 |             yield "Mir geht es gut."
 47 |         expected = ["Hallo,", "wie geht es dir?", "Mir geht es gut."]
 48 |         sentences = list(generate_sentences(generator(), minimum_sentence_length = 3, context_size=5, minimum_first_fragment_length = 3, quick_yield_single_sentence_fragment=True))
 49 |         self.assertEqual(sentences, expected)    
 50 | 
 51 |     def test_return_incomplete_last(self):
 52 |         text = "How I feel? I feel fine"
 53 |         expected = ["How I feel?", "I feel fine"]
 54 |         sentences = list(generate_sentences(text))
 55 |         self.assertEqual(sentences, expected)    
 56 | 
 57 |     def test_hello_world(self):
 58 |         text = "Hello, world."
 59 |         expected = ["Hello,", "world."]
 60 |         sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=3))
 61 |         self.assertEqual(sentences, expected)    
 62 | 
 63 |     def test_hello_world2(self):
 64 |         text = "Hello, world! Hello all, my dear friends of realtime apps."
 65 |         expected = ["Hello, world!", "Hello all, my dear friends of realtime apps."]
 66 |         sentences = list(generate_sentences(text, minimum_sentence_length=3))
 67 |         self.assertEqual(sentences, expected)    
 68 | 
 69 |     def test_basic(self):
 70 |         text = "This is a test. This is another test sentence. Just testing out the module."
 71 |         expected = ["This is a test.", "This is another test sentence.", "Just testing out the module."]
 72 |         sentences = list(generate_sentences(text))
 73 |         self.assertEqual(sentences, expected)
 74 | 
 75 |     def test_tricky_sentence1(self):
 76 |         text = "Good muffins cost $3.88 in New York. Please buy me two of them."
 77 |         expected = ["Good muffins cost $3.88 in New York.", "Please buy me two of them."]
 78 |         sentences = list(generate_sentences(text))
 79 |         self.assertEqual(sentences, expected)
 80 | 
 81 |     def test_tricky_sentence2(self):
 82 |         text = "I called Dr. Jones. I called Dr. Jones."
 83 |         expected = ["I called Dr. Jones.", "I called Dr. Jones."]
 84 |         sentences = list(generate_sentences(text))
 85 |         self.assertEqual(sentences, expected)
 86 | 
 87 |     def test_quick_yield(self):
 88 |         text = "First, this. Second, this."
 89 |         expected = ["First,", "this.", "Second, this."]
 90 |         sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=3))
 91 |         self.assertEqual(sentences, expected)
 92 | 
 93 |     def test_quick_yield2(self):
 94 |         text = "First, this. Second, this."
 95 |         expected = ["First,", "this. Second, this."]
 96 |         sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=6, minimum_first_fragment_length=3))
 97 |         self.assertEqual(sentences, expected)
 98 | 
 99 |     def test_quick_yield3(self):
100 |         text = "First, this. Second, this."
101 |         expected = ["First, this.", "Second, this."]
102 |         sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=6))
103 |         self.assertEqual(sentences, expected)
104 | 
105 |     def test_quick_yield4(self):
106 |         text = "First, this. Second, this."
107 |         expected = ["First, this.", "Second, this."]
108 |         sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=6, minimum_first_fragment_length=6))
109 |         self.assertEqual(sentences, expected)
110 | 
111 |     def test_minimum_length1(self):
112 |         text = "Short. Longer sentence."
113 |         expected = ["Short.", "Longer sentence."]
114 |         sentences = list(generate_sentences(text, minimum_sentence_length=6)) # two sentences, len("Short.") == 6
115 |         self.assertEqual(sentences, expected)
116 | 
117 |     def test_minimum_length2(self):
118 |         text = "Short. Longer sentence."
119 |         expected = ["Short. Longer sentence."]
120 |         sentences = list(generate_sentences(text, minimum_sentence_length=7)) # one sentences, len("Short.") == 6
121 |         self.assertEqual(sentences, expected)
122 | 
123 |     def test_cleanup(self):
124 |         text = "Text with link: https://www.example.com and emoji 😀" 
125 |         expected = ["Text with link:  and emoji"]
126 |         sentences = list(generate_sentences(text, cleanup_text_links=True, cleanup_text_emojis=True))
127 |         self.assertEqual(sentences, expected)
128 | 
129 |     def test_check1(self):
130 |         text = "I'll go with a glass of red wine. Thank you." 
131 |         expected = ["I'll go with a glass of red wine.", "Thank you."]
132 |         sentences = list(generate_sentences(text, minimum_sentence_length=10, minimum_first_fragment_length=10, quick_yield_single_sentence_fragment=True, cleanup_text_links=True, cleanup_text_emojis=True))
133 |         self.assertEqual(sentences, expected)
134 | 
135 |     def test_very_short(self):
136 |         text = "Excuse me?" 
137 |         expected = ["Excuse me?"]
138 |         sentences = list(generate_sentences(text, minimum_sentence_length=18, minimum_first_fragment_length=10, quick_yield_single_sentence_fragment=True, cleanup_text_links=True, cleanup_text_emojis=True))
139 |         self.assertEqual(sentences, expected)
140 | 
141 |     def test_log_characters(self):
142 |         text = "Hello world"
143 |         print ()
144 |         sentences = list(generate_sentences(text, log_characters=True))
145 |         print ()
146 |         print ()
147 |         print (f"test_log_characters succeeded, if {text} was printed above.")
148 |         print ()
149 |         # Check characters were printed
150 |         self.assertTrue(sentences) 
151 | 
152 |     def test_not_log_characters(self):
153 |         text = "Do not show these characters." 
154 |         expected = ["Do not show these characters."]
155 |         sentences = list(generate_sentences(text, log_characters=False))
156 |         print(f"\ntest_not_log_characters succeeded, if \"{text}\" was not printed above.")
157 |         self.assertEqual(sentences, expected)
158 | 
159 | if __name__ == '__main__':
160 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_stream_from_llm.py:
--------------------------------------------------------------------------------
 1 | from stream2sentence import generate_sentences
 2 | from openai import OpenAI                   # pip install openai
 3 | 
 4 | client = OpenAI()
 5 | 
 6 | def write(prompt: str):
 7 |     stream = client.chat.completions.create(
 8 |         model="gpt-4",
 9 |         messages=[{"role": "user", "content": prompt}],
10 |         stream=True,
11 |     )
12 |     for chunk in stream:
13 |         if (text_chunk := chunk.choices[0].delta.content):
14 |             yield text_chunk
15 | 
16 | 
17 | text_stream = write("A three-sentence relaxing speech.")
18 | 
19 | for idx, sentence in enumerate(generate_sentences(text_stream, minimum_sentence_length=5), start=1):
20 |     print(f"Sentence {idx}: {sentence}")
21 | 


--------------------------------------------------------------------------------
/tests/test_stream_from_llm_old_api.py:
--------------------------------------------------------------------------------
 1 | from stream2sentence import generate_sentences
 2 | import openai                               # pip install openai  
 3 | import os
 4 | 
 5 | openai.api_key = os.environ.get("OPENAI_API_KEY")
 6 | 
 7 | def write(prompt: str):
 8 |     for chunk in openai.ChatCompletion.create(
 9 |         model="gpt-3.5-turbo",
10 |         messages=[{"role": "user", "content" : prompt}],
11 |         stream=True
12 |     ):
13 |         if (text_chunk := chunk["choices"][0]["delta"].get("content")) is not None:
14 |             yield text_chunk
15 | 
16 | text_stream = write("A three-sentence relaxing speech.")
17 | 
18 | for idx, sentence in enumerate(generate_sentences(text_stream), start=1):
19 |     print(f"Sentence {idx}: {sentence}")
20 | 


--------------------------------------------------------------------------------
/tests/test_time_based.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from stream2sentence.stream2sentence_time_based import generate_sentences_time_based as generate_sentences
  3 | import time
  4 | 
  5 | input_stewart_wiki = '''
  6 | In 1996 Mr. Stewart hosted a short-lived talk show entitled, Where's Elvis This Week?, which was a half-hour, weekly comedy television program. 
  7 | It aired on Sunday nights in the United Kingdom on BBC Two. 
  8 | It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States, who discussed news items and cultural issues. 
  9 | The show premiered in the UK on October 6, 1996; five episodes aired in total. 
 10 | Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown. In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy. 
 11 | Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling). 
 12 | Stewart also headlined the 1997 White House Correspondents' dinner.
 13 | '''
 14 | 
 15 | input_problematic = ''' 
 16 | First sentence is short. 
 17 | Second sentence is very long, and totally a run on, and would definitely cause problems if this is what the output of the llm was and we only had a quick yield value of one this needs to be broken up thanks.
 18 | Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
 19 | Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
 20 | Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
 21 | '''
 22 | 
 23 | WORDS_PER_TOKEN = .75
 24 | 
 25 | def get_words(current_input):
 26 |     return list(map(lambda word: word + ' ', current_input.split()))
 27 | 
 28 | 
 29 | def print_word_targets(current_input, tps_target):
 30 |     target_delay_between_words = (1 / (WORDS_PER_TOKEN * tps_target))
 31 |     word_targets = []
 32 |     for i, word in enumerate(get_words(current_input)):
 33 |         t = ((i + 1) * target_delay_between_words) + target_delay_between_words
 34 |         word_targets.append([ word, f"{t:.1f}" ])
 35 |     print(word_targets)
 36 | 
 37 | 
 38 | def get_llm_output_simulation(current_input, tts):
 39 |     def llm_output_simulation():
 40 |         for word in get_words(current_input):
 41 |             time.sleep(1 / (tts * WORDS_PER_TOKEN))
 42 |             yield word
 43 |     return llm_output_simulation()
 44 | 
 45 | 
 46 | def run_test(input, simulated_tts, dynamic_offset=False):
 47 |   time_to_sentences = []
 48 |   start_time = time.time()
 49 |   for i, sentence in enumerate(
 50 |     generate_sentences(
 51 |         get_llm_output_simulation(input, simulated_tts),
 52 |         deadline_offsets_dynamic=[.5, .3, .1] if dynamic_offset else [0]
 53 |     )):
 54 |         t = time.time() - start_time
 55 |         print(f"Sentence {i}: t={t:.1f} {sentence}")
 56 |         time_to_sentences.append([sentence, f"{t:.1f}"])
 57 |   print("\n\n RESULT ")
 58 |   print(time_to_sentences)
 59 |   print("\n\n")
 60 |   return time_to_sentences
 61 | 
 62 | def is_within_tolerance(num1, num2, tolerance):
 63 |     return abs(num1 - num2) <= tolerance
 64 | 
 65 | def compare_results(result, expected_result):
 66 |     for i in range(len(result)):
 67 |         if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.25):
 68 |             raise ValueError(f"RESULT MISMATCH - expected={expected_result[i]} - actual={result[i]}")
 69 | 
 70 | 
 71 | result_1 = run_test(input_stewart_wiki, 9, True)
 72 | expected_result_1 = [
 73 |     ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '1.8'],
 74 |     ["Where's Elvis This Week?, which was a half-hour,", '2.8'],
 75 |     ['weekly comedy television program.', '5.2'],
 76 |     ['It aired on Sunday nights in the United Kingdom on BBC Two.', '6.8'],
 77 |     ['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.3'],
 78 |     ['who discussed news items and cultural issues.', '12.6'],
 79 |     ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.1'],
 80 |     ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.3'],
 81 |     ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '23.8'],
 82 |     ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '24.9'],
 83 |     ["Stewart also headlined the 1997 White House Correspondents' dinner.", '24.9']
 84 | ]
 85 | compare_results(result_1, expected_result_1)
 86 | 
 87 | result_2 = run_test(input_stewart_wiki, 5)
 88 | expected_result_2 = [
 89 |     ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.0'],
 90 |     ["Where's Elvis This Week?, which was a half-hour,", '5.4'],
 91 |     ['weekly comedy television program.', '8.0'],
 92 |     ['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.4'],
 93 |     ['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.4'],
 94 |     ['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '18.8'],
 95 |     ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.1'],
 96 |     ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.0'],
 97 |     ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '37.7'],
 98 |     ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '44.9'],
 99 |     ["Stewart also headlined the 1997 White House Correspondents' dinner.", '44.9']
100 | ]
101 | compare_results(result_2, expected_result_2)
102 | 
103 | result_3 = run_test(input_problematic, 9)
104 | expected_result_3 = [
105 |     ['First sentence is short.', '1.0'],
106 |     ['Second sentence is very long,', '1.5'],
107 |     ['and totally a run on,', '3.1'],
108 |     ['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.2'],
109 |     ['yield value of one this needs to be broken up thanks.', '11.7'],
110 |     ['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.3'],
111 |     ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.0'],
112 |     ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.0']
113 | ]
114 | compare_results(result_3, expected_result_3)
115 | 
116 | result_4 = run_test(input_problematic, 5, True)
117 | expected_result_4 = [
118 |     ['First sentence is short.', '1.3'],
119 |     ['Second sentence is very long,', '2.7'],
120 |     ['and totally a run on,', '4.0'],
121 |     ['and would definitely cause problems if this is what the output', '6.7'],
122 |     ['of the llm was and we only had a quick', '9.4'],
123 |     ['yield value of one this needs to be broken', '11.8'],
124 |     ['up thanks. Third sentence also very long that lorem', '14.2'],
125 |     ['ipsum dolor sit amet, consectetur adipiscing elit,', '17.4'],
126 |     ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.5'],
127 |     ['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '23.5'],
128 |     ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '28.3'],
129 |     ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '33.6'],
130 |     ['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '39.3'],
131 |     ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non', '42.0'],
132 |     ['proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2']
133 | ]
134 | compare_results(result_4, expected_result_4)
135 | 


--------------------------------------------------------------------------------