├── .gitignore ├── README.md ├── setup.py ├── stream2sentence ├── __init__.py ├── avoid_pause_words.py ├── delimiter_ignore_prefixes.py ├── stream2sentence.py └── stream2sentence_time_based.py └── tests ├── run_test.bat ├── simpletest.py ├── test_data ├── 1.txt ├── 2.txt ├── 3.txt ├── 4.txt └── debug.py ├── test_stream2sentence.py ├── test_stream_from_llm.py ├── test_stream_from_llm_old_api.py └── test_time_based.py /.gitignore: -------------------------------------------------------------------------------- 1 | tests_private/ 2 | test_env/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | myenv/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Sentence Detection 2 | 3 | Real-time processing and delivery of sentences from a continuous stream of characters or text chunks. 4 | 5 | > **Hint:** *If you're interested in state-of-the-art voice solutions you might also want to have a look at [Linguflex](https://github.com/KoljaB/Linguflex), the original project from which stream2sentence is spun off. It lets you control your environment by speaking and is one of the most capable and sophisticated open-source assistants currently available.* 6 | 7 | ## Table of Contents 8 | 9 | - [Features](#features) 10 | - [Installation](#installation) 11 | - [Usage](#usage) 12 | - [Configuration](#configuration) 13 | - [Contributing](#contributing) 14 | - [License](#license) 15 | 16 | ## Features 17 | 18 | - Generates sentences from a stream of text in real-time. 19 | - Customizable to finetune/balance speed vs reliability. 20 | - Option to clean the output by removing links and emojis from the detected sentences. 21 | - Easy to configure and integrate. 22 | 23 | ## Installation 24 | 25 | ```bash 26 | pip install stream2sentence 27 | ``` 28 | 29 | ## Usage 30 | 31 | Pass a generator of characters or text chunks to `generate_sentences()` to get a generator of sentences in return. 32 | 33 | Here's a basic example: 34 | 35 | ```python 36 | from stream2sentence import generate_sentences 37 | 38 | # Dummy generator for demonstration 39 | def dummy_generator(): 40 | yield "This is a sentence. And here's another! Yet, " 41 | yield "there's more. This ends now." 42 | 43 | for sentence in generate_sentences(dummy_generator()): 44 | print(sentence) 45 | ``` 46 | 47 | This will output: 48 | ``` 49 | This is a sentence. 50 | And here's another! 51 | Yet, there's more. 52 | This ends now. 53 | ``` 54 | 55 | One main use case of this library is enable fast text to speech synthesis in the context of character feeds generated from large language models: this library enables fastest possible access to a complete sentence or sentence fragment (using the quick_yield_single_sentence_fragment flag) that then can be synthesized in realtime. The usage of this is demonstrated in the test_stream_from_llm.py file in the tests directory. 56 | 57 | ## Configuration 58 | 59 | The `generate_sentences()` function offers various parameters to fine-tune its behavior: 60 | 61 | ### Core Parameters 62 | 63 | - `generator: Iterator[str]` 64 | - The primary input source, yielding chunks of text to be processed. 65 | - Can be any iterator that emits text chunks of any size. 66 | 67 | - `context_size: int = 12` 68 | - Number of characters considered for sentence boundary detection. 69 | - Larger values improve accuracy but may increase latency. 70 | - Default: 12 characters 71 | 72 | - `context_size_look_overhead: int = 12` 73 | - Additional characters to examine beyond `context_size` for sentence splitting. 74 | - Enhances sentence detection accuracy. 75 | - Default: 12 characters 76 | 77 | - `minimum_sentence_length: int = 10` 78 | - Minimum character count for a text chunk to be considered a sentence. 79 | - Shorter fragments are buffered until this threshold is met. 80 | - Default: 10 characters 81 | 82 | - `minimum_first_fragment_length: int = 10` 83 | - Minimum character count required for the first sentence fragment. 84 | - Ensures the initial output meets a specified length threshold. 85 | - Default: 10 characters 86 | 87 | ### Yield Control 88 | 89 | These parameters control how quickly and frequently the generator yields sentence fragments: 90 | 91 | - `quick_yield_single_sentence_fragment: bool = False` 92 | - When True, yields the first fragment of the first sentence as quickly as possible. 93 | - Useful for getting immediate output in real-time applications like speech synthesis. 94 | - Default: False 95 | 96 | - `quick_yield_for_all_sentences: bool = False` 97 | - When True, yields the first fragment of every sentence as quickly as possible. 98 | - Extends the quick yield behavior to all sentences, not just the first one. 99 | - Automatically sets `quick_yield_single_sentence_fragment` to True. 100 | - Default: False 101 | 102 | - `quick_yield_every_fragment: bool = False` 103 | - When True, yields every fragment of every sentence as quickly as possible. 104 | - Provides the most granular output, yielding fragments as soon as they're detected. 105 | - Automatically sets both `quick_yield_for_all_sentences` and `quick_yield_single_sentence_fragment` to True. 106 | - Default: False 107 | 108 | ### Text Cleanup 109 | 110 | - `cleanup_text_links: bool = False` 111 | - When True, removes hyperlinks from the output sentences. 112 | - Default: False 113 | 114 | - `cleanup_text_emojis: bool = False` 115 | - When True, removes emoji characters from the output sentences. 116 | - Default: False 117 | 118 | ### Tokenization 119 | 120 | - `tokenize_sentences: Callable = None` 121 | - Custom function for sentence tokenization. 122 | - If None, uses the default tokenizer specified by `tokenizer`. 123 | - Default: None 124 | 125 | - `tokenizer: str = "nltk"` 126 | - Specifies the tokenizer to use. Options: "nltk" or "stanza" 127 | - Default: "nltk" 128 | 129 | - `language: str = "en"` 130 | - Language setting for the tokenizer. 131 | - Use "en" for English or "multilingual" for Stanza tokenizer. 132 | - Default: "en" 133 | 134 | ### Debugging and Fine-tuning 135 | 136 | - `log_characters: bool = False` 137 | - When True, logs each processed character to the console. 138 | - Useful for debugging or monitoring real-time processing. 139 | - Default: False 140 | 141 | - `sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-"` 142 | - Characters considered as potential sentence fragment delimiters. 143 | - Used for quick yielding of sentence fragments. 144 | - Default: ".?!;:,\n…)]}。-" 145 | 146 | - `full_sentence_delimiters: str = ".?!\n…。"` 147 | - Characters considered as full sentence delimiters. 148 | - Used for more definitive sentence boundary detection. 149 | - Default: ".?!\n…。" 150 | 151 | - `force_first_fragment_after_words: int = 15` 152 | - Forces the yield of the first sentence fragment after this many words. 153 | - Ensures timely output even with long opening sentences. 154 | - Default: 15 words 155 | 156 | 157 | ## Time based strategy 158 | Instead of a purely lexigraphical strategy, a time based strategy is available. 159 | A target tokens per second (tps) is input, and generate_sentences will yield the best 160 | available output (full sentence, longest fragment, or any available buffer, in that order) if it is approaching a "deadline" 161 | where what has been output would be slower than the input tps target. If LLM is more than 162 | two full sentences ahead of the target it will output a sentence even if it's ahead of the "deadline" 163 | 164 | `from stream2sentence.stream2sentence_time_based import generate_sentences_time_based` 165 | 166 | ### Parameters 167 | - `generator (Iterator[str])` 168 | - A generator that yields chunks of text as a stream of characters.` 169 | - `target_tps: float = 4` 170 | - the rate in tokens per second you want to use to calculate deadlines for output. 171 | - Default is 4. (approximately the speed of human speech) 172 | - `lead_time: float = 1` 173 | - amount of time in seconds to wait for the buffer to build for before returning values. 174 | - `max_wait_for_fragments = [3, 2]` 175 | - Max amount of time in seconds that the Nth sentence will wait beyond the "deadline" for a "fragment" (text preceeding a fragment delimiter), which is preferred over a piece of buffer. 176 | - The last value in the array is used for all subsequent checks. 177 | - `min_output_lengths: int[] = [2, 3, 3, 4]` 178 | - An array that corresponds to the minimum output size in words for the corresponding output sentence, the last value in the array is used for all remaining output. 179 | - For example [4,5,6] would mean the first piece of output must have 4 words, the second 5 words, and all subsequent 6. 180 | - `preferred_sentence_fragment_delimiters: str[] = ['. ', '? ', '! ', '\n']` 181 | - Array of strings that deliniate a sentence fragment. "Preferred" are checked first and always used if the fragment meets the length requirement over the other fragment delimiters. 182 | - Note the trailing spaces, added to differentiate between values like $3.5 and a proper sentence end 183 | - `sentence_fragment_delimiters: str[] = ['; ', ': ', ', ', '* ', '**', '– ']` 184 | - Array of strings that are checked after "preferred" delimiters 185 | - `delimiter_ignore_prefixes: str[]` 186 | - Array of strings that will not be considered "delimiters" if preceeded by a delimiter. 187 | - Used to ignore common abbreviations for things like Mr. Dr. and Mrs. where we don't want to split 188 | - Default is a long list documented in delimiter_ignore_prefixes 189 | - `wait_for_if_non_fragment: str[]` 190 | - Array of strings that the algorithm will not use as the last value if the whole buffer is being output (not a fragment or sentence). 191 | - Avoids awkward pauses on common words that are unnatural to pause at. 192 | - Default is a long list of common words documented in avoid_pause_words.py 193 | - `deadline_offsets_static: float[] = [1]` 194 | - Constant amount of time in seconds to subtract from the deadline for first n sentences. 195 | - Last value applied to all subsequent sentences 196 | - `deadline_offsets_dynamic: float[] = [0]`: 197 | - Added to account for the time it takes a TTS engine to generate output. 198 | - For example, if it takes your TTS engine around 1 second to generate 10 words, you can use a value of 0.1 so that the TTS generation time is included in the deadline. 199 | - Applied to first n sentences, last value applied to all subsequent 200 | 201 | ## Contributing 202 | 203 | Any Contributions you make are welcome and **greatly appreciated**. 204 | 205 | 1. **Fork** the Project. 206 | 2. **Create** your Feature Branch (`git checkout -b feature/AmazingFeature`). 207 | 3. **Commit** your Changes (`git commit -m 'Add some AmazingFeature'`). 208 | 4. **Push** to the Branch (`git push origin feature/AmazingFeature`). 209 | 5. **Open** a Pull Request. 210 | 211 | ## License 212 | 213 | This project is licensed under the MIT License. For more details, see the [`LICENSE`](LICENSE) file. 214 | 215 | --- 216 | 217 | Project created and maintained by [Kolja Beigel](https://github.com/KoljaB). 218 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="stream2sentence", 8 | version="0.3.1", 9 | author="Kolja Beigel", 10 | author_email="kolja.beigel@web.de", 11 | description="Real-time processing and delivery of sentences from a continuous stream of characters or text chunks.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/KoljaB/stream2sentence", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.6', 22 | install_requires=[ 23 | 'nltk==3.9.1', 24 | 'emoji==2.14.1', 25 | 'stanza==1.10.1' 26 | ], 27 | keywords='realtime, text streaming, stream, sentence, sentence detection, sentence generation, tts, speech synthesis, nltk, text analysis, audio processing, boundary detection, sentence boundary detection' 28 | ) -------------------------------------------------------------------------------- /stream2sentence/__init__.py: -------------------------------------------------------------------------------- 1 | from .stream2sentence import ( 2 | generate_sentences, 3 | generate_sentences_async, 4 | init_tokenizer, 5 | ) 6 | 7 | from .stream2sentence_time_based import ( 8 | generate_sentences_time_based, 9 | ) 10 | 11 | from .avoid_pause_words import ( 12 | AVOID_PAUSE_WORDS, 13 | ) 14 | -------------------------------------------------------------------------------- /stream2sentence/avoid_pause_words.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Conjunctions 4 | conjunctions = ["and", "or", "but", "so", "for", "nor", "yet"] 5 | 6 | # Prepositions 7 | prepositions = [ 8 | "in", "on", "at", "by", "with", "about", "of", "to", "for", 9 | "from", "as", "over", "under", "through", "between", "during", "there" 10 | ] 11 | 12 | # Articles 13 | articles = ["a", "an", "the"] 14 | 15 | # Possessives and Demonstratives 16 | possessives_and_demonstratives = [ 17 | "my", "your", "his", "her", "its", "our", "their", 18 | "this", "that", "these", "those" 19 | ] 20 | 21 | # Auxiliary/Helping Verbs 22 | auxiliary_verbs = [ 23 | "is", "are", "was", "were", "am", "be", "been", "being", 24 | "do", "does", "did", "have", "has", "had", 25 | "can", "could", "shall", "should", 26 | "will", "would", "may", "might", "must" 27 | ] 28 | 29 | # Pronouns 30 | pronouns = [ 31 | "I", "we", "you", "he", "she", "it", "they", 32 | "who", "whom", "whose", "which", "that" 33 | ] 34 | 35 | # Quantifiers 36 | quantifiers = ["some", "many", "few", "all", "any", "most", "much", "none", "several"] 37 | 38 | # Adverbs that Modify Flow 39 | adverbs = ["very", "too", "just", "quite", "almost", "nearly", "only"] 40 | 41 | # Interrogatives 42 | interrogatives = ["what", "where", "when", "why", "how"] 43 | 44 | # Relative Pronouns 45 | relative_pronouns = ["who", "which", "that"] 46 | 47 | # Subordinating Conjunctions 48 | subordinating_conjunctions = [ 49 | "although", "because", "if", "since", 50 | "though", "while", "until", "unless" 51 | ] 52 | 53 | AVOID_PAUSE_WORDS = set( 54 | conjunctions + 55 | prepositions + 56 | articles + 57 | possessives_and_demonstratives + 58 | auxiliary_verbs + 59 | pronouns + 60 | quantifiers + 61 | adverbs + 62 | interrogatives + 63 | subordinating_conjunctions 64 | ) 65 | 66 | -------------------------------------------------------------------------------- /stream2sentence/delimiter_ignore_prefixes.py: -------------------------------------------------------------------------------- 1 | 2 | titles_and_abbreviations = [ 3 | "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "St.", 4 | "Ph.D.", "Phd.", "PhD.", "M.D.", "B.A.", "M.A.", "D.D.S.", "J.D.", 5 | "Co.", "Corp.", "Ave.", "Blvd.", "Rd.", "Mt.", 6 | "a.m.", "p.m.", "Jr.", "Sr.", 7 | "Gov.", "Gen.", "Capt.", "Lt.", "Maj.", "Col.", "Adm.", "Cmdr.", 8 | "Sgt.", "Cpl.", "Pvt.", "U.S.", "U.K.", "vs.", "i.e.", "e.g.", 9 | "Vol.", "Art.", "Sec.", "Chap.", "Fig.", "Ref.", "Dept." 10 | ] 11 | 12 | dates_and_times = [ 13 | "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", 14 | "Sep.", "Oct.", "Nov.", "Dec.", 15 | "Mon.", "Tue.", "Wed.", "Thu.", "Fri.", "Sat.", "Sun.", 16 | ] 17 | 18 | financial_abbreviations = [ 19 | "Inc.", "Ltd.", "Corp.", "PLC.", "LLC.", "LLP.", 20 | "P/E.", "EPS.", "NAV.", "ROI.", "ROA.", "ROE.", 21 | ] 22 | 23 | country_abbreviations = [ 24 | "U.S.A.", "U.K.", "U.A.E.", "P.R.C.", "D.R.C.", "R.O.C.", 25 | "E.U.", "U.N.", "A.U.", 26 | "U.S.", "U.K.", "E.U.", "P.R.C.", "D.R.C.", "R.O.C.", 27 | ] 28 | 29 | DELIMITER_IGNORE_PREFIXES = set( 30 | titles_and_abbreviations + dates_and_times + 31 | financial_abbreviations + country_abbreviations 32 | ) 33 | 34 | 35 | -------------------------------------------------------------------------------- /stream2sentence/stream2sentence.py: -------------------------------------------------------------------------------- 1 | """ 2 | Real-time processing and delivery of sentences 3 | from a continuous stream of characters or text chunks 4 | """ 5 | 6 | import functools 7 | import logging 8 | import re 9 | import time 10 | from typing import ( 11 | AsyncIterable, 12 | AsyncIterator, 13 | Awaitable, 14 | Callable, 15 | Concatenate, 16 | Iterable, 17 | Iterator, 18 | ParamSpec, 19 | ) 20 | 21 | import emoji 22 | 23 | current_tokenizer = "nltk" 24 | stanza_initialized = False 25 | nltk_initialized = False 26 | nlp = None 27 | 28 | 29 | def initialize_nltk(debug=False): 30 | """ 31 | Initializes NLTK by downloading required data for sentence tokenization. 32 | """ 33 | global nltk_initialized 34 | if nltk_initialized: 35 | return 36 | 37 | logging.info("Initializing NLTK Tokenizer") 38 | 39 | try: 40 | import nltk 41 | 42 | nltk.download("punkt_tab", quiet=not debug) 43 | nltk_initialized = True 44 | except Exception as e: 45 | print(f"Error initializing nltk tokenizer: {e}") 46 | nltk_initialized = False 47 | 48 | 49 | def initialize_stanza(language: str = "en", offline=False): 50 | """ 51 | Initializes Stanza by downloading required data for sentence tokenization. 52 | """ 53 | global nlp, stanza_initialized 54 | if stanza_initialized: 55 | return 56 | 57 | logging.info("Initializing Stanza Tokenizer") 58 | 59 | try: 60 | import stanza 61 | 62 | if not offline: 63 | stanza.download(language) 64 | 65 | nlp = stanza.Pipeline(language, download_method=None) 66 | stanza_initialized = True 67 | except Exception as e: 68 | print(f"Error initializing stanza tokenizer: {e}") 69 | stanza_initialized = False 70 | 71 | 72 | def _remove_links(text: str) -> str: 73 | """ 74 | Removes any links from the input text. 75 | 76 | Args: 77 | text (str): Input text 78 | 79 | Returns: 80 | str: Text with links removed 81 | """ 82 | pattern = ( 83 | r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|" 84 | r"[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" 85 | ) 86 | 87 | return re.sub(pattern, "", text) 88 | 89 | 90 | def _remove_emojis(text: str) -> str: 91 | """ 92 | Removes emojis from the input text. 93 | 94 | Args: 95 | text (str): Input text 96 | 97 | Returns: 98 | str: Text with emojis removed 99 | """ 100 | return emoji.replace_emoji(text, "") 101 | 102 | 103 | async def _generate_characters( 104 | generator: AsyncIterable[str], log_characters: bool = False 105 | ) -> AsyncIterator[str]: 106 | """ 107 | Generates individual characters from a text generator. 108 | 109 | Args: 110 | generator (Iterator[str]): Input text generator 111 | log_characters (bool): Whether to log the characters to the console 112 | 113 | Yields: 114 | Individual characters from the generator 115 | """ 116 | if log_characters: 117 | print("Stream: ", end="", flush=True) 118 | async for chunk in generator: 119 | for char in chunk: 120 | if log_characters: 121 | print(char, end="", flush=True) 122 | yield char 123 | if log_characters: 124 | print() 125 | 126 | 127 | def _clean_text( 128 | text: str, 129 | cleanup_text_links: bool = False, 130 | cleanup_text_emojis: bool = False, 131 | strip_text: bool = True, 132 | ) -> str: 133 | """ 134 | Cleans the text by removing links and emojis. 135 | 136 | Args: 137 | text (str): Input text 138 | cleanup_text_links (boolean, optional): Remove non-desired links from 139 | the stream. 140 | cleanup_text_emojis (boolean, optional): Remove non-desired emojis 141 | from the stream. 142 | 143 | Returns: 144 | str: Cleaned text 145 | """ 146 | if cleanup_text_links: 147 | text = _remove_links(text) 148 | if cleanup_text_emojis: 149 | text = _remove_emojis(text) 150 | if strip_text: 151 | text = text.strip() 152 | return text 153 | 154 | 155 | def _tokenize_sentences(text: str, tokenize_sentences=None) -> list[str]: 156 | """ 157 | Tokenizes sentences from the input text. 158 | 159 | Args: 160 | text (str): Input text 161 | tokenize_sentences (Callable, optional): A function that tokenizes 162 | sentences from the input text. Defaults to None. 163 | 164 | Yields: 165 | Iterator[str]: An iterator of sentences 166 | """ 167 | if tokenize_sentences: 168 | sentences = tokenize_sentences(text) 169 | else: 170 | nlp_start_time = time.time() 171 | if current_tokenizer == "nltk": 172 | import nltk 173 | 174 | sentences = nltk.tokenize.sent_tokenize(text) 175 | elif current_tokenizer == "stanza": 176 | import stanza 177 | 178 | global nlp 179 | doc = nlp(text) 180 | sentences = [sentence.text for sentence in doc.sentences] 181 | else: 182 | raise ValueError(f"Unknown tokenizer: {current_tokenizer}") 183 | nlp_end_time = time.time() 184 | logging.debug("Time to split sentences: " f"{nlp_end_time - nlp_start_time}") 185 | return sentences 186 | 187 | 188 | def init_tokenizer(tokenizer: str, language: str = "en", offline=False, debug=False): 189 | """ 190 | Initializes the sentence tokenizer. 191 | """ 192 | if tokenizer == "nltk": 193 | initialize_nltk(debug) 194 | elif tokenizer == "stanza": 195 | initialize_stanza(language,offline=offline) 196 | else: 197 | logging.warning(f"Unknown tokenizer: {tokenizer}") 198 | 199 | async def generate_sentences_async( 200 | generator: AsyncIterable[str], 201 | context_size: int = 12, 202 | context_size_look_overhead: int = 12, 203 | minimum_sentence_length: int = 10, 204 | minimum_first_fragment_length=10, 205 | quick_yield_single_sentence_fragment: bool = False, 206 | quick_yield_for_all_sentences: bool = False, 207 | quick_yield_every_fragment: bool = False, 208 | cleanup_text_links: bool = False, 209 | cleanup_text_emojis: bool = False, 210 | tokenize_sentences=None, 211 | tokenizer: str = "nltk", 212 | language: str = "en", 213 | log_characters: bool = False, 214 | sentence_fragment_delimiters: str = ".?!;:,\n…)]}。-", 215 | full_sentence_delimiters: str = ".?!\n…。", 216 | force_first_fragment_after_words=30, 217 | filter_first_non_alnum_characters: bool = False, 218 | debug=False, 219 | ) -> AsyncIterator[str]: 220 | """ 221 | Generates well-formed sentences from a stream of characters or text chunks 222 | provided by an input generator. 223 | 224 | Args: 225 | generator (Iterator[str]): A generator that yields chunks of text as a 226 | stream of characters. 227 | context_size (int): The number of characters used to establish context 228 | for sentence boundary detection. A larger context improves the 229 | accuracy of detecting sentence boundaries. 230 | Default is 12 characters. 231 | context_size_look_overhead: The number of characters to look 232 | over the context_size boundaries to detect sentence splitting 233 | characters (improves sentence detection). 234 | minimum_sentence_length (int): The minimum number of characters a 235 | sentence must have. If a sentence is shorter, it will be 236 | concatenated with the following one, improving the overall 237 | readability. This parameter does not apply to the first sentence 238 | fragment, which is governed by `minimum_first_fragment_length`. 239 | Default is 10 characters. 240 | minimum_first_fragment_length (int): The minimum number of characters 241 | required for the first sentence fragment before yielding. 242 | Default is 10 characters. 243 | quick_yield_single_sentence_fragment (bool): If set to True, the 244 | generator will yield the first sentence first fragment as quickly as 245 | possible. This is particularly useful for real-time applications 246 | such as speech synthesis. 247 | quick_yield_for_all_sentences (bool): If set to True, the 248 | generator will yield every sentence first fragment as quickly as 249 | possible (not only the first sentence first fragment) 250 | quick_yield_every_fragment (bool): If set to True, the 251 | generator not only yield every sentence first fragment, but also every 252 | following fragment. 253 | cleanup_text_links (bool): If True, removes hyperlinks from the text 254 | stream to ensure clean output. 255 | cleanup_text_emojis (bool): If True, filters out emojis from the text 256 | stream for clear textual content. 257 | tokenize_sentences (Callable): A function that tokenizes sentences 258 | from the input text. Defaults to None. 259 | tokenizer (str): The tokenizer to use for sentence tokenization. 260 | Default is "nltk". Can be "nltk" or "stanza". 261 | language (str): The language to use for sentence tokenization. 262 | Default is "en". Can be "multilingual" for stanze tokenizer. 263 | log_characters (bool): If True, logs each character to the console as 264 | they are processed. 265 | sentence_fragment_delimiters (str): A string of characters that are 266 | considered sentence fragment delimiters. Default is ".?!;:,\n…)]}。-". 267 | full_sentence_delimiters (str): A string of characters that are 268 | considered full sentence delimiters. Default is ".?!\n…。". 269 | force_first_fragment_after_words (int): The number of words after 270 | which the first sentence fragment is forced to be yielded. 271 | Default is 30 words. 272 | filter_first_non_alnum_characters (bool): If True, filters out the 273 | first non-alphanumeric characters from the text stream. 274 | debug (bool): If True, enables debug mode for logging. 275 | 276 | Yields: 277 | Iterator[str]: An iterator of complete sentences constructed from the 278 | input text stream. Each yielded sentence meets the specified minimum 279 | length requirements and is cleaned up if specified. 280 | 281 | The function maintains a buffer to accumulate text chunks and applies 282 | natural language processing to detect sentence boundaries. 283 | It employs various heuristics, such as minimum sentence length and 284 | sentence delimiters, to ensure the quality of the output sentences. 285 | The function also provides options to clean up the text stream, 286 | making it versatile for different types of text processing applications. 287 | """ 288 | 289 | # Initialize the tokenizer based on the specified tokenizer and language 290 | global current_tokenizer 291 | current_tokenizer = tokenizer 292 | init_tokenizer(current_tokenizer, language, debug) 293 | 294 | buffer = "" 295 | is_first_sentence = True 296 | word_count = 0 # Initialize word count 297 | last_delimiter_position = -1 # Position of last full sentence delimiter 298 | 299 | # Adjust quick yield flags based on settings 300 | if quick_yield_every_fragment: 301 | quick_yield_for_all_sentences = True 302 | 303 | if quick_yield_for_all_sentences: 304 | quick_yield_single_sentence_fragment = True 305 | 306 | async for char in _generate_characters(generator, log_characters): 307 | 308 | if char: 309 | if len(buffer) == 0: 310 | if filter_first_non_alnum_characters: 311 | if not char.isalnum(): 312 | continue 313 | 314 | buffer += char 315 | buffer = buffer.lstrip() 316 | 317 | # Update word count on encountering space or sentence fragment delimiter 318 | if char.isspace() or char in sentence_fragment_delimiters: 319 | word_count += 1 320 | 321 | if debug: 322 | print("\033[36mDebug: Added char, buffer size: \"{}\"\033[0m".format(len(buffer))) 323 | 324 | # Check conditions to yield first sentence fragment quickly 325 | if ( 326 | is_first_sentence 327 | and len(buffer) > minimum_first_fragment_length 328 | and quick_yield_single_sentence_fragment 329 | ): 330 | 331 | if ( 332 | buffer[-1] in sentence_fragment_delimiters 333 | or char.isspace() and word_count >= force_first_fragment_after_words 334 | ): 335 | 336 | yield_text = _clean_text( 337 | buffer, 338 | cleanup_text_links, 339 | cleanup_text_emojis) 340 | if debug: 341 | if buffer[-1] in sentence_fragment_delimiters: 342 | print("\033[36mDebug: Yielding first sentence fragment: \"{}\" because buffer[-1] {} is sentence frag \033[0m".format(yield_text, buffer[-1])) 343 | else: 344 | print("\033[36mDebug: Yielding first sentence fragment: \"{}\" because word_count {} is >= force_first_fragment_after_words \033[0m".format(yield_text, word_count)) 345 | 346 | yield yield_text 347 | 348 | buffer = "" 349 | word_count = 0 350 | if not quick_yield_every_fragment: 351 | is_first_sentence = False 352 | 353 | continue 354 | 355 | # Continue accumulating characters if buffer is under minimum sentence length 356 | if len(buffer) <= minimum_sentence_length + context_size: 357 | 358 | continue 359 | 360 | # Update last delimiter position if a new delimiter is found 361 | if char in full_sentence_delimiters: 362 | last_delimiter_position = len(buffer) - 1 363 | 364 | # Define context window for checking potential sentence boundaries 365 | context_window_end_pos = len(buffer) - context_size - 1 366 | context_window_start_pos = ( 367 | context_window_end_pos - context_size_look_overhead 368 | ) 369 | if context_window_start_pos < 0: 370 | context_window_start_pos = 0 371 | 372 | # Tokenize sentences from buffer 373 | sentences = _tokenize_sentences(buffer, tokenize_sentences) 374 | 375 | if debug: 376 | print("\033[36mbuffer: \"{}\"\033[0m".format(buffer)) 377 | print("\033[36mlast_delimiter_position: {}\033[0m".format(last_delimiter_position)) 378 | print("\033[36mlen(sentences) > 2: {}\033[0m".format(len(sentences) > 2)) 379 | print("\033[36mcontext_window_start_pos: {}\033[0m".format(context_window_start_pos)) 380 | print("\033[36mcontext_window_end_pos: {}\033[0m".format(context_window_end_pos)) 381 | 382 | # Combine sentences below minimum_sentence_length with the next sentence(s) 383 | combined_sentences = [] 384 | temp_sentence = "" 385 | 386 | for sentence in sentences: 387 | if len(sentence) < minimum_sentence_length: 388 | temp_sentence += sentence + " " 389 | else: 390 | if temp_sentence: 391 | temp_sentence += sentence 392 | combined_sentences.append(temp_sentence.strip()) 393 | temp_sentence = "" 394 | else: 395 | combined_sentences.append(sentence.strip()) 396 | 397 | # If there's a leftover temp_sentence that hasn't been appended 398 | if temp_sentence: 399 | combined_sentences.append(temp_sentence.strip()) 400 | 401 | # Replace the original sentences with the combined_sentences 402 | sentences = combined_sentences 403 | 404 | # Process and yield sentences based on conditions 405 | if len(sentences) > 2 or ( 406 | last_delimiter_position >= 0 407 | and context_window_start_pos 408 | <= last_delimiter_position 409 | <= context_window_end_pos 410 | ): 411 | 412 | if len(sentences) > 1: 413 | total_length_except_last = sum( 414 | len(sentence) for sentence in sentences[:-1] 415 | ) 416 | if total_length_except_last >= minimum_sentence_length: 417 | for sentence in sentences[:-1]: 418 | yield_text = _clean_text( 419 | sentence, 420 | cleanup_text_links, 421 | cleanup_text_emojis) 422 | if debug: 423 | print("\033[36mDebug: Yielding sentence: \"{}\"\033[0m".format(yield_text)) 424 | 425 | yield yield_text 426 | word_count = 0 427 | 428 | if quick_yield_for_all_sentences: 429 | is_first_sentence = True 430 | 431 | # we need to remember if the buffer ends with space 432 | # - sentences returned by the tokenizers are rtrimmed 433 | # - this takes any blank spaces away from the last unfinshed sentence 434 | # - we have to work around this by re-adding the blank space in this case 435 | ends_with_space = buffer.endswith(" ") 436 | 437 | # set buffer to last unfinshed sentence returned by tokenizers 438 | buffer = sentences[-1] 439 | 440 | # reset the blank space if it was there: 441 | if ends_with_space: 442 | buffer += " " 443 | 444 | # reset the last delimiter position after yielding 445 | last_delimiter_position = -1 446 | 447 | 448 | # Yield remaining buffer as final sentence(s) 449 | if buffer: 450 | sentences = _tokenize_sentences(buffer, tokenize_sentences) 451 | sentence_buffer = "" 452 | 453 | for sentence in sentences: 454 | sentence_buffer += sentence 455 | if len(sentence_buffer) < minimum_sentence_length: 456 | sentence_buffer += " " 457 | 458 | continue 459 | 460 | yield_text = _clean_text( 461 | sentence_buffer, cleanup_text_links, cleanup_text_emojis 462 | ) 463 | 464 | if debug: 465 | print("\033[36mDebug: Yielding final sentence(s): \"{}\"\033[0m".format(yield_text)) 466 | 467 | yield yield_text 468 | 469 | sentence_buffer = "" 470 | 471 | if sentence_buffer: 472 | yield_text = _clean_text( 473 | sentence_buffer, 474 | cleanup_text_links, 475 | cleanup_text_emojis) 476 | if debug: 477 | print("\033[36mDebug: Yielding remaining text: \"{}\"\033[0m".format(yield_text)) 478 | 479 | yield yield_text 480 | 481 | 482 | def _await_sync(f: Awaitable[str]) -> str: 483 | gen = f.__await__() 484 | try: 485 | next(gen) 486 | raise RuntimeError(f"{f} failed to be synchronous") 487 | except StopIteration as e: 488 | return e.value 489 | 490 | 491 | def _async_iter_to_sync(f: AsyncIterator[str]) -> Iterator[str]: 492 | try: 493 | while True: 494 | yield _await_sync(f.__anext__()) 495 | except StopAsyncIteration: 496 | return 497 | 498 | 499 | P = ParamSpec("P") 500 | 501 | 502 | def _dowrap( 503 | f: Callable[Concatenate[AsyncIterable[str], P], AsyncIterator[str]] 504 | ) -> Callable[Concatenate[Iterable[str], P], Iterator[str]]: 505 | @functools.wraps(f) 506 | def inner(generator: Iterable[str], *args: P.args, **kwargs: P.kwargs): 507 | async def gen_wrap(): 508 | for x in generator: 509 | yield x 510 | 511 | return _async_iter_to_sync(f(gen_wrap(), *args, **kwargs)) 512 | 513 | return inner 514 | 515 | 516 | generate_sentences = _dowrap(generate_sentences_async) 517 | generate_sentences.__name__ = "generate_sentences" 518 | generate_sentences.__qualname__ = "generate_sentences" 519 | -------------------------------------------------------------------------------- /stream2sentence/stream2sentence_time_based.py: -------------------------------------------------------------------------------- 1 | 2 | import nltk 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | import time 6 | from itertools import accumulate 7 | 8 | from stream2sentence import init_tokenizer 9 | from stream2sentence.avoid_pause_words import AVOID_PAUSE_WORDS 10 | from stream2sentence.delimiter_ignore_prefixes import DELIMITER_IGNORE_PREFIXES 11 | 12 | 13 | init_tokenizer("nltk") 14 | 15 | WORDS_PER_TOKEN = 0.75 16 | preferred_sentence_fragment_delimiters_global = [] 17 | sentence_fragment_delimiters_global = [] 18 | delimiter_ignore_prefixes_global = [] 19 | 20 | def get_index_or_last(a_list, index): 21 | return a_list[index] if index < len(a_list) else a_list[-1] 22 | 23 | def find_last_delimiter(s, delimiters): 24 | valid_indices = [] 25 | for delimiter in delimiters: 26 | index = s.rfind(delimiter) 27 | if index != -1: 28 | # Get the word preceding the delimiter 29 | preceding_word_start = s.rfind(" ", 0, index) 30 | preceding_word = s[preceding_word_start:index + 1].strip() 31 | 32 | if preceding_word not in delimiter_ignore_prefixes_global: 33 | valid_indices.append(index) 34 | 35 | return max(valid_indices, default=-1) 36 | 37 | def find_last_preferred_fragment_delimiter(s): 38 | return find_last_delimiter(s, preferred_sentence_fragment_delimiters_global) 39 | 40 | def find_last_fragment_delimiter(s): 41 | return find_last_delimiter(s, sentence_fragment_delimiters_global) 42 | 43 | def get_num_words(s): 44 | return len(s.split()) 45 | 46 | def find_first_greater(nums, value): 47 | for index, num in enumerate(nums): 48 | if num > value: 49 | return index 50 | return -1 51 | 52 | 53 | def is_output_needed(has_output_started, start_time, lead_time, output_sentences, estimated_time_between_words, deadline_offset): 54 | cur_time = time.time() 55 | if not has_output_started and cur_time - start_time < lead_time: 56 | return False 57 | 58 | num_words_output = get_num_words(" ".join(output_sentences)) 59 | output_deadline = num_words_output * estimated_time_between_words - deadline_offset 60 | return cur_time - start_time > output_deadline 61 | 62 | def is_output_long_enough(output, min_output_length): 63 | num_words = get_num_words(output) 64 | return (num_words >= min_output_length) 65 | 66 | def get_fragment(llm_buffer, min_output_length): 67 | delimiter_index = find_last_preferred_fragment_delimiter(llm_buffer) 68 | if delimiter_index != -1 and is_output_long_enough(llm_buffer[:delimiter_index], min_output_length): 69 | return llm_buffer[:delimiter_index + 1] 70 | 71 | delimiter_index = find_last_fragment_delimiter(llm_buffer) 72 | if delimiter_index != -1 and is_output_long_enough(llm_buffer[:delimiter_index], min_output_length): 73 | return llm_buffer[:delimiter_index + 1] 74 | return "" 75 | 76 | def get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length): 77 | word_lengths_of_sentences = list(map(get_num_words, sentences_on_buffer)) 78 | sums_of_word_lens = list(accumulate(word_lengths_of_sentences)) 79 | return find_first_greater(sums_of_word_lens, min_output_length) + 1 80 | 81 | 82 | def generate_sentences_time_based( 83 | generator, 84 | lead_time = 1, 85 | max_wait_for_fragments = [3, 2], 86 | target_tps = 4, 87 | min_output_lengths = [2, 3, 3, 4], 88 | preferred_sentence_fragment_delimiters = ['. ', '? ', '! ', '\n'], 89 | sentence_fragment_delimiters = ['; ', ': ', ', ', '* ', '**', '– '], 90 | delimiter_ignore_prefixes = DELIMITER_IGNORE_PREFIXES, 91 | wait_for_if_non_fragment = AVOID_PAUSE_WORDS, 92 | deadline_offsets_static = [1], 93 | deadline_offsets_dynamic = [0], 94 | ): 95 | """ 96 | Uses a time based strategy to determine whether to yield. A target tps is provided, 97 | and when the outputted values are approaching the "deadline" where output will lag behind 98 | the target then yield best available option. 99 | 100 | Args: 101 | generator (Iterator[str]): A generator that yields chunks of text as a stream of characters. 102 | lead_time (float): amount of time in seconds to wait for the buffer to build for before returning values. 103 | Default is 1. 104 | max_wait_for_fragments (float): Max amount of time in seconds that the Nth sentence will wait beyond the 105 | "deadline" for a "fragment" (text preceeding a fragment delimiter), which is preferred over a piece of buffer. 106 | The last value in the array is used for all subsequent checks. 107 | Default is [3, 2]. 108 | target_tps (float): the rate in tokens per second you want to use to calculate output deadlines. 109 | Default is 4. (approximately the speed of human speech) 110 | min_output_lengths (int[]]): An array that corresponds to the minimum output size in words 111 | for the corresponding output sentence, the last value in the array is used for all remaining output. 112 | For example [4,5,6] would mean the first piece of output must have 4 words, the second 5 words, and all subsequent 6. 113 | Default is [2, 3, 3, 4] 114 | preferred_sentence_fragment_delimiters (str[]): Array of strings that deliniate a sentence fragment. "Preferred" 115 | are checked first and always used if the fragment meets the length requirement over the other fragment delimiters. 116 | Note the trailing spaces, added to differentiate between values like $3.5 and a proper sentence end 117 | Default is ['. ', '? ', '! ', '\n'] 118 | sentence_fragment_delimiters (str[]): Array of strings that are checked after "preferred" delimiters 119 | Default is ['; ', ': ', ', ', '* '] 120 | delimiter_ignore_prefixes (str[]): Array of strings that will not be considered "delimiters" if preceeded by a delimiter. 121 | Used to ignore common abbreviations for things like Mr. Dr. and Mrs. where we don't want to split 122 | Default is a long list documented in delimiter_ignore_prefixes 123 | wait_for_if_non_fragment (str[]): Array of strings that the algorithm will not use as the last value if the whole buffer 124 | is being output. Avoids awkward pauses on common words that are unnatural to pause at. 125 | Default is a long list of common words documented in avoid_pause_words.py 126 | deadline_offsets_static float[]: Constant amount of time in seconds to subtract from the deadline for first n sentences. 127 | Last value applied to all subsequent sentences 128 | Default is [1]. 129 | deadline_offsets_dynamic float[]: Added to account for the time it takes a TTS engine to generate output. 130 | For example, if it takes your TTS engine around 1 second to generate 10 words, you can use a value of 0.1 131 | so that the TTS generation time is included in the deadline. Applied to first n sentences, last value applied to all subsequent 132 | Default is [0]. 133 | Yields: 134 | Iterator[str]: An iterator of complete sentences constructed from the 135 | input text stream. 136 | """ 137 | global preferred_sentence_fragment_delimiters_global, sentence_fragment_delimiters_global, delimiter_ignore_prefixes_global 138 | preferred_sentence_fragment_delimiters_global = set(preferred_sentence_fragment_delimiters) 139 | sentence_fragment_delimiters_global = set(sentence_fragment_delimiters) 140 | delimiter_ignore_prefixes_global = set(delimiter_ignore_prefixes) 141 | punkt_sentence_tokenizer = PunktSentenceTokenizer() 142 | 143 | start_time = time.time() 144 | last_sentence_time = time.time() 145 | estimated_time_between_words = 1 / (target_tps * WORDS_PER_TOKEN) 146 | output_sentences = [] 147 | llm_buffer_full = "" 148 | has_output_started = False 149 | 150 | 151 | def handle_output(output, sentence_boundary_index=None): 152 | nonlocal has_output_started, llm_buffer_full, output_sentences, min_output_lengths, start_time, last_sentence_time 153 | if not has_output_started: 154 | #once output has started we go based on TTS start for deadline 155 | start_time = time.time() 156 | has_output_started = True 157 | 158 | end_index = len(output) 159 | if sentence_boundary_index != None: 160 | end_index = sentence_boundary_index 161 | llm_buffer_full = llm_buffer_full[end_index:] 162 | output_sentences.append(output) 163 | last_sentence_time = time.time() 164 | return output 165 | 166 | for token in generator: 167 | llm_buffer_full += token 168 | llm_buffer_full = llm_buffer_full.lstrip() 169 | if len(llm_buffer_full.split(None, 2)) < 2: 170 | #must have at least two words since last token may not be a full word 171 | continue 172 | 173 | llm_buffer = llm_buffer_full.rsplit(" ", 1)[0] #remove last word 174 | 175 | #TODO edge case with disagreement, how to identify and use len(output) as fallback? 176 | sentences_on_buffer = nltk.tokenize.sent_tokenize(llm_buffer) 177 | sentence_boundaries = list(punkt_sentence_tokenizer.span_tokenize(llm_buffer_full)) #handle white space descrepancies in full_buffer and buffer after split() 178 | 179 | num_sentences_output = len(output_sentences) 180 | min_output_length = get_index_or_last(min_output_lengths, num_sentences_output) 181 | sentences_needed_for_min_len = get_sentences_needed_for_min_length(sentences_on_buffer, min_output_length) 182 | 183 | current_output = llm_buffer 184 | use_first_sentence = len(sentences_on_buffer) > 1 and is_output_long_enough(sentences_on_buffer[0], min_output_length) 185 | if use_first_sentence: 186 | current_output = sentences_on_buffer[0] 187 | else: 188 | current_fragment = get_fragment(llm_buffer, min_output_length) 189 | if current_fragment != "": 190 | current_output = current_fragment 191 | 192 | num_words_for_offset = get_num_words(current_output) 193 | deadline_offset_dynamic = get_index_or_last(deadline_offsets_dynamic, num_sentences_output) 194 | deadline_offset_static = get_index_or_last(deadline_offsets_static, num_sentences_output) 195 | deadline_offset = (num_words_for_offset * deadline_offset_dynamic) + deadline_offset_static 196 | 197 | output_needed = is_output_needed(has_output_started, start_time, lead_time, output_sentences, estimated_time_between_words, deadline_offset) 198 | if output_needed and use_first_sentence: 199 | end_index = len(sentences_on_buffer[0]) if len(sentence_boundaries) == 1 else sentence_boundaries[1][0] #edge case where sentence_boundaries disagrees with nltk.tokenize.sent_tokenize 200 | yield handle_output(sentences_on_buffer[0], end_index) 201 | elif output_needed: 202 | output = current_fragment 203 | if output == "": 204 | output = llm_buffer 205 | is_not_min_length = get_num_words(output) < min_output_length 206 | max_wait_for_fragment = get_index_or_last(max_wait_for_fragments, num_sentences_output) 207 | waiting_for_fragment = (time.time() - last_sentence_time < max_wait_for_fragment) 208 | if " " in output: 209 | _, last_word = output.rsplit(" ", 1) 210 | else: 211 | last_word = output 212 | last_word_avoid_pause = last_word in wait_for_if_non_fragment 213 | 214 | if is_not_min_length or waiting_for_fragment or last_word_avoid_pause: 215 | continue 216 | 217 | yield handle_output(output) 218 | else: 219 | if sentences_needed_for_min_len == 0 or sentences_needed_for_min_len + 2 > len(sentences_on_buffer): 220 | #two sentences ahead is ideal 221 | continue 222 | end_index = sentence_boundaries[sentences_needed_for_min_len][0] 223 | output = " ".join(sentences_on_buffer[:sentences_needed_for_min_len]) 224 | yield handle_output(output, end_index) 225 | 226 | #after all tokens are processed yield whatever is left 227 | for sentence in nltk.tokenize.sent_tokenize(llm_buffer_full): 228 | yield sentence 229 | 230 | 231 | -------------------------------------------------------------------------------- /tests/run_test.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | :: switch to current execution directory 4 | cd /d %~dp0 5 | 6 | TITLE basic stream2sentence test 7 | python test_stream2sentence.py 8 | pause -------------------------------------------------------------------------------- /tests/simpletest.py: -------------------------------------------------------------------------------- 1 | from stream2sentence import generate_sentences 2 | 3 | def generator(): 4 | yield """No, the way it "cuts midway" is NOT like the audio is cut abruptly (like when you pause a video). You can check below the audio (sorry for not doing that earlier)""" 5 | # yield "Hallo, " 6 | # yield "wie geht es dir? " 7 | # yield "Mir geht es gut." 8 | # expected = ["Hallo,", "wie geht es dir?", "Mir geht es gut."] 9 | sentences = list(generate_sentences(generator(), minimum_sentence_length = 3, context_size=5, minimum_first_fragment_length = 3, quick_yield_single_sentence_fragment=True, debug=True)) 10 | print(sentences) -------------------------------------------------------------------------------- /tests/test_data/1.txt: -------------------------------------------------------------------------------- 1 | Here!@#0.0000 2 | 's!@#0.1226 3 | an!@#0.2501 4 | article!@#0.3877 5 | on!@#0.5165 6 | contemporary!@#0.6441 7 | sculptures!@#0.7736 8 | : 9 | 10 | !@#0.9468 11 | **!@#1.1154 12 | The!@#1.2851 13 | Vibr!@#1.4344 14 | ant!@#1.5733 15 | World!@#1.7188 16 | of!@#1.8563 17 | Contemporary!@#1.9919 18 | Sculpt!@#2.1275 19 | ure!@#2.2698 20 | ** 21 | 22 | !@#2.4197 23 | Cont!@#2.5498 24 | emporary!@#2.6798 25 | sculpture!@#2.8095 26 | has!@#2.9456 27 | evolved!@#3.0844 28 | to!@#3.2220 29 | become!@#3.3618 30 | a!@#3.5000 31 | diverse!@#3.6489 32 | and!@#3.7822 33 | exciting!@#3.9078 34 | field!@#4.0370 35 | ,!@#4.1796 36 | pushing!@#4.3138 37 | the!@#4.4497 38 | boundaries!@#4.5878 39 | of!@#4.7236 40 | what!@#4.8613 41 | art!@#5.0025 42 | can!@#5.1396 43 | be!@#5.2834 44 | .!@#5.4285 45 | From!@#5.5672 46 | the!@#5.7131 47 | abstract!@#5.8658 48 | expression!@#5.9940 49 | ist!@#6.1250 50 | sculptures!@#6.2582 51 | of!@#6.3902 52 | the!@#6.5205 53 | !@#6.6523 54 | 195!@#6.7829 55 | 0!@#6.9109 56 | s!@#7.0437 57 | to!@#7.1729 58 | the!@#7.3029 59 | cutting!@#7.4321 60 | -edge!@#7.5642 61 | installations!@#7.7011 62 | of!@#7.8383 63 | today!@#7.9776 64 | ,!@#8.1163 65 | contemporary!@#8.2550 66 | sculpture!@#8.3987 67 | offers!@#8.5447 68 | a!@#8.6847 69 | wide!@#8.8347 70 | range!@#8.9662 71 | of!@#9.0988 72 | styles!@#9.2286 73 | ,!@#9.3615 74 | materials!@#9.4918 75 | ,!@#9.6230 76 | and!@#9.7524 77 | ideas!@#9.8841 78 | . 79 | 80 | !@#10.0167 81 | **!@#10.1456 82 | Key!@#10.2788 83 | Trends!@#10.4097 84 | in!@#10.5402 85 | Contemporary!@#10.6720 86 | Sculpt!@#10.8059 87 | ure!@#10.9369 88 | ** 89 | 90 | !@#11.0699 91 | Some!@#11.2104 92 | of!@#11.3505 93 | the!@#11.4903 94 | most!@#11.6298 95 | notable!@#11.7702 96 | trends!@#11.9045 97 | in!@#12.0441 98 | contemporary!@#12.1864 99 | sculpture!@#12.3252 100 | include!@#12.4633 101 | : 102 | 103 | !@#12.6048 104 | *!@#12.7449 105 | **!@#12.8872 106 | Abstract!@#13.0307 107 | Sculpt!@#13.1650 108 | ure!@#13.3063 109 | **:!@#13.4519 110 | Abstract!@#13.5971 111 | sculpture!@#13.7428 112 | emphasizes!@#13.8939 113 | the!@#14.0357 114 | idea!@#14.1810 115 | or!@#14.3167 116 | concept!@#14.4647 117 | behind!@#14.6148 118 | a!@#14.7596 119 | work!@#14.9109 120 | ,!@#15.0604 121 | often!@#15.2102 122 | over!@#15.3408 123 | its!@#15.4755 124 | physical!@#15.6045 125 | form!@#15.7351 126 | . 127 | !@#15.8680 128 | *!@#16.0013 129 | **!@#16.1327 130 | Environmental!@#16.2669 131 | Sculpt!@#16.3957 132 | ure!@#16.5278 133 | **:!@#16.6626 134 | Environmental!@#16.7933 135 | sculpture!@#16.9237 136 | explores!@#17.0570 137 | the!@#17.1869 138 | relationship!@#17.3189 139 | between!@#17.4513 140 | art!@#17.5833 141 | and!@#17.7155 142 | the!@#17.8465 143 | natural!@#17.9796 144 | world!@#18.1110 145 | . 146 | !@#18.2436 147 | *!@#18.3752 148 | **!@#18.5068 149 | Performance!@#18.6484 150 | Sculpt!@#18.7892 151 | ure!@#18.9265 152 | **:!@#19.0698 153 | Performance!@#19.2081 154 | sculpture!@#19.3479 155 | involves!@#19.4869 156 | the!@#19.6276 157 | artist!@#19.7690 158 | 's!@#19.9091 159 | body!@#20.0474 160 | and!@#20.1885 161 | the!@#20.3252 162 | viewer!@#20.4667 163 | 's!@#20.6114 164 | experience!@#20.7561 165 | ,!@#20.8950 166 | creating!@#21.0312 167 | a!@#21.1801 168 | new!@#21.3251 169 | kind!@#21.4727 170 | of!@#21.6204 171 | dialogue!@#21.7732 172 | between!@#21.9129 173 | artist!@#22.0510 174 | and!@#22.1929 175 | audience!@#22.3374 176 | . 177 | !@#22.4842 178 | *!@#22.6363 179 | **!@#22.7843 180 | Digital!@#22.9376 181 | Sculpt!@#23.0920 182 | ure!@#23.2494 183 | **:!@#23.3886 184 | Digital!@#23.5309 185 | sculpture!@#23.6705 186 | uses!@#23.8126 187 | new!@#23.9498 188 | technologies!@#24.0893 189 | and!@#24.2271 190 | materials!@#24.3666 191 | to!@#24.5049 192 | create!@#24.6488 193 | innovative!@#24.7860 194 | and!@#24.9320 195 | interactive!@#25.0819 196 | works!@#25.2293 197 | . 198 | 199 | !@#25.3684 200 | **!@#25.5160 201 | Not!@#25.6669 202 | able!@#25.8172 203 | Contemporary!@#25.9579 204 | Sculpt!@#26.1009 205 | ors!@#26.2416 206 | ** 207 | 208 | !@#26.3834 209 | Some!@#26.5225 210 | notable!@#26.6635 211 | contemporary!@#26.8027 212 | sculpt!@#26.9417 213 | ors!@#27.0801 214 | include!@#27.2189 215 | : 216 | 217 | !@#27.3561 218 | *!@#27.4985 219 | **!@#27.6449 220 | An!@#27.7858 221 | ish!@#27.9271 222 | Kapoor!@#28.0725 223 | **:!@#28.2208 224 | A!@#28.3691 225 | British!@#28.5163 226 | sculpt!@#28.6589 227 | or!@#28.7991 228 | known!@#28.9478 229 | for!@#29.0967 230 | his!@#29.2488 231 | large!@#29.4031 232 | -scale!@#29.5505 233 | ,!@#29.6869 234 | site!@#29.8201 235 | -specific!@#29.9524 236 | works!@#30.0846 237 | that!@#30.2212 238 | explore!@#30.3509 239 | the!@#30.4849 240 | relationship!@#30.6181 241 | between!@#30.7513 242 | art!@#30.8848 243 | and!@#31.0175 244 | the!@#31.1494 245 | natural!@#31.2831 246 | world!@#31.4145 247 | . 248 | !@#31.5461 249 | *!@#31.6766 250 | **!@#31.8114 251 | Richard!@#31.9529 252 | S!@#32.0948 253 | erra!@#32.2339 254 | **:!@#32.3738 255 | An!@#32.5118 256 | American!@#32.6533 257 | sculpt!@#32.7928 258 | or!@#32.9342 259 | who!@#33.0802 260 | has!@#33.2155 261 | created!@#33.3658 262 | many!@#33.5115 263 | of!@#33.6634 264 | the!@#33.8030 265 | most!@#33.9475 266 | iconic!@#34.0965 267 | public!@#34.2502 268 | sculptures!@#34.4086 269 | of!@#34.5414 270 | the!@#34.6778 271 | past!@#34.8118 272 | century!@#34.9442 273 | ,!@#35.0773 274 | including!@#35.2099 275 | the!@#35.3420 276 | "!@#35.4783 277 | T!@#35.6104 278 | ilt!@#35.7445 279 | ed!@#35.8798 280 | Arc!@#36.0139 281 | "!@#36.1462 282 | in!@#36.2803 283 | New!@#36.4118 284 | York!@#36.5461 285 | City!@#36.6782 286 | . 287 | !@#36.8135 288 | *!@#36.9487 289 | **!@#37.0810 290 | Y!@#37.2250 291 | ay!@#37.3674 292 | oi!@#37.5118 293 | K!@#37.6547 294 | us!@#37.7991 295 | ama!@#37.9404 296 | **:!@#38.0820 297 | A!@#38.2237 298 | Japanese!@#38.3635 299 | sculpt!@#38.5041 300 | or!@#38.6472 301 | known!@#38.7884 302 | for!@#38.9278 303 | her!@#39.0739 304 | immersive!@#39.2198 305 | installations!@#39.3672 306 | and!@#39.5015 307 | sculptures!@#39.6478 308 | that!@#39.7983 309 | explore!@#39.9441 310 | themes!@#40.0941 311 | of!@#40.2445 312 | infinity!@#40.3960 313 | and!@#40.5367 314 | self!@#40.6840 315 | -!@#40.8238 316 | obl!@#40.9735 317 | iteration!@#41.1250 318 | . 319 | !@#41.2744 320 | *!@#41.4277 321 | **!@#41.5806 322 | C!@#41.7402 323 | ai!@#41.8775 324 | Gu!@#42.0240 325 | o!@#42.1668 326 | -Q!@#42.3061 327 | iang!@#42.4487 328 | **:!@#42.5885 329 | A!@#42.7288 330 | Chinese!@#42.8691 331 | sculpt!@#43.0131 332 | or!@#43.1547 333 | who!@#43.2946 334 | has!@#43.4403 335 | created!@#43.5861 336 | many!@#43.7251 337 | innovative!@#43.8729 338 | and!@#44.0209 339 | interactive!@#44.1735 340 | works!@#44.3239 341 | using!@#44.4713 342 | fire!@#44.6138 343 | ,!@#44.7615 344 | oil!@#44.9123 345 | ,!@#45.0642 346 | and!@#45.2173 347 | other!@#45.3722 348 | materials!@#45.5095 349 | . 350 | 351 | !@#45.6437 352 | **!@#45.7766 353 | M!@#45.9128 354 | use!@#46.0471 355 | ums!@#46.1823 356 | and!@#46.3172 357 | Gall!@#46.4608 358 | eries!@#46.6032 359 | ** 360 | 361 | !@#46.7450 362 | Several!@#46.8844 363 | museums!@#47.0281 364 | and!@#47.1705 365 | galleries!@#47.3114 366 | showcase!@#47.4544 367 | contemporary!@#47.5988 368 | sculpture!@#47.7454 369 | from!@#47.8902 370 | around!@#48.0350 371 | the!@#48.1867 372 | world!@#48.3355 373 | .!@#48.4871 374 | Some!@#48.6313 375 | notable!@#48.7799 376 | institutions!@#48.9345 377 | include!@#49.0863 378 | : 379 | 380 | !@#49.2394 381 | *!@#49.3938 382 | **!@#49.5343 383 | The!@#49.6766 384 | Museum!@#49.8181 385 | of!@#49.9603 386 | Modern!@#50.1021 387 | Art!@#50.2454 388 | (!@#50.3880 389 | Mo!@#50.5310 390 | MA!@#50.6792 391 | )!@#50.8176 392 | -!@#50.9679 393 | New!@#51.1190 394 | York!@#51.2691 395 | City!@#51.4234 396 | ,!@#51.5667 397 | USA!@#51.7136 398 | ** 399 | !@#51.8644 400 | *!@#52.0164 401 | **!@#52.1699 402 | The!@#52.3167 403 | Tate!@#52.4514 404 | Modern!@#52.5861 405 | -!@#52.7210 406 | London!@#52.8571 407 | ,!@#52.9923 408 | UK!@#53.1277 409 | ** 410 | !@#53.2647 411 | *!@#53.4011 412 | **!@#53.5373 413 | The!@#53.6710 414 | Whitney!@#53.8092 415 | Museum!@#53.9437 416 | of!@#54.0809 417 | American!@#54.2188 418 | Art!@#54.3516 419 | -!@#54.4858 420 | New!@#54.6227 421 | York!@#54.7565 422 | City!@#54.8910 423 | ,!@#55.0285 424 | USA!@#55.1626 425 | ** 426 | !@#55.2999 427 | *!@#55.4347 428 | **!@#55.5705 429 | The!@#55.7071 430 | G!@#55.8412 431 | ugg!@#55.9770 432 | enheim!@#56.1141 433 | Museum!@#56.2491 434 | -!@#56.3840 435 | New!@#56.5201 436 | York!@#56.6551 437 | City!@#56.7900 438 | ,!@#56.9246 439 | USA!@#57.0604 440 | ** 441 | !@#57.1959 442 | *!@#57.3315 443 | **!@#57.4712 444 | The!@#57.6027 445 | Centre!@#57.7391 446 | Pom!@#57.8743 447 | pid!@#58.0085 448 | ou!@#58.1437 449 | -!@#58.2805 450 | Paris!@#58.4164 451 | ,!@#58.5513 452 | France!@#58.6879 453 | ** 454 | 455 | !@#58.8225 456 | **!@#58.9584 457 | Outdoor!@#59.0925 458 | Sculpt!@#59.2278 459 | ure!@#59.3642 460 | ** 461 | 462 | !@#59.5006 463 | Many!@#59.6365 464 | contemporary!@#59.7759 465 | sculpt!@#59.9112 466 | ors!@#60.0488 467 | have!@#60.1859 468 | also!@#60.3198 469 | been!@#60.4530 470 | exploring!@#60.5910 471 | the!@#60.7262 472 | relationship!@#60.8612 473 | between!@#60.9950 474 | art!@#61.1309 475 | and!@#61.2667 476 | the!@#61.4034 477 | natural!@#61.5384 478 | world!@#61.6753 479 | ,!@#61.8115 480 | creating!@#61.9472 481 | large!@#62.0843 482 | -scale!@#62.2215 483 | outdoor!@#62.3586 484 | sculptures!@#62.4908 485 | that!@#62.6285 486 | interact!@#62.7639 487 | with!@#62.9010 488 | their!@#63.0404 489 | surroundings!@#63.1746 490 | .!@#63.3122 491 | Some!@#63.4490 492 | notable!@#63.5847 493 | examples!@#63.7260 494 | include!@#63.8695 495 | : 496 | 497 | !@#64.0111 498 | *!@#64.1550 499 | **!@#64.2985 500 | An!@#64.4417 501 | ish!@#64.5855 502 | Kapoor!@#64.7323 503 | 's!@#64.8747 504 | "!@#65.0252 505 | Cloud!@#65.1754 506 | Gate!@#65.3307 507 | "!@#65.4791 508 | (!@#65.6280 509 | Chicago!@#65.7825 510 | ,!@#65.9369 511 | USA!@#66.0865 512 | )**!@#66.2242 513 | :!@#66.3616 514 | A!@#66.4975 515 | stainless!@#66.6360 516 | steel!@#66.7713 517 | sculpture!@#66.9128 518 | that!@#67.0503 519 | reflects!@#67.1859 520 | the!@#67.3216 521 | sky!@#67.4600 522 | and!@#67.5953 523 | the!@#67.7308 524 | city!@#67.8685 525 | ,!@#68.0034 526 | creating!@#68.1404 527 | a!@#68.2781 528 | dynamic!@#68.4122 529 | and!@#68.5475 530 | interactive!@#68.6851 531 | experience!@#68.8203 532 | . 533 | !@#68.9578 534 | *!@#69.0957 535 | **!@#69.2326 536 | Richard!@#69.3767 537 | S!@#69.5275 538 | erra!@#69.6715 539 | 's!@#69.8214 540 | "!@#69.9642 541 | T!@#70.1109 542 | ilt!@#70.2575 543 | ed!@#70.4032 544 | Arc!@#70.5552 545 | "!@#70.7009 546 | (!@#70.8488 547 | Washington!@#70.9965 548 | ,!@#71.1406 549 | D!@#71.2898 550 | .C!@#71.4363 551 | .,!@#71.5848 552 | USA!@#71.7333 553 | )**!@#71.8761 554 | :!@#72.0234 555 | A!@#72.1714 556 | large!@#72.3172 557 | -scale!@#72.4626 558 | steel!@#72.6113 559 | sculpture!@#72.7569 560 | that!@#72.9050 561 | explores!@#73.0512 562 | the!@#73.1978 563 | relationship!@#73.3465 564 | between!@#73.4904 565 | art!@#73.6389 566 | and!@#73.7908 567 | the!@#73.9402 568 | natural!@#74.0907 569 | world!@#74.2410 570 | . 571 | !@#74.3891 572 | *!@#74.5328 573 | **!@#74.6820 574 | Y!@#74.8354 575 | ay!@#74.9866 576 | oi!@#75.1384 577 | K!@#75.2882 578 | us!@#75.4411 579 | ama!@#75.5917 580 | 's!@#75.7447 581 | "!@#75.8970 582 | Infinity!@#76.0525 583 | Room!@#76.2165 584 | "!@#76.3657 585 | (!@#76.5154 586 | Tok!@#76.6622 587 | yo!@#76.8155 588 | ,!@#76.9586 589 | Japan!@#77.1104 590 | )**!@#77.2614 591 | :!@#77.4148 592 | An!@#77.5702 593 | immersive!@#77.7253 594 | installation!@#77.8788 595 | that!@#78.0346 596 | creates!@#78.1909 597 | a!@#78.3488 598 | sense!@#78.5043 599 | of!@#78.6621 600 | infinity!@#78.8225 601 | and!@#78.9936 602 | self!@#79.1372 603 | -!@#79.2866 604 | obl!@#79.4350 605 | iteration!@#79.5810 606 | . 607 | 608 | !@#79.7277 609 | **!@#79.8714 610 | Conclusion!@#80.0149 611 | ** 612 | 613 | !@#80.1605 614 | Cont!@#80.3064 615 | emporary!@#80.4560 616 | sculpture!@#80.6023 617 | is!@#80.7513 618 | a!@#80.9028 619 | vibrant!@#81.0455 620 | and!@#81.1920 621 | diverse!@#81.3430 622 | field!@#81.4963 623 | ,!@#81.6491 624 | pushing!@#81.8071 625 | the!@#81.9552 626 | boundaries!@#82.1056 627 | of!@#82.2579 628 | what!@#82.4122 629 | art!@#82.5736 630 | can!@#82.7287 631 | be!@#82.8881 632 | .!@#83.0347 633 | From!@#83.1812 634 | abstract!@#83.3246 635 | expression!@#83.4747 636 | ist!@#83.6213 637 | to!@#83.7678 638 | environmental!@#83.9099 639 | ,!@#84.0592 640 | performance!@#84.2091 641 | ,!@#84.3536 642 | and!@#84.5056 643 | digital!@#84.6620 644 | sculpture!@#84.8152 645 | !@#84.8169 646 | -------------------------------------------------------------------------------- /tests/test_data/2.txt: -------------------------------------------------------------------------------- 1 | Here!@#0.0000 2 | 's!@#0.1291 3 | an!@#0.2670 4 | article!@#0.4081 5 | on!@#0.5451 6 | the!@#0.7283 7 | use!@#0.9064 8 | of!@#1.0996 9 | food!@#1.2352 10 | in!@#1.3745 11 | contemporary!@#1.5210 12 | art!@#1.6654 13 | : 14 | 15 | !@#1.8154 16 | **!@#1.9700 17 | Food!@#2.1085 18 | for!@#2.2481 19 | Thought!@#2.3884 20 | :!@#2.5320 21 | The!@#2.6773 22 | Rise!@#2.8268 23 | of!@#2.9765 24 | Ed!@#3.1370 25 | ible!@#3.2736 26 | Art!@#3.4119 27 | ** 28 | 29 | !@#3.5607 30 | Food!@#3.7067 31 | has!@#3.8558 32 | long!@#4.0117 33 | been!@#4.1498 34 | a!@#4.2885 35 | vital!@#4.4289 36 | part!@#4.5674 37 | of!@#4.7120 38 | human!@#4.8603 39 | culture!@#5.0081 40 | ,!@#5.1609 41 | but!@#5.3157 42 | in!@#5.4572 43 | contemporary!@#5.5956 44 | art!@#5.7346 45 | ,!@#5.8760 46 | it!@#6.0168 47 | 's!@#6.1559 48 | taking!@#6.2948 49 | center!@#6.4425 50 | stage!@#6.5885 51 | .!@#6.7329 52 | Ed!@#6.8783 53 | ible!@#7.0251 54 | art!@#7.1726 55 | ,!@#7.3175 56 | also!@#7.4698 57 | known!@#7.6227 58 | as!@#7.7795 59 | food!@#7.9321 60 | art!@#8.0834 61 | or!@#8.2418 62 | food!@#8.3976 63 | sculpture!@#8.5353 64 | ,!@#8.6745 65 | has!@#8.8140 66 | become!@#8.9518 67 | a!@#9.0910 68 | popular!@#9.2312 69 | medium!@#9.3706 70 | for!@#9.5106 71 | artists!@#9.6507 72 | to!@#9.7900 73 | express!@#9.9323 74 | themselves!@#10.0706 75 | .!@#10.2095 76 | From!@#10.3509 77 | delicate!@#10.4895 78 | past!@#10.6341 79 | ries!@#10.7826 80 | to!@#10.9294 81 | massive!@#11.0830 82 | installations!@#11.2403 83 | ,!@#11.3818 84 | food!@#11.5188 85 | is!@#11.6601 86 | being!@#11.7985 87 | used!@#11.9401 88 | in!@#12.0791 89 | innovative!@#12.2162 90 | and!@#12.3541 91 | thought!@#12.4953 92 | -pro!@#12.6354 93 | v!@#12.7769 94 | oking!@#12.9180 95 | ways!@#13.0643 96 | . 97 | 98 | !@#13.2131 99 | **!@#13.3609 100 | Why!@#13.5054 101 | Food!@#13.6549 102 | in!@#13.8035 103 | Art!@#13.9531 104 | ?!@#14.1019 105 | ** 106 | 107 | !@#14.2548 108 | So!@#14.4013 109 | ,!@#14.5550 110 | why!@#14.7113 111 | are!@#14.8698 112 | artists!@#15.0281 113 | turning!@#15.1746 114 | to!@#15.3307 115 | food!@#15.4889 116 | as!@#15.6485 117 | a!@#15.8132 118 | medium!@#15.9617 119 | ?!@#16.1114 120 | One!@#16.2568 121 | reason!@#16.4044 122 | is!@#16.5538 123 | that!@#16.7010 124 | food!@#16.8499 125 | is!@#16.9999 126 | a!@#17.1485 127 | universal!@#17.3011 128 | language!@#17.4569 129 | ,!@#17.6079 130 | understood!@#17.7599 131 | by!@#17.9183 132 | everyone!@#18.0792 133 | .!@#18.2383 134 | It!@#18.3840 135 | 's!@#18.5319 136 | also!@#18.6825 137 | a!@#18.8407 138 | very!@#18.9869 139 | tactile!@#19.1376 140 | medium!@#19.2814 141 | ,!@#19.4333 142 | inviting!@#19.5854 143 | the!@#19.7420 144 | viewer!@#19.8919 145 | to!@#20.0541 146 | touch!@#20.1965 147 | and!@#20.3393 148 | explore!@#20.4800 149 | .!@#20.6195 150 | Additionally!@#20.7637 151 | ,!@#20.9042 152 | food!@#21.0461 153 | is!@#21.1952 154 | a!@#21.3444 155 | fleeting!@#21.4926 156 | and!@#21.6390 157 | ephem!@#21.7863 158 | eral!@#21.9357 159 | medium!@#22.0825 160 | ,!@#22.2302 161 | which!@#22.3859 162 | can!@#22.5435 163 | be!@#22.6963 164 | both!@#22.8544 165 | beautiful!@#23.0150 166 | and!@#23.1631 167 | imper!@#23.3053 168 | manent!@#23.4459 169 | . 170 | 171 | !@#23.5868 172 | **!@#23.7269 173 | Not!@#23.8668 174 | able!@#24.0074 175 | Examples!@#24.1482 176 | of!@#24.3003 177 | Ed!@#24.4497 178 | ible!@#24.5968 179 | Art!@#24.7459 180 | ** 181 | 182 | !@#24.8906 183 | Some!@#25.0386 184 | notable!@#25.1896 185 | examples!@#25.3372 186 | of!@#25.4905 187 | edible!@#25.6501 188 | art!@#25.8008 189 | include!@#25.9580 190 | : 191 | 192 | !@#26.1173 193 | *!@#26.2648 194 | **!@#26.4085 195 | Mar!@#26.5489 196 | ina!@#26.6924 197 | Abram!@#26.8362 198 | ovic!@#26.9776 199 | 's!@#27.1176 200 | "!@#27.2601 201 | The!@#27.4034 202 | Artist!@#27.5470 203 | is!@#27.6862 204 | Present!@#27.8277 205 | "!@#27.9694 206 | **:!@#28.1144 207 | A!@#28.2557 208 | performance!@#28.3975 209 | art!@#28.5364 210 | piece!@#28.6790 211 | where!@#28.8231 212 | Abram!@#28.9639 213 | ovic!@#29.1056 214 | sat!@#29.2488 215 | silently!@#29.3917 216 | for!@#29.5339 217 | !@#29.6784 218 | 736!@#29.8212 219 | hours!@#29.9615 220 | ,!@#30.1045 221 | with!@#30.2472 222 | visitors!@#30.3897 223 | seated!@#30.5325 224 | across!@#30.6736 225 | from!@#30.8151 226 | her!@#30.9557 227 | .!@#31.0981 228 | She!@#31.2409 229 | offered!@#31.3845 230 | them!@#31.5315 231 | a!@#31.6804 232 | cup!@#31.8266 233 | of!@#31.9751 234 | tea!@#32.1220 235 | ,!@#32.2683 236 | a!@#32.4190 237 | gesture!@#32.5714 238 | that!@#32.7222 239 | symbol!@#32.8796 240 | ized!@#33.0387 241 | the!@#33.1913 242 | connection!@#33.3490 243 | between!@#33.5093 244 | the!@#33.6725 245 | artist!@#33.8133 246 | and!@#33.9548 247 | the!@#34.0954 248 | viewer!@#34.2381 249 | . 250 | !@#34.3812 251 | *!@#34.5229 252 | **!@#34.6660 253 | Y!@#34.8181 254 | ay!@#34.9694 255 | oi!@#35.1210 256 | K!@#35.2743 257 | us!@#35.4256 258 | ama!@#35.5779 259 | 's!@#35.7264 260 | "!@#35.8770 261 | Infinity!@#36.0282 262 | Room!@#36.1780 263 | "!@#36.3331 264 | **:!@#36.4841 265 | An!@#36.6342 266 | immersive!@#36.7865 267 | installation!@#36.9418 268 | that!@#37.0985 269 | features!@#37.2495 270 | a!@#37.3991 271 | room!@#37.5544 272 | filled!@#37.7117 273 | with!@#37.8663 274 | twink!@#38.0245 275 | ling!@#38.1842 276 | lights!@#38.3532 277 | and!@#38.5019 278 | a!@#38.6551 279 | mirror!@#38.8048 280 | ball!@#38.9632 281 | .!@#39.1201 282 | The!@#39.2852 283 | artist!@#39.4421 284 | has!@#39.6077 285 | also!@#39.7689 286 | created!@#39.9419 287 | edible!@#40.0867 288 | versions!@#40.2398 289 | of!@#40.3912 290 | this!@#40.5397 291 | installation!@#40.6887 292 | ,!@#40.8387 293 | using!@#40.9884 294 | food!@#41.1396 295 | to!@#41.2895 296 | create!@#41.4452 297 | a!@#41.6017 298 | sense!@#41.7629 299 | of!@#41.9177 300 | infinity!@#42.0786 301 | and!@#42.2391 302 | self!@#42.4013 303 | -!@#42.5431 304 | obl!@#42.6866 305 | iteration!@#42.8322 306 | . 307 | !@#42.9739 308 | *!@#43.1183 309 | **!@#43.2621 310 | El!@#43.4126 311 | An!@#43.5652 312 | ats!@#43.7149 313 | ui!@#43.8655 314 | 's!@#44.0153 315 | "!@#44.1675 316 | G!@#44.3176 317 | hana!@#44.4743 318 | ian!@#44.6219 319 | -G!@#44.7759 320 | erman!@#44.9327 321 | Sculpt!@#45.0920 322 | ure!@#45.2476 323 | "!@#45.3993 324 | **:!@#45.5616 325 | A!@#45.7217 326 | series!@#45.8843 327 | of!@#46.0410 328 | sculptures!@#46.1819 329 | made!@#46.3253 330 | from!@#46.4687 331 | discarded!@#46.6178 332 | materials!@#46.7589 333 | ,!@#46.9021 334 | including!@#47.0465 335 | bottle!@#47.1914 336 | caps!@#47.3350 337 | and!@#47.4791 338 | cans!@#47.6227 339 | .!@#47.7633 340 | The!@#47.9053 341 | artist!@#48.0505 342 | 's!@#48.2011 343 | use!@#48.3541 344 | of!@#48.5045 345 | food!@#48.6546 346 | waste!@#48.8051 347 | is!@#48.9562 348 | a!@#49.1074 349 | commentary!@#49.2596 350 | on!@#49.4145 351 | the!@#49.5682 352 | environment!@#49.7237 353 | and!@#49.8818 354 | the!@#50.0439 355 | value!@#50.2005 356 | of!@#50.3537 357 | waste!@#50.5167 358 | . 359 | 360 | !@#50.6776 361 | **!@#50.8431 362 | Food!@#51.0024 363 | as!@#51.1508 364 | a!@#51.3018 365 | Medium!@#51.4527 366 | ** 367 | 368 | !@#51.6042 369 | Food!@#51.7528 370 | can!@#51.9073 371 | be!@#52.0589 372 | used!@#52.2117 373 | in!@#52.3629 374 | a!@#52.5182 375 | variety!@#52.6658 376 | of!@#52.8260 377 | ways!@#52.9852 378 | in!@#53.1516 379 | art!@#53.3087 380 | ,!@#53.4581 381 | from!@#53.6210 382 | : 383 | 384 | !@#53.7843 385 | *!@#53.9525 386 | **!@#54.0992 387 | Food!@#54.2442 388 | sculpture!@#54.3892 389 | **:!@#54.5346 390 | Creating!@#54.6758 391 | intricate!@#54.8211 392 | and!@#54.9646 393 | beautiful!@#55.1096 394 | designs!@#55.2548 395 | using!@#55.3990 396 | food!@#55.5420 397 | ,!@#55.6861 398 | such!@#55.8309 399 | as!@#55.9743 400 | pastry!@#56.1183 401 | or!@#56.2625 402 | chocolate!@#56.4080 403 | . 404 | !@#56.5511 405 | *!@#56.6956 406 | **!@#56.8412 407 | Food!@#56.9910 408 | installation!@#57.1421 409 | **:!@#57.2913 410 | Creating!@#57.4411 411 | large!@#57.5916 412 | -scale!@#57.7403 413 | installations!@#57.8966 414 | using!@#58.0448 415 | food!@#58.2051 416 | ,!@#58.3682 417 | such!@#58.5263 418 | as!@#58.6791 419 | a!@#58.8415 420 | massive!@#59.0074 421 | pile!@#59.1542 422 | of!@#59.2994 423 | fruit!@#59.4417 424 | or!@#59.5860 425 | a!@#59.7293 426 | giant!@#59.8727 427 | pastry!@#60.0159 428 | sculpture!@#60.1591 429 | . 430 | !@#60.3016 431 | *!@#60.4484 432 | **!@#60.5922 433 | Food!@#60.7452 434 | performance!@#60.8976 435 | **:!@#61.0511 436 | Using!@#61.2043 437 | food!@#61.3543 438 | as!@#61.5060 439 | a!@#61.6588 440 | medium!@#61.8100 441 | for!@#61.9650 442 | performance!@#62.1149 443 | art!@#62.2721 444 | ,!@#62.4291 445 | such!@#62.5763 446 | as!@#62.7383 447 | cooking!@#62.8969 448 | or!@#63.0562 449 | serving!@#63.2208 450 | meals!@#63.3757 451 | . 452 | !@#63.5320 453 | *!@#63.6891 454 | **!@#63.8514 455 | Food!@#64.0166 456 | photography!@#64.1791 457 | **:!@#64.3517 458 | Creating!@#64.5040 459 | photographs!@#64.6597 460 | that!@#64.8120 461 | showcase!@#64.9637 462 | food!@#65.1135 463 | as!@#65.2661 464 | a!@#65.4182 465 | medium!@#65.5717 466 | ,!@#65.7234 467 | such!@#65.8785 468 | as!@#66.0364 469 | still!@#66.1860 470 | -life!@#66.3477 471 | compositions!@#66.5084 472 | or!@#66.6728 473 | food!@#66.8267 474 | portraits!@#66.9800 475 | . 476 | 477 | !@#67.1441 478 | **!@#67.3057 479 | M!@#67.4717 480 | use!@#67.6287 481 | ums!@#67.7725 482 | and!@#67.9178 483 | Gall!@#68.0756 484 | eries!@#68.2276 485 | ** 486 | 487 | !@#68.3777 488 | Some!@#68.5285 489 | notable!@#68.6801 490 | museums!@#68.8306 491 | and!@#68.9859 492 | galleries!@#69.1395 493 | that!@#69.2955 494 | showcase!@#69.4555 495 | edible!@#69.6214 496 | art!@#69.7755 497 | include!@#69.9316 498 | : 499 | 500 | !@#70.0930 501 | *!@#70.2614 502 | **!@#70.4081 503 | The!@#70.5516 504 | Museum!@#70.6983 505 | of!@#70.8434 506 | Food!@#70.9891 507 | and!@#71.1355 508 | Drink!@#71.2797 509 | (!@#71.4256 510 | New!@#71.5736 511 | York!@#71.7186 512 | City!@#71.8663 513 | ,!@#72.0101 514 | USA!@#72.1561 515 | )!@#72.3011 516 | ** 517 | !@#72.4482 518 | *!@#72.5926 519 | **!@#72.7376 520 | The!@#72.8833 521 | National!@#73.0258 522 | Museum!@#73.1715 523 | of!@#73.3159 524 | Food!@#73.4595 525 | and!@#73.6060 526 | Drink!@#73.7515 527 | (!@#73.8944 528 | Washington!@#74.0395 529 | ,!@#74.1854 530 | D!@#74.3309 531 | .C!@#74.4773 532 | .,!@#74.6240 533 | USA!@#74.7709 534 | )!@#74.9159 535 | ** 536 | !@#75.0614 537 | *!@#75.2085 538 | **!@#75.3540 539 | The!@#75.4996 540 | Tate!@#75.6459 541 | Modern!@#75.7929 542 | (!@#75.9378 543 | London!@#76.0865 544 | ,!@#76.2306 545 | UK!@#76.3757 546 | )!@#76.5227 547 | ** 548 | !@#76.6677 549 | *!@#76.8127 550 | **!@#76.9582 551 | The!@#77.0993 552 | G!@#77.2485 553 | ugg!@#77.3940 554 | enheim!@#77.5403 555 | Museum!@#77.6866 556 | (!@#77.8332 557 | New!@#77.9781 558 | York!@#78.1218 559 | City!@#78.2663 560 | ,!@#78.4121 561 | USA!@#78.5587 562 | )**!@#78.7041 563 | 564 | 565 | !@#78.8521 566 | **!@#78.9974 567 | Conclusion!@#79.1419 568 | ** 569 | 570 | !@#79.2861 571 | Food!@#79.4337 572 | is!@#79.5812 573 | a!@#79.7267 574 | vital!@#79.8719 575 | part!@#80.0190 576 | of!@#80.1639 577 | human!@#80.3102 578 | culture!@#80.4574 579 | ,!@#80.6046 580 | and!@#80.7511 581 | in!@#80.8968 582 | contemporary!@#81.0454 583 | art!@#81.1931 584 | ,!@#81.3402 585 | it!@#81.4865 586 | 's!@#81.6336 587 | being!@#81.7784 588 | used!@#81.9241 589 | in!@#82.0702 590 | innovative!@#82.2160 591 | and!@#82.3620 592 | thought!@#82.5090 593 | -pro!@#82.6557 594 | v!@#82.8043 595 | oking!@#82.9499 596 | ways!@#83.0955 597 | .!@#83.2447 598 | From!@#83.3903 599 | delicate!@#83.5389 600 | past!@#83.6943 601 | ries!@#83.8487 602 | to!@#84.0024 603 | massive!@#84.1551 604 | installations!@#84.3086 605 | ,!@#84.4645 606 | food!@#84.6168 607 | is!@#84.7742 608 | a!@#84.9309 609 | medium!@#85.0896 610 | that!@#85.2493 611 | invites!@#85.4137 612 | the!@#85.5764 613 | viewer!@#85.7299 614 | to!@#85.8941 615 | touch!@#86.0596 616 | ,!@#86.2285 617 | taste!@#86.3925 618 | ,!@#86.5396 619 | and!@#86.6859 620 | explore!@#86.8322 621 | .!@#86.9794 622 | As!@#87.1264 623 | the!@#87.2733 624 | use!@#87.4292 625 | of!@#87.5839 626 | food!@#87.7418 627 | in!@#87.9005 628 | art!@#88.0579 629 | continues!@#88.2112 630 | to!@#88.3709 631 | evolve!@#88.5299 632 | ,!@#88.6858 633 | we!@#88.8419 634 | can!@#88.9980 635 | expect!@#89.1576 636 | to!@#89.3133 637 | see!@#89.4713 638 | even!@#89.6281 639 | !@#89.6298 640 | -------------------------------------------------------------------------------- /tests/test_data/3.txt: -------------------------------------------------------------------------------- 1 | even!@#0.0000 2 | more!@#0.1390 3 | creative!@#0.2878 4 | and!@#0.4349 5 | delicious!@#0.5823 6 | uses!@#0.7270 7 | of!@#0.9210 8 | food!@#1.1057 9 | in!@#1.2976 10 | art!@#1.4520 11 | .!@#1.6193 12 | Whether!@#1.7652 13 | it!@#1.9202 14 | 's!@#2.0769 15 | a!@#2.2316 16 | delicate!@#2.3931 17 | pastry!@#2.5600 18 | or!@#2.7080 19 | a!@#2.8537 20 | massive!@#3.0008 21 | installation!@#3.1482 22 | ,!@#3.2950 23 | food!@#3.4516 24 | has!@#3.6057 25 | the!@#3.7609 26 | power!@#3.9146 27 | to!@#4.0762 28 | inspire!@#4.2324 29 | and!@#4.3999 30 | provoke!@#4.5483 31 | .!@#4.6986 32 | So!@#4.8471 33 | ,!@#4.9943 34 | the!@#5.1399 35 | next!@#5.2875 36 | time!@#5.4368 37 | you!@#5.5831 38 | see!@#5.7284 39 | a!@#5.8771 40 | piece!@#6.0218 41 | of!@#6.1722 42 | edible!@#6.3298 43 | art!@#6.4825 44 | ,!@#6.6381 45 | remember!@#6.7923 46 | that!@#6.9520 47 | it!@#7.1181 48 | 's!@#7.2798 49 | not!@#7.4412 50 | just!@#7.6106 51 | a!@#7.7572 52 | tasty!@#7.9016 53 | treat!@#8.0501 54 | ,!@#8.1975 55 | but!@#8.3454 56 | also!@#8.4949 57 | a!@#8.6402 58 | work!@#8.7863 59 | of!@#8.9362 60 | art!@#9.0861 61 | in!@#9.2352 62 | its!@#9.3918 63 | own!@#9.5459 64 | right!@#9.6992 65 | .!@#9.8541 66 | !@#10.0080 67 | -------------------------------------------------------------------------------- /tests/test_data/4.txt: -------------------------------------------------------------------------------- 1 | Here!@#0.0000 2 | 's!@#0.1408 3 | an!@#0.2889 4 | article!@#0.4359 5 | on!@#0.5847 6 | the!@#0.7856 7 | relationship!@#0.9715 8 | between!@#1.1708 9 | nudity!@#1.3124 10 | and!@#1.4590 11 | artwork!@#1.6143 12 | : 13 | 14 | !@#1.7709 15 | **!@#1.9235 16 | The!@#2.0877 17 | Naked!@#2.2507 18 | Truth!@#2.4043 19 | :!@#2.5537 20 | Un!@#2.7089 21 | packing!@#2.8667 22 | the!@#3.0278 23 | Art!@#3.1937 24 | of!@#3.3412 25 | N!@#3.4893 26 | ud!@#3.6406 27 | ity!@#3.7889 28 | ** 29 | 30 | !@#3.9361 31 | N!@#4.0838 32 | ud!@#4.2376 33 | ity!@#4.3962 34 | has!@#4.5533 35 | long!@#4.7142 36 | been!@#4.8714 37 | a!@#5.0378 38 | subject!@#5.1840 39 | of!@#5.3335 40 | fascination!@#5.4838 41 | and!@#5.6331 42 | debate!@#5.7819 43 | in!@#5.9291 44 | the!@#6.0869 45 | art!@#6.2417 46 | world!@#6.4000 47 | .!@#6.5548 48 | From!@#6.7141 49 | ancient!@#6.8755 50 | Greek!@#7.0406 51 | sculptures!@#7.2023 52 | to!@#7.3748 53 | modern!@#7.5214 54 | -day!@#7.6706 55 | installations!@#7.8200 56 | ,!@#7.9683 57 | the!@#8.1190 58 | human!@#8.2691 59 | body!@#8.4196 60 | has!@#8.5687 61 | been!@#8.7177 62 | a!@#8.8667 63 | source!@#9.0162 64 | of!@#9.1668 65 | inspiration!@#9.3163 66 | for!@#9.4643 67 | artists!@#9.6206 68 | throughout!@#9.7770 69 | history!@#9.9374 70 | .!@#10.1029 71 | But!@#10.2523 72 | what!@#10.4050 73 | is!@#10.5536 74 | it!@#10.7001 75 | about!@#10.8504 76 | nudity!@#11.0002 77 | that!@#11.1484 78 | draws!@#11.2972 79 | artists!@#11.4546 80 | to!@#11.6155 81 | it!@#11.7744 82 | ,!@#11.9300 83 | and!@#12.0867 84 | how!@#12.2451 85 | does!@#12.4043 86 | it!@#12.5610 87 | impact!@#12.7233 88 | the!@#12.8771 89 | viewer!@#13.0407 90 | 's!@#13.2095 91 | experience!@#13.3776 92 | ? 93 | 94 | !@#13.5397 95 | **!@#13.7008 96 | The!@#13.8696 97 | Origins!@#14.0413 98 | of!@#14.2130 99 | N!@#14.3611 100 | ud!@#14.5086 101 | ity!@#14.6597 102 | in!@#14.8114 103 | Art!@#14.9622 104 | ** 105 | 106 | !@#15.1098 107 | N!@#15.2600 108 | ud!@#15.4092 109 | ity!@#15.5579 110 | has!@#15.7108 111 | its!@#15.8609 112 | roots!@#16.0129 113 | in!@#16.1661 114 | ancient!@#16.3134 115 | cultures!@#16.4633 116 | ,!@#16.6140 117 | where!@#16.7615 118 | the!@#16.9140 119 | human!@#17.0659 120 | body!@#17.2142 121 | was!@#17.3629 122 | seen!@#17.5114 123 | as!@#17.6636 124 | a!@#17.8128 125 | symbol!@#17.9627 126 | of!@#18.1123 127 | beauty!@#18.2619 128 | ,!@#18.4221 129 | strength!@#18.5781 130 | ,!@#18.7328 131 | and!@#18.8899 132 | fertility!@#19.0438 133 | .!@#19.2018 134 | In!@#19.3636 135 | ancient!@#19.5186 136 | Greece!@#19.6846 137 | ,!@#19.8552 138 | for!@#20.0145 139 | example!@#20.1835 140 | ,!@#20.3556 141 | the!@#20.5176 142 | nude!@#20.6698 143 | male!@#20.8191 144 | form!@#20.9696 145 | was!@#21.1194 146 | celebrated!@#21.2702 147 | as!@#21.4199 148 | a!@#21.5702 149 | symbol!@#21.7181 150 | of!@#21.8688 151 | masculinity!@#22.0175 152 | and!@#22.1697 153 | athletic!@#22.3188 154 | achievement!@#22.4700 155 | .!@#22.6218 156 | This!@#22.7732 157 | tradition!@#22.9243 158 | continued!@#23.0815 159 | into!@#23.2417 160 | the!@#23.4002 161 | Renaissance!@#23.5571 162 | ,!@#23.7149 163 | where!@#23.8748 164 | artists!@#24.0335 165 | such!@#24.1935 166 | as!@#24.3509 167 | Michel!@#24.5153 168 | angelo!@#24.6706 169 | and!@#24.8347 170 | Leonardo!@#25.0019 171 | da!@#25.1717 172 | Vinci!@#25.3367 173 | created!@#25.4958 174 | iconic!@#25.6630 175 | works!@#25.8319 176 | of!@#26.0051 177 | art!@#26.1823 178 | featuring!@#26.3368 179 | the!@#26.4871 180 | human!@#26.6399 181 | form!@#26.7955 182 | in!@#26.9434 183 | all!@#27.0951 184 | its!@#27.2481 185 | glory!@#27.3992 186 | . 187 | 188 | !@#27.5505 189 | **!@#27.7015 190 | The!@#27.8528 191 | Art!@#28.0033 192 | of!@#28.1633 193 | the!@#28.3239 194 | Nude!@#28.4811 195 | :!@#28.6403 196 | A!@#28.8018 197 | Symbol!@#28.9601 198 | of!@#29.1199 199 | Truth!@#29.2865 200 | ** 201 | 202 | !@#29.4444 203 | So!@#29.6128 204 | ,!@#29.7821 205 | why!@#29.9576 206 | do!@#30.1163 207 | artists!@#30.2842 208 | choose!@#30.4572 209 | to!@#30.6282 210 | depict!@#30.7791 211 | the!@#30.9289 212 | human!@#31.0806 213 | form!@#31.2320 214 | in!@#31.3819 215 | the!@#31.5310 216 | nude!@#31.6836 217 | ?!@#31.8333 218 | One!@#31.9836 219 | reason!@#32.1351 220 | is!@#32.2919 221 | that!@#32.4529 222 | the!@#32.6130 223 | nude!@#32.7758 224 | body!@#32.9301 225 | is!@#33.0897 226 | a!@#33.2471 227 | symbol!@#33.4069 228 | of!@#33.5670 229 | truth!@#33.7263 230 | and!@#33.8901 231 | authenticity!@#34.0553 232 | .!@#34.2136 233 | When!@#34.3806 234 | an!@#34.5472 235 | artist!@#34.7211 236 | paints!@#34.8848 237 | or!@#35.0463 238 | sc!@#35.2091 239 | ul!@#35.3803 240 | pts!@#35.5498 241 | the!@#35.7295 242 | nude!@#35.8896 243 | form!@#36.0504 244 | ,!@#36.2074 245 | they!@#36.3663 246 | are!@#36.5241 247 | revealing!@#36.6825 248 | the!@#36.8469 249 | underlying!@#37.0132 250 | structure!@#37.1829 251 | and!@#37.3452 252 | beauty!@#37.5141 253 | of!@#37.6740 254 | the!@#37.8452 255 | human!@#38.0187 256 | body!@#38.1906 257 | ,!@#38.3438 258 | stripped!@#38.4940 259 | of!@#38.6473 260 | the!@#38.8009 261 | distractions!@#38.9534 262 | of!@#39.1246 263 | clothing!@#39.2792 264 | and!@#39.4521 265 | social!@#39.6596 266 | convention!@#39.8562 267 | . 268 | 269 | !@#40.0292 270 | **!@#40.2125 271 | The!@#40.3830 272 | Viewer!@#40.5408 273 | 's!@#40.7037 274 | Experience!@#40.8676 275 | ** 276 | 277 | !@#41.0302 278 | The!@#41.1946 279 | experience!@#41.3573 280 | of!@#41.5265 281 | viewing!@#41.7008 282 | a!@#41.8712 283 | nude!@#42.0470 284 | work!@#42.2306 285 | of!@#42.4021 286 | art!@#42.5775 287 | can!@#42.7657 288 | be!@#42.9419 289 | complex!@#43.1266 290 | and!@#43.3073 291 | multif!@#43.4966 292 | ac!@#43.6851 293 | eted!@#43.8749 294 | .!@#44.0592 295 | On!@#44.2471 296 | one!@#44.4405 297 | hand!@#44.6290 298 | ,!@#44.8193 299 | the!@#45.0012 300 | nude!@#45.1775 301 | body!@#45.3503 302 | can!@#45.5255 303 | evoke!@#45.7044 304 | feelings!@#45.8859 305 | of!@#46.0576 306 | comfort!@#46.2263 307 | and!@#46.3991 308 | familiarity!@#46.5659 309 | ,!@#46.7446 310 | as!@#46.9398 311 | it!@#47.1200 312 | is!@#47.3143 313 | a!@#47.5048 314 | reminder!@#47.6909 315 | of!@#47.8653 316 | our!@#48.0415 317 | own!@#48.2224 318 | physical!@#48.4112 319 | ity!@#48.6000 320 | and!@#48.7674 321 | vulnerability!@#48.9562 322 | .!@#49.1325 323 | On!@#49.3433 324 | the!@#49.5232 325 | other!@#49.7296 326 | hand!@#49.9098 327 | ,!@#50.1050 328 | the!@#50.3006 329 | nude!@#50.4939 330 | body!@#50.7042 331 | can!@#50.8902 332 | also!@#51.1030 333 | be!@#51.2906 334 | a!@#51.4791 335 | source!@#51.6754 336 | of!@#51.8503 337 | discomfort!@#52.0351 338 | and!@#52.2109 339 | self!@#52.9610 340 | -conscious!@#53.1345 341 | ness!@#53.3259 342 | ,!@#53.5074 343 | as!@#53.6989 344 | it!@#53.8765 345 | challenges!@#54.0603 346 | our!@#54.2400 347 | social!@#54.4234 348 | norms!@#54.6132 349 | and!@#54.8218 350 | expectations!@#55.0390 351 | around!@#55.2377 352 | nudity!@#55.3969 353 | . 354 | 355 | !@#55.5609 356 | **!@#55.7210 357 | Examples!@#55.8811 358 | of!@#56.0452 359 | Not!@#56.2101 360 | able!@#56.3771 361 | N!@#56.5450 362 | udes!@#56.7120 363 | ** 364 | 365 | !@#56.8774 366 | Some!@#57.0447 367 | notable!@#57.2145 368 | examples!@#57.3815 369 | of!@#57.5494 370 | nude!@#57.7329 371 | artwork!@#57.9322 372 | include!@#58.1209 373 | : 374 | 375 | !@#58.3163 376 | *!@#58.5166 377 | **!@#58.7115 378 | Mich!@#58.9083 379 | el!@#59.0998 380 | angelo!@#59.3117 381 | 's!@#59.4892 382 | "!@#59.6862 383 | David!@#59.8816 384 | "!@#60.0637 385 | **:!@#60.2921 386 | A!@#60.4699 387 | iconic!@#60.6621 388 | sculpture!@#60.8525 389 | of!@#61.0525 390 | the!@#61.2401 391 | biblical!@#61.4277 392 | hero!@#61.6294 393 | David!@#61.8199 394 | ,!@#62.0096 395 | which!@#62.1895 396 | has!@#62.3609 397 | become!@#62.5408 398 | one!@#62.7320 399 | of!@#62.9188 400 | the!@#63.1140 401 | most!@#63.3052 402 | famous!@#63.4903 403 | works!@#63.6859 404 | of!@#63.8528 405 | art!@#64.0106 406 | in!@#64.1686 407 | history!@#64.3281 408 | . 409 | !@#64.5123 410 | *!@#64.6932 411 | **!@#64.8782 412 | Leon!@#65.0605 413 | ardo!@#65.2439 414 | da!@#65.4300 415 | Vinci!@#65.6254 416 | 's!@#65.8193 417 | "!@#66.0029 418 | V!@#66.1753 419 | it!@#66.3494 420 | ru!@#66.5277 421 | v!@#66.7018 422 | ian!@#66.8887 423 | Man!@#67.0585 424 | "!@#67.2364 425 | **:!@#67.9768 426 | A!@#68.1536 427 | drawing!@#68.3440 428 | that!@#68.5213 429 | depicts!@#68.7118 430 | the!@#68.9007 431 | human!@#69.0814 432 | form!@#69.2572 433 | ins!@#69.4431 434 | cribed!@#69.6231 435 | within!@#69.8097 436 | a!@#70.0097 437 | circle!@#70.1910 438 | and!@#70.3723 439 | square!@#70.5254 440 | ,!@#70.6826 441 | highlighting!@#70.8347 442 | the!@#70.9875 443 | body!@#71.1439 444 | 's!@#71.3004 445 | relationship!@#71.4530 446 | to!@#71.6079 447 | geometry!@#71.7595 448 | and!@#71.9140 449 | proportion!@#72.0679 450 | . 451 | !@#72.2264 452 | *!@#72.3794 453 | **!@#72.5350 454 | Mar!@#72.6993 455 | ina!@#72.8658 456 | Abram!@#73.0403 457 | ovic!@#73.2097 458 | 's!@#73.3746 459 | "!@#73.5370 460 | The!@#73.7023 461 | Artist!@#73.8649 462 | is!@#74.0287 463 | Present!@#74.1920 464 | "!@#74.3540 465 | **:!@#74.5238 466 | A!@#74.6920 467 | performance!@#74.8583 468 | art!@#75.0164 469 | piece!@#75.1905 470 | in!@#75.3609 471 | which!@#75.5343 472 | Abram!@#75.7150 473 | ovic!@#75.8826 474 | sat!@#76.0501 475 | silently!@#76.2136 476 | for!@#76.3880 477 | !@#76.5567 478 | 736!@#76.7326 479 | hours!@#76.9097 480 | ,!@#77.0839 481 | inviting!@#77.2401 482 | visitors!@#77.3944 483 | to!@#77.5509 484 | sit!@#77.7059 485 | across!@#77.8604 486 | from!@#78.0165 487 | her!@#78.1699 488 | and!@#78.3240 489 | engage!@#78.4775 490 | in!@#78.6323 491 | a!@#78.7868 492 | silent!@#78.9427 493 | conversation!@#79.1002 494 | . 495 | 496 | !@#79.2566 497 | **!@#79.4117 498 | The!@#79.5675 499 | Future!@#79.7237 500 | of!@#79.8837 501 | N!@#80.0517 502 | ud!@#80.2178 503 | ity!@#80.3852 504 | in!@#80.5491 505 | Art!@#80.7135 506 | ** 507 | 508 | !@#80.8768 509 | As!@#81.0406 510 | we!@#81.2049 511 | move!@#81.3686 512 | forward!@#81.5319 513 | into!@#81.7021 514 | a!@#81.8696 515 | more!@#82.0299 516 | open!@#82.2006 517 | and!@#82.3723 518 | accepting!@#82.5465 519 | society!@#82.7180 520 | ,!@#82.8878 521 | it!@#83.0581 522 | 's!@#83.2342 523 | likely!@#83.4102 524 | that!@#83.5892 525 | we!@#83.7634 526 | 'll!@#83.9191 527 | see!@#84.0749 528 | more!@#84.2324 529 | and!@#84.3873 530 | more!@#84.5441 531 | artists!@#84.7019 532 | exploring!@#84.8600 533 | the!@#85.0151 534 | theme!@#85.1687 535 | of!@#85.3257 536 | nudity!@#85.4796 537 | in!@#85.6365 538 | their!@#85.7903 539 | work!@#85.9453 540 | .!@#86.1022 541 | With!@#86.2564 542 | the!@#86.4121 543 | rise!@#86.5775 544 | of!@#86.7414 545 | social!@#86.9072 546 | media!@#87.0741 547 | and!@#87.2414 548 | the!@#87.4039 549 | increasing!@#87.5683 550 | visibility!@#87.7325 551 | of!@#87.8960 552 | the!@#88.0611 553 | human!@#88.2224 554 | body!@#88.3935 555 | ,!@#88.5628 556 | artists!@#88.7284 557 | are!@#88.8931 558 | finding!@#89.0657 559 | new!@#89.2398 560 | and!@#89.4158 561 | innovative!@#89.5962 562 | ways!@#89.7638 563 | to!@#89.9253 564 | depict!@#90.0993 565 | the!@#90.2757 566 | nude!@#90.4495 567 | form!@#90.6278 568 | . 569 | 570 | !@#90.8158 571 | **!@#90.9735 572 | Conclusion!@#91.1285 573 | ** 574 | 575 | !@#91.2864 576 | N!@#91.4456 577 | ud!@#91.6100 578 | ity!@#91.7739 579 | is!@#91.9381 580 | a!@#92.1040 581 | powerful!@#92.2652 582 | symbol!@#92.4313 583 | in!@#92.5965 584 | the!@#92.7647 585 | world!@#92.9293 586 | of!@#93.0936 587 | art!@#93.2648 588 | ,!@#93.4321 589 | ev!@#93.5966 590 | oking!@#93.7677 591 | feelings!@#93.9396 592 | of!@#94.1158 593 | truth!@#94.2905 594 | ,!@#94.4633 595 | authenticity!@#94.6285 596 | ,!@#94.8053 597 | and!@#94.9809 598 | vulnerability!@#95.1617 599 | .!@#95.3493 600 | Whether!@#95.5071 601 | it!@#95.6623 602 | 's!@#95.8301 603 | through!@#95.9957 604 | sculpture!@#96.1616 605 | ,!@#96.3242 606 | painting!@#96.4884 607 | ,!@#96.6526 608 | or!@#96.8207 609 | performance!@#96.9831 610 | ,!@#97.1512 611 | the!@#97.3169 612 | human!@#97.4838 613 | body!@#97.6578 614 | continues!@#97.8319 615 | to!@#98.0087 616 | inspire!@#98.1755 617 | and!@#98.3441 618 | challenge!@#98.5205 619 | artists!@#98.6975 620 | and!@#98.8913 621 | viewers!@#99.0492 622 | alike!@#99.2067 623 | .!@#99.3658 624 | As!@#99.5284 625 | we!@#99.6863 626 | move!@#99.8507 627 | forward!@#100.0180 628 | into!@#100.1839 629 | a!@#100.3492 630 | more!@#100.5172 631 | open!@#100.6820 632 | and!@#100.8487 633 | !@#100.8504 634 | -------------------------------------------------------------------------------- /tests/test_data/debug.py: -------------------------------------------------------------------------------- 1 | 2 | ''' 3 | Run this script to replay actual tokens on a configuration of your choice 4 | ''' 5 | 6 | from stream2sentence.stream2sentence_time_based import generate_sentences 7 | import time 8 | 9 | records = [] 10 | buffer = "" 11 | with open("1.txt", "r") as f: 12 | for line in f: 13 | #include newlines in split 14 | if "!@#" in line: 15 | buffer += line 16 | records.append(buffer) 17 | buffer = "" 18 | else: 19 | buffer += line 20 | if buffer: 21 | records.append(buffer) 22 | 23 | token_times = [tuple(record.split("!@#", 1)) for record in records] 24 | 25 | 26 | 27 | def get_llm_output_simulation(): 28 | start = time.time() 29 | def llm_output_simulation(): 30 | for tt in token_times: 31 | # print(tt) 32 | while (time.time() - start) < float(tt[1]): 33 | time.sleep(0.0001) 34 | yield tt[0] 35 | 36 | return llm_output_simulation() 37 | 38 | 39 | def run_test(): 40 | time_to_sentences = [] 41 | start_time = time.time() 42 | for i, sentence in enumerate( 43 | generate_sentences( 44 | get_llm_output_simulation(), 45 | lead_time = 0.3, 46 | max_wait_for_fragments = [1, 0.8, 1, 1.1, 1.5], 47 | target_tps = 3.6, 48 | min_output_lengths = [2, 3], 49 | deadline_offsets_dynamic=[.1] 50 | )): 51 | t = time.time() - start_time 52 | print(f"Sentence {i}: t={t:.1f} {sentence}") 53 | time_to_sentences.append([sentence, f"{t:.1f}"]) 54 | return time_to_sentences 55 | 56 | 57 | run_test() -------------------------------------------------------------------------------- /tests/test_stream2sentence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from stream2sentence import generate_sentences, generate_sentences_async 3 | 4 | class TestSentenceGenerator(unittest.TestCase): 5 | 6 | def test_chinese(self): 7 | text = "我喜欢读书。天气很好。我们去公园吧。今天是星期五。早上好。这是我的朋友。请帮我。吃饭了吗?我在学中文。晚安。" 8 | #expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。吃饭了吗?", "我在学中文。", "晚安。"] 9 | #expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。", "吃饭了吗?", "我在学中文。", "晚安。"] 10 | expected = ["我喜欢读书。", "天气很好。", "我们去公园吧。", "今天是星期五。", "早上好。", "这是我的朋友。", "请帮我。吃饭了吗?我在学中文。", "晚安。"] # this changed with new stanza version 11 | sentences = list(generate_sentences(text, minimum_sentence_length=2, context_size=2, tokenizer="stanza", language="zh")) 12 | self.assertEqual(sentences, expected) 13 | 14 | def test_chinese2(self): 15 | text = """ 16 | 胡/爷/爷,我/来/给/您/讲/一下/下/周/每/天/的/安/排。 17 | 周/一/:/9:00-10:00:晨/练/太/极/拳/,/地点/:/活/动/室/。 18 | 10:30-11:30:园/艺/活/动/菠菜/种/植/,/地点/:/花/园/。 19 | 14:00-15:00:手/工/制/作/睡/眠/香/囊/,/地点/:/手/工/室/。 20 | 15:30-16:30:观/看/老/电/影/,/地点/:/影/音/室/。 21 | 22 | 周/二/:/9:00-10:00:八/段/锦/简/化/版/,/地点/:/大/厅/。 23 | 10:30-11:30:书/法/练/习/,/地点/:/书/画/室/。 24 | 14:00-15:00:棋/牌/娱/乐/象/棋/、/围/棋/等/,/地点/:/棋/牌/室/。 25 | 15:30-16:30:养/生/讲/座/春/天/养/生/1/,/地点/:/会/议/室/。 26 | 大/厅/""" 27 | expected = [ 28 | "胡/爷/爷,我/来/给/您/讲/一下/下/周/每/天/的/安/排。", 29 | "周/一/:/9:00-10:00:晨/练/太/极/拳/,/地点/:/活/动/室/。", 30 | "10:30-11:30:园/艺/活/动/菠菜/种/植/,/地点/:/花/园/。", 31 | "14:00-15:00:手/工/制/作/睡/眠/香/囊/,/地点/:/手/工/室/。", 32 | "15:30-16:30:观/看/老/电/影/,/地点/:/影/音/室/。", 33 | "周/二/:/9:00-10:00:八/段/锦/简/化/版/,/地点/:/大/厅/。", 34 | "10:30-11:30:书/法/练/习/,/地点/:/书/画/室/。", 35 | "14:00-15:00:棋/牌/娱/乐/象/棋/、/围/棋/等/,/地点/:/棋/牌/室/。", 36 | "15:30-16:30:养/生/讲/座/春/天/养/生/1/,/地点/:/会/议/室/。", 37 | "大/厅/", 38 | ] 39 | sentences = list(generate_sentences(text, minimum_sentence_length=2, context_size=2, tokenizer="stanza", language="zh")) 40 | self.assertEqual(sentences, expected) 41 | 42 | def test_generator(self): 43 | def generator(): 44 | yield "Hallo, " 45 | yield "wie geht es dir? " 46 | yield "Mir geht es gut." 47 | expected = ["Hallo,", "wie geht es dir?", "Mir geht es gut."] 48 | sentences = list(generate_sentences(generator(), minimum_sentence_length = 3, context_size=5, minimum_first_fragment_length = 3, quick_yield_single_sentence_fragment=True)) 49 | self.assertEqual(sentences, expected) 50 | 51 | def test_return_incomplete_last(self): 52 | text = "How I feel? I feel fine" 53 | expected = ["How I feel?", "I feel fine"] 54 | sentences = list(generate_sentences(text)) 55 | self.assertEqual(sentences, expected) 56 | 57 | def test_hello_world(self): 58 | text = "Hello, world." 59 | expected = ["Hello,", "world."] 60 | sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=3)) 61 | self.assertEqual(sentences, expected) 62 | 63 | def test_hello_world2(self): 64 | text = "Hello, world! Hello all, my dear friends of realtime apps." 65 | expected = ["Hello, world!", "Hello all, my dear friends of realtime apps."] 66 | sentences = list(generate_sentences(text, minimum_sentence_length=3)) 67 | self.assertEqual(sentences, expected) 68 | 69 | def test_basic(self): 70 | text = "This is a test. This is another test sentence. Just testing out the module." 71 | expected = ["This is a test.", "This is another test sentence.", "Just testing out the module."] 72 | sentences = list(generate_sentences(text)) 73 | self.assertEqual(sentences, expected) 74 | 75 | def test_tricky_sentence1(self): 76 | text = "Good muffins cost $3.88 in New York. Please buy me two of them." 77 | expected = ["Good muffins cost $3.88 in New York.", "Please buy me two of them."] 78 | sentences = list(generate_sentences(text)) 79 | self.assertEqual(sentences, expected) 80 | 81 | def test_tricky_sentence2(self): 82 | text = "I called Dr. Jones. I called Dr. Jones." 83 | expected = ["I called Dr. Jones.", "I called Dr. Jones."] 84 | sentences = list(generate_sentences(text)) 85 | self.assertEqual(sentences, expected) 86 | 87 | def test_quick_yield(self): 88 | text = "First, this. Second, this." 89 | expected = ["First,", "this.", "Second, this."] 90 | sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=3)) 91 | self.assertEqual(sentences, expected) 92 | 93 | def test_quick_yield2(self): 94 | text = "First, this. Second, this." 95 | expected = ["First,", "this. Second, this."] 96 | sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=6, minimum_first_fragment_length=3)) 97 | self.assertEqual(sentences, expected) 98 | 99 | def test_quick_yield3(self): 100 | text = "First, this. Second, this." 101 | expected = ["First, this.", "Second, this."] 102 | sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=3, minimum_first_fragment_length=6)) 103 | self.assertEqual(sentences, expected) 104 | 105 | def test_quick_yield4(self): 106 | text = "First, this. Second, this." 107 | expected = ["First, this.", "Second, this."] 108 | sentences = list(generate_sentences(text, quick_yield_single_sentence_fragment=True, minimum_sentence_length=6, minimum_first_fragment_length=6)) 109 | self.assertEqual(sentences, expected) 110 | 111 | def test_minimum_length1(self): 112 | text = "Short. Longer sentence." 113 | expected = ["Short.", "Longer sentence."] 114 | sentences = list(generate_sentences(text, minimum_sentence_length=6)) # two sentences, len("Short.") == 6 115 | self.assertEqual(sentences, expected) 116 | 117 | def test_minimum_length2(self): 118 | text = "Short. Longer sentence." 119 | expected = ["Short. Longer sentence."] 120 | sentences = list(generate_sentences(text, minimum_sentence_length=7)) # one sentences, len("Short.") == 6 121 | self.assertEqual(sentences, expected) 122 | 123 | def test_cleanup(self): 124 | text = "Text with link: https://www.example.com and emoji 😀" 125 | expected = ["Text with link: and emoji"] 126 | sentences = list(generate_sentences(text, cleanup_text_links=True, cleanup_text_emojis=True)) 127 | self.assertEqual(sentences, expected) 128 | 129 | def test_check1(self): 130 | text = "I'll go with a glass of red wine. Thank you." 131 | expected = ["I'll go with a glass of red wine.", "Thank you."] 132 | sentences = list(generate_sentences(text, minimum_sentence_length=10, minimum_first_fragment_length=10, quick_yield_single_sentence_fragment=True, cleanup_text_links=True, cleanup_text_emojis=True)) 133 | self.assertEqual(sentences, expected) 134 | 135 | def test_very_short(self): 136 | text = "Excuse me?" 137 | expected = ["Excuse me?"] 138 | sentences = list(generate_sentences(text, minimum_sentence_length=18, minimum_first_fragment_length=10, quick_yield_single_sentence_fragment=True, cleanup_text_links=True, cleanup_text_emojis=True)) 139 | self.assertEqual(sentences, expected) 140 | 141 | def test_log_characters(self): 142 | text = "Hello world" 143 | print () 144 | sentences = list(generate_sentences(text, log_characters=True)) 145 | print () 146 | print () 147 | print (f"test_log_characters succeeded, if {text} was printed above.") 148 | print () 149 | # Check characters were printed 150 | self.assertTrue(sentences) 151 | 152 | def test_not_log_characters(self): 153 | text = "Do not show these characters." 154 | expected = ["Do not show these characters."] 155 | sentences = list(generate_sentences(text, log_characters=False)) 156 | print(f"\ntest_not_log_characters succeeded, if \"{text}\" was not printed above.") 157 | self.assertEqual(sentences, expected) 158 | 159 | if __name__ == '__main__': 160 | unittest.main() -------------------------------------------------------------------------------- /tests/test_stream_from_llm.py: -------------------------------------------------------------------------------- 1 | from stream2sentence import generate_sentences 2 | from openai import OpenAI # pip install openai 3 | 4 | client = OpenAI() 5 | 6 | def write(prompt: str): 7 | stream = client.chat.completions.create( 8 | model="gpt-4", 9 | messages=[{"role": "user", "content": prompt}], 10 | stream=True, 11 | ) 12 | for chunk in stream: 13 | if (text_chunk := chunk.choices[0].delta.content): 14 | yield text_chunk 15 | 16 | 17 | text_stream = write("A three-sentence relaxing speech.") 18 | 19 | for idx, sentence in enumerate(generate_sentences(text_stream, minimum_sentence_length=5), start=1): 20 | print(f"Sentence {idx}: {sentence}") 21 | -------------------------------------------------------------------------------- /tests/test_stream_from_llm_old_api.py: -------------------------------------------------------------------------------- 1 | from stream2sentence import generate_sentences 2 | import openai # pip install openai 3 | import os 4 | 5 | openai.api_key = os.environ.get("OPENAI_API_KEY") 6 | 7 | def write(prompt: str): 8 | for chunk in openai.ChatCompletion.create( 9 | model="gpt-3.5-turbo", 10 | messages=[{"role": "user", "content" : prompt}], 11 | stream=True 12 | ): 13 | if (text_chunk := chunk["choices"][0]["delta"].get("content")) is not None: 14 | yield text_chunk 15 | 16 | text_stream = write("A three-sentence relaxing speech.") 17 | 18 | for idx, sentence in enumerate(generate_sentences(text_stream), start=1): 19 | print(f"Sentence {idx}: {sentence}") 20 | -------------------------------------------------------------------------------- /tests/test_time_based.py: -------------------------------------------------------------------------------- 1 | 2 | from stream2sentence.stream2sentence_time_based import generate_sentences_time_based as generate_sentences 3 | import time 4 | 5 | input_stewart_wiki = ''' 6 | In 1996 Mr. Stewart hosted a short-lived talk show entitled, Where's Elvis This Week?, which was a half-hour, weekly comedy television program. 7 | It aired on Sunday nights in the United Kingdom on BBC Two. 8 | It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States, who discussed news items and cultural issues. 9 | The show premiered in the UK on October 6, 1996; five episodes aired in total. 10 | Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown. In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy. 11 | Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling). 12 | Stewart also headlined the 1997 White House Correspondents' dinner. 13 | ''' 14 | 15 | input_problematic = ''' 16 | First sentence is short. 17 | Second sentence is very long, and totally a run on, and would definitely cause problems if this is what the output of the llm was and we only had a quick yield value of one this needs to be broken up thanks. 18 | Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. 19 | Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. 20 | Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 21 | ''' 22 | 23 | WORDS_PER_TOKEN = .75 24 | 25 | def get_words(current_input): 26 | return list(map(lambda word: word + ' ', current_input.split())) 27 | 28 | 29 | def print_word_targets(current_input, tps_target): 30 | target_delay_between_words = (1 / (WORDS_PER_TOKEN * tps_target)) 31 | word_targets = [] 32 | for i, word in enumerate(get_words(current_input)): 33 | t = ((i + 1) * target_delay_between_words) + target_delay_between_words 34 | word_targets.append([ word, f"{t:.1f}" ]) 35 | print(word_targets) 36 | 37 | 38 | def get_llm_output_simulation(current_input, tts): 39 | def llm_output_simulation(): 40 | for word in get_words(current_input): 41 | time.sleep(1 / (tts * WORDS_PER_TOKEN)) 42 | yield word 43 | return llm_output_simulation() 44 | 45 | 46 | def run_test(input, simulated_tts, dynamic_offset=False): 47 | time_to_sentences = [] 48 | start_time = time.time() 49 | for i, sentence in enumerate( 50 | generate_sentences( 51 | get_llm_output_simulation(input, simulated_tts), 52 | deadline_offsets_dynamic=[.5, .3, .1] if dynamic_offset else [0] 53 | )): 54 | t = time.time() - start_time 55 | print(f"Sentence {i}: t={t:.1f} {sentence}") 56 | time_to_sentences.append([sentence, f"{t:.1f}"]) 57 | print("\n\n RESULT ") 58 | print(time_to_sentences) 59 | print("\n\n") 60 | return time_to_sentences 61 | 62 | def is_within_tolerance(num1, num2, tolerance): 63 | return abs(num1 - num2) <= tolerance 64 | 65 | def compare_results(result, expected_result): 66 | for i in range(len(result)): 67 | if expected_result[i][0] != result[i][0] or not is_within_tolerance(float(expected_result[i][1]), float(result[i][1]), 0.25): 68 | raise ValueError(f"RESULT MISMATCH - expected={expected_result[i]} - actual={result[i]}") 69 | 70 | 71 | result_1 = run_test(input_stewart_wiki, 9, True) 72 | expected_result_1 = [ 73 | ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '1.8'], 74 | ["Where's Elvis This Week?, which was a half-hour,", '2.8'], 75 | ['weekly comedy television program.', '5.2'], 76 | ['It aired on Sunday nights in the United Kingdom on BBC Two.', '6.8'], 77 | ['It was filmed at the CBS Broadcast Center in New York City and featured a set of panelists, two from the UK and two from the United States,', '9.3'], 78 | ['who discussed news items and cultural issues.', '12.6'], 79 | ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '16.1'], 80 | ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '19.3'], 81 | ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '23.8'], 82 | ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '24.9'], 83 | ["Stewart also headlined the 1997 White House Correspondents' dinner.", '24.9'] 84 | ] 85 | compare_results(result_1, expected_result_1) 86 | 87 | result_2 = run_test(input_stewart_wiki, 5) 88 | expected_result_2 = [ 89 | ['In 1996 Mr. Stewart hosted a short-lived talk show entitled,', '3.0'], 90 | ["Where's Elvis This Week?, which was a half-hour,", '5.4'], 91 | ['weekly comedy television program.', '8.0'], 92 | ['It aired on Sunday nights in the United Kingdom on BBC Two.', '9.4'], 93 | ['It was filmed at the CBS Broadcast Center in New York City and featured a set', '13.4'], 94 | ['of panelists, two from the UK and two from the United States, who discussed news items and cultural issues.', '18.8'], 95 | ['The show premiered in the UK on October 6, 1996; five episodes aired in total.', '25.1'], 96 | ['Notable panelists included Dave Chappelle, Eddie Izzard, Phill Jupitus, Nora Ephron, Craig Kilborn, Christopher Hitchens, Armando Iannucci, Norm Macdonald, and Helen Gurley Brown.', '30.0'], 97 | ["In 1997, Stewart was chosen as the host and interviewer for George Carlin's tenth HBO special, George Carlin: 40 Years of Comedy.", '37.7'], 98 | ['Stewart had a recurring role in The Larry Sanders Show, playing himself as an occasional substitute and possible successor to late-night talk show host Larry Sanders (played by Garry Shandling).', '44.9'], 99 | ["Stewart also headlined the 1997 White House Correspondents' dinner.", '44.9'] 100 | ] 101 | compare_results(result_2, expected_result_2) 102 | 103 | result_3 = run_test(input_problematic, 9) 104 | expected_result_3 = [ 105 | ['First sentence is short.', '1.0'], 106 | ['Second sentence is very long,', '1.5'], 107 | ['and totally a run on,', '3.1'], 108 | ['and would definitely cause problems if this is what the output of the llm was and we only had a quick', '5.2'], 109 | ['yield value of one this needs to be broken up thanks.', '11.7'], 110 | ['Third sentence also very long that lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '15.3'], 111 | ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '25.0'], 112 | ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '25.0'] 113 | ] 114 | compare_results(result_3, expected_result_3) 115 | 116 | result_4 = run_test(input_problematic, 5, True) 117 | expected_result_4 = [ 118 | ['First sentence is short.', '1.3'], 119 | ['Second sentence is very long,', '2.7'], 120 | ['and totally a run on,', '4.0'], 121 | ['and would definitely cause problems if this is what the output', '6.7'], 122 | ['of the llm was and we only had a quick', '9.4'], 123 | ['yield value of one this needs to be broken', '11.8'], 124 | ['up thanks. Third sentence also very long that lorem', '14.2'], 125 | ['ipsum dolor sit amet, consectetur adipiscing elit,', '17.4'], 126 | ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim', '19.5'], 127 | ['ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '23.5'], 128 | ['Fourth sentence also very long fourth sentence that Lorem ipsum dolor sit amet, consectetur adipiscing elit,', '28.3'], 129 | ['sed do eiusmod tempor incididunt ut labore et dolore magna aliqua ut enim ad minim veniam,', '33.6'], 130 | ['quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.', '39.3'], 131 | ['Fifth sentence also long Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur excepteur sint occaecat cupidatat non', '42.0'], 132 | ['proident, sunt in culpa qui officia deserunt mollit anim id est laborum.', '45.2'] 133 | ] 134 | compare_results(result_4, expected_result_4) 135 | --------------------------------------------------------------------------------