├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── examples └── demo.ipynb ├── images └── spacy_chunks.png ├── setup.py └── spacy_chunks └── __init__.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .nox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | *.py,cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | db.sqlite3-journal 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule files 85 | celerybeat-schedule 86 | celerybeat.pid 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | 113 | # PyCharm 114 | .idea/ 115 | 116 | # VSCode 117 | .vscode/ 118 | .DS_Store 119 | src/.DS_Store 120 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 William J.B. Mattingly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![logo](images/spacy_chunks.png) 2 | 3 | spaCy Chunks is a custom pipeline component for spaCy that allows you to generate overlapping chunks of sentences or tokens from a document. This component is useful for various NLP tasks that require processing text in smaller, potentially overlapping segments. 4 | 5 | ## Features 6 | 7 | - Chunk by sentences or tokens 8 | - Configurable chunk size 9 | - Adjustable overlap between chunks 10 | - Option to truncate incomplete chunks 11 | 12 | ## Installation 13 | 14 | To use spaCy Chunks, you need to have spaCy installed. You can install spaCy using pip: 15 | 16 | ```bash 17 | pip install spacy 18 | pip install spacy_chunks 19 | ``` 20 | 21 | Download a spaCy model: 22 | 23 | ```bash 24 | python -m spacy download en_core_web_sm 25 | ``` 26 | 27 | ## Usage 28 | 29 | Here's how to use the spaCy Chunks component: 30 | 31 | ```python 32 | import spacy 33 | 34 | # Load a spaCy model 35 | nlp = spacy.load("en_core_web_sm") 36 | 37 | # Add the chunking component to the pipeline 38 | nlp.add_pipe("spacy_chunks", last=True, config={ 39 | "chunking_method": "sentence", 40 | "chunk_size": 2, 41 | "overlap": 1, 42 | "truncate": True 43 | }) 44 | 45 | # Process a text 46 | text = "This is the first sentence. This is the second one. And here's the third. The fourth is here. And a fifth." 47 | doc = nlp(text) 48 | 49 | # Print the chunks 50 | print("Chunks:") 51 | for i, chunk in enumerate(doc._.chunks, 1): 52 | print(f"Chunk {i}: {[sent.text for sent in chunk]}") 53 | ``` 54 | 55 | Output: 56 | ``` 57 | Chunks: 58 | Chunk 1: ['This is the first sentence.', 'This is the second one.'] 59 | Chunk 2: ['This is the second one.', "And here's the third."] 60 | Chunk 3: ["And here's the third.", 'The fourth is here.'] 61 | Chunk 4: ['The fourth is here.', 'And a fifth.'] 62 | ``` 63 | 64 | ## Configuration 65 | 66 | When adding the chunking component to your pipeline, you can configure the following parameters: 67 | 68 | - `chunking_method`: "sentence" or "token" (default: "sentence") 69 | - `chunk_size`: Number of sentences or tokens per chunk (default: 3) 70 | - `overlap`: Number of overlapping sentences or tokens between chunks (default: 0) 71 | - `truncate`: Whether to remove incomplete chunks at the end (default: True) 72 | 73 | ## Changing Configuration Dynamically 74 | 75 | You can change the configuration of the chunking component dynamically: 76 | 77 | ```python 78 | # Change chunk size 79 | nlp.get_pipe("spacy_chunks").chunk_size = 3 80 | 81 | # Disable truncation 82 | nlp.get_pipe("spacy_chunks").truncate = False 83 | 84 | # Process the text again with new settings 85 | doc = nlp(text) 86 | ``` 87 | 88 | ## Contributing 89 | 90 | Contributions to spaCy Chunks are welcome! Please feel free to submit a Pull Request. 91 | 92 | ## License 93 | 94 | This project is licensed under the MIT License. -------------------------------------------------------------------------------- /examples/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import spacy\n", 10 | "import spacy_chunks" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Chunks:\n", 23 | "Chunk 1: ['This is the first sentence.', 'This is the second one.']\n", 24 | "Chunk 2: ['This is the second one.', \"And here's the third.\"]\n", 25 | "Chunk 3: [\"And here's the third.\", 'The fourth is here.']\n", 26 | "Chunk 4: ['The fourth is here.', 'And a fifth.']\n", 27 | "\n", 28 | "Chunks (without truncation):\n", 29 | "Chunk 1: ['This is the first sentence.', 'This is the second one.']\n", 30 | "Chunk 2: ['This is the second one.', \"And here's the third.\"]\n", 31 | "Chunk 3: [\"And here's the third.\", 'The fourth is here.']\n", 32 | "Chunk 4: ['The fourth is here.', 'And a fifth.']\n", 33 | "Chunk 5: ['And a fifth.']\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "# Usage example\n", 39 | "nlp = spacy.load(\"en_core_web_sm\")\n", 40 | "nlp.add_pipe(\"spacy_chunks\", last=True, config={\"chunking_method\": \"sentence\", \"chunk_size\": 2, \"overlap\": 1, \"truncate\": True})\n", 41 | "\n", 42 | "text = \"This is the first sentence. This is the second one. And here's the third. The fourth is here. And a fifth.\"\n", 43 | "doc = nlp(text)\n", 44 | "\n", 45 | "print(\"Chunks:\")\n", 46 | "for i, chunk in enumerate(doc._.chunks, 1):\n", 47 | " print(f\"Chunk {i}: {[sent.text for sent in chunk]}\")\n", 48 | "\n", 49 | "# Example with truncate set to False\n", 50 | "nlp.get_pipe(\"spacy_chunks\").truncate = False\n", 51 | "doc = nlp(text)\n", 52 | "\n", 53 | "print(\"\\nChunks (without truncation):\")\n", 54 | "for i, chunk in enumerate(doc._.chunks, 1):\n", 55 | " print(f\"Chunk {i}: {[sent.text for sent in chunk]}\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "bow", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.10.14" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /images/spacy_chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjbmattingly/spacy-chunks/bcb74098613329fc771cb60be22d6ea13af1c082/images/spacy_chunks.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | # Read the contents of your README file 5 | with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf-8') as f: 6 | long_description = f.read() 7 | 8 | # Import the version 9 | directory = os.path.dirname(__file__) 10 | 11 | setup( 12 | name='spacy-chunks', 13 | version="0.0.2", 14 | author='William J. B. Mattingly', 15 | description='An easy way to chunk spacy docs.', 16 | long_description=long_description, 17 | long_description_content_type='text/markdown', 18 | url='https://github.com/wjbmattingly/spacy-chunks', 19 | packages=find_packages(), 20 | entry_points={ 21 | "spacy_factories": ["spacy_chunks = spacy_chunks:Chunking"], 22 | }, 23 | install_requires=[ 24 | 'spacy>=3.0.0' 25 | ], 26 | classifiers=[ 27 | 'Development Status :: 3 - Alpha', 28 | 'Intended Audience :: Developers', 29 | 'Natural Language :: English', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: 3.7', 32 | 'Programming Language :: Python :: 3.8', 33 | 'Programming Language :: Python :: 3.9', 34 | 'Programming Language :: Python :: 3.10', 35 | ], 36 | python_requires='>=3.7', 37 | ) -------------------------------------------------------------------------------- /spacy_chunks/__init__.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.tokens import Doc 3 | from spacy.language import Language 4 | from typing import List, Union 5 | 6 | @Language.factory("spacy_chunks") 7 | class Chunking: 8 | def __init__(self, nlp: Language, name: str, chunking_method: str = "sentence", chunk_size: int = 3, overlap: int = 0, truncate: bool = True): 9 | self.nlp = nlp 10 | self.chunking_method = chunking_method 11 | self.chunk_size = chunk_size 12 | self.overlap = overlap 13 | self.truncate = truncate 14 | 15 | def __call__(self, doc: Doc) -> Doc: 16 | # Generate chunks based on the chunking method 17 | if self.chunking_method == "sentence": 18 | chunks = self._generate_sentence_chunks(doc, self.chunk_size, self.overlap, self.truncate) 19 | elif self.chunking_method == "token": 20 | chunks = self._generate_token_chunks(doc, self.chunk_size, self.overlap, self.truncate) 21 | else: 22 | raise ValueError(f"Invalid chunking method: {self.chunking_method}") 23 | 24 | # Store chunks in custom attribute 25 | if not Doc.has_extension("chunks"): 26 | Doc.set_extension("chunks", default=None) 27 | doc._.chunks = chunks 28 | 29 | return doc 30 | 31 | def _generate_sentence_chunks(self, doc: Doc, chunk_size: int, overlap: int, truncate: bool) -> List[List[spacy.tokens.Span]]: 32 | sentences = list(doc.sents) 33 | return self._generate_chunks(sentences, chunk_size, overlap, truncate) 34 | 35 | def _generate_token_chunks(self, doc: Doc, chunk_size: int, overlap: int, truncate: bool) -> List[List[spacy.tokens.Token]]: 36 | tokens = list(doc) 37 | return self._generate_chunks(tokens, chunk_size, overlap, truncate) 38 | 39 | def _generate_chunks(self, items: Union[List[spacy.tokens.Span], List[spacy.tokens.Token]], chunk_size: int, overlap: int, truncate: bool) -> List[List[Union[spacy.tokens.Span, spacy.tokens.Token]]]: 40 | chunks = [] 41 | stride = chunk_size - overlap 42 | for i in range(0, len(items), stride): 43 | chunk = items[i:i + chunk_size] 44 | if not truncate or len(chunk) == chunk_size: 45 | chunks.append(chunk) 46 | return chunks --------------------------------------------------------------------------------