├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── examples
    └── demo.ipynb
├── images
    └── spacy_chunks.png
├── setup.py
└── spacy_chunks
    └── __init__.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | # Usually these files are written by a python script from a template
 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .nox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | *.py,cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | db.sqlite3-journal
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule files
 85 | celerybeat-schedule
 86 | celerybeat.pid
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | 
113 | # PyCharm
114 | .idea/
115 | 
116 | # VSCode
117 | .vscode/
118 | .DS_Store
119 | src/.DS_Store
120 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 William J.B. Mattingly
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![logo](images/spacy_chunks.png)
 2 | 
 3 | spaCy Chunks is a custom pipeline component for spaCy that allows you to generate overlapping chunks of sentences or tokens from a document. This component is useful for various NLP tasks that require processing text in smaller, potentially overlapping segments.
 4 | 
 5 | ## Features
 6 | 
 7 | - Chunk by sentences or tokens
 8 | - Configurable chunk size
 9 | - Adjustable overlap between chunks
10 | - Option to truncate incomplete chunks
11 | 
12 | ## Installation
13 | 
14 | To use spaCy Chunks, you need to have spaCy installed. You can install spaCy using pip:
15 | 
16 | ```bash
17 | pip install spacy
18 | pip install spacy_chunks
19 | ```
20 | 
21 | Download a spaCy model:
22 | 
23 | ```bash
24 | python -m spacy download en_core_web_sm
25 | ```
26 | 
27 | ## Usage
28 | 
29 | Here's how to use the spaCy Chunks component:
30 | 
31 | ```python
32 | import spacy
33 | 
34 | # Load a spaCy model
35 | nlp = spacy.load("en_core_web_sm")
36 | 
37 | # Add the chunking component to the pipeline
38 | nlp.add_pipe("spacy_chunks", last=True, config={
39 |     "chunking_method": "sentence",
40 |     "chunk_size": 2,
41 |     "overlap": 1,
42 |     "truncate": True
43 | })
44 | 
45 | # Process a text
46 | text = "This is the first sentence. This is the second one. And here's the third. The fourth is here. And a fifth."
47 | doc = nlp(text)
48 | 
49 | # Print the chunks
50 | print("Chunks:")
51 | for i, chunk in enumerate(doc._.chunks, 1):
52 |     print(f"Chunk {i}: {[sent.text for sent in chunk]}")
53 | ```
54 | 
55 | Output:
56 | ```
57 | Chunks:
58 | Chunk 1: ['This is the first sentence.', 'This is the second one.']
59 | Chunk 2: ['This is the second one.', "And here's the third."]
60 | Chunk 3: ["And here's the third.", 'The fourth is here.']
61 | Chunk 4: ['The fourth is here.', 'And a fifth.']
62 | ```
63 | 
64 | ## Configuration
65 | 
66 | When adding the chunking component to your pipeline, you can configure the following parameters:
67 | 
68 | - `chunking_method`: "sentence" or "token" (default: "sentence")
69 | - `chunk_size`: Number of sentences or tokens per chunk (default: 3)
70 | - `overlap`: Number of overlapping sentences or tokens between chunks (default: 0)
71 | - `truncate`: Whether to remove incomplete chunks at the end (default: True)
72 | 
73 | ## Changing Configuration Dynamically
74 | 
75 | You can change the configuration of the chunking component dynamically:
76 | 
77 | ```python
78 | # Change chunk size
79 | nlp.get_pipe("spacy_chunks").chunk_size = 3
80 | 
81 | # Disable truncation
82 | nlp.get_pipe("spacy_chunks").truncate = False
83 | 
84 | # Process the text again with new settings
85 | doc = nlp(text)
86 | ```
87 | 
88 | ## Contributing
89 | 
90 | Contributions to spaCy Chunks are welcome! Please feel free to submit a Pull Request.
91 | 
92 | ## License
93 | 
94 | This project is licensed under the MIT License.


--------------------------------------------------------------------------------
/examples/demo.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import spacy\n",
10 |     "import spacy_chunks"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 4,
16 |    "metadata": {},
17 |    "outputs": [
18 |     {
19 |      "name": "stdout",
20 |      "output_type": "stream",
21 |      "text": [
22 |       "Chunks:\n",
23 |       "Chunk 1: ['This is the first sentence.', 'This is the second one.']\n",
24 |       "Chunk 2: ['This is the second one.', \"And here's the third.\"]\n",
25 |       "Chunk 3: [\"And here's the third.\", 'The fourth is here.']\n",
26 |       "Chunk 4: ['The fourth is here.', 'And a fifth.']\n",
27 |       "\n",
28 |       "Chunks (without truncation):\n",
29 |       "Chunk 1: ['This is the first sentence.', 'This is the second one.']\n",
30 |       "Chunk 2: ['This is the second one.', \"And here's the third.\"]\n",
31 |       "Chunk 3: [\"And here's the third.\", 'The fourth is here.']\n",
32 |       "Chunk 4: ['The fourth is here.', 'And a fifth.']\n",
33 |       "Chunk 5: ['And a fifth.']\n"
34 |      ]
35 |     }
36 |    ],
37 |    "source": [
38 |     "# Usage example\n",
39 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
40 |     "nlp.add_pipe(\"spacy_chunks\", last=True, config={\"chunking_method\": \"sentence\", \"chunk_size\": 2, \"overlap\": 1, \"truncate\": True})\n",
41 |     "\n",
42 |     "text = \"This is the first sentence. This is the second one. And here's the third. The fourth is here. And a fifth.\"\n",
43 |     "doc = nlp(text)\n",
44 |     "\n",
45 |     "print(\"Chunks:\")\n",
46 |     "for i, chunk in enumerate(doc._.chunks, 1):\n",
47 |     "    print(f\"Chunk {i}: {[sent.text for sent in chunk]}\")\n",
48 |     "\n",
49 |     "# Example with truncate set to False\n",
50 |     "nlp.get_pipe(\"spacy_chunks\").truncate = False\n",
51 |     "doc = nlp(text)\n",
52 |     "\n",
53 |     "print(\"\\nChunks (without truncation):\")\n",
54 |     "for i, chunk in enumerate(doc._.chunks, 1):\n",
55 |     "    print(f\"Chunk {i}: {[sent.text for sent in chunk]}\")"
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {},
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "bow",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.10.14"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/images/spacy_chunks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wjbmattingly/spacy-chunks/bcb74098613329fc771cb60be22d6ea13af1c082/images/spacy_chunks.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | 
 4 | # Read the contents of your README file
 5 | with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf-8') as f:
 6 |     long_description = f.read()
 7 | 
 8 | # Import the version
 9 | directory = os.path.dirname(__file__)
10 | 
11 | setup(
12 |     name='spacy-chunks',
13 |     version="0.0.2",
14 |     author='William J. B. Mattingly',
15 |     description='An easy way to chunk spacy docs.',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown',
18 |     url='https://github.com/wjbmattingly/spacy-chunks',
19 |     packages=find_packages(),
20 |     entry_points={
21 |       "spacy_factories": ["spacy_chunks = spacy_chunks:Chunking"],
22 |     },
23 |     install_requires=[
24 |         'spacy>=3.0.0'
25 |     ],
26 |     classifiers=[
27 |         'Development Status :: 3 - Alpha',  
28 |         'Intended Audience :: Developers',
29 |         'Natural Language :: English',
30 |         'Programming Language :: Python :: 3',
31 |         'Programming Language :: Python :: 3.7',
32 |         'Programming Language :: Python :: 3.8',
33 |         'Programming Language :: Python :: 3.9',
34 |         'Programming Language :: Python :: 3.10',
35 |     ],
36 |     python_requires='>=3.7',
37 | )


--------------------------------------------------------------------------------
/spacy_chunks/__init__.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from spacy.tokens import Doc
 3 | from spacy.language import Language
 4 | from typing import List, Union
 5 | 
 6 | @Language.factory("spacy_chunks")
 7 | class Chunking:
 8 |     def __init__(self, nlp: Language, name: str, chunking_method: str = "sentence", chunk_size: int = 3, overlap: int = 0, truncate: bool = True):
 9 |         self.nlp = nlp
10 |         self.chunking_method = chunking_method
11 |         self.chunk_size = chunk_size
12 |         self.overlap = overlap
13 |         self.truncate = truncate
14 |         
15 |     def __call__(self, doc: Doc) -> Doc:
16 |         # Generate chunks based on the chunking method
17 |         if self.chunking_method == "sentence":
18 |             chunks = self._generate_sentence_chunks(doc, self.chunk_size, self.overlap, self.truncate)
19 |         elif self.chunking_method == "token":
20 |             chunks = self._generate_token_chunks(doc, self.chunk_size, self.overlap, self.truncate)
21 |         else:
22 |             raise ValueError(f"Invalid chunking method: {self.chunking_method}")
23 |         
24 |         # Store chunks in custom attribute
25 |         if not Doc.has_extension("chunks"):
26 |             Doc.set_extension("chunks", default=None)
27 |         doc._.chunks = chunks
28 |         
29 |         return doc
30 |     
31 |     def _generate_sentence_chunks(self, doc: Doc, chunk_size: int, overlap: int, truncate: bool) -> List[List[spacy.tokens.Span]]:
32 |         sentences = list(doc.sents)
33 |         return self._generate_chunks(sentences, chunk_size, overlap, truncate)
34 |     
35 |     def _generate_token_chunks(self, doc: Doc, chunk_size: int, overlap: int, truncate: bool) -> List[List[spacy.tokens.Token]]:
36 |         tokens = list(doc)
37 |         return self._generate_chunks(tokens, chunk_size, overlap, truncate)
38 |     
39 |     def _generate_chunks(self, items: Union[List[spacy.tokens.Span], List[spacy.tokens.Token]], chunk_size: int, overlap: int, truncate: bool) -> List[List[Union[spacy.tokens.Span, spacy.tokens.Token]]]:
40 |         chunks = []
41 |         stride = chunk_size - overlap
42 |         for i in range(0, len(items), stride):
43 |             chunk = items[i:i + chunk_size]
44 |             if not truncate or len(chunk) == chunk_size:
45 |                 chunks.append(chunk)
46 |         return chunks


--------------------------------------------------------------------------------