├── tests ├── __init__.py ├── requirements.txt ├── test_vectorizers.py └── utils.py ├── keyphrase_vectorizers ├── _version.py ├── __init__.py ├── keyphrase_tfidf_vectorizer.py ├── keyphrase_vectorizer_mixin.py └── keyphrase_count_vectorizer.py ├── pyproject.toml ├── requirements.txt ├── docs ├── api.rst ├── index.rst ├── requirements.txt ├── conf.py └── KeyphraseVectorizers.md ├── .readthedocs.yaml ├── .github └── workflows │ └── testing.yml ├── LICENSE ├── setup.py ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /keyphrase_vectorizers/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.13' 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest>=7.0.1 2 | keybert>=0.5.0 3 | flair==0.11.3 4 | scipy==1.7.3 5 | bertopic>=0.16.1 6 | scikit-learn>=1.0.1 7 | umap-learn==0.5.4 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.5 2 | spacy>=3.0.1 3 | spacy-transformers>=1.1.6 4 | spacy-curated-transformers>=0.2.2 5 | nltk>=3.6.1 6 | scikit-learn>=1.0 7 | scipy>=1.7.3 8 | psutil>=5.8.0 -------------------------------------------------------------------------------- /keyphrase_vectorizers/__init__.py: -------------------------------------------------------------------------------- 1 | from keyphrase_vectorizers._version import __version__ 2 | from keyphrase_vectorizers.keyphrase_count_vectorizer import KeyphraseCountVectorizer 3 | from keyphrase_vectorizers.keyphrase_tfidf_vectorizer import KeyphraseTfidfVectorizer 4 | from keyphrase_vectorizers.keyphrase_vectorizer_mixin import _KeyphraseVectorizerMixin 5 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | :mod:`KeyphraseCountVectorizer` 2 | =============================== 3 | 4 | .. automodule:: keyphrase_vectorizers.keyphrase_count_vectorizer 5 | :members: 6 | :inherited-members: 7 | 8 | :mod:`KeyphraseTfidfVectorizer` 9 | =============================== 10 | 11 | .. automodule:: keyphrase_vectorizers.keyphrase_tfidf_vectorizer 12 | :members: 13 | :inherited-members: 14 | 15 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to the KeyphraseVectorizers documentation! 2 | =================================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: User Guide: 7 | 8 | KeyphraseVectorizers 9 | 10 | .. toctree:: 11 | :caption: API Guide: 12 | 13 | api 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=3.2.1 2 | sphinx-rtd-theme>=0.5.2 3 | sphinxcontrib-applehelp>=1.0.2 4 | sphinxcontrib-devhelp>=1.0.2 5 | sphinxcontrib-htmlhelp>=1.0.3 6 | sphinxcontrib-jsmath>=1.0.1 7 | sphinxcontrib-qthelp>=1.0.3 8 | sphinxcontrib-serializinghtml>=1.1.4 9 | sphinxcontrib-websupport>=1.2.4 10 | readthedocs-sphinx-search>=0.1.0 11 | sphinx-markdown-tables>=0.0.15 12 | recommonmark>=0.7.1 13 | docutils>=0.16 14 | numpy>=1.18.5 15 | spacy>=3.0.1 16 | spacy-transformers>=1.1.6 17 | spacy-curated-transformers>=0.2.2 18 | nltk>=3.6.1 19 | scikit-learn>=1.0 20 | scipy>=1.7.3 21 | psutil>=5.8.0 22 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | builder: html 12 | fail_on_warning: false 13 | 14 | # Optionally build your docs in additional formats such as PDF 15 | formats: all 16 | 17 | # Optionally set the version of Python and requirements required to build your docs 18 | python: 19 | install: 20 | - requirements: docs/requirements.txt 21 | - method: pip 22 | path: . 23 | extra_requirements: 24 | - docs 25 | 26 | build: 27 | os: ubuntu-22.04 28 | tools: 29 | python: "3.7" 30 | 31 | submodules: 32 | include: all 33 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Code tests 5 | 6 | on: 7 | push: 8 | branches: 9 | - master 10 | pull_request: 11 | branches: 12 | - master 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: [ 3.7, 3.8, 3.9 ] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v1 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install -r "requirements.txt" 31 | pip install -r "tests/requirements.txt" 32 | - name: Run tests 33 | run: pytest 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Tim Schopf 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.util import convert_path 2 | 3 | import setuptools 4 | 5 | with open("README.md", "r", encoding="utf-8") as fh: 6 | long_description = fh.read() 7 | 8 | main_ns = {} 9 | ver_path = convert_path('keyphrase_vectorizers/_version.py') 10 | with open(ver_path) as ver_file: 11 | exec(ver_file.read(), main_ns) 12 | 13 | ver_path = convert_path('requirements.txt') 14 | with open(ver_path) as ver_file: 15 | base_packages = ver_file.read().splitlines() 16 | 17 | setuptools.setup( 18 | name='keyphrase-vectorizers', 19 | version=main_ns['__version__'], 20 | url='https://github.com/TimSchopf/KeyphraseVectorizers', 21 | license='BSD 3-Clause "New" or "Revised" License', 22 | author='Tim Schopf', 23 | author_email='tim.schopf@t-online.de.de', 24 | description='Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert them into a document-keyphrase matrix.', 25 | long_description=long_description, 26 | long_description_content_type='text/markdown', 27 | classifiers=[ 28 | "Development Status :: 3 - Alpha", 29 | "Programming Language :: Python :: 3", 30 | "Intended Audience :: Science/Research", 31 | "Intended Audience :: Developers", 32 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 33 | "Topic :: Scientific/Engineering :: Information Analysis", 34 | "License :: OSI Approved :: BSD License", 35 | "Operating System :: OS Independent", 36 | ], 37 | install_requires=base_packages, 38 | package_dir={"": "."}, 39 | packages=setuptools.find_packages(where="."), 40 | python_requires='>=3.7', 41 | data_files=[('requirements', ['requirements.txt'])], 42 | ) 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit tests / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | #PyCharm stuff 59 | .idea/ 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | 13 | import os 14 | import sys 15 | from distutils.util import convert_path 16 | 17 | sys.path.insert(0, os.path.abspath('..')) 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'KeyphraseVectorizers' 22 | copyright = '2022, Tim Schopf' 23 | author = 'Tim Schopf' 24 | 25 | main_ns = {} 26 | ver_path = convert_path('../keyphrase_vectorizers/_version.py') 27 | with open(ver_path) as ver_file: 28 | exec(ver_file.read(), main_ns) 29 | release = main_ns['__version__'] 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx_rtd_theme', 38 | 'recommonmark', 39 | 'sphinx.ext.autodoc', 40 | 'sphinx.ext.napoleon', 41 | 'sphinx_markdown_tables', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 51 | 52 | # -- Options for HTML output ------------------------------------------------- 53 | 54 | # The theme to use for HTML and HTML Help pages. See the documentation for 55 | # a list of builtin themes. 56 | # 57 | html_theme = 'sphinx_rtd_theme' 58 | 59 | # Add any paths that contain custom static files (such as style sheets) here, 60 | # relative to this directory. They are copied after the builtin static files, 61 | # so a file named "default.css" will overwrite the builtin "default.css". 62 | html_static_path = [] 63 | 64 | master_doc = 'index' 65 | 66 | source_parsers = { 67 | '.md': 'recommonmark.parser.CommonMarkParser', 68 | } 69 | 70 | source_suffix = ['.rst', '.md'] 71 | -------------------------------------------------------------------------------- /tests/test_vectorizers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import flair 4 | import spacy 5 | from bertopic import BERTopic 6 | from flair.models import SequenceTagger 7 | from flair.tokenization import SegtokSentenceSplitter 8 | from keybert import KeyBERT 9 | from sklearn.datasets import fetch_20newsgroups 10 | 11 | import tests.utils as utils 12 | from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer 13 | 14 | english_docs = utils.get_english_test_docs() 15 | german_docs = utils.get_german_test_docs() 16 | french_docs = utils.get_french_docs() 17 | 18 | 19 | def test_default_count_vectorizer(): 20 | sorted_english_test_keyphrases = utils.get_english_test_keyphrases() 21 | sorted_count_matrix = utils.get_sorted_english_count_matrix() 22 | 23 | vectorizer = KeyphraseCountVectorizer() 24 | vectorizer.fit(english_docs) 25 | keyphrases = vectorizer.get_feature_names_out() 26 | document_keyphrase_matrix = vectorizer.transform(english_docs).toarray() 27 | 28 | assert [sorted(count_list) for count_list in 29 | KeyphraseCountVectorizer().fit_transform(english_docs).toarray()] == sorted_count_matrix 30 | assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_count_matrix 31 | assert sorted(keyphrases) == sorted_english_test_keyphrases 32 | 33 | 34 | def test_spacy_language_argument(): 35 | sorted_english_test_keyphrases = utils.get_english_test_keyphrases() 36 | sorted_count_matrix = utils.get_sorted_english_count_matrix() 37 | 38 | nlp = spacy.load("en_core_web_sm") 39 | 40 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline=nlp) 41 | vectorizer.fit(english_docs) 42 | keyphrases = vectorizer.get_feature_names_out() 43 | document_keyphrase_matrix = vectorizer.transform(english_docs).toarray() 44 | 45 | assert [sorted(count_list) for count_list in 46 | KeyphraseCountVectorizer().fit_transform(english_docs).toarray()] == sorted_count_matrix 47 | assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_count_matrix 48 | assert sorted(keyphrases) == sorted_english_test_keyphrases 49 | 50 | 51 | def test_german_count_vectorizer(): 52 | sorted_german_test_keyphrases = utils.get_german_test_keyphrases() 53 | 54 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='*+', 55 | stop_words='german') 56 | keyphrases = vectorizer.fit(german_docs).get_feature_names_out() 57 | assert sorted(keyphrases) == sorted_german_test_keyphrases 58 | 59 | 60 | def test_default_tfidf_vectorizer(): 61 | sorted_english_test_keyphrases = utils.get_english_test_keyphrases() 62 | sorted_english_tfidf_matrix = utils.get_sorted_english_tfidf_matrix() 63 | 64 | vectorizer = KeyphraseTfidfVectorizer() 65 | vectorizer.fit(english_docs) 66 | keyphrases = vectorizer.get_feature_names_out() 67 | document_keyphrase_matrix = vectorizer.transform(english_docs).toarray() 68 | document_keyphrase_matrix = [[round(element, 10) for element in tfidf_list] for tfidf_list in 69 | document_keyphrase_matrix] 70 | 71 | assert [sorted(tfidf_list) for tfidf_list in document_keyphrase_matrix] == sorted_english_tfidf_matrix 72 | assert sorted(keyphrases) == sorted_english_test_keyphrases 73 | 74 | 75 | def test_keybert_integration(): 76 | english_keybert_keyphrases = utils.get_english_keybert_keyphrases() 77 | kw_model = KeyBERT(model="all-MiniLM-L6-v2") 78 | keyphrases = kw_model.extract_keywords(docs=english_docs, vectorizer=KeyphraseCountVectorizer()) 79 | keyphrases = [[element[0] for element in keyphrases_list] for keyphrases_list in keyphrases] 80 | 81 | assert keyphrases == english_keybert_keyphrases 82 | 83 | 84 | def test_french_trf_spacy_pipeline(): 85 | sorted_french_test_keyphrases = utils.get_french_test_keyphrases() 86 | sorted_french_count_matrix = utils.get_sorted_french_count_matrix() 87 | 88 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='fr_dep_news_trf', spacy_exclude=[]) 89 | vectorizer.fit(french_docs) 90 | keyphrases = vectorizer.get_feature_names_out() 91 | document_keyphrase_matrix = vectorizer.transform(french_docs).toarray() 92 | 93 | assert [sorted(count_list) for count_list in 94 | KeyphraseCountVectorizer(spacy_pipeline='fr_dep_news_trf', spacy_exclude=[]).fit_transform( 95 | french_docs).toarray()] == sorted_french_count_matrix 96 | assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_french_count_matrix 97 | assert sorted(keyphrases) == sorted_french_test_keyphrases 98 | 99 | 100 | def test_custom_tagger(): 101 | sorted_english_test_keyphrases = utils.get_sorted_english_keyphrases_custom_flair_tagger() 102 | 103 | tagger = SequenceTagger.load('pos') 104 | splitter = SegtokSentenceSplitter() 105 | 106 | # define custom pos tagger function using flair 107 | def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger, 108 | splitter: flair.tokenization.SegtokSentenceSplitter = splitter) -> List[tuple]: 109 | """ 110 | Important: 111 | 112 | The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings. 113 | Furthermore the function has to return a list of (word token, POS-tag) tuples. 114 | """ 115 | # split texts into sentences 116 | sentences = [] 117 | for doc in raw_documents: 118 | sentences.extend(splitter.split(doc)) 119 | 120 | # predict POS tags 121 | tagger.predict(sentences) 122 | 123 | # iterate through sentences to get word tokens and predicted POS-tags 124 | pos_tags = [] 125 | words = [] 126 | for sentence in sentences: 127 | pos_tags.extend([label.value for label in sentence.get_labels('pos')]) 128 | words.extend([word.text for word in sentence]) 129 | 130 | return list(zip(words, pos_tags)) 131 | 132 | vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger) 133 | vectorizer.fit(english_docs) 134 | keyphrases = vectorizer.get_feature_names_out() 135 | 136 | assert sorted(keyphrases) == sorted_english_test_keyphrases 137 | 138 | 139 | def test_online_vectorizer(): 140 | first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix() 141 | second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix() 142 | first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases() 143 | english_keyphrases = utils.get_english_test_keyphrases() 144 | frequencies_after_min_df = utils.get_frequencies_after_min_df() 145 | frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df() 146 | frequencies_after_bow = utils.get_frequencies_after_bow() 147 | 148 | # intitial vectorizer fit 149 | vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) 150 | 151 | assert [sorted(count_list) for count_list in 152 | vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix 153 | assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases 154 | 155 | # learn additional keyphrases from new documents with partial fit 156 | vectorizer.partial_fit([english_docs[1]]) 157 | 158 | assert [sorted(count_list) for count_list in 159 | vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix 160 | assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases 161 | 162 | # update list of learned keyphrases according to 'delete_min_df' 163 | vectorizer.update_bow([english_docs[1]]) 164 | assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all() 165 | 166 | # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) 167 | assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df 168 | 169 | # update again and check the impact of 'decay' on the learned document-keyphrase matrix 170 | vectorizer.update_bow([english_docs[1]]) 171 | assert (vectorizer.X_.toarray() == frequencies_after_bow).all() 172 | 173 | 174 | def test_bertopic(): 175 | data = fetch_20newsgroups(subset='train') 176 | texts = data.data[:100] 177 | topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) 178 | topics, probs = topic_model.fit_transform(documents=texts) 179 | new_topics = topic_model.reduce_outliers(texts, topics) 180 | topic_model.update_topics(texts, topics=new_topics) 181 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def get_english_test_docs(): 3 | english_docs = ["""Supervised learning is the machine learning task of learning a function that 4 | maps an input to an output based on example input-output pairs. It infers a 5 | function from labeled training data consisting of a set of training examples. 6 | In supervised learning, each example is a pair consisting of an input object 7 | (typically a vector) and a desired output value (also called the supervisory signal). 8 | A supervised learning algorithm analyzes the training data and produces an inferred function, 9 | which can be used for mapping new examples. An optimal scenario will allow for the 10 | algorithm to correctly determine the class labels for unseen instances. This requires 11 | the learning algorithm to generalize from the training data to unseen situations in a 12 | 'reasonable' way (see inductive bias).""", 13 | 14 | """Keywords are defined as phrases that capture the main topics discussed in a document. 15 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 16 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 17 | of keywords can quickly help to determine whether a given document is relevant to their interest. 18 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 19 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 20 | in information retrieval."""] 21 | 22 | return english_docs 23 | 24 | 25 | def get_german_test_docs(): 26 | german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 27 | Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 28 | sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 29 | Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 30 | Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 31 | Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""", 32 | 33 | """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 34 | Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 35 | die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren 36 | """] 37 | return german_docs 38 | 39 | 40 | def get_french_docs(): 41 | french_docs = ["Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs"] 42 | 43 | return french_docs 44 | 45 | 46 | def get_english_test_keyphrases(): 47 | sorted_english_test_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 'document relevance', 48 | 'documents', 'example', 'function', 'groups', 'indication', 'inductive bias', 49 | 'information retrieval', 'information retrieval environment', 'input', 50 | 'input object', 'interest', 'keywords', 'list', 'machine', 'main topics', 51 | 'new examples', 'optimal scenario', 'output', 'output pairs', 'output value', 52 | 'overlap', 'pair', 'phrases', 'precise summary', 'set', 'supervised learning', 53 | 'supervised learning algorithm', 'supervisory signal', 'task', 'training data', 54 | 'training examples', 'unseen instances', 'unseen situations', 'users', 55 | 'various applications', 'vector', 'way'] 56 | 57 | return sorted_english_test_keyphrases 58 | 59 | 60 | def get_english_first_doc_test_keyphrases(): 61 | sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias', 62 | 'input', 'input object', 'machine', 'new examples', 'optimal scenario', 63 | 'output', 'output pairs', 'output value', 'pair', 'set', 64 | 'supervised learning', 'supervised learning algorithm', 65 | 'supervisory signal', 'task', 'training data', 'training examples', 66 | 'unseen instances', 'unseen situations', 'vector', 'way'] 67 | 68 | return sorted_english_first_doc_test_keyphrases 69 | 70 | 71 | def get_sorted_english_keyphrases_custom_flair_tagger(): 72 | sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 73 | 'document relevance', 74 | 'documents', 'example', 'example input-output pairs', 'function', 75 | 'groups', 76 | 'indication', 'inductive bias', 'inferred function', 77 | 'information retrieval', 'information retrieval environment', 'input', 78 | 'input object', 'interest', 'keywords', 'learning', 'learning algorithm', 79 | 'list', 'machine', 'main topics', 'new examples', 80 | 'optimal scenario', 'output', 'output value', 'overlap', 'pair', 81 | 'phrases', 'precise summary', 'set', 'supervised learning', 82 | 'supervised learning algorithm', 'supervisory signal', 'task', 83 | 'training data', 'training examples', 'unseen instances', 84 | 'unseen situations', 'users', 'various applications', 'vector', 'way'] 85 | 86 | return sorted_english_custom_tagger_keyphrases 87 | 88 | 89 | def get_german_test_keyphrases(): 90 | sorted_german_test_keyphrases = ['advokat', 'angesehenen bürgerlichen familie', 'ausbildung', 'bäckers', 91 | 'dichtkunst', 'ehefrau elisabetha dorothea schiller', 'frankfurt', 92 | 'friedrich schiller', 'geb. kodweiß', 'goethe', 'großvater', 'hauslehrer', 93 | 'hofgärtnerei', 'höchster justizbeamter', 'kaiserlicher rat', 'leipzig', 'leiters', 94 | 'marbach', 'neckar', 'neckar johann kaspar schiller', 'neigung', 'offiziers', 95 | 'rechte', 'rechtswissenschaft', 'schwester cornelia', 'stadt frankfurt', 96 | 'stadtschultheiß', 'straßburg', 'tochter', 'vater doktor', 'vaters', 'wetzlar', 97 | 'wirtes', 'wundarztes', 'wunsch', 'zweites kind'] 98 | return sorted_german_test_keyphrases 99 | 100 | 101 | def get_french_test_keyphrases(): 102 | sorted_french_test_keyphrases = ['assurance', 'constructeurs', 'responsabilité', 'voitures'] 103 | 104 | return sorted_french_test_keyphrases 105 | 106 | 107 | def get_sorted_english_count_matrix(): 108 | sorted_english_count_matrix = [ 109 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 110 | 3, 3, 3, 3, 3], 111 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 112 | 1, 2, 2, 5, 5]] 113 | 114 | return sorted_english_count_matrix 115 | 116 | 117 | def get_sorted_english_first_doc_count_matrix(): 118 | sorted_english_first_doc_count_matrix = [ 119 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]] 120 | 121 | return sorted_english_first_doc_count_matrix 122 | 123 | 124 | def get_sorted_english_second_doc_count_matrix(): 125 | sorted_english_second_doc_count_matrix = [ 126 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 127 | 1, 2, 2, 5, 5]] 128 | 129 | return sorted_english_second_doc_count_matrix 130 | 131 | 132 | def get_sorted_french_count_matrix(): 133 | sorted_french_count_matrix = [[1, 1, 1, 1]] 134 | 135 | return sorted_french_count_matrix 136 | 137 | 138 | def get_sorted_english_tfidf_matrix(): 139 | sorted_english_tfidf_matrix = [ 140 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1147078669, 141 | 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 142 | 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 143 | 0.1147078669, 0.2294157339, 0.3441236008, 0.3441236008, 0.3441236008, 0.3441236008, 0.3441236008, 144 | 0.3441236008], 145 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 146 | 0.0, 0.0, 0.0, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 147 | 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.2373563316, 148 | 0.2373563316, 0.5933908291, 0.5933908291]] 149 | 150 | return sorted_english_tfidf_matrix 151 | 152 | 153 | def get_english_keybert_keyphrases(): 154 | english_keybert_keyphrases = [ 155 | ['supervised learning algorithm', 'supervised learning', 'training data', 'training examples', 'class labels'], 156 | ['document relevance', 'keywords', 'information retrieval', 'information retrieval environment', 157 | 'document content']] 158 | 159 | return english_keybert_keyphrases 160 | 161 | 162 | def get_frequencies_after_min_df(): 163 | frequency_array = np.array([[5, 5]]) 164 | 165 | return frequency_array 166 | 167 | 168 | def get_frequencies_after_bow(): 169 | frequency_array = np.array([[7.5, 7.5]]) 170 | 171 | return frequency_array 172 | 173 | 174 | def get_frequent_keyphrases_after_min_df(): 175 | keyphrases = ['document', 'keywords'] 176 | 177 | return keyphrases 178 | -------------------------------------------------------------------------------- /keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. _spaCy pipeline: https://spacy.io/models 3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip 4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py 5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax 6 | .. _spaCy part-of-speech tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py 7 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in 8 | """ 9 | 10 | import warnings 11 | from typing import List, Union 12 | 13 | import numpy as np 14 | import psutil 15 | import spacy 16 | from sklearn.exceptions import NotFittedError 17 | from sklearn.feature_extraction.text import TfidfTransformer 18 | from sklearn.utils.validation import FLOAT_DTYPES 19 | 20 | from keyphrase_vectorizers.keyphrase_count_vectorizer import KeyphraseCountVectorizer 21 | 22 | 23 | class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer): 24 | """ 25 | KeyphraseTfidfVectorizer 26 | 27 | KeyphraseTfidfVectorizer converts a collection of text documents to a normalized tf or tf-idf document-token matrix. 28 | The tokens are keyphrases that are extracted from the text documents based on their part-of-speech tags. 29 | The matrix rows indicate the documents and columns indicate the unique keyphrases. 30 | Each cell represents the tf or tf-idf value, depending on the parameter settings. 31 | The part-of-speech pattern of keyphrases can be defined by the ``pos_pattern`` parameter. 32 | By default, keyphrases are extracted, that have 0 or more adjectives, followed by 1 or more nouns. 33 | A list of extracted keyphrases matching the defined part-of-speech pattern can be returned after fitting via :class:`get_feature_names_out()`. 34 | 35 | Attention: 36 | If the vectorizer is used for languages other than English, the ``spacy_pipeline`` and ``stop_words`` parameters 37 | must be customized accordingly. 38 | Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. 39 | Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. 40 | In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. 41 | 42 | Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. 43 | This is a common term weighting scheme in information retrieval, 44 | that has also found good use in document classification. 45 | 46 | The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document 47 | is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less 48 | informative than features that occur in a small fraction of the training corpus. 49 | 50 | The formula that is used to compute the tf-idf for a term t of a document d in a document set is 51 | tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), 52 | where n is the total number of documents in the document set and df(t) is the document frequency of t; 53 | the document frequency is the number of documents in the document set that contain the term t. 54 | The effect of adding "1" to the idf in the equation above is that terms with zero idf, i.e., terms 55 | that occur in all documents in a training set, will not be entirely ignored. 56 | (Note that the idf formula above differs from the standard textbook 57 | notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]). 58 | 59 | If ``smooth_idf=True`` (the default), the constant "1" is added to the numerator and denominator of the idf as 60 | if an extra document was seen containing every term in the collection exactly once, which prevents 61 | zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1. 62 | 63 | Furthermore, the formulas used to compute tf and idf depend on parameter settings that correspond to 64 | the SMART notation used in IR as follows: 65 | 66 | Tf is "n" (natural) by default, "l" (logarithmic) when ``sublinear_tf=True``. 67 | Idf is "t" when use_idf is given, "n" (none) otherwise. 68 | Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``. 69 | 70 | Parameters 71 | ---------- 72 | spacy_pipeline : Union[str, spacy.Language], default='en_core_web_sm' 73 | A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text. Standard is the 'en' pipeline. 74 | 75 | pos_pattern : str, default='*+' 76 | The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text. 77 | Standard is to only select keyphrases that have 0 or more adjectives, followed by 1 or more nouns. 78 | 79 | stop_words : Union[str, List[str]], default='english' 80 | Language of stopwords to remove from the document, e.g. 'english'. 81 | Supported options are `stopwords available in NLTK`_. 82 | Removes unwanted stopwords from keyphrases if 'stop_words' is not None. 83 | If given a list of custom stopwords, removes them instead. 84 | 85 | lowercase : bool, default=True 86 | Whether the returned keyphrases should be converted to lowercase. 87 | 88 | workers :int, default=1 89 | How many workers to use for spaCy part-of-speech tagging. 90 | If set to -1, use all available worker threads of the machine. 91 | SpaCy uses the specified number of cores to tag documents with part-of-speech. 92 | Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. 93 | In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. 94 | Therefore, carefully consider whether this option is really necessary. 95 | 96 | spacy_exclude : List[str], default=['parser', 'attribute_ruler', 'lemmatizer', 'ner'] 97 | A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. 98 | Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. 99 | 100 | custom_pos_tagger: callable, default=None 101 | A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. 102 | If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. 103 | 104 | max_df : int, default=None 105 | During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold. 106 | 107 | min_df : int, default=None 108 | During fitting ignore keyphrases that have a document frequency strictly lower than the given threshold. 109 | This value is also called cut-off in the literature. 110 | 111 | binary : bool, default=False 112 | If True, all non-zero counts are set to 1. 113 | This is useful for discrete probabilistic models that model binary events rather than integer counts. 114 | 115 | dtype : type, default=np.int64 116 | Type of the matrix returned by fit_transform() or transform(). 117 | 118 | decay : float, default=None 119 | A value between [0, 1] to weight the percentage of frequencies 120 | the previous bag-of-words should be decreased. For example, 121 | a value of `.1` will decrease the frequencies in the bag-of-words 122 | matrix with 10% at each iteration. 123 | 124 | delete_min_df : float, default=None 125 | Delete words at each iteration from its vocabulary 126 | that are below a minimum frequency. 127 | This will keep the resulting bag-of-words matrix small 128 | such that it does not explode in size with increasing 129 | vocabulary. If `decay` is None then this equals `min_df`. 130 | 131 | norm : {'l1', 'l2'}, default='l2' 132 | Each output row will have unit norm, either: 133 | - 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied. 134 | - 'l1': Sum of absolute values of vector elements is 1. 135 | 136 | use_idf : bool, default=True 137 | Enable inverse-document-frequency reweighting. If False, idf(t) = 1. 138 | 139 | smooth_idf : bool, default=True 140 | Smooth idf weights by adding one to document frequencies, as if an 141 | extra document was seen containing every term in the collection 142 | exactly once. Prevents zero divisions. 143 | 144 | sublinear_tf : bool, default=False 145 | Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 146 | 147 | """ 148 | 149 | def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm', pos_pattern: str = '*+', 150 | stop_words: Union[str, List[str]] = 'english', 151 | lowercase: bool = True, workers: int = 1, 152 | spacy_exclude: List[str] = ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 153 | custom_pos_tagger: callable = None, max_df: int = None, min_df: int = None, 154 | binary: bool = False, dtype: np.dtype = np.float64, decay: float = None, 155 | delete_min_df: float = None, norm: str = "l2", 156 | use_idf: bool = True, smooth_idf: bool = True, 157 | sublinear_tf: bool = False): 158 | 159 | # triggers a parameter validation 160 | if not isinstance(workers, int): 161 | raise ValueError( 162 | "'workers' parameter must be of type int" 163 | ) 164 | 165 | if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0): 166 | raise ValueError( 167 | "'workers' parameter value cannot be 0 and must be between -1 and " + str( 168 | psutil.cpu_count(logical=True)) 169 | ) 170 | 171 | self.spacy_pipeline = spacy_pipeline 172 | self.pos_pattern = pos_pattern 173 | self.stop_words = stop_words 174 | self.lowercase = lowercase 175 | self.workers = workers 176 | self.spacy_exclude = spacy_exclude 177 | self.custom_pos_tagger = custom_pos_tagger 178 | self.max_df = max_df 179 | self.min_df = min_df 180 | self.binary = binary 181 | self.dtype = dtype 182 | self.decay = decay 183 | self.delete_min_df = delete_min_df 184 | self.norm = norm 185 | self.use_idf = use_idf 186 | self.smooth_idf = smooth_idf 187 | self.sublinear_tf = sublinear_tf 188 | 189 | self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf, 190 | sublinear_tf=self.sublinear_tf) 191 | 192 | super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words, 193 | lowercase=self.lowercase, workers=self.workers, spacy_exclude=self.spacy_exclude, 194 | custom_pos_tagger=self.custom_pos_tagger, max_df=self.max_df, min_df=self.min_df, 195 | binary=self.binary, dtype=self.dtype, decay=self.decay, delete_min_df=self.delete_min_df) 196 | 197 | def _check_params(self): 198 | """ 199 | Validate dtype parameter. 200 | """ 201 | 202 | if self.dtype not in FLOAT_DTYPES: 203 | warnings.warn( 204 | "Only {} 'dtype' should be used. {} 'dtype' will " 205 | "be converted to np.float64.".format(FLOAT_DTYPES, self.dtype), 206 | UserWarning, 207 | ) 208 | 209 | def fit(self, raw_documents: List[str]) -> object: 210 | """Learn the keyphrases that match the defined part-of-speech pattern and idf from the list of raw documents. 211 | 212 | Parameters 213 | ---------- 214 | raw_documents : iterable 215 | An iterable of strings. 216 | 217 | Returns 218 | ------- 219 | self : object 220 | Fitted vectorizer. 221 | """ 222 | 223 | self._check_params() 224 | X = super().fit_transform(raw_documents) 225 | self._tfidf.fit(X) 226 | return self 227 | 228 | def fit_transform(self, raw_documents: List[str]) -> List[List[float]]: 229 | """ 230 | Learn the keyphrases that match the defined part-of-speech pattern and idf from the list of raw documents. 231 | Then return document-keyphrase matrix. 232 | This is equivalent to fit followed by transform, but more efficiently implemented. 233 | 234 | Parameters 235 | ---------- 236 | raw_documents : iterable 237 | An iterable of strings. 238 | 239 | Returns 240 | ------- 241 | X : sparse matrix of (n_samples, n_features) 242 | Tf-idf-weighted document-keyphrase matrix. 243 | """ 244 | 245 | self._check_params() 246 | X = super().fit_transform(raw_documents) 247 | self._tfidf.fit(X) 248 | # X is already a transformed view of raw_documents so 249 | # we set copy to False 250 | return self._tfidf.transform(X, copy=False) 251 | 252 | def transform(self, raw_documents: List[str]) -> List[List[float]]: 253 | """ 254 | Transform documents to document-keyphrase matrix. 255 | Uses the keyphrases and document frequencies (df) learned by fit (or fit_transform). 256 | 257 | Parameters 258 | ---------- 259 | raw_documents : iterable 260 | An iterable of strings. 261 | 262 | Returns 263 | ------- 264 | X : sparse matrix of (n_samples, n_features) 265 | Tf-idf-weighted document-keyphrase matrix. 266 | """ 267 | 268 | # triggers a parameter validation 269 | if not hasattr(self, 'keyphrases'): 270 | raise NotFittedError("Keyphrases not fitted.") 271 | 272 | X = super().transform(raw_documents) 273 | return self._tfidf.transform(X, copy=False) -------------------------------------------------------------------------------- /keyphrase_vectorizers/keyphrase_vectorizer_mixin.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. _spaCy pipeline: https://spacy.io/models 3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip 4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py 5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax 6 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in 7 | """ 8 | 9 | import logging 10 | import os 11 | from typing import List, Union 12 | 13 | import nltk 14 | import numpy as np 15 | import psutil 16 | import scipy.sparse as sp 17 | import spacy 18 | 19 | 20 | class _KeyphraseVectorizerMixin(): 21 | """ 22 | _KeyphraseVectorizerMixin 23 | 24 | Provides common code for text vectorizers. 25 | """ 26 | 27 | def build_tokenizer(self) -> callable: 28 | """ 29 | Return a function that splits a string into a sequence of tokens. 30 | 31 | Returns 32 | ------- 33 | tokenizer: callable 34 | A function to split a string into a sequence of tokens. 35 | """ 36 | 37 | return self._tokenize 38 | 39 | def _tokenize_simple(self, text: str) -> List[str]: 40 | """ 41 | Simple tokenizer that just splits strings by whitespace. 42 | 43 | Parameters 44 | ---------- 45 | text : str 46 | The text to tokenize. 47 | 48 | Returns 49 | ------- 50 | tokens: List[str] 51 | A list of tokens. 52 | """ 53 | 54 | tokens = text.split() 55 | return tokens 56 | 57 | def _tokenize(self, text: str) -> List[str]: 58 | """ 59 | Custom word tokenizer for sklearn vectorizer that uses a spaCy pipeline for tokenization. 60 | 61 | Parameters 62 | ---------- 63 | text : str 64 | The text to tokenize. 65 | 66 | Returns 67 | ------- 68 | tokens: List[str] 69 | A list of tokens. 70 | """ 71 | 72 | processed_documents, _ = self._get_pos_keyphrases(document_list=[text], 73 | stop_words=self.stop_words, 74 | spacy_pipeline=self.spacy_pipeline, 75 | pos_pattern=self.pos_pattern, 76 | lowercase=self.lowercase, workers=self.workers, 77 | spacy_exclude=['tok2vec', 'tagger', 'parser', 78 | 'attribute_ruler', 'lemmatizer', 'ner', 79 | 'textcat'], 80 | custom_pos_tagger=self.custom_pos_tagger, 81 | extract_keyphrases=False) 82 | 83 | return self._tokenize_simple(processed_documents[0]) 84 | 85 | def _document_frequency(self, document_keyphrase_count_matrix: List[List[int]]) -> np.array: 86 | """ 87 | Count the number of non-zero values for each feature in sparse a matrix. 88 | 89 | Parameters 90 | ---------- 91 | document_keyphrase_count_matrix : List[List[int]] 92 | The document-keyphrase count matrix to transform to document frequencies. 93 | 94 | Returns 95 | ------- 96 | document_frequencies : np.array 97 | Numpy array of document frequencies for keyphrases. 98 | """ 99 | 100 | document_keyphrase_count_matrix = sp.csr_matrix(document_keyphrase_count_matrix) 101 | document_frequencies = np.bincount(document_keyphrase_count_matrix.indices, 102 | minlength=document_keyphrase_count_matrix.shape[1]) 103 | 104 | return document_frequencies 105 | 106 | def _remove_suffixes(self, text: str, suffixes: List[str]) -> str: 107 | """ 108 | Removes pre-defined suffixes from a given text string. 109 | 110 | Parameters 111 | ---------- 112 | text : str 113 | Text string where suffixes should be removed. 114 | 115 | suffixes : list 116 | List of strings that should be removed from the end of the text. 117 | 118 | Returns 119 | ------- 120 | text : Text string with removed suffixes. 121 | """ 122 | 123 | for suffix in suffixes: 124 | if text.lower().endswith(suffix.lower()): 125 | return text[:-len(suffix)].strip() 126 | return text 127 | 128 | def _remove_prefixes(self, text: str, prefixes: List[str]) -> str: 129 | """ 130 | Removes pre-defined prefixes from a given text string. 131 | 132 | Parameters 133 | ---------- 134 | text : str 135 | Text string where prefixes should be removed. 136 | 137 | prefixes : list 138 | List of strings that should be removed from the beginning of the text. 139 | 140 | Returns 141 | ------- 142 | text : Text string with removed prefixes. 143 | """ 144 | 145 | for prefix in prefixes: 146 | if text.lower().startswith(prefix.lower()): 147 | return text[len(prefix):].strip() 148 | return text 149 | 150 | def _cumulative_length_joiner(self, text_list: List[str], max_text_length: int) -> List[str]: 151 | """ 152 | Joins strings from list of strings to single string until maximum char length is reached. 153 | Then join the next strings from list to a single string and so on. 154 | 155 | Parameters 156 | ---------- 157 | text_list : list of strings 158 | List of strings to join. 159 | 160 | max_text_length : int 161 | Maximum character length of the joined strings. 162 | 163 | Returns 164 | ------- 165 | list_of_joined_strings_with_max_length : List of joined text strings with max char length of 'max_text_length'. 166 | """ 167 | 168 | if isinstance(text_list, str): 169 | raise ValueError("Iterable over raw texts expected, string object received.") 170 | 171 | if not isinstance(max_text_length, int) or max_text_length <= 0: 172 | raise ValueError("max_text_length must be a positive integer.") 173 | 174 | joined_strings = [] 175 | current_string = "" 176 | 177 | for text in text_list: 178 | if not text: 179 | continue 180 | 181 | # If the next text exceeds the max length, start a new string 182 | if len(current_string) + len(text) + 1 > max_text_length: # +1 for space character 183 | # Append the current string to the result list 184 | if current_string: 185 | joined_strings.append(current_string.strip()) 186 | # Start a new string with the current text 187 | current_string = text 188 | else: 189 | # Add the text to the current string 190 | if current_string: 191 | current_string += ' ' + text 192 | else: 193 | current_string = text 194 | 195 | # Append the last string to the result list 196 | if current_string: 197 | joined_strings.append(current_string.strip()) 198 | 199 | return joined_strings 200 | 201 | def _split_long_document(self, text: str, max_text_length: int) -> List[str]: 202 | """ 203 | Split single string in list of strings with a maximum character length. 204 | 205 | Parameters 206 | ---------- 207 | text : str 208 | Text string that should be split. 209 | 210 | max_text_length : int 211 | Maximum character length of the strings. 212 | 213 | Returns 214 | ------- 215 | splitted_document : List of text strings. 216 | """ 217 | # triggers a parameter validation 218 | if not isinstance(text, str): 219 | raise ValueError( 220 | "'text' parameter needs to be a string." 221 | ) 222 | 223 | # triggers a parameter validation 224 | if not isinstance(max_text_length, int): 225 | raise ValueError( 226 | "'max_text_length' parameter needs to be a int" 227 | ) 228 | 229 | text = text.replace("? ", "?") 230 | text = text.replace("! ", "!") 231 | 232 | if "" in text: 233 | splitted_document = text.split("") 234 | splitted_document = [s.strip() for s in splitted_document if s.strip()] # Filter out empty strings 235 | splitted_document = [ 236 | self._cumulative_length_joiner(text_list=doc.split(" "), max_text_length=max_text_length) if len( 237 | doc) > max_text_length else [doc] for doc in splitted_document] 238 | return [text for doc in splitted_document for text in doc] 239 | else: 240 | # No punctuation marks found, process the entire text 241 | splitted_document = text.split(" ") 242 | splitted_document = self._cumulative_length_joiner(text_list=splitted_document, 243 | max_text_length=max_text_length) 244 | return splitted_document 245 | 246 | def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, List[str]], spacy_pipeline: Union[str, spacy.Language], 247 | pos_pattern: str, spacy_exclude: List[str], custom_pos_tagger: callable, 248 | lowercase: bool = True, workers: int = 1, extract_keyphrases: bool = True) -> List[str]: 249 | """ 250 | Select keyphrases with part-of-speech tagging from a text document. 251 | Parameters 252 | ---------- 253 | document_list : list of str 254 | List of text documents from which to extract the keyphrases. 255 | 256 | stop_words : Union[str, List[str]] 257 | Language of stopwords to remove from the document, e.g. 'english'. 258 | Supported options are `stopwords available in NLTK`_. 259 | Removes unwanted stopwords from keyphrases if 'stop_words' is not None. 260 | If given a list of custom stopwords, removes them instead. 261 | 262 | spacy_pipeline : Union[str, spacy.Language] 263 | A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text. 264 | 265 | pos_pattern : str 266 | The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text. 267 | 268 | spacy_exclude : List[str] 269 | A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. 270 | Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. 271 | 272 | custom_pos_tagger : callable 273 | A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. 274 | If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. 275 | 276 | lowercase : bool, default=True 277 | Whether the returned keyphrases should be converted to lowercase. 278 | 279 | workers : int, default=1 280 | How many workers to use for spaCy part-of-speech tagging. 281 | If set to -1, use all available worker threads of the machine. 282 | spaCy uses the specified number of cores to tag documents with part-of-speech. 283 | Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. 284 | In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. 285 | Therefore, carefully consider whether this option is really necessary. 286 | 287 | extract_keyphrases : bool, default=True 288 | Whether to run the keyphrase extraction step or just return an empty list. 289 | 290 | Returns 291 | ------- 292 | keyphrases : List of unique keyphrases of varying length, extracted from the text document with the defined 'pos_pattern'. 293 | """ 294 | 295 | # triggers a parameter validation 296 | if isinstance(document_list, str): 297 | raise ValueError( 298 | "Iterable over raw text documents expected, string object received." 299 | ) 300 | 301 | # triggers a parameter validation 302 | if not hasattr(document_list, '__iter__'): 303 | raise ValueError( 304 | "Iterable over raw text documents expected." 305 | ) 306 | 307 | # triggers a parameter validation 308 | if not isinstance(stop_words, str) and (stop_words is not None) and (not hasattr(stop_words, '__iter__')): 309 | raise ValueError( 310 | "'stop_words' parameter needs to be a string, e.g. 'english' or 'None' or a list of strings." 311 | ) 312 | 313 | # triggers a parameter validation 314 | if not isinstance(spacy_pipeline, (str, spacy.Language)): 315 | raise ValueError( 316 | "'spacy_pipeline' parameter needs to be a spacy.Language object or a spaCy pipeline string. E.g. 'en_core_web_sm'" 317 | ) 318 | 319 | # triggers a parameter validation 320 | if not isinstance(pos_pattern, str): 321 | raise ValueError( 322 | "'pos_pattern' parameter needs to be a regex string. E.g. '*+'" 323 | ) 324 | 325 | # triggers a parameter validation 326 | if ((not hasattr(spacy_exclude, '__iter__')) and (spacy_exclude is not None)) or ( 327 | isinstance(spacy_exclude, str)): 328 | raise ValueError( 329 | "'spacy_exclude' parameter needs to be a list of 'spaCy pipeline components' strings." 330 | ) 331 | 332 | # triggers a parameter validation 333 | if not callable(custom_pos_tagger) and (custom_pos_tagger is not None): 334 | raise ValueError( 335 | "'custom_pos_tagger' must be a callable function that gets a list of strings in a 'raw_documents' parameter and returns a list of (word, POS-tag) tuples." 336 | ) 337 | 338 | # triggers a parameter validation 339 | if not isinstance(workers, int): 340 | raise ValueError( 341 | "'workers' parameter must be of type int." 342 | ) 343 | 344 | if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0): 345 | raise ValueError( 346 | "'workers' parameter value cannot be 0 and must be between -1 and " + str( 347 | psutil.cpu_count(logical=True)) 348 | ) 349 | 350 | 351 | stop_words_list = set() 352 | if isinstance(stop_words, str): 353 | try: 354 | stop_words_list = set(nltk.corpus.stopwords.words(stop_words)) 355 | except LookupError: 356 | logger = logging.getLogger('KeyphraseVectorizer') 357 | logger.setLevel(logging.WARNING) 358 | sh = logging.StreamHandler() 359 | sh.setFormatter(logging.Formatter( 360 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) 361 | logger.addHandler(sh) 362 | logger.setLevel(logging.DEBUG) 363 | logger.info( 364 | 'It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.') 365 | nltk.download('stopwords') 366 | stop_words_list = set(nltk.corpus.stopwords.words(stop_words)) 367 | 368 | elif hasattr(stop_words, '__iter__'): 369 | stop_words_list = set(stop_words) 370 | 371 | # add spaCy PoS tags for documents 372 | if not custom_pos_tagger: 373 | if isinstance(spacy_pipeline, spacy.Language): 374 | nlp = spacy_pipeline 375 | else: 376 | if not spacy_exclude: 377 | spacy_exclude = [] 378 | try: 379 | if extract_keyphrases: 380 | nlp = spacy.load(spacy_pipeline, exclude=spacy_exclude) 381 | else: 382 | # only use tokenizer if no keywords are extracted 383 | nlp = spacy.blank(spacy_pipeline.split("_")[0]) 384 | 385 | except OSError: 386 | # set logger 387 | logger = logging.getLogger('KeyphraseVectorizer') 388 | logger.setLevel(logging.WARNING) 389 | sh = logging.StreamHandler() 390 | sh.setFormatter(logging.Formatter( 391 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) 392 | logger.addHandler(sh) 393 | logger.setLevel(logging.DEBUG) 394 | logger.info( 395 | 'It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.') 396 | spacy.cli.download(spacy_pipeline) 397 | nlp = spacy.load(spacy_pipeline, 398 | exclude=spacy_exclude) 399 | 400 | if workers != 1: 401 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 402 | 403 | # add document delimiter, so we can identify the original document split later 404 | doc_delimiter = "thisisadocumentdelimiternotakeyphrasepleaseignore" 405 | document_list = [doc_delimiter + " " + doc for doc in document_list] 406 | 407 | # split large documents in smaller chunks, so that spacy can process them without memory issues 408 | docs_list = [] 409 | # set maximal character length of documents for spaCy processing 410 | max_doc_length = 500 411 | for document in document_list: 412 | if len(document) > max_doc_length: 413 | docs_list.extend(self._split_long_document(text=document, max_text_length=max_doc_length)) 414 | else: 415 | docs_list.append(document) 416 | document_list = docs_list 417 | del docs_list 418 | 419 | # increase max length of documents that spaCy can parse 420 | # (should only be done if parser and ner are not used due to memory issues) 421 | if not custom_pos_tagger: 422 | nlp.max_length = max([len(doc) for doc in document_list]) + 100 423 | 424 | if not custom_pos_tagger: 425 | pos_tuples = [] 426 | for tagged_doc in nlp.pipe(document_list, n_process=workers): 427 | pos_tuples.extend([(word.text, word.tag_) for word in tagged_doc if word.text]) 428 | else: 429 | pos_tuples = custom_pos_tagger(raw_documents=document_list) 430 | 431 | # get the original documents after they were processed by a tokenizer and a POS tagger 432 | processed_docs = [] 433 | for tup in pos_tuples: 434 | token = tup[0] 435 | if lowercase: 436 | token = token.lower() 437 | if token not in stop_words_list: 438 | processed_docs.append(token) 439 | processed_docs = ' '.join(processed_docs) 440 | 441 | # add delimiter to stop_words_list to ignore it during keyphrase extraction 442 | stop_words_list.add(doc_delimiter) 443 | 444 | # split processed documents by delimiter 445 | processed_docs = [doc.strip() for doc in processed_docs.split(doc_delimiter)][1:] 446 | 447 | if extract_keyphrases: 448 | # extract keyphrases that match the NLTK RegexpParser filter 449 | keyphrases = [] 450 | # prefix_list = [stop_word + ' ' for stop_word in stop_words_list] 451 | # suffix_list = [' ' + stop_word for stop_word in stop_words_list] 452 | cp = nltk.RegexpParser('CHUNK: {(' + pos_pattern + ')}') 453 | tree = cp.parse(pos_tuples) 454 | for subtree in tree.subtrees(filter=lambda tuple: tuple.label() == 'CHUNK'): 455 | # join candidate keyphrase from single words 456 | keyphrase = ' '.join([i[0] for i in subtree.leaves() if i[0] not in stop_words_list]) 457 | 458 | # convert keyphrase to lowercase 459 | if lowercase: 460 | keyphrase = keyphrase.lower() 461 | 462 | # remove stopword suffixes 463 | # keyphrase = self._remove_suffixes(keyphrase, suffix_list) 464 | 465 | # remove stopword prefixes 466 | # keyphrase = self._remove_prefixes(keyphrase, prefix_list) 467 | 468 | # remove whitespace from the beginning and end of keyphrases 469 | keyphrase = keyphrase.strip() 470 | 471 | # do not include single keywords that are actually stopwords 472 | if keyphrase.lower() not in stop_words_list: 473 | keyphrases.append(keyphrase) 474 | 475 | # remove potential empty keyphrases 476 | keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != ''] 477 | 478 | else: 479 | keyphrases = [] 480 | 481 | return processed_docs, list(set(keyphrases)) -------------------------------------------------------------------------------- /keyphrase_vectorizers/keyphrase_count_vectorizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. _spaCy pipeline: https://spacy.io/models 3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip 4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py 5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax 6 | .. _spaCy part-of-speech tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py 7 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in 8 | """ 9 | 10 | import warnings 11 | from typing import List, Union 12 | 13 | import numpy as np 14 | import psutil 15 | import spacy 16 | from scipy import sparse 17 | from scipy.sparse import csr_matrix 18 | from sklearn.base import BaseEstimator 19 | from sklearn.exceptions import NotFittedError 20 | from sklearn.feature_extraction.text import CountVectorizer 21 | from sklearn.utils.deprecation import deprecated 22 | 23 | from keyphrase_vectorizers.keyphrase_vectorizer_mixin import _KeyphraseVectorizerMixin 24 | 25 | 26 | class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator): 27 | """ 28 | KeyphraseCountVectorizer 29 | 30 | KeyphraseCountVectorizer converts a collection of text documents to a matrix of document-token counts. 31 | The tokens are keyphrases that are extracted from the text documents based on their part-of-speech tags. 32 | The matrix rows indicate the documents and columns indicate the unique keyphrases. Each cell represents the count. 33 | The part-of-speech pattern of keyphrases can be defined by the ``pos_pattern`` parameter. 34 | By default, keyphrases are extracted, that have 0 or more adjectives, followed by 1 or more nouns. 35 | A list of extracted keyphrases matching the defined part-of-speech pattern can be returned after fitting via :class:`get_feature_names_out()`. 36 | 37 | Attention: 38 | If the vectorizer is used for languages other than English, the ``spacy_pipeline`` and ``stop_words`` parameters 39 | must be customized accordingly. 40 | Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages. 41 | Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered. 42 | In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly. 43 | 44 | Parameters 45 | ---------- 46 | spacy_pipeline : Union[str, spacy.Language], default='en_core_web_sm' 47 | A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text. Standard is the 'en' pipeline. 48 | 49 | pos_pattern : str, default='*+' 50 | The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text. 51 | Standard is to only select keyphrases that have 0 or more adjectives, followed by 1 or more nouns. 52 | 53 | stop_words : Union[str, List[str]], default='english' 54 | Language of stopwords to remove from the document, e.g. 'english'. 55 | Supported options are `stopwords available in NLTK`_. 56 | Removes unwanted stopwords from keyphrases if 'stop_words' is not None. 57 | If given a list of custom stopwords, removes them instead. 58 | 59 | lowercase : bool, default=True 60 | Whether the returned keyphrases should be converted to lowercase. 61 | 62 | workers : int, default=1 63 | How many workers to use for spaCy part-of-speech tagging. 64 | If set to -1, use all available worker threads of the machine. 65 | SpaCy uses the specified number of cores to tag documents with part-of-speech. 66 | Depending on the platform, starting many processes with multiprocessing can add a lot of overhead. 67 | In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow. 68 | Therefore, carefully consider whether this option is really necessary. 69 | 70 | spacy_exclude : List[str], default=['parser', 'attribute_ruler', 'lemmatizer', 'ner'] 71 | A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging. 72 | Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed. 73 | 74 | custom_pos_tagger: callable, default=None 75 | A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples. 76 | If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. 77 | 78 | max_df : int, default=None 79 | During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold. 80 | 81 | min_df : int, default=None 82 | During fitting ignore keyphrases that have a document frequency strictly lower than the given threshold. 83 | This value is also called cut-off in the literature. 84 | 85 | binary : bool, default=False 86 | If True, all non zero counts are set to 1. 87 | This is useful for discrete probabilistic models that model binary events rather than integer counts. 88 | 89 | dtype : type, default=np.int64 90 | Type of the matrix returned by fit_transform() or transform(). 91 | 92 | decay : float, default=None 93 | A value between [0, 1] to weight the percentage of frequencies 94 | the previous bag-of-words should be decreased. For example, 95 | a value of `.1` will decrease the frequencies in the bag-of-words 96 | matrix with 10% at each iteration. 97 | 98 | delete_min_df : float, default=None 99 | Delete words at each iteration from its vocabulary 100 | that are below a minimum frequency. 101 | This will keep the resulting bag-of-words matrix small 102 | such that it does not explode in size with increasing 103 | vocabulary. If `decay` is None then this equals `min_df`. 104 | """ 105 | 106 | def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm', pos_pattern: str = '*+', 107 | stop_words: Union[str, List[str]] = 'english', lowercase: bool = True, workers: int = 1, 108 | spacy_exclude: List[str] = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat'], 109 | custom_pos_tagger: callable = None, 110 | max_df: int = None, min_df: int = None, binary: bool = False, dtype: np.dtype = np.int64, 111 | decay: float = None, delete_min_df: float = None): 112 | 113 | # triggers a parameter validation 114 | if not isinstance(min_df, int) and min_df is not None: 115 | raise ValueError( 116 | "'min_df' parameter must be of type int" 117 | ) 118 | 119 | # triggers a parameter validation 120 | if not isinstance(decay, float) and min_df is not None: 121 | raise ValueError( 122 | "'decay' parameter must be of type int" 123 | ) 124 | 125 | # triggers a parameter validation 126 | if not isinstance(delete_min_df, float) and min_df is not None: 127 | raise ValueError( 128 | "'delete_min_df' parameter must be of type int" 129 | ) 130 | 131 | # triggers a parameter validation 132 | if min_df == 0: 133 | raise ValueError( 134 | "'min_df' parameter must be > 0" 135 | ) 136 | 137 | # triggers a parameter validation 138 | if not isinstance(max_df, int) and max_df is not None: 139 | raise ValueError( 140 | "'max_df' parameter must be of type int" 141 | ) 142 | 143 | # triggers a parameter validation 144 | if max_df == 0: 145 | raise ValueError( 146 | "'max_df' parameter must be > 0" 147 | ) 148 | 149 | # triggers a parameter validation 150 | if max_df and min_df and max_df <= min_df: 151 | raise ValueError( 152 | "'max_df' must be > 'min_df'" 153 | ) 154 | 155 | # triggers a parameter validation 156 | if not isinstance(workers, int): 157 | raise ValueError( 158 | "'workers' parameter must be of type int" 159 | ) 160 | 161 | if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0): 162 | raise ValueError( 163 | "'workers' parameter value cannot be 0 and must be between -1 and " + str( 164 | psutil.cpu_count(logical=True)) 165 | ) 166 | 167 | self.spacy_pipeline = spacy_pipeline 168 | self.pos_pattern = pos_pattern 169 | self.stop_words = stop_words 170 | self.lowercase = lowercase 171 | self.workers = workers 172 | self.spacy_exclude = spacy_exclude 173 | self.custom_pos_tagger = custom_pos_tagger 174 | self.max_df = max_df 175 | self.min_df = min_df 176 | self.binary = binary 177 | self.dtype = dtype 178 | self.decay = decay 179 | self.delete_min_df = delete_min_df 180 | self.running_fit_transform = False 181 | 182 | def fit(self, raw_documents: List[str]) -> object: 183 | """ 184 | Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents. 185 | 186 | Parameters 187 | ---------- 188 | raw_documents : iterable 189 | An iterable of strings. 190 | 191 | Returns 192 | ------- 193 | self : object 194 | Fitted vectorizer. 195 | """ 196 | 197 | processed_documents, self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents, 198 | stop_words=self.stop_words, 199 | spacy_pipeline=self.spacy_pipeline, 200 | pos_pattern=self.pos_pattern, 201 | lowercase=self.lowercase, workers=self.workers, 202 | spacy_exclude=self.spacy_exclude, 203 | custom_pos_tagger=self.custom_pos_tagger, 204 | extract_keyphrases=True) 205 | 206 | # if the fit_transform process is currently running, pass the processed documents, so they do not need to be tokenized again 207 | if self.running_fit_transform: 208 | self.processed_documents = processed_documents 209 | 210 | # remove keyphrases that have more than 8 words, as they are probably no real keyphrases 211 | # additionally this prevents memory issues during transformation to a document-keyphrase matrix 212 | self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 8] 213 | 214 | # compute document frequencies of keyphrases 215 | if self.max_df or self.min_df: 216 | document_keyphrase_counts = CountVectorizer(vocabulary=self.keyphrases, ngram_range=( 217 | min([len(keyphrase.split()) for keyphrase in self.keyphrases]), 218 | max([len(keyphrase.split()) for keyphrase in self.keyphrases])), 219 | lowercase=self.lowercase, binary=self.binary, 220 | dtype=self.dtype, tokenizer=self._tokenize_simple).transform( 221 | raw_documents=processed_documents).toarray() 222 | 223 | document_frequencies = self._document_frequency(document_keyphrase_counts) 224 | 225 | # remove keyphrases with document frequencies < min_df and document frequencies > max_df 226 | if self.max_df: 227 | self.keyphrases = [keyphrase for index, keyphrase in enumerate(self.keyphrases) if 228 | (document_frequencies[index] <= self.max_df)] 229 | if self.min_df: 230 | self.keyphrases = [keyphrase for index, keyphrase in enumerate(self.keyphrases) if 231 | (document_frequencies[index] >= self.min_df)] 232 | 233 | # set n-gram range to zero if no keyphrases could be extracted 234 | if self.keyphrases: 235 | self.max_n_gram_length = max([len(keyphrase.split()) for keyphrase in self.keyphrases]) 236 | self.min_n_gram_length = min([len(keyphrase.split()) for keyphrase in self.keyphrases]) 237 | else: 238 | raise ValueError( 239 | "Empty keyphrases. Perhaps the documents do not contain keyphrases that match the 'pos_pattern' argument, only contain stop words, or you set the 'min_df'/'max_df/delete_min_df' arguments too strict.") 240 | 241 | return self 242 | 243 | def fit_transform(self, raw_documents: List[str]) -> List[List[int]]: 244 | """ 245 | Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents 246 | and return the document-keyphrase matrix. 247 | This is equivalent to fit followed by transform, but more efficiently implemented. 248 | 249 | Parameters 250 | ---------- 251 | raw_documents : iterable 252 | An iterable of strings. 253 | 254 | Returns 255 | ------- 256 | X : array of shape (n_samples, n_features) 257 | Document-keyphrase matrix. 258 | """ 259 | 260 | # indicate if the fit_trasnform process is currently running 261 | self.running_fit_transform = True 262 | 263 | # fit 264 | KeyphraseCountVectorizer.fit(self=self, raw_documents=raw_documents) 265 | 266 | # transform 267 | count_matrix = CountVectorizer(vocabulary=self.keyphrases, 268 | ngram_range=(self.min_n_gram_length, self.max_n_gram_length), 269 | lowercase=self.lowercase, binary=self.binary, dtype=self.dtype, 270 | tokenizer=self._tokenize_simple).fit_transform( 271 | raw_documents=self.processed_documents) 272 | 273 | del self.processed_documents 274 | self.running_fit_transform = False 275 | 276 | return count_matrix 277 | 278 | def transform(self, raw_documents: List[str]) -> List[List[int]]: 279 | """ 280 | Transform documents to document-keyphrase matrix. 281 | Extract token counts out of raw text documents using the keyphrases 282 | fitted with fit. 283 | 284 | Parameters 285 | ---------- 286 | raw_documents : iterable 287 | An iterable of strings. 288 | 289 | Returns 290 | ------- 291 | X : sparse matrix of shape (n_samples, n_features) 292 | Document-keyphrase matrix. 293 | """ 294 | 295 | # triggers a parameter validation 296 | if not hasattr(self, 'keyphrases'): 297 | raise NotFittedError("Keyphrases not fitted.") 298 | 299 | # triggers a parameter validation 300 | if self.keyphrases == []: 301 | raise ValueError( 302 | "Empty keyphrases. Perhaps the documents used to fit did not contain any keyphrases or you set the 'min_df'/'max_df/delete_min_df' arguments too strict." 303 | ) 304 | 305 | processed_documents, _ = self._get_pos_keyphrases(document_list=raw_documents, 306 | stop_words=self.stop_words, 307 | spacy_pipeline=self.spacy_pipeline, 308 | pos_pattern=self.pos_pattern, 309 | lowercase=self.lowercase, workers=self.workers, 310 | spacy_exclude=['tok2vec', 'tagger', 'parser', 311 | 'attribute_ruler', 'lemmatizer', 'ner', 312 | 'textcat'], 313 | custom_pos_tagger=self.custom_pos_tagger, 314 | extract_keyphrases=False) 315 | 316 | return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length), 317 | lowercase=self.lowercase, binary=self.binary, dtype=self.dtype, 318 | tokenizer=self._tokenize_simple).transform( 319 | raw_documents=processed_documents) 320 | 321 | def inverse_transform(self, X: List[List[int]]) -> List[List[str]]: 322 | """ 323 | Return keyphrases per document with nonzero entries in X. 324 | 325 | Parameters 326 | ---------- 327 | X : {array-like, sparse matrix} of shape (n_samples, n_features) 328 | Document-keyphrase matrix. 329 | 330 | Returns 331 | ------- 332 | X_inv : list of arrays of shape (n_samples,) 333 | List of arrays of keyphrase. 334 | """ 335 | 336 | # triggers a parameter validation 337 | if not hasattr(self, 'keyphrases'): 338 | raise NotFittedError("Keyphrases not fitted.") 339 | 340 | return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length), 341 | lowercase=self.lowercase, binary=self.binary, dtype=self.dtype).inverse_transform(X=X) 342 | 343 | @deprecated( 344 | "get_feature_names() is deprecated in scikit-learn 1.0 and will be removed " 345 | "with scikit-learn 1.2. Please use get_feature_names_out() instead." 346 | ) 347 | def get_feature_names(self) -> List[str]: 348 | """ 349 | Array mapping from feature integer indices to feature name. 350 | 351 | Returns 352 | ------- 353 | feature_names : list 354 | A list of fitted keyphrases. 355 | """ 356 | 357 | # triggers a parameter validation 358 | if not hasattr(self, 'keyphrases'): 359 | raise NotFittedError("Keyphrases not fitted.") 360 | 361 | # raise DeprecationWarning when function is removed from scikit-learn 362 | try: 363 | with warnings.catch_warnings(): 364 | warnings.simplefilter("ignore") 365 | return CountVectorizer(vocabulary=self.keyphrases, 366 | ngram_range=(self.min_n_gram_length, self.max_n_gram_length), 367 | lowercase=self.lowercase, binary=self.binary, 368 | dtype=self.dtype).get_feature_names() 369 | except AttributeError: 370 | raise DeprecationWarning("get_feature_names() is deprecated. Please use 'get_feature_names_out()' instead.") 371 | 372 | def get_feature_names_out(self) -> np.array(str): 373 | """ 374 | Get fitted keyphrases for transformation. 375 | 376 | Returns 377 | ------- 378 | feature_names_out : ndarray of str objects 379 | Transformed keyphrases. 380 | """ 381 | 382 | # triggers a parameter validation 383 | if not hasattr(self, 'keyphrases'): 384 | raise NotFittedError("Keyphrases not fitted.") 385 | 386 | return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length), 387 | lowercase=self.lowercase, binary=self.binary, dtype=self.dtype).get_feature_names_out() 388 | 389 | def partial_fit(self, raw_documents: List[str]) -> None: 390 | """ 391 | Perform a partial fit and update internal list of keyphrases with OOV keyphrases 392 | 393 | Parameters 394 | ---------- 395 | raw_documents : iterable 396 | An iterable of strings. 397 | 398 | Returns 399 | ------- 400 | self : object 401 | Partial fitted vectorizer. 402 | """ 403 | 404 | if not hasattr(self, 'keyphrases'): 405 | return self.fit(raw_documents) 406 | 407 | processed_documents, new_keyphrases = self._get_pos_keyphrases(document_list=raw_documents, 408 | stop_words=self.stop_words, 409 | spacy_pipeline=self.spacy_pipeline, 410 | pos_pattern=self.pos_pattern, 411 | lowercase=self.lowercase, workers=self.workers, 412 | spacy_exclude=self.spacy_exclude, 413 | custom_pos_tagger=self.custom_pos_tagger, 414 | extract_keyphrases=True) 415 | 416 | oov_keyphrases = list(set(new_keyphrases).difference(set(self.keyphrases))) 417 | 418 | # remove keyphrases that have more than 8 words, as they are probably no real keyphrases 419 | # additionally this prevents memory issues during transformation to a document-keyphrase matrix 420 | oov_keyphrases = [keyphrase for keyphrase in oov_keyphrases if len(keyphrase.split()) <= 8] 421 | 422 | # compute document frequencies of keyphrases 423 | if self.max_df or self.min_df: 424 | document_keyphrase_counts = CountVectorizer(vocabulary=oov_keyphrases, ngram_range=( 425 | min([len(keyphrase.split()) for keyphrase in oov_keyphrases]), 426 | max([len(keyphrase.split()) for keyphrase in oov_keyphrases])), 427 | lowercase=self.lowercase, binary=self.binary, 428 | dtype=self.dtype, tokenizer=self._tokenize_simple).transform( 429 | raw_documents=processed_documents).toarray() 430 | 431 | document_frequencies = self._document_frequency(document_keyphrase_counts) 432 | 433 | # remove keyphrases with document frequencies < min_df and document frequencies > max_df 434 | if self.max_df: 435 | oov_keyphrases = [keyphrase for index, keyphrase in enumerate(oov_keyphrases) if 436 | (document_frequencies[index] <= self.max_df)] 437 | if self.min_df: 438 | oov_keyphrases = [keyphrase for index, keyphrase in enumerate(oov_keyphrases) if 439 | (document_frequencies[index] >= self.min_df)] 440 | 441 | if oov_keyphrases: 442 | self.keyphrases = self.keyphrases + oov_keyphrases 443 | self.max_n_gram_length = max([len(keyphrase.split()) for keyphrase in self.keyphrases]) 444 | self.min_n_gram_length = min([len(keyphrase.split()) for keyphrase in self.keyphrases]) 445 | 446 | return self 447 | 448 | def update_bow(self, raw_documents: List[str]) -> csr_matrix: 449 | """ 450 | Create or update the bag-of-keywords matrix 451 | 452 | Update the bag-of-keywords matrix by adding the newly transformed 453 | documents. This may add empty columns if new words are found and/or 454 | add empty rows if new topics are found. 455 | 456 | During this process, the previous bag-of-keywords matrix might be 457 | decayed if `self.decay` has been set during init. Similarly, words 458 | that do not exceed `self.delete_min_df` are removed from its 459 | vocabulary and bag-of-keywords matrix. 460 | 461 | Parameters 462 | ---------- 463 | raw_documents : iterable 464 | An iterable of strings. 465 | 466 | Returns 467 | ------- 468 | X_ : scipy.sparse.csr_matrix 469 | Bag-of-keywords matrix 470 | """ 471 | 472 | if hasattr(self, "X_"): 473 | X = self.transform(raw_documents) 474 | 475 | # Add empty columns if new words are found 476 | columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int) 477 | self.X_ = sparse.hstack([self.X_, columns]) 478 | 479 | # Add empty rows if new topics are found 480 | rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int) 481 | self.X_ = sparse.vstack([self.X_, rows]) 482 | 483 | # Decay of BoW matrix 484 | if self.decay is not None: 485 | self.X_ = self.X_ * (1 - self.decay) 486 | 487 | self.X_ += X 488 | else: 489 | self.X_ = self.transform(raw_documents) 490 | 491 | if self.delete_min_df is not None: 492 | self._clean_bow() 493 | 494 | return self.X_ 495 | 496 | def _clean_bow(self) -> None: 497 | """ 498 | Remove words that do not exceed `delete_min_df` 499 | """ 500 | 501 | # Only keep words with a minimum frequency 502 | indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1] 503 | self.X_ = self.X_[:, indices] 504 | 505 | x = np.array(self.keyphrases) 506 | mask = np.full(len(self.keyphrases), True, dtype=bool) 507 | mask[indices] = False 508 | self.keyphrases = list(x[~mask]) -------------------------------------------------------------------------------- /docs/KeyphraseVectorizers.md: -------------------------------------------------------------------------------- 1 | [![PyPI - Python](https://img.shields.io/badge/python-%3E%3D3.7-blue)](https://pypi.org/project/keyphrase-vectorizers/) 2 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://github.com/TimSchopf/Keyphrase_Vectorizers/blob/master/LICENSE) 3 | [![PyPI - PyPi](https://img.shields.io/pypi/v/keyphrase-vectorizers.svg)](https://pypi.org/project/keyphrase-vectorizers/) 4 | [![Build](https://img.shields.io/github/workflow/status/TimSchopf/KeyphraseVectorizers/Code%20tests/master)](https://pypi.org/project/keyphrase-vectorizers/) 5 | [![Documentation Status](https://readthedocs.org/projects/keyphrase-vectorizers/badge/?version=latest)](https://keyphrase-vectorizers.readthedocs.io/en/latest/?badge=latest) 6 | 7 | KeyphraseVectorizers 8 | ===================== 9 | 10 | Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert 11 | them into a document-keyphrase matrix. A document-keyphrase matrix is a mathematical matrix that describes the frequency 12 | of keyphrases that occur in a collection of documents. The matrix rows indicate the text documents and columns indicate 13 | the unique keyphrases. 14 | 15 | The package contains wrappers of the 16 | [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer "scikit-learn CountVectorizer") 17 | and 18 | [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer "scikit-learn TfidfVectorizer") 19 | classes. Instead of using n-gram tokens of a pre-defined range, these classes extract keyphrases from text documents 20 | using part-of-speech tags to compute document-keyphrase matrices. 21 | 22 | Benefits 23 | -------- 24 | 25 | * Extract grammatically accurate keyphases based on their part-of-speech tags. 26 | * No need to specify n-gram ranges. 27 | * Get document-keyphrase matrices. 28 | * Multiple language support. 29 | * User-defined part-of-speech patterns for keyphrase extraction possible. 30 | 31 | 32 | 33 | Table of Contents 34 | ----------------- 35 | 36 | 37 | 38 | 1. [How does it work?](#how-does-it-work) 39 | 2. [Installation](#installation) 40 | 3. [Usage](#usage) 41 | 1. [KeyphraseCountVectorizer](#KeyphraseCountVectorizer) 42 | 1. [English language](#english-language) 43 | 2. [Other languages](#other-languages) 44 | 2. [KeyphraseTfidfVectorizer](#KeyphraseTfidfVectorizer) 45 | 3. [Keyphrase extraction with KeyBERT](#keyphrase-extraction-with-keybert) 46 | 4. [Topic modeling with BERTopic and KeyphraseVectorizers](#topic-modeling-with-bertopic-and-keyphrasevectorizers) 47 | 48 | 49 | 50 | 51 | 52 | How does it work? 53 | ----------------- 54 | 55 | First, the document texts are annotated with [spaCy](https://spacy.io "spaCy homepage") part-of-speech tags. A list of 56 | all possible spaCy part-of-speech tags for different languages is 57 | linked [here](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py "spaCy POS tags"). The annotation 58 | requires passing the [spaCy pipeline](https://spacy.io/models "available spaCy pipelines") of the corresponding language 59 | to the vectorizer with the `spacy_pipeline` parameter. 60 | 61 | Second, words are extracted from the document texts whose part-of-speech tags match the regex pattern defined in 62 | the `pos_pattern` 63 | parameter. The keyphrases are a list of unique words extracted from text documents by this method. 64 | 65 | Finally, the vectorizers calculate document-keyphrase matrices. 66 | 67 | 68 | 69 | Installation 70 | ------------ 71 | 72 | ``` 73 | pip install keyphrase-vectorizers 74 | ``` 75 | 76 | 77 | 78 | Usage 79 | ----- 80 | For detailed information visit 81 | the [API Guide](https://keyphrase-vectorizers.readthedocs.io/en/latest/index.html "Keyphrase_Vectorizers API Guide"). 82 | 83 | 84 | 85 | ### KeyphraseCountVectorizer 86 | 87 | [Back to Table of Contents](#toc) 88 | 89 | 90 | 91 | #### English language 92 | 93 | ```python 94 | from keyphrase_vectorizers import KeyphraseCountVectorizer 95 | 96 | docs = ["""Supervised learning is the machine learning task of learning a function that 97 | maps an input to an output based on example input-output pairs. It infers a 98 | function from labeled training data consisting of a set of training examples. 99 | In supervised learning, each example is a pair consisting of an input object 100 | (typically a vector) and a desired output value (also called the supervisory signal). 101 | A supervised learning algorithm analyzes the training data and produces an inferred function, 102 | which can be used for mapping new examples. An optimal scenario will allow for the 103 | algorithm to correctly determine the class labels for unseen instances. This requires 104 | the learning algorithm to generalize from the training data to unseen situations in a 105 | 'reasonable' way (see inductive bias).""", 106 | 107 | """Keywords are defined as phrases that capture the main topics discussed in a document. 108 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 109 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 110 | of keywords can quickly help to determine whether a given document is relevant to their interest. 111 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 112 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 113 | in information retrieval."""] 114 | 115 | # Init default vectorizer. 116 | vectorizer = KeyphraseCountVectorizer() 117 | 118 | # Print parameters 119 | print(vectorizer.get_params()) 120 | >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} 121 | ``` 122 | 123 | By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is 124 | specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives, 125 | followed by 1 or more nouns using the English spaCy part-of-speech tags. 126 | 127 | ```python 128 | # After initializing the vectorizer, it can be fitted 129 | # to learn the keyphrases from the text documents. 130 | vectorizer.fit(docs) 131 | ``` 132 | 133 | ```python 134 | # After learning the keyphrases, they can be returned. 135 | keyphrases = vectorizer.get_feature_names_out() 136 | 137 | print(keyphrases) 138 | >>> ['output' 'training data' 'task' 'way' 'input object' 'documents' 139 | 'unseen instances' 'vector' 'interest' 'learning algorithm' 140 | 'unseen situations' 'training examples' 'machine' 'given document' 141 | 'document' 'document relevance' 'output pairs' 'document content' 142 | 'class labels' 'new examples' 'pair' 'main topics' 'phrases' 'overlap' 143 | 'algorithm' 'various applications' 'information retrieval' 'users' 'list' 144 | 'example input' 'supervised learning' 'optimal scenario' 145 | 'precise summary' 'keywords' 'input' 'supervised learning algorithm' 146 | 'example' 'supervisory signal' 'indication' 'set' 147 | 'information retrieval environment' 'output value' 'inductive bias' 148 | 'groups' 'function'] 149 | ``` 150 | 151 | ```python 152 | # After fitting, the vectorizer can transform the documents 153 | # to a document-keyphrase matrix. 154 | # Matrix rows indicate the documents and columns indicate the unique keyphrases. 155 | # Each cell represents the count. 156 | document_keyphrase_matrix = vectorizer.transform(docs).toarray() 157 | 158 | print(document_keyphrase_matrix) 159 | >>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1 160 | 2 1 0 1 0 1 1 0 3] 161 | [0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0 162 | 0 0 1 0 1 0 0 1 0]] 163 | ``` 164 | 165 | ```python 166 | # Fit and transform can also be executed in one step, 167 | # which is more efficient. 168 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray() 169 | 170 | print(document_keyphrase_matrix) 171 | >>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1 172 | 2 1 0 1 0 1 1 0 3] 173 | [0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0 174 | 0 0 1 0 1 0 0 1 0]] 175 | ``` 176 | 177 | 178 | 179 | #### Other languages 180 | 181 | [Back to Table of Contents](#toc) 182 | 183 | ```python 184 | german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 185 | Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 186 | sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 187 | Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 188 | Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 189 | Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""", 190 | 191 | """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 192 | Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 193 | die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren 194 | """] 195 | # Init vectorizer for the german language 196 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='*+', stop_words='german') 197 | ``` 198 | 199 | The German `spacy_pipeline` is specified and German `stop_words` are removed. Because the German spaCy part-of-speech 200 | tags differ from the English ones, the `pos_pattern` parameter is also customized. The regex pattern `*+` 201 | extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the German spaCy part-of-speech 202 | tags. 203 | 204 | 205 | 206 | ### KeyphraseTfidfVectorizer 207 | 208 | [Back to Table of Contents](#toc) 209 | 210 | The `KeyphraseTfidfVectorizer` has the same function calls and features as the `KeyphraseCountVectorizer`. The only 211 | difference is, that document-keyphrase matrix cells represent tf or tf-idf values, depending on the parameter settings, 212 | instead of counts. 213 | 214 | ```python 215 | from keyphrase_vectorizers import KeyphraseTfidfVectorizer 216 | 217 | docs = ["""Supervised learning is the machine learning task of learning a function that 218 | maps an input to an output based on example input-output pairs. It infers a 219 | function from labeled training data consisting of a set of training examples. 220 | In supervised learning, each example is a pair consisting of an input object 221 | (typically a vector) and a desired output value (also called the supervisory signal). 222 | A supervised learning algorithm analyzes the training data and produces an inferred function, 223 | which can be used for mapping new examples. An optimal scenario will allow for the 224 | algorithm to correctly determine the class labels for unseen instances. This requires 225 | the learning algorithm to generalize from the training data to unseen situations in a 226 | 'reasonable' way (see inductive bias).""", 227 | 228 | """Keywords are defined as phrases that capture the main topics discussed in a document. 229 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 230 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 231 | of keywords can quickly help to determine whether a given document is relevant to their interest. 232 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 233 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 234 | in information retrieval."""] 235 | 236 | # Init default vectorizer for the English language that computes tf-idf values 237 | vectorizer = KeyphraseTfidfVectorizer() 238 | 239 | # Print parameters 240 | print(vectorizer.get_params()) 241 | >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'norm': 'l2', 'pos_pattern': '*+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True, 'workers': 1} 242 | ``` 243 | 244 | To calculate tf values instead, set `use_idf=False`. 245 | 246 | ```python 247 | # Fit and transform to document-keyphrase matrix. 248 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray() 249 | 250 | print(document_keyphrase_matrix) 251 | >>> [[0.11111111 0.22222222 0.11111111 0. 0. 0. 252 | 0.11111111 0. 0.11111111 0.11111111 0.33333333 0. 253 | 0. 0. 0.11111111 0. 0. 0.11111111 254 | 0. 0.33333333 0. 0.22222222 0. 0.11111111 255 | 0.11111111 0.11111111 0.11111111 0.11111111 0.33333333 0.11111111 256 | 0.11111111 0.33333333 0.11111111 0. 0.33333333 0. 257 | 0. 0. 0.11111111 0. 0.11111111 0.11111111 258 | 0. 0.33333333 0.11111111] 259 | [0. 0. 0. 0.11785113 0.11785113 0.11785113 260 | 0. 0.11785113 0. 0. 0. 0.11785113 261 | 0.11785113 0.11785113 0. 0.11785113 0.23570226 0. 262 | 0.23570226 0. 0.58925565 0. 0.11785113 0. 263 | 0. 0. 0. 0. 0. 0. 264 | 0. 0. 0. 0.58925565 0. 0.11785113 265 | 0.11785113 0.11785113 0. 0.11785113 0. 0. 266 | 0.11785113 0. 0. ]] 267 | ``` 268 | 269 | ```python 270 | # Return keyphrases 271 | keyphrases = vectorizer.get_feature_names_out() 272 | 273 | print(keyphrases) 274 | >>> ['optimal scenario' 'example' 'input object' 'groups' 'list' 275 | 'precise summary' 'inductive bias' 'phrases' 'training examples' 276 | 'output value' 'function' 'given document' 'documents' 277 | 'information retrieval environment' 'new examples' 'interest' 278 | 'main topics' 'unseen situations' 'information retrieval' 'input' 279 | 'keywords' 'learning algorithm' 'indication' 'set' 'example input' 280 | 'vector' 'machine' 'supervised learning algorithm' 'algorithm' 'pair' 281 | 'task' 'training data' 'way' 'document' 'supervised learning' 'users' 282 | 'document relevance' 'document content' 'supervisory signal' 'overlap' 283 | 'class labels' 'unseen instances' 'various applications' 'output' 284 | 'output pairs'] 285 | ``` 286 | 287 | 288 | 289 | ### Keyphrase extraction with [KeyBERT](https://github.com/MaartenGr/KeyBERT "KeyBERT repository") 290 | 291 | [Back to Table of Contents](#toc) 292 | 293 | The keyphrase vectorizers can be used together with KeyBERT to extract grammatically correct keyphrases that are most 294 | similar to a document. Thereby, the vectorizer first extracts candidate keyphrases from the text documents, which are 295 | subsequently ranked by KeyBERT based on their document similarity. The top-n most similar keyphrases can then be 296 | considered as document keywords. 297 | 298 | The advantage of using KeyphraseVectorizers in addition to KeyBERT is that it allows users to get grammatically correct 299 | keyphrases instead of simple n-grams of pre-defined lengths. In KeyBERT, users can specify the `keyphrase_ngram_range` 300 | to define the length of the retrieved keyphrases. However, this raises two issues. First, users usually do not know the 301 | optimal n-gram range and therefore have to spend some time experimenting until they find a suitable n-gram range. 302 | Second, even after finding a good n-gram range, the returned keyphrases are sometimes still grammatically not quite 303 | correct or are slightly off-key. Unfortunately, this limits the quality of the returned keyphrases. 304 | 305 | To adress this issue, we can use the vectorizers of this package to first extract candidate keyphrases that consist of 306 | zero or more adjectives, followed by one or multiple nouns in a pre-processing step instead of simple n-grams. 307 | [Wan and Xiao](https://www.aaai.org/Papers/AAAI/2008/AAAI08-136.pdf) successfully used this noun phrase approach for 308 | keyphrase extraction during their research in 2008. The extracted candidate keyphrases are subsequently passed to 309 | KeyBERT for embedding generation and similarity calculation. To use both packages for keyphrase extraction, we need to 310 | pass KeyBERT a keyphrase vectorizer with the `vectorizer` parameter. Since the length of keyphrases now depends on 311 | part-of-speech tags, there is no need to define an n-gram length anymore. 312 | 313 | #### Example: 314 | 315 | KeyBERT can be installed via `pip install keybert`. 316 | 317 | ```python 318 | from keyphrase_vectorizers import KeyphraseCountVectorizer 319 | from keybert import KeyBERT 320 | 321 | docs = ["""Supervised learning is the machine learning task of learning a function that 322 | maps an input to an output based on example input-output pairs. It infers a 323 | function from labeled training data consisting of a set of training examples. 324 | In supervised learning, each example is a pair consisting of an input object 325 | (typically a vector) and a desired output value (also called the supervisory signal). 326 | A supervised learning algorithm analyzes the training data and produces an inferred function, 327 | which can be used for mapping new examples. An optimal scenario will allow for the 328 | algorithm to correctly determine the class labels for unseen instances. This requires 329 | the learning algorithm to generalize from the training data to unseen situations in a 330 | 'reasonable' way (see inductive bias).""", 331 | 332 | """Keywords are defined as phrases that capture the main topics discussed in a document. 333 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 334 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 335 | of keywords can quickly help to determine whether a given document is relevant to their interest. 336 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 337 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 338 | in information retrieval."""] 339 | 340 | kw_model = KeyBERT() 341 | ``` 342 | 343 | Instead of deciding on a suitable n-gram range which could be e.g.(1,2)... 344 | 345 | ```python 346 | >>> kw_model.extract_keywords(docs=docs, keyphrase_ngram_range=(1,2)) 347 | [[('labeled training', 0.6013), 348 | ('examples supervised', 0.6112), 349 | ('signal supervised', 0.6152), 350 | ('supervised', 0.6676), 351 | ('supervised learning', 0.6779)], 352 | [('keywords assigned', 0.6354), 353 | ('keywords used', 0.6373), 354 | ('list keywords', 0.6375), 355 | ('keywords quickly', 0.6376), 356 | ('keywords defined', 0.6997)]] 357 | ``` 358 | 359 | we can now just let the keyphrase vectorizer decide on suitable keyphrases, without limitations to a maximum or minimum 360 | n-gram range. We only have to pass a keyphrase vectorizer as parameter to KeyBERT: 361 | 362 | ```python 363 | >>> kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer()) 364 | [[('training examples', 0.4668), 365 | ('training data', 0.5271), 366 | ('learning algorithm', 0.5632), 367 | ('supervised learning', 0.6779), 368 | ('supervised learning algorithm', 0.6992)], 369 | [('given document', 0.4143), 370 | ('information retrieval environment', 0.5166), 371 | ('information retrieval', 0.5792), 372 | ('keywords', 0.6046), 373 | ('document relevance', 0.633)]] 374 | ``` 375 | 376 | This allows us to make sure that we do not cut off important words caused by defining our n-gram range too short. For 377 | example, we would not have found the keyphrase "supervised learning algorithm" with keyphrase_ngram_range=(1,2). 378 | Furthermore, we avoid to get keyphrases that are slightly off-key like "labeled training", "signal supervised" or 379 | "keywords quickly". 380 | 381 | 382 | 383 | ### Topic modeling with [BERTopic](https://github.com/MaartenGr/BERTopic "BERTopic repository") and KeyphraseVectorizers 384 | 385 | [Back to Table of Contents](#toc) 386 | 387 | Similar to the application with KeyBERT, the keyphrase vectorizers can be used to obtain grammatically correct 388 | keyphrases as 389 | descriptions for topics instead of simple n-grams. This allows us to make sure that we do not cut off important topic 390 | description keyphrases by defining our n-gram range too short. Moreover, we don't need to clean stopwords upfront, can 391 | get more precise topic models and avoid to get topic description keyphrases that are slightly off-key. 392 | 393 | #### Example: 394 | 395 | BERTopic can be installed via `pip install bertopic`. 396 | 397 | ```python 398 | from keyphrase_vectorizers import KeyphraseCountVectorizer 399 | from bertopic import BERTopic 400 | from sklearn.datasets import fetch_20newsgroups 401 | 402 | # load text documents 403 | docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] 404 | # only use subset of the data 405 | docs = docs[:5000] 406 | 407 | # train topic model with KeyphraseCountVectorizer 408 | keyphrase_topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) 409 | keyphrase_topics, keyphrase_probs = keyphrase_topic_model.fit_transform(docs) 410 | 411 | # get topics 412 | >>> keyphrase_topic_model.topics 413 | {-1: [('file', 0.007265527630674131), 414 | ('one', 0.007055454904474792), 415 | ('use', 0.00633563957153475), 416 | ('program', 0.006053271092949018), 417 | ('get', 0.006011060091056076), 418 | ('people', 0.005729309058970368), 419 | ('know', 0.005635951168273583), 420 | ('like', 0.0055692449802916015), 421 | ('time', 0.00527028825803415), 422 | ('us', 0.00525564504880084)], 423 | 0: [('game', 0.024134589719090525), 424 | ('team', 0.021852806383170772), 425 | ('players', 0.01749406934044139), 426 | ('games', 0.014397938026886745), 427 | ('hockey', 0.013932342023677305), 428 | ('win', 0.013706115572901401), 429 | ('year', 0.013297593024390321), 430 | ('play', 0.012533185558169046), 431 | ('baseball', 0.012412743802062559), 432 | ('season', 0.011602725885164318)], 433 | 1: [('patients', 0.022600352291162015), 434 | ('msg', 0.02023877371575874), 435 | ('doctor', 0.018816282737587457), 436 | ('medical', 0.018614407917995103), 437 | ('treatment', 0.0165028251400717), 438 | ('food', 0.01604980195180696), 439 | ('candida', 0.015255961242066143), 440 | ('disease', 0.015115496310099693), 441 | ('pain', 0.014129703072484495), 442 | ('hiv', 0.012884503220341102)], 443 | 2: [('key', 0.028851633177510126), 444 | ('encryption', 0.024375137861044675), 445 | ('clipper', 0.023565947302544528), 446 | ('privacy', 0.019258719348097385), 447 | ('security', 0.018983682856076434), 448 | ('chip', 0.018822199098878365), 449 | ('keys', 0.016060139239615384), 450 | ('internet', 0.01450486904722165), 451 | ('encrypted', 0.013194373119964168), 452 | ('government', 0.01303978311708837)], 453 | ... 454 | ``` 455 | 456 | The same topics look a bit different when no keyphrase vectorizer is used: 457 | 458 | ```python 459 | from bertopic import BERTopic 460 | from sklearn.datasets import fetch_20newsgroups 461 | 462 | # load text documents 463 | docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] 464 | # only use subset of the data 465 | docs = docs[:5000] 466 | 467 | # train topic model without KeyphraseCountVectorizer 468 | topic_model = BERTopic() 469 | topics, probs = topic_model.fit_transform(docs) 470 | 471 | # get topics 472 | >>> topic_model.topics 473 | {-1: [('the', 0.012864641020408933), 474 | ('to', 0.01187920529994724), 475 | ('and', 0.011431498631699856), 476 | ('of', 0.01099851927541331), 477 | ('is', 0.010995478673036962), 478 | ('in', 0.009908233622158523), 479 | ('for', 0.009903667215879675), 480 | ('that', 0.009619596716087699), 481 | ('it', 0.009578499681829809), 482 | ('you', 0.0095328846440753)], 483 | 0: [('game', 0.013949166096523719), 484 | ('team', 0.012458483177116456), 485 | ('he', 0.012354733462693834), 486 | ('the', 0.01119583508278812), 487 | ('10', 0.010190243555226108), 488 | ('in', 0.0101436249231417), 489 | ('players', 0.009682212470082758), 490 | ('to', 0.00933700544705287), 491 | ('was', 0.009172402203816335), 492 | ('and', 0.008653375901739337)], 493 | 1: [('of', 0.012771267188340924), 494 | ('to', 0.012581337590513296), 495 | ('is', 0.012554884458779008), 496 | ('patients', 0.011983273578628046), 497 | ('and', 0.011863499662237566), 498 | ('that', 0.011616113472989725), 499 | ('it', 0.011581944987387165), 500 | ('the', 0.011475148304229873), 501 | ('in', 0.011395485985801054), 502 | ('msg', 0.010715000656335596)], 503 | 2: [('key', 0.01725282988290282), 504 | ('the', 0.014634841495851404), 505 | ('be', 0.014429762197907552), 506 | ('encryption', 0.013530733999898166), 507 | ('to', 0.013443159534369817), 508 | ('clipper', 0.01296614319927958), 509 | ('of', 0.012164734232650158), 510 | ('is', 0.012128295958613464), 511 | ('and', 0.011972763728732667), 512 | ('chip', 0.010785744492767285)], 513 | ... 514 | ``` 515 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI - Python](https://img.shields.io/badge/python-%3E%3D3.7-blue)](https://pypi.org/project/keyphrase-vectorizers/) 2 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://github.com/TimSchopf/Keyphrase_Vectorizers/blob/master/LICENSE) 3 | [![PyPI - PyPi](https://img.shields.io/pypi/v/keyphrase-vectorizers.svg)](https://pypi.org/project/keyphrase-vectorizers/) 4 | [![Build](https://img.shields.io/github/actions/workflow/status/TimSchopf/KeyphraseVectorizers/testing.yml?branch=master)](https://pypi.org/project/keyphrase-vectorizers/) 5 | [![Documentation Status](https://readthedocs.org/projects/keyphrase-vectorizers/badge/?version=latest)](https://keyphrase-vectorizers.readthedocs.io/en/latest/?badge=latest) 6 | [![DOI:10.5220/0011546600003335](https://zenodo.org/badge/DOI/10.5220/0011546600003335.svg)](https://doi.org/10.5220/0011546600003335) 7 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/patternrank-leveraging-pretrained-language/keyphrase-extraction-on-inspec)](https://paperswithcode.com/sota/keyphrase-extraction-on-inspec?p=patternrank-leveraging-pretrained-language) 8 | 9 | KeyphraseVectorizers 10 | ===================== 11 | 12 | **This package was developed during the writing of our PatternRank paper. You can check out the paper [here](https://arxiv.org/abs/2210.05245). When using KeyphraseVectorizers or PatternRank in academic papers and theses, please use the [BibTeX entry below](#citation-information).** 13 | 14 | Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert 15 | them into a document-keyphrase matrix. A document-keyphrase matrix is a mathematical matrix that describes the frequency 16 | of keyphrases that occur in a collection of documents. The matrix rows indicate the text documents and columns indicate 17 | the unique keyphrases. 18 | 19 | The package contains wrappers of the 20 | [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer "scikit-learn CountVectorizer") 21 | and 22 | [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer "scikit-learn TfidfVectorizer") 23 | classes. Instead of using n-gram tokens of a pre-defined range, these classes extract keyphrases from text documents 24 | using part-of-speech tags to compute document-keyphrase matrices. 25 | 26 | Corresponding medium posts can be found [here](https://towardsdatascience.com/enhancing-keybert-keyword-extraction-results-with-keyphrasevectorizers-3796fa93f4db "Keyphrase Extraction with BERT Transformers and Noun Phrases") and [here](https://towardsdatascience.com/unsupervised-keyphrase-extraction-with-patternrank-28ec3ca737f0 "Unsupervised Keyphrase Extraction with PatternRank"). 27 | 28 | Benefits 29 | -------- 30 | 31 | * Extract grammatically accurate keyphases based on their part-of-speech tags. 32 | * No need to specify n-gram ranges. 33 | * Get document-keyphrase matrices. 34 | * Multiple language support. 35 | * User-defined part-of-speech patterns for keyphrase extraction possible. 36 | 37 | 38 | 39 | Table of Contents 40 | ----------------- 41 | 42 | 43 | 44 | 1. [How does it work?](#how-does-it-work) 45 | 2. [Installation](#installation) 46 | 3. [Usage](#usage) 47 | 1. [KeyphraseCountVectorizer](#keyphrasecountvectorizer) 48 | 1. [English language](#english-language) 49 | 2. [Other languages](#other-languages) 50 | 2. [KeyphraseTfidfVectorizer](#keyphrasetfidfvectorizer) 51 | 3. [Reuse a spaCy Language object](#reuse-a-spacy-language-object) 52 | 4. [Custom POS-tagger](#custom-pos-tagger) 53 | 5. [PatternRank: Keyphrase extraction with KeyphraseVectorizers and KeyBERT](#patternrank-keyphrase-extraction-with-keyphrasevectorizers-and-keybert) 54 | 6. [Topic modeling with BERTopic and KeyphraseVectorizers](#topic-modeling-with-bertopic-and-keyphrasevectorizers) 55 | 7. [Online KeyphraseVectorizers](#online-keyphrasevectorizers) 56 | 4. [Citation information](#citation-information) 57 | 58 | 59 | 60 | 61 | 62 | How does it work? 63 | ----------------- 64 | 65 | First, the document texts are annotated with [spaCy](https://spacy.io "spaCy homepage") part-of-speech tags. A list of 66 | all possible spaCy part-of-speech tags for different languages is 67 | linked [here](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py "spaCy POS tags"). The annotation 68 | requires passing the [spaCy pipeline](https://spacy.io/models "available spaCy pipelines") of the corresponding language 69 | to the vectorizer with the `spacy_pipeline` parameter. 70 | 71 | Second, words are extracted from the document texts whose part-of-speech tags match the regex pattern defined in 72 | the `pos_pattern` 73 | parameter. The keyphrases are a list of unique words extracted from text documents by this method. 74 | 75 | Finally, the vectorizers calculate document-keyphrase matrices. 76 | 77 | 78 | 79 | Installation 80 | ------------ 81 | 82 | ``` 83 | pip install keyphrase-vectorizers 84 | ``` 85 | 86 | 87 | 88 | Usage 89 | ----- 90 | For detailed information visit 91 | the [API Guide](https://keyphrase-vectorizers.readthedocs.io/en/latest/index.html "Keyphrase_Vectorizers API Guide"). 92 | 93 | 94 | 95 | ### KeyphraseCountVectorizer 96 | 97 | [Back to Table of Contents](#toc) 98 | 99 | 100 | 101 | #### English language 102 | 103 | ```python 104 | from keyphrase_vectorizers import KeyphraseCountVectorizer 105 | 106 | docs = ["""Supervised learning is the machine learning task of learning a function that 107 | maps an input to an output based on example input-output pairs. It infers a 108 | function from labeled training data consisting of a set of training examples. 109 | In supervised learning, each example is a pair consisting of an input object 110 | (typically a vector) and a desired output value (also called the supervisory signal). 111 | A supervised learning algorithm analyzes the training data and produces an inferred function, 112 | which can be used for mapping new examples. An optimal scenario will allow for the 113 | algorithm to correctly determine the class labels for unseen instances. This requires 114 | the learning algorithm to generalize from the training data to unseen situations in a 115 | 'reasonable' way (see inductive bias).""", 116 | 117 | """Keywords are defined as phrases that capture the main topics discussed in a document. 118 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 119 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 120 | of keywords can quickly help to determine whether a given document is relevant to their interest. 121 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 122 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 123 | in information retrieval."""] 124 | 125 | # Init default vectorizer. 126 | vectorizer = KeyphraseCountVectorizer() 127 | 128 | # Print parameters 129 | print(vectorizer.get_params()) 130 | >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} 131 | ``` 132 | 133 | By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is 134 | specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives, 135 | followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline 136 | components `['parser', 'attribute_ruler', 'lemmatizer', 'ner']` are excluded by default to increase efficiency. If you 137 | choose a different `spacy_pipeline`, you may have to exclude/include different pipeline components using 138 | the `spacy_exclude` parameter for the spaCy POS 139 | tagger to work properly. 140 | 141 | ```python 142 | # After initializing the vectorizer, it can be fitted 143 | # to learn the keyphrases from the text documents. 144 | vectorizer.fit(docs) 145 | ``` 146 | 147 | ```python 148 | # After learning the keyphrases, they can be returned. 149 | keyphrases = vectorizer.get_feature_names_out() 150 | 151 | print(keyphrases) 152 | >>> ['users' 'main topics' 'learning algorithm' 'overlap' 'documents' 'output' 153 | 'keywords' 'precise summary' 'new examples' 'training data' 'input' 154 | 'document content' 'training examples' 'unseen instances' 155 | 'optimal scenario' 'document' 'task' 'supervised learning algorithm' 156 | 'example' 'interest' 'function' 'example input' 'various applications' 157 | 'unseen situations' 'phrases' 'indication' 'inductive bias' 158 | 'supervisory signal' 'document relevance' 'information retrieval' 'set' 159 | 'input object' 'groups' 'output value' 'list' 'learning' 'output pairs' 160 | 'pair' 'class labels' 'supervised learning' 'machine' 161 | 'information retrieval environment' 'algorithm' 'vector' 'way'] 162 | ``` 163 | 164 | ```python 165 | # After fitting, the vectorizer can transform the documents 166 | # to a document-keyphrase matrix. 167 | # Matrix rows indicate the documents and columns indicate the unique keyphrases. 168 | # Each cell represents the count. 169 | document_keyphrase_matrix = vectorizer.transform(docs).toarray() 170 | 171 | print(document_keyphrase_matrix) 172 | >>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6 173 | 1 1 1 3 1 0 3 1 1] 174 | [1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0 175 | 0 0 0 0 0 1 0 0 0]] 176 | ``` 177 | 178 | ```python 179 | # Fit and transform can also be executed in one step, 180 | # which is more efficient. 181 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray() 182 | 183 | print(document_keyphrase_matrix) 184 | >>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6 185 | 1 1 1 3 1 0 3 1 1] 186 | [1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0 187 | 0 0 0 0 0 1 0 0 0]] 188 | ``` 189 | 190 | 191 | 192 | #### Other languages 193 | 194 | [Back to Table of Contents](#toc) 195 | 196 | ```python 197 | german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 198 | Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 199 | sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 200 | Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 201 | Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 202 | Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""", 203 | 204 | """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 205 | Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 206 | die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren 207 | """] 208 | # Init vectorizer for the german language 209 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='*+', stop_words='german') 210 | ``` 211 | 212 | The German `spacy_pipeline` is specified and German `stop_words` are removed. Because the German spaCy part-of-speech 213 | tags differ from the English ones, the `pos_pattern` parameter is also customized. The regex pattern `*+` 214 | extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the German spaCy part-of-speech 215 | tags. 216 | 217 | **Attention!** The spaCy pipeline components `['parser', 'attribute_ruler', 'lemmatizer', 'ner']` are excluded by 218 | default to increase efficiency. If you choose a different `spacy_pipeline`, you may have to exclude/include different 219 | pipeline components using the `spacy_exclude` parameter for the spaCy POS tagger to work properly. 220 | 221 | 222 | 223 | ### KeyphraseTfidfVectorizer 224 | 225 | [Back to Table of Contents](#toc) 226 | 227 | The `KeyphraseTfidfVectorizer` has the same function calls and features as the `KeyphraseCountVectorizer`. The only 228 | difference is, that document-keyphrase matrix cells represent tf or tf-idf values, depending on the parameter settings, 229 | instead of counts. 230 | 231 | ```python 232 | from keyphrase_vectorizers import KeyphraseTfidfVectorizer 233 | 234 | docs = ["""Supervised learning is the machine learning task of learning a function that 235 | maps an input to an output based on example input-output pairs. It infers a 236 | function from labeled training data consisting of a set of training examples. 237 | In supervised learning, each example is a pair consisting of an input object 238 | (typically a vector) and a desired output value (also called the supervisory signal). 239 | A supervised learning algorithm analyzes the training data and produces an inferred function, 240 | which can be used for mapping new examples. An optimal scenario will allow for the 241 | algorithm to correctly determine the class labels for unseen instances. This requires 242 | the learning algorithm to generalize from the training data to unseen situations in a 243 | 'reasonable' way (see inductive bias).""", 244 | 245 | """Keywords are defined as phrases that capture the main topics discussed in a document. 246 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 247 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 248 | of keywords can quickly help to determine whether a given document is relevant to their interest. 249 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 250 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 251 | in information retrieval."""] 252 | 253 | # Init default vectorizer for the English language that computes tf-idf values 254 | vectorizer = KeyphraseTfidfVectorizer() 255 | 256 | # Print parameters 257 | print(vectorizer.get_params()) 258 | >>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < 259 | 260 | 261 | class 'numpy.int64'>, 'lowercase': True, 'max_df': None 262 | 263 | , 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 264 | 'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} 265 | ``` 266 | 267 | To calculate tf values instead, set `use_idf=False`. 268 | 269 | ```python 270 | # Fit and transform to document-keyphrase matrix. 271 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray() 272 | 273 | print(document_keyphrase_matrix) 274 | >>> [[0. 0. 0.09245003 0.09245003 0.09245003 0.09245003 275 | 0.2773501 0.09245003 0.2773501 0.2773501 0.09245003 0. 276 | 0. 0.09245003 0. 0.2773501 0.09245003 0.09245003 277 | 0. 0.09245003 0.09245003 0.09245003 0.09245003 0.09245003 278 | 0.5547002 0. 0. 0.09245003 0.09245003 0. 279 | 0.2773501 0.18490007 0.09245003 0. 0.2773501 0. 280 | 0. 0.09245003 0. 0.09245003 0. 0. 281 | 0. 0.18490007 0. ] 282 | [0.11867817 0.11867817 0. 0. 0. 0. 283 | 0. 0. 0. 0. 0. 0.11867817 284 | 0.11867817 0. 0.11867817 0. 0. 0. 285 | 0.11867817 0. 0. 0. 0. 0. 286 | 0. 0.11867817 0.23735633 0. 0. 0.11867817 287 | 0. 0. 0. 0.23735633 0. 0.11867817 288 | 0.11867817 0. 0.59339083 0. 0.11867817 0.11867817 289 | 0.11867817 0. 0.59339083]] 290 | ``` 291 | 292 | ```python 293 | # Return keyphrases 294 | keyphrases = vectorizer.get_feature_names_out() 295 | 296 | print(keyphrases) 297 | >>> ['various applications' 'list' 'task' 'supervisory signal' 298 | 'inductive bias' 'supervised learning algorithm' 'supervised learning' 299 | 'example input' 'input' 'algorithm' 'set' 'precise summary' 'documents' 300 | 'input object' 'interest' 'function' 'class labels' 'machine' 301 | 'document content' 'output pairs' 'new examples' 'unseen situations' 302 | 'vector' 'output value' 'learning' 'document relevance' 'main topics' 303 | 'pair' 'training examples' 'information retrieval environment' 304 | 'training data' 'example' 'optimal scenario' 'information retrieval' 305 | 'output' 'groups' 'indication' 'unseen instances' 'keywords' 'way' 306 | 'phrases' 'overlap' 'users' 'learning algorithm' 'document'] 307 | ``` 308 | 309 | 310 | 311 | ### Reuse a spaCy Language object 312 | 313 | [Back to Table of Contents](#toc) 314 | 315 | KeyphraseVectorizers loads a `spacy.Language` object for every `KeyphraseVectorizer` object. 316 | When using multiple `KeyphraseVectorizer` objects, it is more efficient to load the `spacy.Language` object beforehand and pass it as the `spacy_pipeline` argument. 317 | 318 | ```python 319 | import spacy 320 | from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer 321 | 322 | docs = ["""Supervised learning is the machine learning task of learning a function that 323 | maps an input to an output based on example input-output pairs. It infers a 324 | function from labeled training data consisting of a set of training examples. 325 | In supervised learning, each example is a pair consisting of an input object 326 | (typically a vector) and a desired output value (also called the supervisory signal). 327 | A supervised learning algorithm analyzes the training data and produces an inferred function, 328 | which can be used for mapping new examples. An optimal scenario will allow for the 329 | algorithm to correctly determine the class labels for unseen instances. This requires 330 | the learning algorithm to generalize from the training data to unseen situations in a 331 | 'reasonable' way (see inductive bias).""", 332 | 333 | """Keywords are defined as phrases that capture the main topics discussed in a document. 334 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 335 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 336 | of keywords can quickly help to determine whether a given document is relevant to their interest. 337 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 338 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 339 | in information retrieval."""] 340 | 341 | nlp = spacy.load("en_core_web_sm") 342 | 343 | vectorizer1 = KeyphraseCountVectorizer(spacy_pipeline=nlp) 344 | vectorizer2 = KeyphraseTfidfVectorizer(spacy_pipeline=nlp) 345 | 346 | # the following calls use the nlp object 347 | vectorizer1.fit(docs) 348 | vectorizer2.fit(docs) 349 | ``` 350 | 351 | 352 | 353 | ### Custom POS-tagger 354 | 355 | [Back to Table of Contents](#toc) 356 | 357 | To use a different part-of-speech tagger than the ones provided by spaCy, a custom POS-tagger function can be defined and passed to the KeyphraseVectorizers via the `custom_pos_tagger` parameter. This parameter expects a callable function which in turn needs to expect a list of strings in a 'raw_documents' parameter and has to return a list of (word token, POS-tag) tuples. If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored. 358 | 359 | #### Example using [flair](https://github.com/flairNLP/flair "flair GitHub"): 360 | 361 | Flair can be installed via `pip install flair`. 362 | 363 | ```python 364 | from typing import List 365 | import flair 366 | from flair.models import SequenceTagger 367 | from flair.tokenization import SegtokSentenceSplitter 368 | 369 | 370 | docs = ["""Supervised learning is the machine learning task of learning a function that 371 | maps an input to an output based on example input-output pairs. It infers a 372 | function from labeled training data consisting of a set of training examples. 373 | In supervised learning, each example is a pair consisting of an input object 374 | (typically a vector) and a desired output value (also called the supervisory signal). 375 | A supervised learning algorithm analyzes the training data and produces an inferred function, 376 | which can be used for mapping new examples. An optimal scenario will allow for the 377 | algorithm to correctly determine the class labels for unseen instances. This requires 378 | the learning algorithm to generalize from the training data to unseen situations in a 379 | 'reasonable' way (see inductive bias).""", 380 | 381 | """Keywords are defined as phrases that capture the main topics discussed in a document. 382 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 383 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 384 | of keywords can quickly help to determine whether a given document is relevant to their interest. 385 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 386 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 387 | in information retrieval."""] 388 | 389 | # define flair POS-tagger and splitter 390 | tagger = SequenceTagger.load('pos') 391 | splitter = SegtokSentenceSplitter() 392 | 393 | # define custom POS-tagger function using flair 394 | def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger, splitter: flair.tokenization.SegtokSentenceSplitter = splitter)->List[tuple]: 395 | """ 396 | Important: 397 | 398 | The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings. 399 | Any other parameter of the custom POS-tagger function can be arbitrarily defined, depending on the respective use case. 400 | Furthermore the function has to return a list of (word token, POS-tag) tuples. 401 | """ 402 | # split texts into sentences 403 | sentences = [] 404 | for doc in raw_documents: 405 | sentences.extend(splitter.split(doc)) 406 | 407 | # predict POS tags 408 | tagger.predict(sentences) 409 | 410 | # iterate through sentences to get word tokens and predicted POS-tags 411 | pos_tags = [] 412 | words = [] 413 | for sentence in sentences: 414 | pos_tags.extend([label.value for label in sentence.get_labels('pos')]) 415 | words.extend([word.text for word in sentence]) 416 | 417 | return list(zip(words, pos_tags)) 418 | 419 | 420 | # check that the custom POS-tagger function returns a list of (word token, POS-tag) tuples 421 | print(custom_pos_tagger(raw_documents=docs)) 422 | 423 | >>> [('Supervised', 'VBN'), ('learning', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('machine', 'NN'), ('learning', 'VBG'), ('task', 'NN'), ('of', 'IN'), ('learning', 'VBG'), ('a', 'DT'), ('function', 'NN'), ('that', 'WDT'), ('maps', 'VBZ'), ('an', 'DT'), ('input', 'NN'), ('to', 'IN'), ('an', 'DT'), ('output', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('example', 'NN'), ('input-output', 'NN'), ('pairs', 'NNS'), ('.', '.'), ('It', 'PRP'), ('infers', 'VBZ'), ('a', 'DT'), ('function', 'NN'), ('from', 'IN'), ('labeled', 'VBN'), ('training', 'NN'), ('data', 'NNS'), ('consisting', 'VBG'), ('of', 'IN'), ('a', 'DT'), ('set', 'NN'), ('of', 'IN'), ('training', 'NN'), ('examples', 'NNS'), ('.', '.'), ('In', 'IN'), ('supervised', 'JJ'), ('learning', 'NN'), (',', ','), ('each', 'DT'), ('example', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('pair', 'NN'), ('consisting', 'VBG'), ('of', 'IN'), ('an', 'DT'), ('input', 'NN'), ('object', 'NN'), ('(', ':'), ('typically', 'RB'), ('a', 'DT'), ('vector', 'NN'), (')', ','), ('and', 'CC'), ('a', 'DT'), ('desired', 'VBN'), ('output', 'NN'), ('value', 'NN'), ('(', ','), ('also', 'RB'), ('called', 'VBN'), ('the', 'DT'), ('supervisory', 'JJ'), ('signal', 'NN'), (')', '-RRB-'), ('.', '.'), ('A', 'DT'), ('supervised', 'JJ'), ('learning', 'NN'), ('algorithm', 'NN'), ('analyzes', 'VBZ'), ('the', 'DT'), ('training', 'NN'), ('data', 'NNS'), ('and', 'CC'), ('produces', 'VBZ'), ('an', 'DT'), ('inferred', 'JJ'), ('function', 'NN'), (',', ','), ('which', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('for', 'IN'), ('mapping', 'VBG'), ('new', 'JJ'), ('examples', 'NNS'), ('.', '.'), ('An', 'DT'), ('optimal', 'JJ'), ('scenario', 'NN'), ('will', 'MD'), ('allow', 'VB'), ('for', 'IN'), ('the', 'DT'), ('algorithm', 'NN'), ('to', 'TO'), ('correctly', 'RB'), ('determine', 'VB'), ('the', 'DT'), ('class', 'NN'), ('labels', 'NNS'), ('for', 'IN'), ('unseen', 'JJ'), ('instances', 'NNS'), ('.', '.'), ('This', 'DT'), ('requires', 'VBZ'), ('the', 'DT'), ('learning', 'NN'), ('algorithm', 'NN'), ('to', 'TO'), ('generalize', 'VB'), ('from', 'IN'), ('the', 'DT'), ('training', 'NN'), ('data', 'NNS'), ('to', 'IN'), ('unseen', 'JJ'), ('situations', 'NNS'), ('in', 'IN'), ('a', 'DT'), ("'", '``'), ('reasonable', 'JJ'), ("'", "''"), ('way', 'NN'), ('(', ','), ('see', 'VB'), ('inductive', 'JJ'), ('bias', 'NN'), (')', '-RRB-'), ('.', '.'), ('Keywords', 'NNS'), ('are', 'VBP'), ('defined', 'VBN'), ('as', 'IN'), ('phrases', 'NNS'), ('that', 'WDT'), ('capture', 'VBP'), ('the', 'DT'), ('main', 'JJ'), ('topics', 'NNS'), ('discussed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('document', 'NN'), ('.', '.'), ('As', 'IN'), ('they', 'PRP'), ('offer', 'VBP'), ('a', 'DT'), ('brief', 'JJ'), ('yet', 'CC'), ('precise', 'JJ'), ('summary', 'NN'), ('of', 'IN'), ('document', 'NN'), ('content', 'NN'), (',', ','), ('they', 'PRP'), ('can', 'MD'), ('be', 'VB'), ('utilized', 'VBN'), ('for', 'IN'), ('various', 'JJ'), ('applications', 'NNS'), ('.', '.'), ('In', 'IN'), ('an', 'DT'), ('information', 'NN'), ('retrieval', 'NN'), ('environment', 'NN'), (',', ','), ('they', 'PRP'), ('serve', 'VBP'), ('as', 'IN'), ('an', 'DT'), ('indication', 'NN'), ('of', 'IN'), ('document', 'NN'), ('relevance', 'NN'), ('for', 'IN'), ('users', 'NNS'), (',', ','), ('as', 'IN'), ('the', 'DT'), ('list', 'NN'), ('of', 'IN'), ('keywords', 'NNS'), ('can', 'MD'), ('quickly', 'RB'), ('help', 'VB'), ('to', 'TO'), ('determine', 'VB'), ('whether', 'IN'), ('a', 'DT'), ('given', 'VBN'), ('document', 'NN'), ('is', 'VBZ'), ('relevant', 'JJ'), ('to', 'IN'), ('their', 'PRP$'), ('interest', 'NN'), ('.', '.'), ('As', 'IN'), ('keywords', 'NNS'), ('reflect', 'VBP'), ('a', 'DT'), ('document', 'NN'), ("'s", 'POS'), ('main', 'JJ'), ('topics', 'NNS'), (',', ','), ('they', 'PRP'), ('can', 'MD'), ('be', 'VB'), ('utilized', 'VBN'), ('to', 'TO'), ('classify', 'VB'), ('documents', 'NNS'), ('into', 'IN'), ('groups', 'NNS'), ('by', 'IN'), ('measuring', 'VBG'), ('the', 'DT'), ('overlap', 'NN'), ('between', 'IN'), ('the', 'DT'), ('keywords', 'NNS'), ('assigned', 'VBN'), ('to', 'IN'), ('them', 'PRP'), ('.', '.'), ('Keywords', 'NNS'), ('are', 'VBP'), ('also', 'RB'), ('used', 'VBN'), ('proactively', 'RB'), ('in', 'IN'), ('information', 'NN'), ('retrieval', 'NN'), ('.', '.')] 424 | ``` 425 | 426 | After the custom POS-tagger function is defined, it can be passed to KeyphraseVectorizers via the `custom_pos_tagger` parameter. 427 | 428 | ```python 429 | from keyphrase_vectorizers import KeyphraseCountVectorizer 430 | 431 | # use custom POS-tagger with KeyphraseVectorizers 432 | vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger) 433 | vectorizer.fit(docs) 434 | keyphrases = vectorizer.get_feature_names_out() 435 | print(keyphrases) 436 | 437 | >>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups' 438 | 'main topics' 'task' 'precise summary' 'supervised learning' 439 | 'inductive bias' 'information retrieval environment' 440 | 'supervised learning algorithm' 'function' 'input' 'pair' 441 | 'document relevance' 'learning' 'class labels' 'new examples' 'keywords' 442 | 'list' 'machine' 'training data' 'unseen situations' 'phrases' 'output' 443 | 'optimal scenario' 'document' 'training examples' 'documents' 'interest' 444 | 'indication' 'learning algorithm' 'inferred function' 445 | 'various applications' 'example' 'set' 'unseen instances' 446 | 'example input-output pairs' 'way' 'users' 'input object' 447 | 'supervisory signal' 'overlap' 'document content'] 448 | ``` 449 | 450 | 451 | 452 | ### [PatternRank:](https://arxiv.org/abs/2210.05245) Keyphrase extraction with KeyphraseVectorizers and [KeyBERT](https://github.com/MaartenGr/KeyBERT "KeyBERT repository") 453 | 454 | [Back to Table of Contents](#toc) 455 | 456 | Using the keyphrase vectorizers together with KeyBERT for keyphrase extraction results in the [PatternRank](https://arxiv.org/abs/2210.05245) approach. PatternRank can extract grammatically correct keyphrases that are most similar to a document. Thereby, the vectorizer first extracts candidate keyphrases from the text documents, which are subsequently ranked by KeyBERT based on their document similarity. The top-n most similar keyphrases can then be 457 | considered as document keywords. 458 | 459 | The advantage of using KeyphraseVectorizers in addition to KeyBERT is that it allows users to get grammatically correct 460 | keyphrases instead of simple n-grams of pre-defined lengths. In KeyBERT, users can specify the `keyphrase_ngram_range` 461 | to define the length of the retrieved keyphrases. However, this raises two issues. First, users usually do not know the 462 | optimal n-gram range and therefore have to spend some time experimenting until they find a suitable n-gram range. 463 | Second, even after finding a good n-gram range, the returned keyphrases are sometimes still grammatically not quite 464 | correct or are slightly off-key. Unfortunately, this limits the quality of the returned keyphrases. 465 | 466 | To adress this issue, we can use the vectorizers of this package to first extract candidate keyphrases that consist of 467 | zero or more adjectives, followed by one or multiple nouns in a pre-processing step instead of simple n-grams. [TextRank](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf), [SingleRank](https://aclanthology.org/C08-1122.pdf), and [EmbedRank](https://aclanthology.org/K18-1022.pdf) already successfully used this noun phrase approach for keyphrase extraction. The extracted candidate keyphrases are subsequently passed to KeyBERT for embedding generation and similarity calculation. To use both packages for keyphrase extraction, we need to 468 | pass KeyBERT a keyphrase vectorizer with the `vectorizer` parameter. Since the length of keyphrases now depends on 469 | part-of-speech tags, there is no need to define an n-gram length anymore. 470 | 471 | #### Example: 472 | 473 | KeyBERT can be installed via `pip install keybert`. 474 | 475 | ```python 476 | from keyphrase_vectorizers import KeyphraseCountVectorizer 477 | from keybert import KeyBERT 478 | 479 | docs = ["""Supervised learning is the machine learning task of learning a function that 480 | maps an input to an output based on example input-output pairs. It infers a 481 | function from labeled training data consisting of a set of training examples. 482 | In supervised learning, each example is a pair consisting of an input object 483 | (typically a vector) and a desired output value (also called the supervisory signal). 484 | A supervised learning algorithm analyzes the training data and produces an inferred function, 485 | which can be used for mapping new examples. An optimal scenario will allow for the 486 | algorithm to correctly determine the class labels for unseen instances. This requires 487 | the learning algorithm to generalize from the training data to unseen situations in a 488 | 'reasonable' way (see inductive bias).""", 489 | 490 | """Keywords are defined as phrases that capture the main topics discussed in a document. 491 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 492 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 493 | of keywords can quickly help to determine whether a given document is relevant to their interest. 494 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 495 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 496 | in information retrieval."""] 497 | 498 | kw_model = KeyBERT() 499 | ``` 500 | 501 | Instead of deciding on a suitable n-gram range which could be e.g.(1,2)... 502 | 503 | ```python 504 | >>> kw_model.extract_keywords(docs=docs, keyphrase_ngram_range=(1,2)) 505 | [[('labeled training', 0.6013), 506 | ('examples supervised', 0.6112), 507 | ('signal supervised', 0.6152), 508 | ('supervised', 0.6676), 509 | ('supervised learning', 0.6779)], 510 | [('keywords assigned', 0.6354), 511 | ('keywords used', 0.6373), 512 | ('list keywords', 0.6375), 513 | ('keywords quickly', 0.6376), 514 | ('keywords defined', 0.6997)]] 515 | ``` 516 | 517 | we can now just let the keyphrase vectorizer decide on suitable keyphrases, without limitations to a maximum or minimum 518 | n-gram range. We only have to pass a keyphrase vectorizer as parameter to KeyBERT: 519 | 520 | ```python 521 | >>> kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer()) 522 | [[('learning', 0.4813), 523 | ('training data', 0.5271), 524 | ('learning algorithm', 0.5632), 525 | ('supervised learning', 0.6779), 526 | ('supervised learning algorithm', 0.6992)], 527 | [('document content', 0.3988), 528 | ('information retrieval environment', 0.5166), 529 | ('information retrieval', 0.5792), 530 | ('keywords', 0.6046), 531 | ('document relevance', 0.633)]] 532 | ``` 533 | 534 | This allows us to make sure that we do not cut off important words caused by defining our n-gram range too short. For 535 | example, we would not have found the keyphrase "supervised learning algorithm" with `keyphrase_ngram_range=(1,2)`. 536 | Furthermore, we avoid to get keyphrases that are slightly off-key like "labeled training", "signal supervised" or 537 | "keywords quickly". 538 | 539 | For more tips on how to use the KeyphraseVectorizers together with KeyBERT, visit [this guide](https://maartengr.github.io/KeyBERT/guides/countvectorizer.html#keyphrasevectorizers "KeyBERT rCountVectorizer Guide"). 540 | 541 | 542 | 543 | ### Topic modeling with [BERTopic](https://github.com/MaartenGr/BERTopic "BERTopic repository") and KeyphraseVectorizers 544 | 545 | [Back to Table of Contents](#toc) 546 | 547 | Similar to the application with KeyBERT, the keyphrase vectorizers can be used to obtain grammatically correct keyphrases as 548 | descriptions for topics instead of simple n-grams. This allows us to make sure that we do not cut off important topic 549 | description keyphrases by defining our n-gram range too short. Moreover, we don't need to clean stopwords upfront, can 550 | get more precise topic models and avoid to get topic description keyphrases that are slightly off-key. 551 | 552 | #### Example: 553 | 554 | BERTopic can be installed via `pip install bertopic`. 555 | 556 | ```python 557 | from keyphrase_vectorizers import KeyphraseCountVectorizer 558 | from bertopic import BERTopic 559 | from sklearn.datasets import fetch_20newsgroups 560 | 561 | # load text documents 562 | docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] 563 | # only use subset of the data 564 | docs = docs[:5000] 565 | 566 | # train topic model with KeyphraseCountVectorizer 567 | keyphrase_topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer()) 568 | keyphrase_topics, keyphrase_probs = keyphrase_topic_model.fit_transform(docs) 569 | 570 | # get topics 571 | >>> keyphrase_topic_model.topics 572 | {-1: [('file', 0.007265527630674131), 573 | ('one', 0.007055454904474792), 574 | ('use', 0.00633563957153475), 575 | ('program', 0.006053271092949018), 576 | ('get', 0.006011060091056076), 577 | ('people', 0.005729309058970368), 578 | ('know', 0.005635951168273583), 579 | ('like', 0.0055692449802916015), 580 | ('time', 0.00527028825803415), 581 | ('us', 0.00525564504880084)], 582 | 0: [('game', 0.024134589719090525), 583 | ('team', 0.021852806383170772), 584 | ('players', 0.01749406934044139), 585 | ('games', 0.014397938026886745), 586 | ('hockey', 0.013932342023677305), 587 | ('win', 0.013706115572901401), 588 | ('year', 0.013297593024390321), 589 | ('play', 0.012533185558169046), 590 | ('baseball', 0.012412743802062559), 591 | ('season', 0.011602725885164318)], 592 | 1: [('patients', 0.022600352291162015), 593 | ('msg', 0.02023877371575874), 594 | ('doctor', 0.018816282737587457), 595 | ('medical', 0.018614407917995103), 596 | ('treatment', 0.0165028251400717), 597 | ('food', 0.01604980195180696), 598 | ('candida', 0.015255961242066143), 599 | ('disease', 0.015115496310099693), 600 | ('pain', 0.014129703072484495), 601 | ('hiv', 0.012884503220341102)], 602 | 2: [('key', 0.028851633177510126), 603 | ('encryption', 0.024375137861044675), 604 | ('clipper', 0.023565947302544528), 605 | ('privacy', 0.019258719348097385), 606 | ('security', 0.018983682856076434), 607 | ('chip', 0.018822199098878365), 608 | ('keys', 0.016060139239615384), 609 | ('internet', 0.01450486904722165), 610 | ('encrypted', 0.013194373119964168), 611 | ('government', 0.01303978311708837)], 612 | ... 613 | ``` 614 | 615 | The same topics look a bit different when no keyphrase vectorizer is used: 616 | 617 | ```python 618 | from bertopic import BERTopic 619 | from sklearn.datasets import fetch_20newsgroups 620 | 621 | # load text documents 622 | docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] 623 | # only use subset of the data 624 | docs = docs[:5000] 625 | 626 | # train topic model without KeyphraseCountVectorizer 627 | topic_model = BERTopic() 628 | topics, probs = topic_model.fit_transform(docs) 629 | 630 | # get topics 631 | >>> topic_model.topics 632 | {-1: [('the', 0.012864641020408933), 633 | ('to', 0.01187920529994724), 634 | ('and', 0.011431498631699856), 635 | ('of', 0.01099851927541331), 636 | ('is', 0.010995478673036962), 637 | ('in', 0.009908233622158523), 638 | ('for', 0.009903667215879675), 639 | ('that', 0.009619596716087699), 640 | ('it', 0.009578499681829809), 641 | ('you', 0.0095328846440753)], 642 | 0: [('game', 0.013949166096523719), 643 | ('team', 0.012458483177116456), 644 | ('he', 0.012354733462693834), 645 | ('the', 0.01119583508278812), 646 | ('10', 0.010190243555226108), 647 | ('in', 0.0101436249231417), 648 | ('players', 0.009682212470082758), 649 | ('to', 0.00933700544705287), 650 | ('was', 0.009172402203816335), 651 | ('and', 0.008653375901739337)], 652 | 1: [('of', 0.012771267188340924), 653 | ('to', 0.012581337590513296), 654 | ('is', 0.012554884458779008), 655 | ('patients', 0.011983273578628046), 656 | ('and', 0.011863499662237566), 657 | ('that', 0.011616113472989725), 658 | ('it', 0.011581944987387165), 659 | ('the', 0.011475148304229873), 660 | ('in', 0.011395485985801054), 661 | ('msg', 0.010715000656335596)], 662 | 2: [('key', 0.01725282988290282), 663 | ('the', 0.014634841495851404), 664 | ('be', 0.014429762197907552), 665 | ('encryption', 0.013530733999898166), 666 | ('to', 0.013443159534369817), 667 | ('clipper', 0.01296614319927958), 668 | ('of', 0.012164734232650158), 669 | ('is', 0.012128295958613464), 670 | ('and', 0.011972763728732667), 671 | ('chip', 0.010785744492767285)], 672 | ... 673 | ``` 674 | 675 | 676 | 677 | ### Online KeyphraseVectorizers 678 | 679 | [Back to Table of Contents](#toc) 680 | 681 | The KeyphraseVectorizers also support online/incremental updates of their representation (similar to 682 | the [OnlineCountVectorizer](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer)). 683 | The vectorizer can not only update out-of-vocabulary keyphrases but also implements decay and cleaning functions to 684 | prevent the sparse document-keyphrases matrix to become too large. 685 | 686 | **Parameters for online updates:** 687 | 688 | * `decay`: At each iteration, we sum the document-keyphrase representation of the new documents with the 689 | document-keyphrase representation of all documents processed thus far. In other words, the document-keyphrase matrix 690 | keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less 691 | and less relevant as time goes on. Therefore, a decay parameter was implemented that decays the document-keyphrase 692 | frequencies at each iteration before adding the document frequencies of new documents. The decay parameter is a value 693 | between 0 and 1 and indicates the percentage of frequencies the previous document-keyphrase matrix should be reduced 694 | to. For example, a value of .1 will decrease the frequencies in the document-keyphrase matrix by 10% at each iteration 695 | before adding the new document-keyphrase matrix. This will make sure that recent data has more weight than previous 696 | iterations. 697 | * `delete_min_df`: We might want to remove keyphrases from the document-keyphrase representation that appear 698 | infrequently. The `min_df` parameter works quite well for that. However, when we have a streaming setting, 699 | the `min_df` does not work as well since a keyphrases's frequency might start below `min_df` but will end up higher 700 | than that over time. Setting that value high might not always be advised. As a result, the list of keyphrases learned 701 | by the vectorizer and the resulting document-keyphrase matrix can become quite large. Similarly, if we implement 702 | the `decay` parameter, then some values will decrease over time until they are below `min_df`. For these reasons, 703 | the `delete_min_df` parameter was implemented. The parameter takes positive integers and indicates, at each iteration, 704 | which keyphrases will be removed from the already learned ones. If the value is set to 5, it will check after each 705 | iteration if the total frequency of a keyphrase is exceeded by that value. If so, the keyphrase will be removed in its 706 | entirety from the list of keyphrases learned by the vectorizer. This helps to keep the document-keyphrase matrix of a 707 | manageable size. 708 | 709 | #### Example: 710 | 711 | ```python 712 | from keyphrase_vectorizers import KeyphraseCountVectorizer 713 | 714 | docs = ["""Supervised learning is the machine learning task of learning a function that 715 | maps an input to an output based on example input-output pairs. It infers a 716 | function from labeled training data consisting of a set of training examples. 717 | In supervised learning, each example is a pair consisting of an input object 718 | (typically a vector) and a desired output value (also called the supervisory signal). 719 | A supervised learning algorithm analyzes the training data and produces an inferred function, 720 | which can be used for mapping new examples. An optimal scenario will allow for the 721 | algorithm to correctly determine the class labels for unseen instances. This requires 722 | the learning algorithm to generalize from the training data to unseen situations in a 723 | 'reasonable' way (see inductive bias).""", 724 | 725 | """Keywords are defined as phrases that capture the main topics discussed in a document. 726 | As they offer a brief yet precise summary of document content, they can be utilized for various applications. 727 | In an information retrieval environment, they serve as an indication of document relevance for users, as the list 728 | of keywords can quickly help to determine whether a given document is relevant to their interest. 729 | As keywords reflect a document's main topics, they can be utilized to classify documents into groups 730 | by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 731 | in information retrieval."""] 732 | 733 | # Init default vectorizer. 734 | vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3) 735 | 736 | # intitial vectorizer fit 737 | vectorizer.fit_transform([docs[0]]).toarray() 738 | >>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3, 739 | 1, 1, 1]]) 740 | 741 | # check learned keyphrases 742 | print(vectorizer.get_feature_names_out()) 743 | >>> ['output pairs', 'output value', 'function', 'optimal scenario', 744 | 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 745 | 'supervised learning algorithm', 'way', 'training examples', 746 | 'input object', 'example', 'machine', 'output', 747 | 'unseen situations', 'unseen instances', 'inductive bias', 748 | 'new examples', 'input', 'task', 'training data', 'class labels', 749 | 'set', 'vector'] 750 | 751 | # learn additional keyphrases from new documents with partial fit 752 | vectorizer.partial_fit([docs[1]]) 753 | vectorizer.transform([docs[1]]).toarray() 754 | >>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 755 | 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]]) 756 | 757 | # check learned keyphrases, including newly learned ones 758 | print(vectorizer.get_feature_names_out()) 759 | >>> ['output pairs', 'output value', 'function', 'optimal scenario', 760 | 'pair', 'supervised learning', 'supervisory signal', 'algorithm', 761 | 'supervised learning algorithm', 'way', 'training examples', 762 | 'input object', 'example', 'machine', 'output', 763 | 'unseen situations', 'unseen instances', 'inductive bias', 764 | 'new examples', 'input', 'task', 'training data', 'class labels', 765 | 'set', 'vector', 'list', 'various applications', 766 | 'information retrieval', 'groups', 'overlap', 'main topics', 767 | 'precise summary', 'document relevance', 'interest', 'indication', 768 | 'information retrieval environment', 'phrases', 'keywords', 769 | 'document content', 'documents', 'document', 'users'] 770 | 771 | # update list of learned keyphrases according to 'delete_min_df' 772 | vectorizer.update_bow([docs[1]]) 773 | vectorizer.transform([docs[1]]).toarray() 774 | >>> array([[5, 5]]) 775 | 776 | # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain) 777 | print(vectorizer.get_feature_names_out()) 778 | >>> ['keywords', 'document'] 779 | 780 | # update again and check the impact of 'decay' on the learned document-keyphrase matrix 781 | vectorizer.update_bow([docs[1]]) 782 | vectorizer.X_.toarray() 783 | >>> array([[7.5, 7.5]]) 784 | ``` 785 | 786 | 787 | 788 | ### Citation information 789 | 790 | [Back to Table of Contents](#toc) 791 | 792 | When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry: 793 | 794 | ```plaintext 795 | @conference{schopf_etal_kdir22, 796 | author={Tim Schopf and Simon Klimek and Florian Matthes}, 797 | title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction}, 798 | booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR}, 799 | year={2022}, 800 | pages={243-248}, 801 | publisher={SciTePress}, 802 | organization={INSTICC}, 803 | doi={10.5220/0011546600003335}, 804 | isbn={978-989-758-614-9}, 805 | issn={2184-3228}, 806 | } 807 | ``` 808 | --------------------------------------------------------------------------------