├── tests
    ├── __init__.py
    ├── requirements.txt
    ├── test_vectorizers.py
    └── utils.py
├── keyphrase_vectorizers
    ├── _version.py
    ├── __init__.py
    ├── keyphrase_tfidf_vectorizer.py
    ├── keyphrase_vectorizer_mixin.py
    └── keyphrase_count_vectorizer.py
├── pyproject.toml
├── requirements.txt
├── docs
    ├── api.rst
    ├── index.rst
    ├── requirements.txt
    ├── conf.py
    └── KeyphraseVectorizers.md
├── .readthedocs.yaml
├── .github
    └── workflows
    │   └── testing.yml
├── LICENSE
├── setup.py
├── .gitignore
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/keyphrase_vectorizers/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.13'
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest>=7.0.1
2 | keybert>=0.5.0
3 | flair==0.11.3
4 | scipy==1.7.3
5 | bertopic>=0.16.1
6 | scikit-learn>=1.0.1
7 | umap-learn==0.5.4


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.18.5
2 | spacy>=3.0.1
3 | spacy-transformers>=1.1.6
4 | spacy-curated-transformers>=0.2.2
5 | nltk>=3.6.1
6 | scikit-learn>=1.0
7 | scipy>=1.7.3
8 | psutil>=5.8.0


--------------------------------------------------------------------------------
/keyphrase_vectorizers/__init__.py:
--------------------------------------------------------------------------------
1 | from keyphrase_vectorizers._version import __version__
2 | from keyphrase_vectorizers.keyphrase_count_vectorizer import KeyphraseCountVectorizer
3 | from keyphrase_vectorizers.keyphrase_tfidf_vectorizer import KeyphraseTfidfVectorizer
4 | from keyphrase_vectorizers.keyphrase_vectorizer_mixin import _KeyphraseVectorizerMixin
5 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | :mod:`KeyphraseCountVectorizer`
 2 | ===============================
 3 | 
 4 | .. automodule:: keyphrase_vectorizers.keyphrase_count_vectorizer
 5 |    :members:
 6 |    :inherited-members:
 7 | 
 8 | :mod:`KeyphraseTfidfVectorizer`
 9 | ===============================
10 | 
11 | .. automodule:: keyphrase_vectorizers.keyphrase_tfidf_vectorizer
12 |    :members:
13 |    :inherited-members:
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to the KeyphraseVectorizers documentation!
 2 | ===================================================
 3 |  
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: User Guide:
 7 |    
 8 |    KeyphraseVectorizers
 9 | 
10 | .. toctree::
11 |    :caption: API Guide:
12 | 
13 |    api
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx>=3.2.1
 2 | sphinx-rtd-theme>=0.5.2
 3 | sphinxcontrib-applehelp>=1.0.2
 4 | sphinxcontrib-devhelp>=1.0.2
 5 | sphinxcontrib-htmlhelp>=1.0.3
 6 | sphinxcontrib-jsmath>=1.0.1
 7 | sphinxcontrib-qthelp>=1.0.3
 8 | sphinxcontrib-serializinghtml>=1.1.4
 9 | sphinxcontrib-websupport>=1.2.4
10 | readthedocs-sphinx-search>=0.1.0
11 | sphinx-markdown-tables>=0.0.15
12 | recommonmark>=0.7.1
13 | docutils>=0.16
14 | numpy>=1.18.5
15 | spacy>=3.0.1
16 | spacy-transformers>=1.1.6
17 | spacy-curated-transformers>=0.2.2
18 | nltk>=3.6.1
19 | scikit-learn>=1.0
20 | scipy>=1.7.3
21 | psutil>=5.8.0
22 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 |   builder: html
12 |   fail_on_warning: false
13 | 
14 | # Optionally build your docs in additional formats such as PDF
15 | formats: all
16 | 
17 | # Optionally set the version of Python and requirements required to build your docs
18 | python:
19 |   install:
20 |     - requirements: docs/requirements.txt
21 |     - method: pip
22 |       path: .
23 |       extra_requirements:
24 |         - docs
25 | 
26 | build:
27 |   os: ubuntu-22.04
28 |   tools:
29 |     python: "3.7"
30 | 
31 | submodules:
32 |   include: all
33 | 


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Code tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches:
 9 |     - master
10 |   pull_request:
11 |     branches:
12 |     - master
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         python-version: [ 3.7, 3.8, 3.9 ]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v2
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v1
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install -r "requirements.txt"
31 |         pip install -r "tests/requirements.txt"
32 |     - name: Run tests
33 |       run: pytest
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Tim Schopf
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.util import convert_path
 2 | 
 3 | import setuptools
 4 | 
 5 | with open("README.md", "r", encoding="utf-8") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | main_ns = {}
 9 | ver_path = convert_path('keyphrase_vectorizers/_version.py')
10 | with open(ver_path) as ver_file:
11 |     exec(ver_file.read(), main_ns)
12 | 
13 | ver_path = convert_path('requirements.txt')
14 | with open(ver_path) as ver_file:
15 |     base_packages = ver_file.read().splitlines()
16 | 
17 | setuptools.setup(
18 |     name='keyphrase-vectorizers',
19 |     version=main_ns['__version__'],
20 |     url='https://github.com/TimSchopf/KeyphraseVectorizers',
21 |     license='BSD 3-Clause "New" or "Revised" License',
22 |     author='Tim Schopf',
23 |     author_email='tim.schopf@t-online.de.de',
24 |     description='Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert them into a document-keyphrase matrix.',
25 |     long_description=long_description,
26 |     long_description_content_type='text/markdown',
27 |     classifiers=[
28 |         "Development Status :: 3 - Alpha",
29 |         "Programming Language :: Python :: 3",
30 |         "Intended Audience :: Science/Research",
31 |         "Intended Audience :: Developers",
32 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
33 |         "Topic :: Scientific/Engineering :: Information Analysis",
34 |         "License :: OSI Approved :: BSD License",
35 |         "Operating System :: OS Independent",
36 |     ],
37 |     install_requires=base_packages,
38 |     package_dir={"": "."},
39 |     packages=setuptools.find_packages(where="."),
40 |     python_requires='>=3.7',
41 |     data_files=[('requirements', ['requirements.txt'])],
42 | )
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit tests / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | #PyCharm stuff
 59 | .idea/
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | 
13 | import os
14 | import sys
15 | from distutils.util import convert_path
16 | 
17 | sys.path.insert(0, os.path.abspath('..'))
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = 'KeyphraseVectorizers'
22 | copyright = '2022, Tim Schopf'
23 | author = 'Tim Schopf'
24 | 
25 | main_ns = {}
26 | ver_path = convert_path('../keyphrase_vectorizers/_version.py')
27 | with open(ver_path) as ver_file:
28 |     exec(ver_file.read(), main_ns)
29 | release = main_ns['__version__']
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     'sphinx_rtd_theme',
38 |     'recommonmark',
39 |     'sphinx.ext.autodoc',
40 |     'sphinx.ext.napoleon',
41 |     'sphinx_markdown_tables',
42 | ]
43 | 
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 | 
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
51 | 
52 | # -- Options for HTML output -------------------------------------------------
53 | 
54 | # The theme to use for HTML and HTML Help pages.  See the documentation for
55 | # a list of builtin themes.
56 | #
57 | html_theme = 'sphinx_rtd_theme'
58 | 
59 | # Add any paths that contain custom static files (such as style sheets) here,
60 | # relative to this directory. They are copied after the builtin static files,
61 | # so a file named "default.css" will overwrite the builtin "default.css".
62 | html_static_path = []
63 | 
64 | master_doc = 'index'
65 | 
66 | source_parsers = {
67 |     '.md': 'recommonmark.parser.CommonMarkParser',
68 | }
69 | 
70 | source_suffix = ['.rst', '.md']
71 | 


--------------------------------------------------------------------------------
/tests/test_vectorizers.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import flair
  4 | import spacy
  5 | from bertopic import BERTopic
  6 | from flair.models import SequenceTagger
  7 | from flair.tokenization import SegtokSentenceSplitter
  8 | from keybert import KeyBERT
  9 | from sklearn.datasets import fetch_20newsgroups
 10 | 
 11 | import tests.utils as utils
 12 | from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
 13 | 
 14 | english_docs = utils.get_english_test_docs()
 15 | german_docs = utils.get_german_test_docs()
 16 | french_docs = utils.get_french_docs()
 17 | 
 18 | 
 19 | def test_default_count_vectorizer():
 20 |     sorted_english_test_keyphrases = utils.get_english_test_keyphrases()
 21 |     sorted_count_matrix = utils.get_sorted_english_count_matrix()
 22 | 
 23 |     vectorizer = KeyphraseCountVectorizer()
 24 |     vectorizer.fit(english_docs)
 25 |     keyphrases = vectorizer.get_feature_names_out()
 26 |     document_keyphrase_matrix = vectorizer.transform(english_docs).toarray()
 27 | 
 28 |     assert [sorted(count_list) for count_list in
 29 |             KeyphraseCountVectorizer().fit_transform(english_docs).toarray()] == sorted_count_matrix
 30 |     assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_count_matrix
 31 |     assert sorted(keyphrases) == sorted_english_test_keyphrases
 32 | 
 33 | 
 34 | def test_spacy_language_argument():
 35 |     sorted_english_test_keyphrases = utils.get_english_test_keyphrases()
 36 |     sorted_count_matrix = utils.get_sorted_english_count_matrix()
 37 | 
 38 |     nlp = spacy.load("en_core_web_sm")
 39 | 
 40 |     vectorizer = KeyphraseCountVectorizer(spacy_pipeline=nlp)
 41 |     vectorizer.fit(english_docs)
 42 |     keyphrases = vectorizer.get_feature_names_out()
 43 |     document_keyphrase_matrix = vectorizer.transform(english_docs).toarray()
 44 | 
 45 |     assert [sorted(count_list) for count_list in
 46 |             KeyphraseCountVectorizer().fit_transform(english_docs).toarray()] == sorted_count_matrix
 47 |     assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_count_matrix
 48 |     assert sorted(keyphrases) == sorted_english_test_keyphrases
 49 | 
 50 | 
 51 | def test_german_count_vectorizer():
 52 |     sorted_german_test_keyphrases = utils.get_german_test_keyphrases()
 53 | 
 54 |     vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='<ADJ.*>*<N.*>+',
 55 |                                           stop_words='german')
 56 |     keyphrases = vectorizer.fit(german_docs).get_feature_names_out()
 57 |     assert sorted(keyphrases) == sorted_german_test_keyphrases
 58 | 
 59 | 
 60 | def test_default_tfidf_vectorizer():
 61 |     sorted_english_test_keyphrases = utils.get_english_test_keyphrases()
 62 |     sorted_english_tfidf_matrix = utils.get_sorted_english_tfidf_matrix()
 63 | 
 64 |     vectorizer = KeyphraseTfidfVectorizer()
 65 |     vectorizer.fit(english_docs)
 66 |     keyphrases = vectorizer.get_feature_names_out()
 67 |     document_keyphrase_matrix = vectorizer.transform(english_docs).toarray()
 68 |     document_keyphrase_matrix = [[round(element, 10) for element in tfidf_list] for tfidf_list in
 69 |                                  document_keyphrase_matrix]
 70 | 
 71 |     assert [sorted(tfidf_list) for tfidf_list in document_keyphrase_matrix] == sorted_english_tfidf_matrix
 72 |     assert sorted(keyphrases) == sorted_english_test_keyphrases
 73 | 
 74 | 
 75 | def test_keybert_integration():
 76 |     english_keybert_keyphrases = utils.get_english_keybert_keyphrases()
 77 |     kw_model = KeyBERT(model="all-MiniLM-L6-v2")
 78 |     keyphrases = kw_model.extract_keywords(docs=english_docs, vectorizer=KeyphraseCountVectorizer())
 79 |     keyphrases = [[element[0] for element in keyphrases_list] for keyphrases_list in keyphrases]
 80 | 
 81 |     assert keyphrases == english_keybert_keyphrases
 82 | 
 83 | 
 84 | def test_french_trf_spacy_pipeline():
 85 |     sorted_french_test_keyphrases = utils.get_french_test_keyphrases()
 86 |     sorted_french_count_matrix = utils.get_sorted_french_count_matrix()
 87 | 
 88 |     vectorizer = KeyphraseCountVectorizer(spacy_pipeline='fr_dep_news_trf', spacy_exclude=[])
 89 |     vectorizer.fit(french_docs)
 90 |     keyphrases = vectorizer.get_feature_names_out()
 91 |     document_keyphrase_matrix = vectorizer.transform(french_docs).toarray()
 92 | 
 93 |     assert [sorted(count_list) for count_list in
 94 |             KeyphraseCountVectorizer(spacy_pipeline='fr_dep_news_trf', spacy_exclude=[]).fit_transform(
 95 |                 french_docs).toarray()] == sorted_french_count_matrix
 96 |     assert [sorted(count_list) for count_list in document_keyphrase_matrix] == sorted_french_count_matrix
 97 |     assert sorted(keyphrases) == sorted_french_test_keyphrases
 98 | 
 99 | 
100 | def test_custom_tagger():
101 |     sorted_english_test_keyphrases = utils.get_sorted_english_keyphrases_custom_flair_tagger()
102 | 
103 |     tagger = SequenceTagger.load('pos')
104 |     splitter = SegtokSentenceSplitter()
105 | 
106 |     # define custom pos tagger function using flair
107 |     def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger,
108 |                           splitter: flair.tokenization.SegtokSentenceSplitter = splitter) -> List[tuple]:
109 |         """
110 |         Important:
111 | 
112 |         The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings.
113 |         Furthermore the function has to return a list of (word token, POS-tag) tuples.
114 |         """
115 |         # split texts into sentences
116 |         sentences = []
117 |         for doc in raw_documents:
118 |             sentences.extend(splitter.split(doc))
119 | 
120 |         # predict POS tags
121 |         tagger.predict(sentences)
122 | 
123 |         # iterate through sentences to get word tokens and predicted POS-tags
124 |         pos_tags = []
125 |         words = []
126 |         for sentence in sentences:
127 |             pos_tags.extend([label.value for label in sentence.get_labels('pos')])
128 |             words.extend([word.text for word in sentence])
129 | 
130 |         return list(zip(words, pos_tags))
131 | 
132 |     vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger)
133 |     vectorizer.fit(english_docs)
134 |     keyphrases = vectorizer.get_feature_names_out()
135 | 
136 |     assert sorted(keyphrases) == sorted_english_test_keyphrases
137 | 
138 | 
139 | def test_online_vectorizer():
140 |     first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
141 |     second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
142 |     first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
143 |     english_keyphrases = utils.get_english_test_keyphrases()
144 |     frequencies_after_min_df = utils.get_frequencies_after_min_df()
145 |     frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
146 |     frequencies_after_bow = utils.get_frequencies_after_bow()
147 | 
148 |     # intitial vectorizer fit
149 |     vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
150 | 
151 |     assert [sorted(count_list) for count_list in
152 |             vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
153 |     assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases
154 | 
155 |     # learn additional keyphrases from new documents with partial fit
156 |     vectorizer.partial_fit([english_docs[1]])
157 | 
158 |     assert [sorted(count_list) for count_list in
159 |             vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
160 |     assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases
161 | 
162 |     # update list of learned keyphrases according to 'delete_min_df'
163 |     vectorizer.update_bow([english_docs[1]])
164 |     assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()
165 | 
166 |     # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
167 |     assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df
168 | 
169 |     # update again and check the impact of 'decay' on the learned document-keyphrase matrix
170 |     vectorizer.update_bow([english_docs[1]])
171 |     assert (vectorizer.X_.toarray() == frequencies_after_bow).all()
172 | 
173 | 
174 | def test_bertopic():
175 |     data = fetch_20newsgroups(subset='train')
176 |     texts = data.data[:100]
177 |     topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
178 |     topics, probs = topic_model.fit_transform(documents=texts)
179 |     new_topics = topic_model.reduce_outliers(texts, topics)
180 |     topic_model.update_topics(texts, topics=new_topics)
181 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | def get_english_test_docs():
  3 |     english_docs = ["""Supervised learning is the machine learning task of learning a function that
  4 |              maps an input to an output based on example input-output pairs. It infers a
  5 |              function from labeled training data consisting of a set of training examples.
  6 |              In supervised learning, each example is a pair consisting of an input object
  7 |              (typically a vector) and a desired output value (also called the supervisory signal). 
  8 |              A supervised learning algorithm analyzes the training data and produces an inferred function, 
  9 |              which can be used for mapping new examples. An optimal scenario will allow for the 
 10 |              algorithm to correctly determine the class labels for unseen instances. This requires 
 11 |              the learning algorithm to generalize from the training data to unseen situations in a 
 12 |              'reasonable' way (see inductive bias).""",
 13 | 
 14 |                     """Keywords are defined as phrases that capture the main topics discussed in a document. 
 15 |                     As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
 16 |                     In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
 17 |                     of keywords can quickly help to determine whether a given document is relevant to their interest. 
 18 |                     As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
 19 |                     by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
 20 |                     in information retrieval."""]
 21 | 
 22 |     return english_docs
 23 | 
 24 | 
 25 | def get_german_test_docs():
 26 |     german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 
 27 |                     Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 
 28 |                     sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 
 29 |                     Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 
 30 |                     Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 
 31 |                     Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""",
 32 | 
 33 |                    """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 
 34 |                    Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 
 35 |                    die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren
 36 |                    """]
 37 |     return german_docs
 38 | 
 39 | 
 40 | def get_french_docs():
 41 |     french_docs = ["Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs"]
 42 | 
 43 |     return french_docs
 44 | 
 45 | 
 46 | def get_english_test_keyphrases():
 47 |     sorted_english_test_keyphrases = ['algorithm', 'class labels', 'document', 'document content', 'document relevance',
 48 |                                       'documents', 'example', 'function', 'groups', 'indication', 'inductive bias',
 49 |                                       'information retrieval', 'information retrieval environment', 'input',
 50 |                                       'input object', 'interest', 'keywords', 'list', 'machine', 'main topics',
 51 |                                       'new examples', 'optimal scenario', 'output', 'output pairs', 'output value',
 52 |                                       'overlap', 'pair', 'phrases', 'precise summary', 'set', 'supervised learning',
 53 |                                       'supervised learning algorithm', 'supervisory signal', 'task', 'training data',
 54 |                                       'training examples', 'unseen instances', 'unseen situations', 'users',
 55 |                                       'various applications', 'vector', 'way']
 56 | 
 57 |     return sorted_english_test_keyphrases
 58 | 
 59 | 
 60 | def get_english_first_doc_test_keyphrases():
 61 |     sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias',
 62 |                                                 'input', 'input object', 'machine', 'new examples', 'optimal scenario',
 63 |                                                 'output', 'output pairs', 'output value', 'pair', 'set',
 64 |                                                 'supervised learning', 'supervised learning algorithm',
 65 |                                                 'supervisory signal', 'task', 'training data', 'training examples',
 66 |                                                 'unseen instances', 'unseen situations', 'vector', 'way']
 67 | 
 68 |     return sorted_english_first_doc_test_keyphrases
 69 | 
 70 | 
 71 | def get_sorted_english_keyphrases_custom_flair_tagger():
 72 |     sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
 73 |                                                'document relevance',
 74 |                                                'documents', 'example', 'example input-output pairs', 'function',
 75 |                                                'groups',
 76 |                                                'indication', 'inductive bias', 'inferred function',
 77 |                                                'information retrieval', 'information retrieval environment', 'input',
 78 |                                                'input object', 'interest', 'keywords', 'learning', 'learning algorithm',
 79 |                                                'list', 'machine', 'main topics', 'new examples',
 80 |                                                'optimal scenario', 'output', 'output value', 'overlap', 'pair',
 81 |                                                'phrases', 'precise summary', 'set', 'supervised learning',
 82 |                                                'supervised learning algorithm', 'supervisory signal', 'task',
 83 |                                                'training data', 'training examples', 'unseen instances',
 84 |                                                'unseen situations', 'users', 'various applications', 'vector', 'way']
 85 | 
 86 |     return sorted_english_custom_tagger_keyphrases
 87 | 
 88 | 
 89 | def get_german_test_keyphrases():
 90 |     sorted_german_test_keyphrases = ['advokat', 'angesehenen bürgerlichen familie', 'ausbildung', 'bäckers',
 91 |                                      'dichtkunst', 'ehefrau elisabetha dorothea schiller', 'frankfurt',
 92 |                                      'friedrich schiller', 'geb. kodweiß', 'goethe', 'großvater', 'hauslehrer',
 93 |                                      'hofgärtnerei', 'höchster justizbeamter', 'kaiserlicher rat', 'leipzig', 'leiters',
 94 |                                      'marbach', 'neckar', 'neckar johann kaspar schiller', 'neigung', 'offiziers',
 95 |                                      'rechte', 'rechtswissenschaft', 'schwester cornelia', 'stadt frankfurt',
 96 |                                      'stadtschultheiß', 'straßburg', 'tochter', 'vater doktor', 'vaters', 'wetzlar',
 97 |                                      'wirtes', 'wundarztes', 'wunsch', 'zweites kind']
 98 |     return sorted_german_test_keyphrases
 99 | 
100 | 
101 | def get_french_test_keyphrases():
102 |     sorted_french_test_keyphrases = ['assurance', 'constructeurs', 'responsabilité', 'voitures']
103 | 
104 |     return sorted_french_test_keyphrases
105 | 
106 | 
107 | def get_sorted_english_count_matrix():
108 |     sorted_english_count_matrix = [
109 |         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
110 |          3, 3, 3, 3, 3],
111 |         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
112 |          1, 2, 2, 5, 5]]
113 | 
114 |     return sorted_english_count_matrix
115 | 
116 | 
117 | def get_sorted_english_first_doc_count_matrix():
118 |     sorted_english_first_doc_count_matrix = [
119 |         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]
120 | 
121 |     return sorted_english_first_doc_count_matrix
122 | 
123 | 
124 | def get_sorted_english_second_doc_count_matrix():
125 |     sorted_english_second_doc_count_matrix = [
126 |         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 |          1, 2, 2, 5, 5]]
128 | 
129 |     return sorted_english_second_doc_count_matrix
130 | 
131 | 
132 | def get_sorted_french_count_matrix():
133 |     sorted_french_count_matrix = [[1, 1, 1, 1]]
134 | 
135 |     return sorted_french_count_matrix
136 | 
137 | 
138 | def get_sorted_english_tfidf_matrix():
139 |     sorted_english_tfidf_matrix = [
140 |         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1147078669,
141 |          0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669,
142 |          0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669, 0.1147078669,
143 |          0.1147078669, 0.2294157339, 0.3441236008, 0.3441236008, 0.3441236008, 0.3441236008, 0.3441236008,
144 |          0.3441236008],
145 |         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
146 |          0.0, 0.0, 0.0, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658,
147 |          0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.1186781658, 0.2373563316,
148 |          0.2373563316, 0.5933908291, 0.5933908291]]
149 | 
150 |     return sorted_english_tfidf_matrix
151 | 
152 | 
153 | def get_english_keybert_keyphrases():
154 |     english_keybert_keyphrases = [
155 |         ['supervised learning algorithm', 'supervised learning', 'training data', 'training examples', 'class labels'],
156 |         ['document relevance', 'keywords', 'information retrieval', 'information retrieval environment',
157 |          'document content']]
158 | 
159 |     return english_keybert_keyphrases
160 | 
161 | 
162 | def get_frequencies_after_min_df():
163 |     frequency_array = np.array([[5, 5]])
164 | 
165 |     return frequency_array
166 | 
167 | 
168 | def get_frequencies_after_bow():
169 |     frequency_array = np.array([[7.5, 7.5]])
170 | 
171 |     return frequency_array
172 | 
173 | 
174 | def get_frequent_keyphrases_after_min_df():
175 |     keyphrases = ['document', 'keywords']
176 | 
177 |     return keyphrases
178 | 


--------------------------------------------------------------------------------
/keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. _spaCy pipeline: https://spacy.io/models
  3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip
  4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
  5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax
  6 | .. _spaCy part-of-speech tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
  7 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in
  8 | """
  9 | 
 10 | import warnings
 11 | from typing import List, Union
 12 | 
 13 | import numpy as np
 14 | import psutil
 15 | import spacy
 16 | from sklearn.exceptions import NotFittedError
 17 | from sklearn.feature_extraction.text import TfidfTransformer
 18 | from sklearn.utils.validation import FLOAT_DTYPES
 19 | 
 20 | from keyphrase_vectorizers.keyphrase_count_vectorizer import KeyphraseCountVectorizer
 21 | 
 22 | 
 23 | class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
 24 |     """
 25 |     KeyphraseTfidfVectorizer
 26 | 
 27 |     KeyphraseTfidfVectorizer converts a collection of text documents to a normalized tf or tf-idf document-token matrix.
 28 |     The tokens are keyphrases that are extracted from the text documents based on their part-of-speech tags.
 29 |     The matrix rows indicate the documents and columns indicate the unique keyphrases.
 30 |     Each cell represents the tf or tf-idf value, depending on the parameter settings.
 31 |     The part-of-speech pattern of keyphrases can be defined by the ``pos_pattern`` parameter.
 32 |     By default, keyphrases are extracted, that have 0 or more adjectives, followed by 1 or more nouns.
 33 |     A list of extracted keyphrases matching the defined part-of-speech pattern can be returned after fitting via :class:`get_feature_names_out()`.
 34 | 
 35 |     Attention:
 36 |         If the vectorizer is used for languages other than English, the ``spacy_pipeline`` and ``stop_words`` parameters
 37 |         must be customized accordingly.
 38 |         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
 39 |         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
 40 |         In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 41 | 
 42 |     Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
 43 |     This is a common term weighting scheme in information retrieval,
 44 |     that has also found good use in document classification.
 45 | 
 46 |     The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document
 47 |     is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less
 48 |     informative than features that occur in a small fraction of the training corpus.
 49 | 
 50 |     The formula that is used to compute the tf-idf for a term t of a document d in a document set is
 51 |     tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``),
 52 |     where n is the total number of documents in the document set and df(t) is the document frequency of t;
 53 |     the document frequency is the number of documents in the document set that contain the term t.
 54 |     The effect of adding "1" to the idf in the equation above is that terms with zero idf, i.e., terms
 55 |     that occur in all documents in a training set, will not be entirely ignored.
 56 |     (Note that the idf formula above differs from the standard textbook
 57 |     notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]).
 58 | 
 59 |     If ``smooth_idf=True`` (the default), the constant "1" is added to the numerator and denominator of the idf as
 60 |     if an extra document was seen containing every term in the collection exactly once, which prevents
 61 |     zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
 62 | 
 63 |     Furthermore, the formulas used to compute tf and idf depend on parameter settings that correspond to
 64 |     the SMART notation used in IR as follows:
 65 | 
 66 |     Tf is "n" (natural) by default, "l" (logarithmic) when ``sublinear_tf=True``.
 67 |     Idf is "t" when use_idf is given, "n" (none) otherwise.
 68 |     Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     spacy_pipeline : Union[str, spacy.Language], default='en_core_web_sm'
 73 |             A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text. Standard is the 'en' pipeline.
 74 | 
 75 |     pos_pattern :  str, default='<J.*>*<N.*>+'
 76 |         The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text.
 77 |         Standard is to only select keyphrases that have 0 or more adjectives, followed by 1 or more nouns.
 78 | 
 79 |     stop_words : Union[str, List[str]], default='english'
 80 |             Language of stopwords to remove from the document, e.g. 'english'.
 81 |             Supported options are `stopwords available in NLTK`_.
 82 |             Removes unwanted stopwords from keyphrases if 'stop_words' is not None.
 83 |             If given a list of custom stopwords, removes them instead.
 84 | 
 85 |     lowercase : bool, default=True
 86 |         Whether the returned keyphrases should be converted to lowercase.
 87 | 
 88 |     workers :int, default=1
 89 |             How many workers to use for spaCy part-of-speech tagging.
 90 |             If set to -1, use all available worker threads of the machine.
 91 |             SpaCy uses the specified number of cores to tag documents with part-of-speech.
 92 |             Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
 93 |             In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
 94 |             Therefore, carefully consider whether this option is really necessary.
 95 | 
 96 |     spacy_exclude : List[str], default=['parser', 'attribute_ruler', 'lemmatizer', 'ner']
 97 |             A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
 98 |             Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
 99 | 
100 |     custom_pos_tagger: callable, default=None
101 |             A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
102 |             If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
103 | 
104 |     max_df : int, default=None
105 |         During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold.
106 | 
107 |     min_df : int, default=None
108 |         During fitting ignore keyphrases that have a document frequency strictly lower than the given threshold.
109 |         This value is also called cut-off in the literature.
110 | 
111 |     binary : bool, default=False
112 |         If True, all non-zero counts are set to 1.
113 |         This is useful for discrete probabilistic models that model binary events rather than integer counts.
114 | 
115 |     dtype : type, default=np.int64
116 |         Type of the matrix returned by fit_transform() or transform().
117 | 
118 |     decay : float, default=None
119 |           A value between [0, 1] to weight the percentage of frequencies
120 |           the previous bag-of-words should be decreased. For example,
121 |           a value of `.1` will decrease the frequencies in the bag-of-words
122 |           matrix with 10% at each iteration.
123 | 
124 |     delete_min_df : float, default=None
125 |           Delete words at each iteration from its vocabulary
126 |           that are below a minimum frequency.
127 |           This will keep the resulting bag-of-words matrix small
128 |           such that it does not explode in size with increasing
129 |           vocabulary. If `decay` is None then this equals `min_df`.
130 | 
131 |     norm : {'l1', 'l2'}, default='l2'
132 |         Each output row will have unit norm, either:
133 |         - 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product when l2 norm has been applied.
134 |         - 'l1': Sum of absolute values of vector elements is 1.
135 | 
136 |     use_idf : bool, default=True
137 |         Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
138 | 
139 |     smooth_idf : bool, default=True
140 |         Smooth idf weights by adding one to document frequencies, as if an
141 |         extra document was seen containing every term in the collection
142 |         exactly once. Prevents zero divisions.
143 | 
144 |     sublinear_tf : bool, default=False
145 |         Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
146 | 
147 |     """
148 | 
149 |     def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
150 |                  stop_words: Union[str, List[str]] = 'english',
151 |                  lowercase: bool = True, workers: int = 1,
152 |                  spacy_exclude: List[str] = ['parser', 'attribute_ruler', 'lemmatizer', 'ner'],
153 |                  custom_pos_tagger: callable = None, max_df: int = None, min_df: int = None,
154 |                  binary: bool = False, dtype: np.dtype = np.float64, decay: float = None,
155 |                  delete_min_df: float = None, norm: str = "l2",
156 |                  use_idf: bool = True, smooth_idf: bool = True,
157 |                  sublinear_tf: bool = False):
158 | 
159 |         # triggers a parameter validation
160 |         if not isinstance(workers, int):
161 |             raise ValueError(
162 |                 "'workers' parameter must be of type int"
163 |             )
164 | 
165 |         if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0):
166 |             raise ValueError(
167 |                 "'workers' parameter value cannot be 0 and must be between -1 and " + str(
168 |                     psutil.cpu_count(logical=True))
169 |             )
170 | 
171 |         self.spacy_pipeline = spacy_pipeline
172 |         self.pos_pattern = pos_pattern
173 |         self.stop_words = stop_words
174 |         self.lowercase = lowercase
175 |         self.workers = workers
176 |         self.spacy_exclude = spacy_exclude
177 |         self.custom_pos_tagger = custom_pos_tagger
178 |         self.max_df = max_df
179 |         self.min_df = min_df
180 |         self.binary = binary
181 |         self.dtype = dtype
182 |         self.decay = decay
183 |         self.delete_min_df = delete_min_df
184 |         self.norm = norm
185 |         self.use_idf = use_idf
186 |         self.smooth_idf = smooth_idf
187 |         self.sublinear_tf = sublinear_tf
188 | 
189 |         self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf, smooth_idf=self.smooth_idf,
190 |                                        sublinear_tf=self.sublinear_tf)
191 | 
192 |         super().__init__(spacy_pipeline=self.spacy_pipeline, pos_pattern=self.pos_pattern, stop_words=self.stop_words,
193 |                          lowercase=self.lowercase, workers=self.workers, spacy_exclude=self.spacy_exclude,
194 |                          custom_pos_tagger=self.custom_pos_tagger, max_df=self.max_df, min_df=self.min_df,
195 |                          binary=self.binary, dtype=self.dtype, decay=self.decay, delete_min_df=self.delete_min_df)
196 | 
197 |     def _check_params(self):
198 |         """
199 |         Validate dtype parameter.
200 |         """
201 | 
202 |         if self.dtype not in FLOAT_DTYPES:
203 |             warnings.warn(
204 |                 "Only {} 'dtype' should be used. {} 'dtype' will "
205 |                 "be converted to np.float64.".format(FLOAT_DTYPES, self.dtype),
206 |                 UserWarning,
207 |             )
208 | 
209 |     def fit(self, raw_documents: List[str]) -> object:
210 |         """Learn the keyphrases that match the defined part-of-speech pattern and idf from the list of raw documents.
211 | 
212 |         Parameters
213 |         ----------
214 |         raw_documents : iterable
215 |             An iterable of strings.
216 | 
217 |         Returns
218 |         -------
219 |         self : object
220 |             Fitted vectorizer.
221 |         """
222 | 
223 |         self._check_params()
224 |         X = super().fit_transform(raw_documents)
225 |         self._tfidf.fit(X)
226 |         return self
227 | 
228 |     def fit_transform(self, raw_documents: List[str]) -> List[List[float]]:
229 |         """
230 |         Learn the keyphrases that match the defined part-of-speech pattern and idf from the list of raw documents.
231 |         Then return document-keyphrase matrix.
232 |         This is equivalent to fit followed by transform, but more efficiently implemented.
233 | 
234 |         Parameters
235 |         ----------
236 |         raw_documents : iterable
237 |             An iterable of strings.
238 | 
239 |         Returns
240 |         -------
241 |         X : sparse matrix of (n_samples, n_features)
242 |             Tf-idf-weighted document-keyphrase matrix.
243 |         """
244 | 
245 |         self._check_params()
246 |         X = super().fit_transform(raw_documents)
247 |         self._tfidf.fit(X)
248 |         # X is already a transformed view of raw_documents so
249 |         # we set copy to False
250 |         return self._tfidf.transform(X, copy=False)
251 | 
252 |     def transform(self, raw_documents: List[str]) -> List[List[float]]:
253 |         """
254 |         Transform documents to document-keyphrase matrix.
255 |         Uses the keyphrases and document frequencies (df) learned by fit (or fit_transform).
256 | 
257 |         Parameters
258 |         ----------
259 |         raw_documents : iterable
260 |             An iterable of strings.
261 | 
262 |         Returns
263 |         -------
264 |         X : sparse matrix of (n_samples, n_features)
265 |             Tf-idf-weighted document-keyphrase matrix.
266 |         """
267 | 
268 |         # triggers a parameter validation
269 |         if not hasattr(self, 'keyphrases'):
270 |             raise NotFittedError("Keyphrases not fitted.")
271 | 
272 |         X = super().transform(raw_documents)
273 |         return self._tfidf.transform(X, copy=False)


--------------------------------------------------------------------------------
/keyphrase_vectorizers/keyphrase_vectorizer_mixin.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. _spaCy pipeline: https://spacy.io/models
  3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip
  4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
  5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax
  6 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in
  7 | """
  8 | 
  9 | import logging
 10 | import os
 11 | from typing import List, Union
 12 | 
 13 | import nltk
 14 | import numpy as np
 15 | import psutil
 16 | import scipy.sparse as sp
 17 | import spacy
 18 | 
 19 | 
 20 | class _KeyphraseVectorizerMixin():
 21 |     """
 22 |     _KeyphraseVectorizerMixin
 23 | 
 24 |     Provides common code for text vectorizers.
 25 |     """
 26 | 
 27 |     def build_tokenizer(self) -> callable:
 28 |         """
 29 |         Return a function that splits a string into a sequence of tokens.
 30 | 
 31 |         Returns
 32 |         -------
 33 |         tokenizer: callable
 34 |               A function to split a string into a sequence of tokens.
 35 |         """
 36 | 
 37 |         return self._tokenize
 38 | 
 39 |     def _tokenize_simple(self, text: str) -> List[str]:
 40 |         """
 41 |         Simple tokenizer that just splits strings by whitespace.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         text : str
 46 |                 The text to tokenize.
 47 | 
 48 |         Returns
 49 |         -------
 50 |         tokens: List[str]
 51 |               A list of tokens.
 52 |         """
 53 | 
 54 |         tokens = text.split()
 55 |         return tokens
 56 | 
 57 |     def _tokenize(self, text: str) -> List[str]:
 58 |         """
 59 |         Custom word tokenizer for sklearn vectorizer that uses a spaCy pipeline for tokenization.
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |         text : str
 64 |                 The text to tokenize.
 65 | 
 66 |         Returns
 67 |         -------
 68 |         tokens: List[str]
 69 |               A list of tokens.
 70 |         """
 71 | 
 72 |         processed_documents, _ = self._get_pos_keyphrases(document_list=[text],
 73 |                                                           stop_words=self.stop_words,
 74 |                                                           spacy_pipeline=self.spacy_pipeline,
 75 |                                                           pos_pattern=self.pos_pattern,
 76 |                                                           lowercase=self.lowercase, workers=self.workers,
 77 |                                                           spacy_exclude=['tok2vec', 'tagger', 'parser',
 78 |                                                                          'attribute_ruler', 'lemmatizer', 'ner',
 79 |                                                                          'textcat'],
 80 |                                                           custom_pos_tagger=self.custom_pos_tagger,
 81 |                                                           extract_keyphrases=False)
 82 | 
 83 |         return self._tokenize_simple(processed_documents[0])
 84 | 
 85 |     def _document_frequency(self, document_keyphrase_count_matrix: List[List[int]]) -> np.array:
 86 |         """
 87 |         Count the number of non-zero values for each feature in sparse a matrix.
 88 | 
 89 |         Parameters
 90 |         ----------
 91 |         document_keyphrase_count_matrix : List[List[int]]
 92 |                 The document-keyphrase count matrix to transform to document frequencies.
 93 | 
 94 |         Returns
 95 |         -------
 96 |         document_frequencies : np.array
 97 |             Numpy array of document frequencies for keyphrases.
 98 |         """
 99 | 
100 |         document_keyphrase_count_matrix = sp.csr_matrix(document_keyphrase_count_matrix)
101 |         document_frequencies = np.bincount(document_keyphrase_count_matrix.indices,
102 |                                            minlength=document_keyphrase_count_matrix.shape[1])
103 | 
104 |         return document_frequencies
105 | 
106 |     def _remove_suffixes(self, text: str, suffixes: List[str]) -> str:
107 |         """
108 |         Removes pre-defined suffixes from a given text string.
109 | 
110 |         Parameters
111 |         ----------
112 |         text : str
113 |             Text string where suffixes should be removed.
114 | 
115 |         suffixes : list
116 |             List of strings that should be removed from the end of the text.
117 | 
118 |         Returns
119 |         -------
120 |         text : Text string with removed suffixes.
121 |         """
122 | 
123 |         for suffix in suffixes:
124 |             if text.lower().endswith(suffix.lower()):
125 |                 return text[:-len(suffix)].strip()
126 |         return text
127 | 
128 |     def _remove_prefixes(self, text: str, prefixes: List[str]) -> str:
129 |         """
130 |         Removes pre-defined prefixes from a given text string.
131 | 
132 |         Parameters
133 |         ----------
134 |         text : str
135 |             Text string where prefixes should be removed.
136 | 
137 |         prefixes :  list
138 |             List of strings that should be removed from the beginning of the text.
139 | 
140 |         Returns
141 |         -------
142 |         text : Text string with removed prefixes.
143 |         """
144 | 
145 |         for prefix in prefixes:
146 |             if text.lower().startswith(prefix.lower()):
147 |                 return text[len(prefix):].strip()
148 |         return text
149 | 
150 |     def _cumulative_length_joiner(self, text_list: List[str], max_text_length: int) -> List[str]:
151 |         """
152 |         Joins strings from list of strings to single string until maximum char length is reached.
153 |         Then join the next strings from list to a single string and so on.
154 | 
155 |         Parameters
156 |         ----------
157 |         text_list : list of strings
158 |             List of strings to join.
159 | 
160 |         max_text_length : int
161 |             Maximum character length of the joined strings.
162 | 
163 |         Returns
164 |         -------
165 |         list_of_joined_strings_with_max_length : List of joined text strings with max char length of 'max_text_length'.
166 |         """
167 | 
168 |         if isinstance(text_list, str):
169 |             raise ValueError("Iterable over raw texts expected, string object received.")
170 | 
171 |         if not isinstance(max_text_length, int) or max_text_length <= 0:
172 |             raise ValueError("max_text_length must be a positive integer.")
173 | 
174 |         joined_strings = []
175 |         current_string = ""
176 | 
177 |         for text in text_list:
178 |             if not text:
179 |                 continue
180 | 
181 |             # If the next text exceeds the max length, start a new string
182 |             if len(current_string) + len(text) + 1 > max_text_length:  # +1 for space character
183 |                 # Append the current string to the result list
184 |                 if current_string:
185 |                     joined_strings.append(current_string.strip())
186 |                 # Start a new string with the current text
187 |                 current_string = text
188 |             else:
189 |                 # Add the text to the current string
190 |                 if current_string:
191 |                     current_string += ' ' + text
192 |                 else:
193 |                     current_string = text
194 | 
195 |         # Append the last string to the result list
196 |         if current_string:
197 |             joined_strings.append(current_string.strip())
198 | 
199 |         return joined_strings
200 | 
201 |     def _split_long_document(self, text: str, max_text_length: int) -> List[str]:
202 |         """
203 |         Split single string in list of strings with a maximum character length.
204 | 
205 |         Parameters
206 |         ----------
207 |         text : str
208 |             Text string that should be split.
209 | 
210 |         max_text_length : int
211 |             Maximum character length of the strings.
212 | 
213 |         Returns
214 |         -------
215 |         splitted_document : List of text strings.
216 |         """
217 |         # triggers a parameter validation
218 |         if not isinstance(text, str):
219 |             raise ValueError(
220 |                 "'text' parameter needs to be a string."
221 |             )
222 | 
223 |         # triggers a parameter validation
224 |         if not isinstance(max_text_length, int):
225 |             raise ValueError(
226 |                 "'max_text_length' parameter needs to be a int"
227 |             )
228 | 
229 |         text = text.replace("? ", "?<stop>")
230 |         text = text.replace("! ", "!<stop>")
231 | 
232 |         if "<stop>" in text:
233 |             splitted_document = text.split("<stop>")
234 |             splitted_document = [s.strip() for s in splitted_document if s.strip()]  # Filter out empty strings
235 |             splitted_document = [
236 |                 self._cumulative_length_joiner(text_list=doc.split(" "), max_text_length=max_text_length) if len(
237 |                     doc) > max_text_length else [doc] for doc in splitted_document]
238 |             return [text for doc in splitted_document for text in doc]
239 |         else:
240 |             # No punctuation marks found, process the entire text
241 |             splitted_document = text.split(" ")
242 |             splitted_document = self._cumulative_length_joiner(text_list=splitted_document,
243 |                                                                max_text_length=max_text_length)
244 |             return splitted_document
245 | 
246 |     def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, List[str]], spacy_pipeline: Union[str, spacy.Language],
247 |                             pos_pattern: str, spacy_exclude: List[str], custom_pos_tagger: callable,
248 |                             lowercase: bool = True, workers: int = 1, extract_keyphrases: bool = True) -> List[str]:
249 |         """
250 |         Select keyphrases with part-of-speech tagging from a text document.
251 |         Parameters
252 |         ----------
253 |         document_list : list of str
254 |             List of text documents from which to extract the keyphrases.
255 | 
256 |         stop_words : Union[str, List[str]]
257 |             Language of stopwords to remove from the document, e.g. 'english'.
258 |             Supported options are `stopwords available in NLTK`_.
259 |             Removes unwanted stopwords from keyphrases if 'stop_words' is not None.
260 |             If given a list of custom stopwords, removes them instead.
261 | 
262 |         spacy_pipeline : Union[str, spacy.Language]
263 |             A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text.
264 | 
265 |         pos_pattern : str
266 |             The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text.
267 | 
268 |         spacy_exclude : List[str]
269 |             A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
270 |             Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
271 | 
272 |         custom_pos_tagger : callable
273 |             A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
274 |             If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
275 | 
276 |         lowercase : bool, default=True
277 |             Whether the returned keyphrases should be converted to lowercase.
278 | 
279 |         workers : int, default=1
280 |             How many workers to use for spaCy part-of-speech tagging.
281 |             If set to -1, use all available worker threads of the machine.
282 |             spaCy uses the specified number of cores to tag documents with part-of-speech.
283 |             Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
284 |             In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
285 |             Therefore, carefully consider whether this option is really necessary.
286 | 
287 |         extract_keyphrases : bool, default=True
288 |             Whether to run the keyphrase extraction step or just return an empty list.
289 | 
290 |         Returns
291 |         -------
292 |         keyphrases : List of unique keyphrases of varying length, extracted from the text document with the defined 'pos_pattern'.
293 |         """
294 | 
295 |         # triggers a parameter validation
296 |         if isinstance(document_list, str):
297 |             raise ValueError(
298 |                 "Iterable over raw text documents expected, string object received."
299 |             )
300 | 
301 |         # triggers a parameter validation
302 |         if not hasattr(document_list, '__iter__'):
303 |             raise ValueError(
304 |                 "Iterable over raw text documents expected."
305 |             )
306 | 
307 |         # triggers a parameter validation
308 |         if not isinstance(stop_words, str) and (stop_words is not None) and (not hasattr(stop_words, '__iter__')):
309 |             raise ValueError(
310 |                 "'stop_words' parameter needs to be a string, e.g. 'english' or 'None' or a list of strings."
311 |             )
312 | 
313 |         # triggers a parameter validation
314 |         if not isinstance(spacy_pipeline, (str, spacy.Language)):
315 |             raise ValueError(
316 |                 "'spacy_pipeline' parameter needs to be a spacy.Language object or a spaCy pipeline string. E.g. 'en_core_web_sm'"
317 |             )
318 | 
319 |         # triggers a parameter validation
320 |         if not isinstance(pos_pattern, str):
321 |             raise ValueError(
322 |                 "'pos_pattern' parameter needs to be a regex string. E.g. '<J.*>*<N.*>+'"
323 |             )
324 | 
325 |         # triggers a parameter validation
326 |         if ((not hasattr(spacy_exclude, '__iter__')) and (spacy_exclude is not None)) or (
327 |                 isinstance(spacy_exclude, str)):
328 |             raise ValueError(
329 |                 "'spacy_exclude' parameter needs to be a list of 'spaCy pipeline components' strings."
330 |             )
331 | 
332 |         # triggers a parameter validation
333 |         if not callable(custom_pos_tagger) and (custom_pos_tagger is not None):
334 |             raise ValueError(
335 |                 "'custom_pos_tagger' must be a callable function that gets a list of strings in a 'raw_documents' parameter and returns a list of (word, POS-tag) tuples."
336 |             )
337 | 
338 |         # triggers a parameter validation
339 |         if not isinstance(workers, int):
340 |             raise ValueError(
341 |                 "'workers' parameter must be of type int."
342 |             )
343 | 
344 |         if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0):
345 |             raise ValueError(
346 |                 "'workers' parameter value cannot be 0 and must be between -1 and " + str(
347 |                     psutil.cpu_count(logical=True))
348 |             )
349 | 
350 | 
351 |         stop_words_list = set()
352 |         if isinstance(stop_words, str):
353 |             try:
354 |                 stop_words_list = set(nltk.corpus.stopwords.words(stop_words))
355 |             except LookupError:
356 |                 logger = logging.getLogger('KeyphraseVectorizer')
357 |                 logger.setLevel(logging.WARNING)
358 |                 sh = logging.StreamHandler()
359 |                 sh.setFormatter(logging.Formatter(
360 |                     '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
361 |                 logger.addHandler(sh)
362 |                 logger.setLevel(logging.DEBUG)
363 |                 logger.info(
364 |                     'It looks like you do not have downloaded a list of stopwords yet. It is attempted to download the stopwords now.')
365 |                 nltk.download('stopwords')
366 |                 stop_words_list = set(nltk.corpus.stopwords.words(stop_words))
367 | 
368 |         elif hasattr(stop_words, '__iter__'):
369 |             stop_words_list = set(stop_words)
370 | 
371 |         # add spaCy PoS tags for documents
372 |         if not custom_pos_tagger:
373 |             if isinstance(spacy_pipeline, spacy.Language):
374 |                 nlp = spacy_pipeline
375 |             else:
376 |                 if not spacy_exclude:
377 |                     spacy_exclude = []
378 |                 try:
379 |                     if extract_keyphrases:
380 |                         nlp = spacy.load(spacy_pipeline, exclude=spacy_exclude)
381 |                     else:
382 |                         # only use tokenizer if no keywords are extracted
383 |                         nlp = spacy.blank(spacy_pipeline.split("_")[0])
384 | 
385 |                 except OSError:
386 |                     # set logger
387 |                     logger = logging.getLogger('KeyphraseVectorizer')
388 |                     logger.setLevel(logging.WARNING)
389 |                     sh = logging.StreamHandler()
390 |                     sh.setFormatter(logging.Formatter(
391 |                         '%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
392 |                     logger.addHandler(sh)
393 |                     logger.setLevel(logging.DEBUG)
394 |                     logger.info(
395 |                         'It looks like the selected spaCy pipeline is not downloaded yet. It is attempted to download the spaCy pipeline now.')
396 |                     spacy.cli.download(spacy_pipeline)
397 |                     nlp = spacy.load(spacy_pipeline,
398 |                                      exclude=spacy_exclude)
399 | 
400 |         if workers != 1:
401 |             os.environ["TOKENIZERS_PARALLELISM"] = "false"
402 | 
403 |         # add document delimiter, so we can identify the original document split later
404 |         doc_delimiter = "thisisadocumentdelimiternotakeyphrasepleaseignore"
405 |         document_list = [doc_delimiter + " " + doc for doc in document_list]
406 | 
407 |         # split large documents in smaller chunks, so that spacy can process them without memory issues
408 |         docs_list = []
409 |         # set maximal character length of documents for spaCy processing
410 |         max_doc_length = 500
411 |         for document in document_list:
412 |             if len(document) > max_doc_length:
413 |                 docs_list.extend(self._split_long_document(text=document, max_text_length=max_doc_length))
414 |             else:
415 |                 docs_list.append(document)
416 |         document_list = docs_list
417 |         del docs_list
418 | 
419 |         # increase max length of documents that spaCy can parse
420 |         # (should only be done if parser and ner are not used due to memory issues)
421 |         if not custom_pos_tagger:
422 |             nlp.max_length = max([len(doc) for doc in document_list]) + 100
423 | 
424 |         if not custom_pos_tagger:
425 |             pos_tuples = []
426 |             for tagged_doc in nlp.pipe(document_list, n_process=workers):
427 |                 pos_tuples.extend([(word.text, word.tag_) for word in tagged_doc if word.text])
428 |         else:
429 |             pos_tuples = custom_pos_tagger(raw_documents=document_list)
430 | 
431 |         # get the original documents after they were processed by a tokenizer and a POS tagger
432 |         processed_docs = []
433 |         for tup in pos_tuples:
434 |             token = tup[0]
435 |             if lowercase:
436 |                 token = token.lower()
437 |             if token not in stop_words_list:
438 |                 processed_docs.append(token)
439 |         processed_docs = ' '.join(processed_docs)
440 | 
441 |         # add delimiter to stop_words_list to ignore it during keyphrase extraction
442 |         stop_words_list.add(doc_delimiter)
443 | 
444 |         # split processed documents by delimiter
445 |         processed_docs = [doc.strip() for doc in processed_docs.split(doc_delimiter)][1:]
446 | 
447 |         if extract_keyphrases:
448 |             # extract keyphrases that match the NLTK RegexpParser filter
449 |             keyphrases = []
450 |             # prefix_list = [stop_word + ' ' for stop_word in stop_words_list]
451 |             # suffix_list = [' ' + stop_word for stop_word in stop_words_list]
452 |             cp = nltk.RegexpParser('CHUNK: {(' + pos_pattern + ')}')
453 |             tree = cp.parse(pos_tuples)
454 |             for subtree in tree.subtrees(filter=lambda tuple: tuple.label() == 'CHUNK'):
455 |                 # join candidate keyphrase from single words
456 |                 keyphrase = ' '.join([i[0] for i in subtree.leaves() if i[0] not in stop_words_list])
457 | 
458 |                 # convert keyphrase to lowercase
459 |                 if lowercase:
460 |                     keyphrase = keyphrase.lower()
461 | 
462 |                 # remove stopword suffixes
463 |                 # keyphrase = self._remove_suffixes(keyphrase, suffix_list)
464 | 
465 |                 # remove stopword prefixes
466 |                 # keyphrase = self._remove_prefixes(keyphrase, prefix_list)
467 | 
468 |                 # remove whitespace from the beginning and end of keyphrases
469 |                 keyphrase = keyphrase.strip()
470 | 
471 |                 # do not include single keywords that are actually stopwords
472 |                 if keyphrase.lower() not in stop_words_list:
473 |                     keyphrases.append(keyphrase)
474 | 
475 |             # remove potential empty keyphrases
476 |             keyphrases = [keyphrase for keyphrase in keyphrases if keyphrase != '']
477 | 
478 |         else:
479 |             keyphrases = []
480 | 
481 |         return processed_docs, list(set(keyphrases))


--------------------------------------------------------------------------------
/keyphrase_vectorizers/keyphrase_count_vectorizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. _spaCy pipeline: https://spacy.io/models
  3 | .. _stopwords available in NLTK: https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip
  4 | .. _POS-tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
  5 | .. _regex pattern: https://docs.python.org/3/library/re.html#regular-expression-syntax
  6 | .. _spaCy part-of-speech tags: https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
  7 | .. _spaCy pipeline components: https://spacy.io/usage/processing-pipelines#built-in
  8 | """
  9 | 
 10 | import warnings
 11 | from typing import List, Union
 12 | 
 13 | import numpy as np
 14 | import psutil
 15 | import spacy
 16 | from scipy import sparse
 17 | from scipy.sparse import csr_matrix
 18 | from sklearn.base import BaseEstimator
 19 | from sklearn.exceptions import NotFittedError
 20 | from sklearn.feature_extraction.text import CountVectorizer
 21 | from sklearn.utils.deprecation import deprecated
 22 | 
 23 | from keyphrase_vectorizers.keyphrase_vectorizer_mixin import _KeyphraseVectorizerMixin
 24 | 
 25 | 
 26 | class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
 27 |     """
 28 |     KeyphraseCountVectorizer
 29 | 
 30 |     KeyphraseCountVectorizer converts a collection of text documents to a matrix of document-token counts.
 31 |     The tokens are keyphrases that are extracted from the text documents based on their part-of-speech tags.
 32 |     The matrix rows indicate the documents and columns indicate the unique keyphrases. Each cell represents the count.
 33 |     The part-of-speech pattern of keyphrases can be defined by the ``pos_pattern`` parameter.
 34 |     By default, keyphrases are extracted, that have 0 or more adjectives, followed by 1 or more nouns.
 35 |     A list of extracted keyphrases matching the defined part-of-speech pattern can be returned after fitting via :class:`get_feature_names_out()`.
 36 | 
 37 |     Attention:
 38 |         If the vectorizer is used for languages other than English, the ``spacy_pipeline`` and ``stop_words`` parameters
 39 |         must be customized accordingly.
 40 |         Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_  differ between languages.
 41 |         Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
 42 |         In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     spacy_pipeline : Union[str, spacy.Language], default='en_core_web_sm'
 47 |             A spacy.Language object or the name of the `spaCy pipeline`_, used to tag the parts-of-speech in the text. Standard is the 'en' pipeline.
 48 | 
 49 |     pos_pattern :  str, default='<J.*>*<N.*>+'
 50 |         The `regex pattern`_ of `POS-tags`_ used to extract a sequence of POS-tagged tokens from the text.
 51 |         Standard is to only select keyphrases that have 0 or more adjectives, followed by 1 or more nouns.
 52 | 
 53 |     stop_words : Union[str, List[str]], default='english'
 54 |           Language of stopwords to remove from the document, e.g. 'english'.
 55 |           Supported options are `stopwords available in NLTK`_.
 56 |           Removes unwanted stopwords from keyphrases if 'stop_words' is not None.
 57 |           If given a list of custom stopwords, removes them instead.
 58 | 
 59 |     lowercase : bool, default=True
 60 |           Whether the returned keyphrases should be converted to lowercase.
 61 | 
 62 |     workers : int, default=1
 63 |           How many workers to use for spaCy part-of-speech tagging.
 64 |           If set to -1, use all available worker threads of the machine.
 65 |           SpaCy uses the specified number of cores to tag documents with part-of-speech.
 66 |           Depending on the platform, starting many processes with multiprocessing can add a lot of overhead.
 67 |           In particular, the default start method spawn used in macOS/OS X (as of Python 3.8) and in Windows can be slow.
 68 |           Therefore, carefully consider whether this option is really necessary.
 69 | 
 70 |     spacy_exclude : List[str], default=['parser', 'attribute_ruler', 'lemmatizer', 'ner']
 71 |           A list of `spaCy pipeline components`_ that should be excluded during the POS-tagging.
 72 |           Removing not needed pipeline components can sometimes make a big difference and improve loading and inference speed.
 73 | 
 74 |     custom_pos_tagger: callable, default=None
 75 |           A callable function which expects a list of strings in a 'raw_documents' parameter and returns a list of (word token, POS-tag) tuples.
 76 |           If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
 77 | 
 78 |     max_df : int, default=None
 79 |           During fitting ignore keyphrases that have a document frequency strictly higher than the given threshold.
 80 | 
 81 |     min_df : int, default=None
 82 |           During fitting ignore keyphrases that have a document frequency strictly lower than the given threshold.
 83 |           This value is also called cut-off in the literature.
 84 | 
 85 |     binary : bool, default=False
 86 |           If True, all non zero counts are set to 1.
 87 |           This is useful for discrete probabilistic models that model binary events rather than integer counts.
 88 | 
 89 |     dtype : type, default=np.int64
 90 |           Type of the matrix returned by fit_transform() or transform().
 91 | 
 92 |     decay : float, default=None
 93 |           A value between [0, 1] to weight the percentage of frequencies
 94 |           the previous bag-of-words should be decreased. For example,
 95 |           a value of `.1` will decrease the frequencies in the bag-of-words
 96 |           matrix with 10% at each iteration.
 97 | 
 98 |     delete_min_df : float, default=None
 99 |           Delete words at each iteration from its vocabulary
100 |           that are below a minimum frequency.
101 |           This will keep the resulting bag-of-words matrix small
102 |           such that it does not explode in size with increasing
103 |           vocabulary. If `decay` is None then this equals `min_df`.
104 | """
105 | 
106 |     def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm', pos_pattern: str = '<J.*>*<N.*>+',
107 |                  stop_words: Union[str, List[str]] = 'english', lowercase: bool = True, workers: int = 1,
108 |                  spacy_exclude: List[str] = ['parser', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat'],
109 |                  custom_pos_tagger: callable = None,
110 |                  max_df: int = None, min_df: int = None, binary: bool = False, dtype: np.dtype = np.int64,
111 |                  decay: float = None, delete_min_df: float = None):
112 | 
113 |         # triggers a parameter validation
114 |         if not isinstance(min_df, int) and min_df is not None:
115 |             raise ValueError(
116 |                 "'min_df' parameter must be of type int"
117 |             )
118 | 
119 |         # triggers a parameter validation
120 |         if not isinstance(decay, float) and min_df is not None:
121 |             raise ValueError(
122 |                 "'decay' parameter must be of type int"
123 |             )
124 | 
125 |         # triggers a parameter validation
126 |         if not isinstance(delete_min_df, float) and min_df is not None:
127 |             raise ValueError(
128 |                 "'delete_min_df' parameter must be of type int"
129 |             )
130 | 
131 |         # triggers a parameter validation
132 |         if min_df == 0:
133 |             raise ValueError(
134 |                 "'min_df' parameter must be > 0"
135 |             )
136 | 
137 |         # triggers a parameter validation
138 |         if not isinstance(max_df, int) and max_df is not None:
139 |             raise ValueError(
140 |                 "'max_df' parameter must be of type int"
141 |             )
142 | 
143 |         # triggers a parameter validation
144 |         if max_df == 0:
145 |             raise ValueError(
146 |                 "'max_df' parameter must be > 0"
147 |             )
148 | 
149 |         # triggers a parameter validation
150 |         if max_df and min_df and max_df <= min_df:
151 |             raise ValueError(
152 |                 "'max_df' must be > 'min_df'"
153 |             )
154 | 
155 |         # triggers a parameter validation
156 |         if not isinstance(workers, int):
157 |             raise ValueError(
158 |                 "'workers' parameter must be of type int"
159 |             )
160 | 
161 |         if (workers < -1) or (workers > psutil.cpu_count(logical=True)) or (workers == 0):
162 |             raise ValueError(
163 |                 "'workers' parameter value cannot be 0 and must be between -1 and " + str(
164 |                     psutil.cpu_count(logical=True))
165 |             )
166 | 
167 |         self.spacy_pipeline = spacy_pipeline
168 |         self.pos_pattern = pos_pattern
169 |         self.stop_words = stop_words
170 |         self.lowercase = lowercase
171 |         self.workers = workers
172 |         self.spacy_exclude = spacy_exclude
173 |         self.custom_pos_tagger = custom_pos_tagger
174 |         self.max_df = max_df
175 |         self.min_df = min_df
176 |         self.binary = binary
177 |         self.dtype = dtype
178 |         self.decay = decay
179 |         self.delete_min_df = delete_min_df
180 |         self.running_fit_transform = False
181 | 
182 |     def fit(self, raw_documents: List[str]) -> object:
183 |         """
184 |         Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents.
185 | 
186 |         Parameters
187 |         ----------
188 |         raw_documents : iterable
189 |             An iterable of strings.
190 | 
191 |         Returns
192 |         -------
193 |         self : object
194 |             Fitted vectorizer.
195 |         """
196 | 
197 |         processed_documents, self.keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
198 |                                                                         stop_words=self.stop_words,
199 |                                                                         spacy_pipeline=self.spacy_pipeline,
200 |                                                                         pos_pattern=self.pos_pattern,
201 |                                                                         lowercase=self.lowercase, workers=self.workers,
202 |                                                                         spacy_exclude=self.spacy_exclude,
203 |                                                                         custom_pos_tagger=self.custom_pos_tagger,
204 |                                                                         extract_keyphrases=True)
205 | 
206 |         # if the fit_transform process is currently running, pass the processed documents, so they do not need to be tokenized again
207 |         if self.running_fit_transform:
208 |             self.processed_documents = processed_documents
209 | 
210 |         # remove keyphrases that have more than 8 words, as they are probably no real keyphrases
211 |         # additionally this prevents memory issues during transformation to a document-keyphrase matrix
212 |         self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 8]
213 | 
214 |         # compute document frequencies of keyphrases
215 |         if self.max_df or self.min_df:
216 |             document_keyphrase_counts = CountVectorizer(vocabulary=self.keyphrases, ngram_range=(
217 |                 min([len(keyphrase.split()) for keyphrase in self.keyphrases]),
218 |                 max([len(keyphrase.split()) for keyphrase in self.keyphrases])),
219 |                                                         lowercase=self.lowercase, binary=self.binary,
220 |                                                         dtype=self.dtype, tokenizer=self._tokenize_simple).transform(
221 |                 raw_documents=processed_documents).toarray()
222 | 
223 |             document_frequencies = self._document_frequency(document_keyphrase_counts)
224 | 
225 |         # remove keyphrases with document frequencies < min_df and document frequencies > max_df
226 |         if self.max_df:
227 |             self.keyphrases = [keyphrase for index, keyphrase in enumerate(self.keyphrases) if
228 |                                (document_frequencies[index] <= self.max_df)]
229 |         if self.min_df:
230 |             self.keyphrases = [keyphrase for index, keyphrase in enumerate(self.keyphrases) if
231 |                                (document_frequencies[index] >= self.min_df)]
232 | 
233 |         # set n-gram range to zero if no keyphrases could be extracted
234 |         if self.keyphrases:
235 |             self.max_n_gram_length = max([len(keyphrase.split()) for keyphrase in self.keyphrases])
236 |             self.min_n_gram_length = min([len(keyphrase.split()) for keyphrase in self.keyphrases])
237 |         else:
238 |             raise ValueError(
239 |                 "Empty keyphrases. Perhaps the documents do not contain keyphrases that match the 'pos_pattern' argument, only contain stop words, or you set the 'min_df'/'max_df/delete_min_df' arguments too strict.")
240 | 
241 |         return self
242 | 
243 |     def fit_transform(self, raw_documents: List[str]) -> List[List[int]]:
244 |         """
245 |         Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents
246 |         and return the document-keyphrase matrix.
247 |         This is equivalent to fit followed by transform, but more efficiently implemented.
248 | 
249 |         Parameters
250 |         ----------
251 |         raw_documents : iterable
252 |             An iterable of strings.
253 | 
254 |         Returns
255 |         -------
256 |         X : array of shape (n_samples, n_features)
257 |             Document-keyphrase matrix.
258 |         """
259 | 
260 |         # indicate if the fit_trasnform process is currently running
261 |         self.running_fit_transform = True
262 | 
263 |         # fit
264 |         KeyphraseCountVectorizer.fit(self=self, raw_documents=raw_documents)
265 | 
266 |         # transform
267 |         count_matrix = CountVectorizer(vocabulary=self.keyphrases,
268 |                                        ngram_range=(self.min_n_gram_length, self.max_n_gram_length),
269 |                                        lowercase=self.lowercase, binary=self.binary, dtype=self.dtype,
270 |                                        tokenizer=self._tokenize_simple).fit_transform(
271 |             raw_documents=self.processed_documents)
272 | 
273 |         del self.processed_documents
274 |         self.running_fit_transform = False
275 | 
276 |         return count_matrix
277 | 
278 |     def transform(self, raw_documents: List[str]) -> List[List[int]]:
279 |         """
280 |         Transform documents to document-keyphrase matrix.
281 |         Extract token counts out of raw text documents using the keyphrases
282 |         fitted with fit.
283 | 
284 |         Parameters
285 |         ----------
286 |         raw_documents : iterable
287 |             An iterable of strings.
288 | 
289 |         Returns
290 |         -------
291 |         X : sparse matrix of shape (n_samples, n_features)
292 |             Document-keyphrase matrix.
293 |         """
294 | 
295 |         # triggers a parameter validation
296 |         if not hasattr(self, 'keyphrases'):
297 |             raise NotFittedError("Keyphrases not fitted.")
298 | 
299 |         # triggers a parameter validation
300 |         if self.keyphrases == []:
301 |             raise ValueError(
302 |                 "Empty keyphrases. Perhaps the documents used to fit did not contain any keyphrases or you set the 'min_df'/'max_df/delete_min_df' arguments too strict."
303 |             )
304 | 
305 |         processed_documents, _ = self._get_pos_keyphrases(document_list=raw_documents,
306 |                                                           stop_words=self.stop_words,
307 |                                                           spacy_pipeline=self.spacy_pipeline,
308 |                                                           pos_pattern=self.pos_pattern,
309 |                                                           lowercase=self.lowercase, workers=self.workers,
310 |                                                           spacy_exclude=['tok2vec', 'tagger', 'parser',
311 |                                                                          'attribute_ruler', 'lemmatizer', 'ner',
312 |                                                                          'textcat'],
313 |                                                           custom_pos_tagger=self.custom_pos_tagger,
314 |                                                           extract_keyphrases=False)
315 | 
316 |         return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length),
317 |                                lowercase=self.lowercase, binary=self.binary, dtype=self.dtype,
318 |                                tokenizer=self._tokenize_simple).transform(
319 |             raw_documents=processed_documents)
320 | 
321 |     def inverse_transform(self, X: List[List[int]]) -> List[List[str]]:
322 |         """
323 |         Return keyphrases per document with nonzero entries in X.
324 | 
325 |         Parameters
326 |         ----------
327 |         X : {array-like, sparse matrix} of shape (n_samples, n_features)
328 |             Document-keyphrase matrix.
329 | 
330 |         Returns
331 |         -------
332 |         X_inv : list of arrays of shape (n_samples,)
333 |             List of arrays of keyphrase.
334 |         """
335 | 
336 |         # triggers a parameter validation
337 |         if not hasattr(self, 'keyphrases'):
338 |             raise NotFittedError("Keyphrases not fitted.")
339 | 
340 |         return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length),
341 |                                lowercase=self.lowercase, binary=self.binary, dtype=self.dtype).inverse_transform(X=X)
342 | 
343 |     @deprecated(
344 |         "get_feature_names() is deprecated in scikit-learn 1.0 and will be removed "
345 |         "with scikit-learn 1.2. Please use get_feature_names_out() instead."
346 |     )
347 |     def get_feature_names(self) -> List[str]:
348 |         """
349 |         Array mapping from feature integer indices to feature name.
350 | 
351 |         Returns
352 |         -------
353 |         feature_names : list
354 |             A list of fitted keyphrases.
355 |         """
356 | 
357 |         # triggers a parameter validation
358 |         if not hasattr(self, 'keyphrases'):
359 |             raise NotFittedError("Keyphrases not fitted.")
360 | 
361 |         # raise DeprecationWarning when function is removed from scikit-learn
362 |         try:
363 |             with warnings.catch_warnings():
364 |                 warnings.simplefilter("ignore")
365 |                 return CountVectorizer(vocabulary=self.keyphrases,
366 |                                        ngram_range=(self.min_n_gram_length, self.max_n_gram_length),
367 |                                        lowercase=self.lowercase, binary=self.binary,
368 |                                        dtype=self.dtype).get_feature_names()
369 |         except AttributeError:
370 |             raise DeprecationWarning("get_feature_names() is deprecated. Please use 'get_feature_names_out()' instead.")
371 | 
372 |     def get_feature_names_out(self) -> np.array(str):
373 |         """
374 |         Get fitted keyphrases for transformation.
375 | 
376 |         Returns
377 |         -------
378 |         feature_names_out : ndarray of str objects
379 |             Transformed keyphrases.
380 |         """
381 | 
382 |         # triggers a parameter validation
383 |         if not hasattr(self, 'keyphrases'):
384 |             raise NotFittedError("Keyphrases not fitted.")
385 | 
386 |         return CountVectorizer(vocabulary=self.keyphrases, ngram_range=(self.min_n_gram_length, self.max_n_gram_length),
387 |                                lowercase=self.lowercase, binary=self.binary, dtype=self.dtype).get_feature_names_out()
388 | 
389 |     def partial_fit(self, raw_documents: List[str]) -> None:
390 |         """
391 |         Perform a partial fit and update internal list of keyphrases with OOV keyphrases
392 | 
393 |         Parameters
394 |         ----------
395 |         raw_documents : iterable
396 |             An iterable of strings.
397 | 
398 |         Returns
399 |         -------
400 |         self : object
401 |             Partial fitted vectorizer.
402 |         """
403 | 
404 |         if not hasattr(self, 'keyphrases'):
405 |             return self.fit(raw_documents)
406 | 
407 |         processed_documents, new_keyphrases = self._get_pos_keyphrases(document_list=raw_documents,
408 |                                                                        stop_words=self.stop_words,
409 |                                                                        spacy_pipeline=self.spacy_pipeline,
410 |                                                                        pos_pattern=self.pos_pattern,
411 |                                                                        lowercase=self.lowercase, workers=self.workers,
412 |                                                                        spacy_exclude=self.spacy_exclude,
413 |                                                                        custom_pos_tagger=self.custom_pos_tagger,
414 |                                                                        extract_keyphrases=True)
415 | 
416 |         oov_keyphrases = list(set(new_keyphrases).difference(set(self.keyphrases)))
417 | 
418 |         # remove keyphrases that have more than 8 words, as they are probably no real keyphrases
419 |         # additionally this prevents memory issues during transformation to a document-keyphrase matrix
420 |         oov_keyphrases = [keyphrase for keyphrase in oov_keyphrases if len(keyphrase.split()) <= 8]
421 | 
422 |         # compute document frequencies of keyphrases
423 |         if self.max_df or self.min_df:
424 |             document_keyphrase_counts = CountVectorizer(vocabulary=oov_keyphrases, ngram_range=(
425 |                 min([len(keyphrase.split()) for keyphrase in oov_keyphrases]),
426 |                 max([len(keyphrase.split()) for keyphrase in oov_keyphrases])),
427 |                                                         lowercase=self.lowercase, binary=self.binary,
428 |                                                         dtype=self.dtype, tokenizer=self._tokenize_simple).transform(
429 |                 raw_documents=processed_documents).toarray()
430 | 
431 |             document_frequencies = self._document_frequency(document_keyphrase_counts)
432 | 
433 |         # remove keyphrases with document frequencies < min_df and document frequencies > max_df
434 |         if self.max_df:
435 |             oov_keyphrases = [keyphrase for index, keyphrase in enumerate(oov_keyphrases) if
436 |                               (document_frequencies[index] <= self.max_df)]
437 |         if self.min_df:
438 |             oov_keyphrases = [keyphrase for index, keyphrase in enumerate(oov_keyphrases) if
439 |                               (document_frequencies[index] >= self.min_df)]
440 | 
441 |         if oov_keyphrases:
442 |             self.keyphrases = self.keyphrases + oov_keyphrases
443 |             self.max_n_gram_length = max([len(keyphrase.split()) for keyphrase in self.keyphrases])
444 |             self.min_n_gram_length = min([len(keyphrase.split()) for keyphrase in self.keyphrases])
445 | 
446 |         return self
447 | 
448 |     def update_bow(self, raw_documents: List[str]) -> csr_matrix:
449 |         """
450 |         Create or update the bag-of-keywords matrix
451 | 
452 |         Update the bag-of-keywords matrix by adding the newly transformed
453 |         documents. This may add empty columns if new words are found and/or
454 |         add empty rows if new topics are found.
455 | 
456 |         During this process, the previous bag-of-keywords matrix might be
457 |         decayed if `self.decay` has been set during init. Similarly, words
458 |         that do not exceed `self.delete_min_df` are removed from its
459 |         vocabulary and bag-of-keywords matrix.
460 | 
461 |         Parameters
462 |         ----------
463 |         raw_documents : iterable
464 |             An iterable of strings.
465 | 
466 |         Returns
467 |         -------
468 |         X_ : scipy.sparse.csr_matrix
469 |             Bag-of-keywords matrix
470 |         """
471 | 
472 |         if hasattr(self, "X_"):
473 |             X = self.transform(raw_documents)
474 | 
475 |             # Add empty columns if new words are found
476 |             columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
477 |             self.X_ = sparse.hstack([self.X_, columns])
478 | 
479 |             # Add empty rows if new topics are found
480 |             rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
481 |             self.X_ = sparse.vstack([self.X_, rows])
482 | 
483 |             # Decay of BoW matrix
484 |             if self.decay is not None:
485 |                 self.X_ = self.X_ * (1 - self.decay)
486 | 
487 |             self.X_ += X
488 |         else:
489 |             self.X_ = self.transform(raw_documents)
490 | 
491 |         if self.delete_min_df is not None:
492 |             self._clean_bow()
493 | 
494 |         return self.X_
495 | 
496 |     def _clean_bow(self) -> None:
497 |         """
498 |         Remove words that do not exceed `delete_min_df`
499 |         """
500 | 
501 |         # Only keep words with a minimum frequency
502 |         indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
503 |         self.X_ = self.X_[:, indices]
504 | 
505 |         x = np.array(self.keyphrases)
506 |         mask = np.full(len(self.keyphrases), True, dtype=bool)
507 |         mask[indices] = False
508 |         self.keyphrases = list(x[~mask])


--------------------------------------------------------------------------------
/docs/KeyphraseVectorizers.md:
--------------------------------------------------------------------------------
  1 | [![PyPI - Python](https://img.shields.io/badge/python-%3E%3D3.7-blue)](https://pypi.org/project/keyphrase-vectorizers/)
  2 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://github.com/TimSchopf/Keyphrase_Vectorizers/blob/master/LICENSE)
  3 | [![PyPI - PyPi](https://img.shields.io/pypi/v/keyphrase-vectorizers.svg)](https://pypi.org/project/keyphrase-vectorizers/)
  4 | [![Build](https://img.shields.io/github/workflow/status/TimSchopf/KeyphraseVectorizers/Code%20tests/master)](https://pypi.org/project/keyphrase-vectorizers/)
  5 | [![Documentation Status](https://readthedocs.org/projects/keyphrase-vectorizers/badge/?version=latest)](https://keyphrase-vectorizers.readthedocs.io/en/latest/?badge=latest)
  6 | 
  7 | KeyphraseVectorizers
  8 | ===================== 
  9 | 
 10 | Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert
 11 | them into a document-keyphrase matrix. A document-keyphrase matrix is a mathematical matrix that describes the frequency
 12 | of keyphrases that occur in a collection of documents. The matrix rows indicate the text documents and columns indicate
 13 | the unique keyphrases.
 14 | 
 15 | The package contains wrappers of the
 16 | [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer "scikit-learn CountVectorizer")
 17 | and
 18 | [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer "scikit-learn TfidfVectorizer")
 19 | classes. Instead of using n-gram tokens of a pre-defined range, these classes extract keyphrases from text documents
 20 | using part-of-speech tags to compute document-keyphrase matrices.
 21 | 
 22 | Benefits
 23 | --------
 24 | 
 25 | * Extract grammatically accurate keyphases based on their part-of-speech tags.
 26 | * No need to specify n-gram ranges.
 27 | * Get document-keyphrase matrices.
 28 | * Multiple language support.
 29 | * User-defined part-of-speech patterns for keyphrase extraction possible.
 30 | 
 31 | <a name="toc"/></a>
 32 | 
 33 | Table of Contents
 34 | -----------------
 35 | 
 36 | <!--ts-->
 37 | 
 38 | 1. [How does it work?](#how-does-it-work)
 39 | 2. [Installation](#installation)
 40 | 3. [Usage](#usage)
 41 |     1. [KeyphraseCountVectorizer](#KeyphraseCountVectorizer)
 42 |         1. [English language](#english-language)
 43 |         2. [Other languages](#other-languages)
 44 |     2. [KeyphraseTfidfVectorizer](#KeyphraseTfidfVectorizer)
 45 |     3. [Keyphrase extraction with KeyBERT](#keyphrase-extraction-with-keybert)
 46 |     4. [Topic modeling with BERTopic and KeyphraseVectorizers](#topic-modeling-with-bertopic-and-keyphrasevectorizers)
 47 | 
 48 | <!--te-->
 49 | 
 50 | <a name="#how-does-it-work"/></a>
 51 | 
 52 | How does it work?
 53 | -----------------
 54 | 
 55 | First, the document texts are annotated with [spaCy](https://spacy.io "spaCy homepage") part-of-speech tags. A list of
 56 | all possible spaCy part-of-speech tags for different languages is
 57 | linked [here](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py "spaCy POS tags"). The annotation
 58 | requires passing the [spaCy pipeline](https://spacy.io/models "available spaCy pipelines") of the corresponding language
 59 | to the vectorizer with the `spacy_pipeline` parameter.
 60 | 
 61 | Second, words are extracted from the document texts whose part-of-speech tags match the regex pattern defined in
 62 | the `pos_pattern`
 63 | parameter. The keyphrases are a list of unique words extracted from text documents by this method.
 64 | 
 65 | Finally, the vectorizers calculate document-keyphrase matrices.
 66 | 
 67 | <a name="#installation"/></a>
 68 | 
 69 | Installation
 70 | ------------
 71 | 
 72 | ```
 73 | pip install keyphrase-vectorizers
 74 | ```
 75 | 
 76 | <a name="#usage"/></a>
 77 | 
 78 | Usage
 79 | -----
 80 | For detailed information visit
 81 | the [API Guide](https://keyphrase-vectorizers.readthedocs.io/en/latest/index.html "Keyphrase_Vectorizers API Guide").
 82 | 
 83 | <a name="#KeyphraseCountVectorizer"/></a>
 84 | 
 85 | ### KeyphraseCountVectorizer
 86 | 
 87 | [Back to Table of Contents](#toc)
 88 | 
 89 | <a name="#english-language"/></a>
 90 | 
 91 | #### English language
 92 | 
 93 | ```python
 94 | from keyphrase_vectorizers import KeyphraseCountVectorizer
 95 | 
 96 | docs = ["""Supervised learning is the machine learning task of learning a function that
 97 |          maps an input to an output based on example input-output pairs. It infers a
 98 |          function from labeled training data consisting of a set of training examples.
 99 |          In supervised learning, each example is a pair consisting of an input object
100 |          (typically a vector) and a desired output value (also called the supervisory signal). 
101 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
102 |          which can be used for mapping new examples. An optimal scenario will allow for the 
103 |          algorithm to correctly determine the class labels for unseen instances. This requires 
104 |          the learning algorithm to generalize from the training data to unseen situations in a 
105 |          'reasonable' way (see inductive bias).""", 
106 |              
107 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
108 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
109 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
110 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
111 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
112 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
113 |         in information retrieval."""]
114 |         
115 | # Init default vectorizer.
116 | vectorizer = KeyphraseCountVectorizer()
117 | 
118 | # Print parameters
119 | print(vectorizer.get_params())
120 | >>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
121 | ```
122 | 
123 | By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is
124 | specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives,
125 | followed by 1 or more nouns using the English spaCy part-of-speech tags.
126 | 
127 | ```python
128 | # After initializing the vectorizer, it can be fitted
129 | # to learn the keyphrases from the text documents.
130 | vectorizer.fit(docs)
131 | ```
132 | 
133 | ```python
134 | # After learning the keyphrases, they can be returned.
135 | keyphrases = vectorizer.get_feature_names_out()
136 | 
137 | print(keyphrases)
138 | >>> ['output' 'training data' 'task' 'way' 'input object' 'documents'
139 |  'unseen instances' 'vector' 'interest' 'learning algorithm'
140 |  'unseen situations' 'training examples' 'machine' 'given document'
141 |  'document' 'document relevance' 'output pairs' 'document content'
142 |  'class labels' 'new examples' 'pair' 'main topics' 'phrases' 'overlap'
143 |  'algorithm' 'various applications' 'information retrieval' 'users' 'list'
144 |  'example input' 'supervised learning' 'optimal scenario'
145 |  'precise summary' 'keywords' 'input' 'supervised learning algorithm'
146 |  'example' 'supervisory signal' 'indication' 'set'
147 |  'information retrieval environment' 'output value' 'inductive bias'
148 |  'groups' 'function']
149 | ```
150 | 
151 | ```python
152 | # After fitting, the vectorizer can transform the documents 
153 | # to a document-keyphrase matrix.
154 | # Matrix rows indicate the documents and columns indicate the unique keyphrases.
155 | # Each cell represents the count.
156 | document_keyphrase_matrix = vectorizer.transform(docs).toarray()
157 | 
158 | print(document_keyphrase_matrix)
159 | >>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1
160 |   2 1 0 1 0 1 1 0 3]
161 |  [0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0
162 |   0 0 1 0 1 0 0 1 0]]
163 | ```
164 | 
165 | ```python
166 | # Fit and transform can also be executed in one step, 
167 | # which is more efficient. 
168 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()
169 | 
170 | print(document_keyphrase_matrix)
171 | >>> [[3 3 1 1 1 0 1 1 0 2 1 1 1 0 0 0 1 0 1 1 1 0 0 0 3 0 0 0 0 1 3 1 0 0 3 1
172 |   2 1 0 1 0 1 1 0 3]
173 |  [0 0 0 0 0 1 0 0 1 0 0 0 0 1 5 1 0 1 0 0 0 2 1 1 0 1 2 1 1 0 0 0 1 5 0 0
174 |   0 0 1 0 1 0 0 1 0]]
175 | ```
176 | 
177 | <a name="#other-languages"/></a>
178 | 
179 | #### Other languages
180 | 
181 | [Back to Table of Contents](#toc)
182 | 
183 | ```python
184 | german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 
185 |                 Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 
186 |                 sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 
187 |                 Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 
188 |                 Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 
189 |                 Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""",
190 |               
191 |                """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 
192 |                Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 
193 |                die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren
194 |                """]
195 | # Init vectorizer for the german language
196 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='<ADJ.*>*<N.*>+', stop_words='german')
197 | ```
198 | 
199 | The German `spacy_pipeline` is specified and German `stop_words` are removed. Because the German spaCy part-of-speech
200 | tags differ from the English ones, the `pos_pattern` parameter is also customized. The regex pattern `<ADJ.*>*<N.*>+`
201 | extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the German spaCy part-of-speech
202 | tags.
203 | 
204 | <a name="#KeyphraseTfidfVectorizer"/></a>
205 | 
206 | ### KeyphraseTfidfVectorizer
207 | 
208 | [Back to Table of Contents](#toc)
209 | 
210 | The `KeyphraseTfidfVectorizer` has the same function calls and features as the `KeyphraseCountVectorizer`. The only
211 | difference is, that document-keyphrase matrix cells represent tf or tf-idf values, depending on the parameter settings,
212 | instead of counts.
213 | 
214 | ```python
215 | from keyphrase_vectorizers import KeyphraseTfidfVectorizer
216 | 
217 | docs = ["""Supervised learning is the machine learning task of learning a function that
218 |          maps an input to an output based on example input-output pairs. It infers a
219 |          function from labeled training data consisting of a set of training examples.
220 |          In supervised learning, each example is a pair consisting of an input object
221 |          (typically a vector) and a desired output value (also called the supervisory signal). 
222 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
223 |          which can be used for mapping new examples. An optimal scenario will allow for the 
224 |          algorithm to correctly determine the class labels for unseen instances. This requires 
225 |          the learning algorithm to generalize from the training data to unseen situations in a 
226 |          'reasonable' way (see inductive bias).""", 
227 |              
228 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
229 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
230 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
231 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
232 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
233 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
234 |         in information retrieval."""]
235 |         
236 | # Init default vectorizer for the English language that computes tf-idf values
237 | vectorizer = KeyphraseTfidfVectorizer()
238 | 
239 | # Print parameters
240 | print(vectorizer.get_params())
241 | >>> {'binary': False, 'dtype': <class 'numpy.float64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'norm': 'l2', 'pos_pattern': '<J.*>*<N.*>+', 'smooth_idf': True, 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'sublinear_tf': False, 'use_idf': True, 'workers': 1}
242 | ```
243 | 
244 | To calculate tf values instead, set `use_idf=False`.
245 | 
246 | ```python
247 | # Fit and transform to document-keyphrase matrix.
248 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()
249 | 
250 | print(document_keyphrase_matrix)
251 | >>> [[0.11111111 0.22222222 0.11111111 0.         0.         0.
252 |   0.11111111 0.         0.11111111 0.11111111 0.33333333 0.
253 |   0.         0.         0.11111111 0.         0.         0.11111111
254 |   0.         0.33333333 0.         0.22222222 0.         0.11111111
255 |   0.11111111 0.11111111 0.11111111 0.11111111 0.33333333 0.11111111
256 |   0.11111111 0.33333333 0.11111111 0.         0.33333333 0.
257 |   0.         0.         0.11111111 0.         0.11111111 0.11111111
258 |   0.         0.33333333 0.11111111]
259 |  [0.         0.         0.         0.11785113 0.11785113 0.11785113
260 |   0.         0.11785113 0.         0.         0.         0.11785113
261 |   0.11785113 0.11785113 0.         0.11785113 0.23570226 0.
262 |   0.23570226 0.         0.58925565 0.         0.11785113 0.
263 |   0.         0.         0.         0.         0.         0.
264 |   0.         0.         0.         0.58925565 0.         0.11785113
265 |   0.11785113 0.11785113 0.         0.11785113 0.         0.
266 |   0.11785113 0.         0.        ]]
267 | ```
268 | 
269 | ```python
270 | # Return keyphrases
271 | keyphrases = vectorizer.get_feature_names_out()
272 | 
273 | print(keyphrases)
274 | >>> ['optimal scenario' 'example' 'input object' 'groups' 'list'
275 |  'precise summary' 'inductive bias' 'phrases' 'training examples'
276 |  'output value' 'function' 'given document' 'documents'
277 |  'information retrieval environment' 'new examples' 'interest'
278 |  'main topics' 'unseen situations' 'information retrieval' 'input'
279 |  'keywords' 'learning algorithm' 'indication' 'set' 'example input'
280 |  'vector' 'machine' 'supervised learning algorithm' 'algorithm' 'pair'
281 |  'task' 'training data' 'way' 'document' 'supervised learning' 'users'
282 |  'document relevance' 'document content' 'supervisory signal' 'overlap'
283 |  'class labels' 'unseen instances' 'various applications' 'output'
284 |  'output pairs']
285 | ```
286 | 
287 | <a name="#keyphrase-extraction-with-keybert"/></a>
288 | 
289 | ### Keyphrase extraction with [KeyBERT](https://github.com/MaartenGr/KeyBERT "KeyBERT repository")
290 | 
291 | [Back to Table of Contents](#toc)
292 | 
293 | The keyphrase vectorizers can be used together with KeyBERT to extract grammatically correct keyphrases that are most
294 | similar to a document. Thereby, the vectorizer first extracts candidate keyphrases from the text documents, which are
295 | subsequently ranked by KeyBERT based on their document similarity. The top-n most similar keyphrases can then be
296 | considered as document keywords.
297 | 
298 | The advantage of using KeyphraseVectorizers in addition to KeyBERT is that it allows users to get grammatically correct
299 | keyphrases instead of simple n-grams of pre-defined lengths. In KeyBERT, users can specify the `keyphrase_ngram_range`
300 | to define the length of the retrieved keyphrases. However, this raises two issues. First, users usually do not know the
301 | optimal n-gram range and therefore have to spend some time experimenting until they find a suitable n-gram range.
302 | Second, even after finding a good n-gram range, the returned keyphrases are sometimes still grammatically not quite
303 | correct or are slightly off-key. Unfortunately, this limits the quality of the returned keyphrases.
304 | 
305 | To adress this issue, we can use the vectorizers of this package to first extract candidate keyphrases that consist of
306 | zero or more adjectives, followed by one or multiple nouns in a pre-processing step instead of simple n-grams.
307 | [Wan and Xiao](https://www.aaai.org/Papers/AAAI/2008/AAAI08-136.pdf) successfully used this noun phrase approach for
308 | keyphrase extraction during their research in 2008. The extracted candidate keyphrases are subsequently passed to
309 | KeyBERT for embedding generation and similarity calculation. To use both packages for keyphrase extraction, we need to
310 | pass KeyBERT a keyphrase vectorizer with the `vectorizer` parameter. Since the length of keyphrases now depends on
311 | part-of-speech tags, there is no need to define an n-gram length anymore.
312 | 
313 | #### Example:
314 | 
315 | KeyBERT can be installed via `pip install keybert`.
316 | 
317 | ```python
318 | from keyphrase_vectorizers import KeyphraseCountVectorizer
319 | from keybert import KeyBERT
320 | 
321 | docs = ["""Supervised learning is the machine learning task of learning a function that
322 |          maps an input to an output based on example input-output pairs. It infers a
323 |          function from labeled training data consisting of a set of training examples.
324 |          In supervised learning, each example is a pair consisting of an input object
325 |          (typically a vector) and a desired output value (also called the supervisory signal). 
326 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
327 |          which can be used for mapping new examples. An optimal scenario will allow for the 
328 |          algorithm to correctly determine the class labels for unseen instances. This requires 
329 |          the learning algorithm to generalize from the training data to unseen situations in a 
330 |          'reasonable' way (see inductive bias).""", 
331 |              
332 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
333 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
334 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
335 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
336 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
337 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
338 |         in information retrieval."""]
339 | 
340 | kw_model = KeyBERT()
341 | ```
342 | 
343 | Instead of deciding on a suitable n-gram range which could be e.g.(1,2)...
344 | 
345 | ```python
346 | >>> kw_model.extract_keywords(docs=docs, keyphrase_ngram_range=(1,2))
347 | [[('labeled training', 0.6013),
348 |   ('examples supervised', 0.6112),
349 |   ('signal supervised', 0.6152),
350 |   ('supervised', 0.6676),
351 |   ('supervised learning', 0.6779)],
352 |  [('keywords assigned', 0.6354),
353 |   ('keywords used', 0.6373),
354 |   ('list keywords', 0.6375),
355 |   ('keywords quickly', 0.6376),
356 |   ('keywords defined', 0.6997)]]
357 | ```
358 | 
359 | we can now just let the keyphrase vectorizer decide on suitable keyphrases, without limitations to a maximum or minimum
360 | n-gram range. We only have to pass a keyphrase vectorizer as parameter to KeyBERT:
361 | 
362 | ```python
363 | >>> kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer())
364 | [[('training examples', 0.4668),
365 |   ('training data', 0.5271),
366 |   ('learning algorithm', 0.5632),
367 |   ('supervised learning', 0.6779),
368 |   ('supervised learning algorithm', 0.6992)],
369 |  [('given document', 0.4143),
370 |   ('information retrieval environment', 0.5166),
371 |   ('information retrieval', 0.5792),
372 |   ('keywords', 0.6046),
373 |   ('document relevance', 0.633)]]
374 | ```
375 | 
376 | This allows us to make sure that we do not cut off important words caused by defining our n-gram range too short. For
377 | example, we would not have found the keyphrase "supervised learning algorithm" with keyphrase_ngram_range=(1,2).
378 | Furthermore, we avoid to get keyphrases that are slightly off-key like "labeled training", "signal supervised" or
379 | "keywords quickly".
380 | 
381 | <a name="#topic-modeling-with-bertopic-and-keyphrasevectorizers"/></a>
382 | 
383 | ### Topic modeling with [BERTopic](https://github.com/MaartenGr/BERTopic "BERTopic repository") and KeyphraseVectorizers
384 | 
385 | [Back to Table of Contents](#toc)
386 | 
387 | Similar to the application with KeyBERT, the keyphrase vectorizers can be used to obtain grammatically correct
388 | keyphrases as
389 | descriptions for topics instead of simple n-grams. This allows us to make sure that we do not cut off important topic
390 | description keyphrases by defining our n-gram range too short. Moreover, we don't need to clean stopwords upfront, can
391 | get more precise topic models and avoid to get topic description keyphrases that are slightly off-key.
392 | 
393 | #### Example:
394 | 
395 | BERTopic can be installed via `pip install bertopic`.
396 | 
397 | ```python
398 | from keyphrase_vectorizers import KeyphraseCountVectorizer
399 | from bertopic import BERTopic
400 | from sklearn.datasets import fetch_20newsgroups
401 | 
402 | # load text documents
403 | docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
404 | # only use subset of the data 
405 | docs = docs[:5000]
406 | 
407 | # train topic model with KeyphraseCountVectorizer
408 | keyphrase_topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
409 | keyphrase_topics, keyphrase_probs = keyphrase_topic_model.fit_transform(docs)
410 | 
411 | # get topics
412 | >>> keyphrase_topic_model.topics
413 | {-1: [('file', 0.007265527630674131),
414 |   ('one', 0.007055454904474792),
415 |   ('use', 0.00633563957153475),
416 |   ('program', 0.006053271092949018),
417 |   ('get', 0.006011060091056076),
418 |   ('people', 0.005729309058970368),
419 |   ('know', 0.005635951168273583),
420 |   ('like', 0.0055692449802916015),
421 |   ('time', 0.00527028825803415),
422 |   ('us', 0.00525564504880084)],
423 |  0: [('game', 0.024134589719090525),
424 |   ('team', 0.021852806383170772),
425 |   ('players', 0.01749406934044139),
426 |   ('games', 0.014397938026886745),
427 |   ('hockey', 0.013932342023677305),
428 |   ('win', 0.013706115572901401),
429 |   ('year', 0.013297593024390321),
430 |   ('play', 0.012533185558169046),
431 |   ('baseball', 0.012412743802062559),
432 |   ('season', 0.011602725885164318)],
433 |  1: [('patients', 0.022600352291162015),
434 |   ('msg', 0.02023877371575874),
435 |   ('doctor', 0.018816282737587457),
436 |   ('medical', 0.018614407917995103),
437 |   ('treatment', 0.0165028251400717),
438 |   ('food', 0.01604980195180696),
439 |   ('candida', 0.015255961242066143),
440 |   ('disease', 0.015115496310099693),
441 |   ('pain', 0.014129703072484495),
442 |   ('hiv', 0.012884503220341102)],
443 |  2: [('key', 0.028851633177510126),
444 |   ('encryption', 0.024375137861044675),
445 |   ('clipper', 0.023565947302544528),
446 |   ('privacy', 0.019258719348097385),
447 |   ('security', 0.018983682856076434),
448 |   ('chip', 0.018822199098878365),
449 |   ('keys', 0.016060139239615384),
450 |   ('internet', 0.01450486904722165),
451 |   ('encrypted', 0.013194373119964168),
452 |   ('government', 0.01303978311708837)],
453 |   ...
454 | ```
455 | 
456 | The same topics look a bit different when no keyphrase vectorizer is used:
457 | 
458 | ```python
459 | from bertopic import BERTopic
460 | from sklearn.datasets import fetch_20newsgroups
461 | 
462 | # load text documents
463 | docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
464 | # only use subset of the data 
465 | docs = docs[:5000]
466 | 
467 | # train topic model without KeyphraseCountVectorizer
468 | topic_model = BERTopic()
469 | topics, probs = topic_model.fit_transform(docs)
470 | 
471 | # get topics
472 | >>> topic_model.topics
473 | {-1: [('the', 0.012864641020408933),
474 |   ('to', 0.01187920529994724),
475 |   ('and', 0.011431498631699856),
476 |   ('of', 0.01099851927541331),
477 |   ('is', 0.010995478673036962),
478 |   ('in', 0.009908233622158523),
479 |   ('for', 0.009903667215879675),
480 |   ('that', 0.009619596716087699),
481 |   ('it', 0.009578499681829809),
482 |   ('you', 0.0095328846440753)],
483 |  0: [('game', 0.013949166096523719),
484 |   ('team', 0.012458483177116456),
485 |   ('he', 0.012354733462693834),
486 |   ('the', 0.01119583508278812),
487 |   ('10', 0.010190243555226108),
488 |   ('in', 0.0101436249231417),
489 |   ('players', 0.009682212470082758),
490 |   ('to', 0.00933700544705287),
491 |   ('was', 0.009172402203816335),
492 |   ('and', 0.008653375901739337)],
493 |  1: [('of', 0.012771267188340924),
494 |   ('to', 0.012581337590513296),
495 |   ('is', 0.012554884458779008),
496 |   ('patients', 0.011983273578628046),
497 |   ('and', 0.011863499662237566),
498 |   ('that', 0.011616113472989725),
499 |   ('it', 0.011581944987387165),
500 |   ('the', 0.011475148304229873),
501 |   ('in', 0.011395485985801054),
502 |   ('msg', 0.010715000656335596)],
503 |  2: [('key', 0.01725282988290282),
504 |   ('the', 0.014634841495851404),
505 |   ('be', 0.014429762197907552),
506 |   ('encryption', 0.013530733999898166),
507 |   ('to', 0.013443159534369817),
508 |   ('clipper', 0.01296614319927958),
509 |   ('of', 0.012164734232650158),
510 |   ('is', 0.012128295958613464),
511 |   ('and', 0.011972763728732667),
512 |   ('chip', 0.010785744492767285)],
513 |  ...
514 | ```
515 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI - Python](https://img.shields.io/badge/python-%3E%3D3.7-blue)](https://pypi.org/project/keyphrase-vectorizers/)
  2 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://github.com/TimSchopf/Keyphrase_Vectorizers/blob/master/LICENSE)
  3 | [![PyPI - PyPi](https://img.shields.io/pypi/v/keyphrase-vectorizers.svg)](https://pypi.org/project/keyphrase-vectorizers/)
  4 | [![Build](https://img.shields.io/github/actions/workflow/status/TimSchopf/KeyphraseVectorizers/testing.yml?branch=master)](https://pypi.org/project/keyphrase-vectorizers/)
  5 | [![Documentation Status](https://readthedocs.org/projects/keyphrase-vectorizers/badge/?version=latest)](https://keyphrase-vectorizers.readthedocs.io/en/latest/?badge=latest)
  6 | [![DOI:10.5220/0011546600003335](https://zenodo.org/badge/DOI/10.5220/0011546600003335.svg)](https://doi.org/10.5220/0011546600003335)
  7 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/patternrank-leveraging-pretrained-language/keyphrase-extraction-on-inspec)](https://paperswithcode.com/sota/keyphrase-extraction-on-inspec?p=patternrank-leveraging-pretrained-language)
  8 | 
  9 | KeyphraseVectorizers
 10 | ===================== 
 11 | 
 12 | **This package was developed during the writing of our PatternRank paper. You can check out the paper [here](https://arxiv.org/abs/2210.05245). When using KeyphraseVectorizers or PatternRank in academic papers and theses, please use the [BibTeX entry below](#citation-information).**
 13 | 
 14 | Set of vectorizers that extract keyphrases with part-of-speech patterns from a collection of text documents and convert
 15 | them into a document-keyphrase matrix. A document-keyphrase matrix is a mathematical matrix that describes the frequency
 16 | of keyphrases that occur in a collection of documents. The matrix rows indicate the text documents and columns indicate
 17 | the unique keyphrases.
 18 | 
 19 | The package contains wrappers of the
 20 | [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer "scikit-learn CountVectorizer")
 21 | and
 22 | [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer "scikit-learn TfidfVectorizer")
 23 | classes. Instead of using n-gram tokens of a pre-defined range, these classes extract keyphrases from text documents
 24 | using part-of-speech tags to compute document-keyphrase matrices.
 25 | 
 26 | Corresponding medium posts can be found [here](https://towardsdatascience.com/enhancing-keybert-keyword-extraction-results-with-keyphrasevectorizers-3796fa93f4db "Keyphrase Extraction with BERT Transformers and Noun Phrases") and [here](https://towardsdatascience.com/unsupervised-keyphrase-extraction-with-patternrank-28ec3ca737f0 "Unsupervised Keyphrase Extraction with PatternRank").
 27 | 
 28 | Benefits
 29 | --------
 30 | 
 31 | * Extract grammatically accurate keyphases based on their part-of-speech tags.
 32 | * No need to specify n-gram ranges.
 33 | * Get document-keyphrase matrices.
 34 | * Multiple language support.
 35 | * User-defined part-of-speech patterns for keyphrase extraction possible.
 36 | 
 37 | <a name="toc"/></a>
 38 | 
 39 | Table of Contents
 40 | -----------------
 41 | 
 42 | <!--ts-->
 43 | 
 44 | 1. [How does it work?](#how-does-it-work)
 45 | 2. [Installation](#installation)
 46 | 3. [Usage](#usage)
 47 |    1. [KeyphraseCountVectorizer](#keyphrasecountvectorizer)
 48 |       1. [English language](#english-language)
 49 |       2. [Other languages](#other-languages)
 50 |    2. [KeyphraseTfidfVectorizer](#keyphrasetfidfvectorizer)
 51 |    3. [Reuse a spaCy Language object](#reuse-a-spacy-language-object)
 52 |    4. [Custom POS-tagger](#custom-pos-tagger)
 53 |    5. [PatternRank: Keyphrase extraction with KeyphraseVectorizers and KeyBERT](#patternrank-keyphrase-extraction-with-keyphrasevectorizers-and-keybert)
 54 |    6. [Topic modeling with BERTopic and KeyphraseVectorizers](#topic-modeling-with-bertopic-and-keyphrasevectorizers)
 55 |    7. [Online KeyphraseVectorizers](#online-keyphrasevectorizers)
 56 | 4. [Citation information](#citation-information)
 57 | 
 58 | <!--te-->
 59 | 
 60 | <a name="#how-does-it-work"/></a>
 61 | 
 62 | How does it work?
 63 | -----------------
 64 | 
 65 | First, the document texts are annotated with [spaCy](https://spacy.io "spaCy homepage") part-of-speech tags. A list of
 66 | all possible spaCy part-of-speech tags for different languages is
 67 | linked [here](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py "spaCy POS tags"). The annotation
 68 | requires passing the [spaCy pipeline](https://spacy.io/models "available spaCy pipelines") of the corresponding language
 69 | to the vectorizer with the `spacy_pipeline` parameter.
 70 | 
 71 | Second, words are extracted from the document texts whose part-of-speech tags match the regex pattern defined in
 72 | the `pos_pattern`
 73 | parameter. The keyphrases are a list of unique words extracted from text documents by this method.
 74 | 
 75 | Finally, the vectorizers calculate document-keyphrase matrices.
 76 | 
 77 | <a name="#installation"/></a>
 78 | 
 79 | Installation
 80 | ------------
 81 | 
 82 | ```
 83 | pip install keyphrase-vectorizers
 84 | ```
 85 | 
 86 | <a name="#usage"/></a>
 87 | 
 88 | Usage
 89 | -----
 90 | For detailed information visit
 91 | the [API Guide](https://keyphrase-vectorizers.readthedocs.io/en/latest/index.html "Keyphrase_Vectorizers API Guide").
 92 | 
 93 | <a name="#keyphrasecountvectorizer"/></a>
 94 | 
 95 | ### KeyphraseCountVectorizer
 96 | 
 97 | [Back to Table of Contents](#toc)
 98 | 
 99 | <a name="#english-language"/></a>
100 | 
101 | #### English language
102 | 
103 | ```python
104 | from keyphrase_vectorizers import KeyphraseCountVectorizer
105 | 
106 | docs = ["""Supervised learning is the machine learning task of learning a function that
107 |          maps an input to an output based on example input-output pairs. It infers a
108 |          function from labeled training data consisting of a set of training examples.
109 |          In supervised learning, each example is a pair consisting of an input object
110 |          (typically a vector) and a desired output value (also called the supervisory signal). 
111 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
112 |          which can be used for mapping new examples. An optimal scenario will allow for the 
113 |          algorithm to correctly determine the class labels for unseen instances. This requires 
114 |          the learning algorithm to generalize from the training data to unseen situations in a 
115 |          'reasonable' way (see inductive bias).""", 
116 |              
117 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
118 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
119 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
120 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
121 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
122 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
123 |         in information retrieval."""]
124 |         
125 | # Init default vectorizer.
126 | vectorizer = KeyphraseCountVectorizer()
127 | 
128 | # Print parameters
129 | print(vectorizer.get_params())
130 | >>> {'binary': False, 'dtype': <class 'numpy.int64'>, 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
131 | ```
132 | 
133 | By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is
134 | specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives,
135 | followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline
136 | components `['parser', 'attribute_ruler', 'lemmatizer', 'ner']` are excluded by default to increase efficiency. If you
137 | choose a different `spacy_pipeline`, you may have to exclude/include different pipeline components using
138 | the `spacy_exclude` parameter for the spaCy POS
139 | tagger to work properly.
140 | 
141 | ```python
142 | # After initializing the vectorizer, it can be fitted
143 | # to learn the keyphrases from the text documents.
144 | vectorizer.fit(docs)
145 | ```
146 | 
147 | ```python
148 | # After learning the keyphrases, they can be returned.
149 | keyphrases = vectorizer.get_feature_names_out()
150 | 
151 | print(keyphrases)
152 | >>> ['users' 'main topics' 'learning algorithm' 'overlap' 'documents' 'output'
153 |  'keywords' 'precise summary' 'new examples' 'training data' 'input'
154 |  'document content' 'training examples' 'unseen instances'
155 |  'optimal scenario' 'document' 'task' 'supervised learning algorithm'
156 |  'example' 'interest' 'function' 'example input' 'various applications'
157 |  'unseen situations' 'phrases' 'indication' 'inductive bias'
158 |  'supervisory signal' 'document relevance' 'information retrieval' 'set'
159 |  'input object' 'groups' 'output value' 'list' 'learning' 'output pairs'
160 |  'pair' 'class labels' 'supervised learning' 'machine'
161 |  'information retrieval environment' 'algorithm' 'vector' 'way']
162 | ```
163 | 
164 | ```python
165 | # After fitting, the vectorizer can transform the documents 
166 | # to a document-keyphrase matrix.
167 | # Matrix rows indicate the documents and columns indicate the unique keyphrases.
168 | # Each cell represents the count.
169 | document_keyphrase_matrix = vectorizer.transform(docs).toarray()
170 | 
171 | print(document_keyphrase_matrix)
172 | >>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6
173 |   1 1 1 3 1 0 3 1 1]
174 |  [1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0
175 |   0 0 0 0 0 1 0 0 0]]
176 | ```
177 | 
178 | ```python
179 | # Fit and transform can also be executed in one step, 
180 | # which is more efficient. 
181 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()
182 | 
183 | print(document_keyphrase_matrix)
184 | >>> [[0 0 2 0 0 3 0 0 1 3 3 0 1 1 1 0 1 1 2 0 3 1 0 1 0 0 1 1 0 0 1 1 0 1 0 6
185 |   1 1 1 3 1 0 3 1 1]
186 |  [1 2 0 1 1 0 5 1 0 0 0 1 0 0 0 5 0 0 0 1 0 0 1 0 1 1 0 0 1 2 0 0 1 0 1 0
187 |   0 0 0 0 0 1 0 0 0]]
188 | ```
189 | 
190 | <a name="#other-languages"/></a>
191 | 
192 | #### Other languages
193 | 
194 | [Back to Table of Contents](#toc)
195 | 
196 | ```python
197 | german_docs = ["""Goethe stammte aus einer angesehenen bürgerlichen Familie. 
198 |                 Sein Großvater mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der Stadt Frankfurt, 
199 |                 sein Vater Doktor der Rechte und Kaiserlicher Rat. Er und seine Schwester Cornelia erfuhren eine aufwendige 
200 |                 Ausbildung durch Hauslehrer. Dem Wunsch seines Vaters folgend, studierte Goethe in Leipzig und Straßburg 
201 |                 Rechtswissenschaft und war danach als Advokat in Wetzlar und Frankfurt tätig. 
202 |                 Gleichzeitig folgte er seiner Neigung zur Dichtkunst.""",
203 |               
204 |                """Friedrich Schiller wurde als zweites Kind des Offiziers, Wundarztes und Leiters der Hofgärtnerei in 
205 |                Marbach am Neckar Johann Kaspar Schiller und dessen Ehefrau Elisabetha Dorothea Schiller, geb. Kodweiß, 
206 |                die Tochter eines Wirtes und Bäckers war, 1759 in Marbach am Neckar geboren
207 |                """]
208 | # Init vectorizer for the german language
209 | vectorizer = KeyphraseCountVectorizer(spacy_pipeline='de_core_news_sm', pos_pattern='<ADJ.*>*<N.*>+', stop_words='german')
210 | ```
211 | 
212 | The German `spacy_pipeline` is specified and German `stop_words` are removed. Because the German spaCy part-of-speech
213 | tags differ from the English ones, the `pos_pattern` parameter is also customized. The regex pattern `<ADJ.*>*<N.*>+`
214 | extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the German spaCy part-of-speech
215 | tags.
216 | 
217 | **Attention!** The spaCy pipeline components `['parser', 'attribute_ruler', 'lemmatizer', 'ner']` are excluded by
218 | default to increase efficiency. If you choose a different `spacy_pipeline`, you may have to exclude/include different
219 | pipeline components using the `spacy_exclude` parameter for the spaCy POS tagger to work properly.
220 | 
221 | <a name="#keyphrasetfidfvectorizer"/></a>
222 | 
223 | ### KeyphraseTfidfVectorizer
224 | 
225 | [Back to Table of Contents](#toc)
226 | 
227 | The `KeyphraseTfidfVectorizer` has the same function calls and features as the `KeyphraseCountVectorizer`. The only
228 | difference is, that document-keyphrase matrix cells represent tf or tf-idf values, depending on the parameter settings,
229 | instead of counts.
230 | 
231 | ```python
232 | from keyphrase_vectorizers import KeyphraseTfidfVectorizer
233 | 
234 | docs = ["""Supervised learning is the machine learning task of learning a function that
235 |          maps an input to an output based on example input-output pairs. It infers a
236 |          function from labeled training data consisting of a set of training examples.
237 |          In supervised learning, each example is a pair consisting of an input object
238 |          (typically a vector) and a desired output value (also called the supervisory signal). 
239 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
240 |          which can be used for mapping new examples. An optimal scenario will allow for the 
241 |          algorithm to correctly determine the class labels for unseen instances. This requires 
242 |          the learning algorithm to generalize from the training data to unseen situations in a 
243 |          'reasonable' way (see inductive bias).""", 
244 |              
245 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
246 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
247 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
248 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
249 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
250 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
251 |         in information retrieval."""]
252 |         
253 | # Init default vectorizer for the English language that computes tf-idf values
254 | vectorizer = KeyphraseTfidfVectorizer()
255 | 
256 | # Print parameters
257 | print(vectorizer.get_params())
258 | >>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
259 | 
260 | 
261 | class 'numpy.int64'>, 'lowercase': True, 'max_df': None
262 | 
263 | , 'min_df': None, 'pos_pattern': '<J.*>*<N.*>+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner',
264 |                                                                    'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1}
265 | ```
266 | 
267 | To calculate tf values instead, set `use_idf=False`.
268 | 
269 | ```python
270 | # Fit and transform to document-keyphrase matrix.
271 | document_keyphrase_matrix = vectorizer.fit_transform(docs).toarray()
272 | 
273 | print(document_keyphrase_matrix)
274 | >>> [[0.         0.         0.09245003 0.09245003 0.09245003 0.09245003
275 |   0.2773501  0.09245003 0.2773501  0.2773501  0.09245003 0.
276 |   0.         0.09245003 0.         0.2773501  0.09245003 0.09245003
277 |   0.         0.09245003 0.09245003 0.09245003 0.09245003 0.09245003
278 |   0.5547002  0.         0.         0.09245003 0.09245003 0.
279 |   0.2773501  0.18490007 0.09245003 0.         0.2773501  0.
280 |   0.         0.09245003 0.         0.09245003 0.         0.
281 |   0.         0.18490007 0.        ]
282 |  [0.11867817 0.11867817 0.         0.         0.         0.
283 |   0.         0.         0.         0.         0.         0.11867817
284 |   0.11867817 0.         0.11867817 0.         0.         0.
285 |   0.11867817 0.         0.         0.         0.         0.
286 |   0.         0.11867817 0.23735633 0.         0.         0.11867817
287 |   0.         0.         0.         0.23735633 0.         0.11867817
288 |   0.11867817 0.         0.59339083 0.         0.11867817 0.11867817
289 |   0.11867817 0.         0.59339083]]
290 | ```
291 | 
292 | ```python
293 | # Return keyphrases
294 | keyphrases = vectorizer.get_feature_names_out()
295 | 
296 | print(keyphrases)
297 | >>> ['various applications' 'list' 'task' 'supervisory signal'
298 |  'inductive bias' 'supervised learning algorithm' 'supervised learning'
299 |  'example input' 'input' 'algorithm' 'set' 'precise summary' 'documents'
300 |  'input object' 'interest' 'function' 'class labels' 'machine'
301 |  'document content' 'output pairs' 'new examples' 'unseen situations'
302 |  'vector' 'output value' 'learning' 'document relevance' 'main topics'
303 |  'pair' 'training examples' 'information retrieval environment'
304 |  'training data' 'example' 'optimal scenario' 'information retrieval'
305 |  'output' 'groups' 'indication' 'unseen instances' 'keywords' 'way'
306 |  'phrases' 'overlap' 'users' 'learning algorithm' 'document']
307 | ```
308 | 
309 | <a name="#reuse-a-spacy-language-object"/></a>
310 | 
311 | ### Reuse a spaCy Language object
312 | 
313 | [Back to Table of Contents](#toc)
314 | 
315 | KeyphraseVectorizers loads a `spacy.Language` object for every `KeyphraseVectorizer` object.
316 | When using multiple `KeyphraseVectorizer` objects, it is more efficient to load the `spacy.Language` object beforehand and pass it as the `spacy_pipeline` argument.
317 | 
318 | ```python
319 | import spacy
320 | from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
321 | 
322 | docs = ["""Supervised learning is the machine learning task of learning a function that
323 |          maps an input to an output based on example input-output pairs. It infers a
324 |          function from labeled training data consisting of a set of training examples.
325 |          In supervised learning, each example is a pair consisting of an input object
326 |          (typically a vector) and a desired output value (also called the supervisory signal). 
327 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
328 |          which can be used for mapping new examples. An optimal scenario will allow for the 
329 |          algorithm to correctly determine the class labels for unseen instances. This requires 
330 |          the learning algorithm to generalize from the training data to unseen situations in a 
331 |          'reasonable' way (see inductive bias).""", 
332 |              
333 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
334 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
335 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
336 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
337 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
338 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
339 |         in information retrieval."""]
340 | 
341 | nlp = spacy.load("en_core_web_sm")
342 | 
343 | vectorizer1 = KeyphraseCountVectorizer(spacy_pipeline=nlp)
344 | vectorizer2 = KeyphraseTfidfVectorizer(spacy_pipeline=nlp)
345 | 
346 | # the following calls use the nlp object
347 | vectorizer1.fit(docs)
348 | vectorizer2.fit(docs)
349 | ```
350 | 
351 | <a name="##custom-pos-tagger"/></a>
352 | 
353 | ### Custom POS-tagger
354 | 
355 | [Back to Table of Contents](#toc)
356 | 
357 | To use a different part-of-speech tagger than the ones provided by spaCy, a custom POS-tagger function can be defined and passed to the KeyphraseVectorizers via the `custom_pos_tagger` parameter. This parameter expects a callable function which in turn needs to expect a list of strings in a 'raw_documents' parameter and has to return a list of (word token, POS-tag) tuples. If this parameter is not None, the custom tagger function is used to tag words with parts-of-speech, while the spaCy pipeline is ignored.
358 | 
359 | #### Example using [flair](https://github.com/flairNLP/flair "flair GitHub"):
360 | 
361 | Flair can be installed via `pip install flair`.
362 | 
363 | ```python
364 | from typing import List
365 | import flair
366 | from flair.models import SequenceTagger
367 | from flair.tokenization import SegtokSentenceSplitter
368 | 
369 | 
370 | docs = ["""Supervised learning is the machine learning task of learning a function that
371 |          maps an input to an output based on example input-output pairs. It infers a
372 |          function from labeled training data consisting of a set of training examples.
373 |          In supervised learning, each example is a pair consisting of an input object
374 |          (typically a vector) and a desired output value (also called the supervisory signal). 
375 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
376 |          which can be used for mapping new examples. An optimal scenario will allow for the 
377 |          algorithm to correctly determine the class labels for unseen instances. This requires 
378 |          the learning algorithm to generalize from the training data to unseen situations in a 
379 |          'reasonable' way (see inductive bias).""", 
380 |              
381 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
382 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
383 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
384 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
385 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
386 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
387 |         in information retrieval."""]
388 | 
389 | # define flair POS-tagger and splitter
390 | tagger = SequenceTagger.load('pos')
391 | splitter = SegtokSentenceSplitter()
392 | 
393 | # define custom POS-tagger function using flair
394 | def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTagger = tagger, splitter: flair.tokenization.SegtokSentenceSplitter = splitter)->List[tuple]:
395 |     """
396 |     Important: 
397 | 
398 |     The mandatory 'raw_documents' parameter can NOT be named differently and has to expect a list of strings. 
399 |     Any other parameter of the custom POS-tagger function can be arbitrarily defined, depending on the respective use case. 
400 |     Furthermore the function has to return a list of (word token, POS-tag) tuples.
401 |     """ 
402 |     # split texts into sentences
403 |     sentences = []
404 |     for doc in raw_documents:
405 |         sentences.extend(splitter.split(doc))
406 | 
407 |     # predict POS tags
408 |     tagger.predict(sentences)
409 | 
410 |     # iterate through sentences to get word tokens and predicted POS-tags
411 |     pos_tags = []
412 |     words = []
413 |     for sentence in sentences:
414 |         pos_tags.extend([label.value for label in sentence.get_labels('pos')])
415 |         words.extend([word.text for word in sentence])
416 |     
417 |     return list(zip(words, pos_tags))
418 | 
419 | 
420 | # check that the custom POS-tagger function returns a list of (word token, POS-tag) tuples
421 | print(custom_pos_tagger(raw_documents=docs))
422 | 
423 | >>> [('Supervised', 'VBN'), ('learning', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('machine', 'NN'), ('learning', 'VBG'), ('task', 'NN'), ('of', 'IN'), ('learning', 'VBG'), ('a', 'DT'), ('function', 'NN'), ('that', 'WDT'), ('maps', 'VBZ'), ('an', 'DT'), ('input', 'NN'), ('to', 'IN'), ('an', 'DT'), ('output', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('example', 'NN'), ('input-output', 'NN'), ('pairs', 'NNS'), ('.', '.'), ('It', 'PRP'), ('infers', 'VBZ'), ('a', 'DT'), ('function', 'NN'), ('from', 'IN'), ('labeled', 'VBN'), ('training', 'NN'), ('data', 'NNS'), ('consisting', 'VBG'), ('of', 'IN'), ('a', 'DT'), ('set', 'NN'), ('of', 'IN'), ('training', 'NN'), ('examples', 'NNS'), ('.', '.'), ('In', 'IN'), ('supervised', 'JJ'), ('learning', 'NN'), (',', ','), ('each', 'DT'), ('example', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('pair', 'NN'), ('consisting', 'VBG'), ('of', 'IN'), ('an', 'DT'), ('input', 'NN'), ('object', 'NN'), ('(', ':'), ('typically', 'RB'), ('a', 'DT'), ('vector', 'NN'), (')', ','), ('and', 'CC'), ('a', 'DT'), ('desired', 'VBN'), ('output', 'NN'), ('value', 'NN'), ('(', ','), ('also', 'RB'), ('called', 'VBN'), ('the', 'DT'), ('supervisory', 'JJ'), ('signal', 'NN'), (')', '-RRB-'), ('.', '.'), ('A', 'DT'), ('supervised', 'JJ'), ('learning', 'NN'), ('algorithm', 'NN'), ('analyzes', 'VBZ'), ('the', 'DT'), ('training', 'NN'), ('data', 'NNS'), ('and', 'CC'), ('produces', 'VBZ'), ('an', 'DT'), ('inferred', 'JJ'), ('function', 'NN'), (',', ','), ('which', 'WDT'), ('can', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('for', 'IN'), ('mapping', 'VBG'), ('new', 'JJ'), ('examples', 'NNS'), ('.', '.'), ('An', 'DT'), ('optimal', 'JJ'), ('scenario', 'NN'), ('will', 'MD'), ('allow', 'VB'), ('for', 'IN'), ('the', 'DT'), ('algorithm', 'NN'), ('to', 'TO'), ('correctly', 'RB'), ('determine', 'VB'), ('the', 'DT'), ('class', 'NN'), ('labels', 'NNS'), ('for', 'IN'), ('unseen', 'JJ'), ('instances', 'NNS'), ('.', '.'), ('This', 'DT'), ('requires', 'VBZ'), ('the', 'DT'), ('learning', 'NN'), ('algorithm', 'NN'), ('to', 'TO'), ('generalize', 'VB'), ('from', 'IN'), ('the', 'DT'), ('training', 'NN'), ('data', 'NNS'), ('to', 'IN'), ('unseen', 'JJ'), ('situations', 'NNS'), ('in', 'IN'), ('a', 'DT'), ("'", '``'), ('reasonable', 'JJ'), ("'", "''"), ('way', 'NN'), ('(', ','), ('see', 'VB'), ('inductive', 'JJ'), ('bias', 'NN'), (')', '-RRB-'), ('.', '.'), ('Keywords', 'NNS'), ('are', 'VBP'), ('defined', 'VBN'), ('as', 'IN'), ('phrases', 'NNS'), ('that', 'WDT'), ('capture', 'VBP'), ('the', 'DT'), ('main', 'JJ'), ('topics', 'NNS'), ('discussed', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('document', 'NN'), ('.', '.'), ('As', 'IN'), ('they', 'PRP'), ('offer', 'VBP'), ('a', 'DT'), ('brief', 'JJ'), ('yet', 'CC'), ('precise', 'JJ'), ('summary', 'NN'), ('of', 'IN'), ('document', 'NN'), ('content', 'NN'), (',', ','), ('they', 'PRP'), ('can', 'MD'), ('be', 'VB'), ('utilized', 'VBN'), ('for', 'IN'), ('various', 'JJ'), ('applications', 'NNS'), ('.', '.'), ('In', 'IN'), ('an', 'DT'), ('information', 'NN'), ('retrieval', 'NN'), ('environment', 'NN'), (',', ','), ('they', 'PRP'), ('serve', 'VBP'), ('as', 'IN'), ('an', 'DT'), ('indication', 'NN'), ('of', 'IN'), ('document', 'NN'), ('relevance', 'NN'), ('for', 'IN'), ('users', 'NNS'), (',', ','), ('as', 'IN'), ('the', 'DT'), ('list', 'NN'), ('of', 'IN'), ('keywords', 'NNS'), ('can', 'MD'), ('quickly', 'RB'), ('help', 'VB'), ('to', 'TO'), ('determine', 'VB'), ('whether', 'IN'), ('a', 'DT'), ('given', 'VBN'), ('document', 'NN'), ('is', 'VBZ'), ('relevant', 'JJ'), ('to', 'IN'), ('their', 'PRP$'), ('interest', 'NN'), ('.', '.'), ('As', 'IN'), ('keywords', 'NNS'), ('reflect', 'VBP'), ('a', 'DT'), ('document', 'NN'), ("'s", 'POS'), ('main', 'JJ'), ('topics', 'NNS'), (',', ','), ('they', 'PRP'), ('can', 'MD'), ('be', 'VB'), ('utilized', 'VBN'), ('to', 'TO'), ('classify', 'VB'), ('documents', 'NNS'), ('into', 'IN'), ('groups', 'NNS'), ('by', 'IN'), ('measuring', 'VBG'), ('the', 'DT'), ('overlap', 'NN'), ('between', 'IN'), ('the', 'DT'), ('keywords', 'NNS'), ('assigned', 'VBN'), ('to', 'IN'), ('them', 'PRP'), ('.', '.'), ('Keywords', 'NNS'), ('are', 'VBP'), ('also', 'RB'), ('used', 'VBN'), ('proactively', 'RB'), ('in', 'IN'), ('information', 'NN'), ('retrieval', 'NN'), ('.', '.')]
424 | ```
425 | 
426 | After the custom POS-tagger function is defined, it can be passed to KeyphraseVectorizers via the `custom_pos_tagger` parameter.
427 | 
428 | ```python
429 | from keyphrase_vectorizers import KeyphraseCountVectorizer
430 | 
431 | # use custom POS-tagger with KeyphraseVectorizers
432 | vectorizer = KeyphraseCountVectorizer(custom_pos_tagger=custom_pos_tagger)
433 | vectorizer.fit(docs)
434 | keyphrases = vectorizer.get_feature_names_out()
435 | print(keyphrases)
436 | 
437 | >>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
438 |  'main topics' 'task' 'precise summary' 'supervised learning'
439 |  'inductive bias' 'information retrieval environment'
440 |  'supervised learning algorithm' 'function' 'input' 'pair'
441 |  'document relevance' 'learning' 'class labels' 'new examples' 'keywords'
442 |  'list' 'machine' 'training data' 'unseen situations' 'phrases' 'output'
443 |  'optimal scenario' 'document' 'training examples' 'documents' 'interest'
444 |  'indication' 'learning algorithm' 'inferred function'
445 |  'various applications' 'example' 'set' 'unseen instances'
446 |  'example input-output pairs' 'way' 'users' 'input object'
447 |  'supervisory signal' 'overlap' 'document content']
448 | ```
449 | 
450 | <a name="#patternrank-keyphrase-extraction-with-keyphrasevectorizers-and-keybert"/></a>
451 | 
452 | ### [PatternRank:](https://arxiv.org/abs/2210.05245) Keyphrase extraction with KeyphraseVectorizers and [KeyBERT](https://github.com/MaartenGr/KeyBERT "KeyBERT repository")
453 | 
454 | [Back to Table of Contents](#toc)
455 | 
456 | Using the keyphrase vectorizers together with KeyBERT for keyphrase extraction results in the [PatternRank](https://arxiv.org/abs/2210.05245) approach. PatternRank can extract grammatically correct keyphrases that are most similar to a document. Thereby, the vectorizer first extracts candidate keyphrases from the text documents, which are subsequently ranked by KeyBERT based on their document similarity. The top-n most similar keyphrases can then be
457 | considered as document keywords.
458 | 
459 | The advantage of using KeyphraseVectorizers in addition to KeyBERT is that it allows users to get grammatically correct
460 | keyphrases instead of simple n-grams of pre-defined lengths. In KeyBERT, users can specify the `keyphrase_ngram_range`
461 | to define the length of the retrieved keyphrases. However, this raises two issues. First, users usually do not know the
462 | optimal n-gram range and therefore have to spend some time experimenting until they find a suitable n-gram range.
463 | Second, even after finding a good n-gram range, the returned keyphrases are sometimes still grammatically not quite
464 | correct or are slightly off-key. Unfortunately, this limits the quality of the returned keyphrases.
465 | 
466 | To adress this issue, we can use the vectorizers of this package to first extract candidate keyphrases that consist of
467 | zero or more adjectives, followed by one or multiple nouns in a pre-processing step instead of simple n-grams. [TextRank](https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf), [SingleRank](https://aclanthology.org/C08-1122.pdf), and [EmbedRank](https://aclanthology.org/K18-1022.pdf) already successfully used this noun phrase approach for keyphrase extraction. The extracted candidate keyphrases are subsequently passed to KeyBERT for embedding generation and similarity calculation. To use both packages for keyphrase extraction, we need to
468 | pass KeyBERT a keyphrase vectorizer with the `vectorizer` parameter. Since the length of keyphrases now depends on
469 | part-of-speech tags, there is no need to define an n-gram length anymore.
470 | 
471 | #### Example:
472 | 
473 | KeyBERT can be installed via `pip install keybert`.
474 | 
475 | ```python
476 | from keyphrase_vectorizers import KeyphraseCountVectorizer
477 | from keybert import KeyBERT
478 | 
479 | docs = ["""Supervised learning is the machine learning task of learning a function that
480 |          maps an input to an output based on example input-output pairs. It infers a
481 |          function from labeled training data consisting of a set of training examples.
482 |          In supervised learning, each example is a pair consisting of an input object
483 |          (typically a vector) and a desired output value (also called the supervisory signal). 
484 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
485 |          which can be used for mapping new examples. An optimal scenario will allow for the 
486 |          algorithm to correctly determine the class labels for unseen instances. This requires 
487 |          the learning algorithm to generalize from the training data to unseen situations in a 
488 |          'reasonable' way (see inductive bias).""", 
489 |              
490 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
491 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
492 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
493 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
494 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
495 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
496 |         in information retrieval."""]
497 | 
498 | kw_model = KeyBERT()
499 | ```
500 | 
501 | Instead of deciding on a suitable n-gram range which could be e.g.(1,2)...
502 | 
503 | ```python
504 | >>> kw_model.extract_keywords(docs=docs, keyphrase_ngram_range=(1,2))
505 | [[('labeled training', 0.6013),
506 |   ('examples supervised', 0.6112),
507 |   ('signal supervised', 0.6152),
508 |   ('supervised', 0.6676),
509 |   ('supervised learning', 0.6779)],
510 |  [('keywords assigned', 0.6354),
511 |   ('keywords used', 0.6373),
512 |   ('list keywords', 0.6375),
513 |   ('keywords quickly', 0.6376),
514 |   ('keywords defined', 0.6997)]]
515 | ```
516 | 
517 | we can now just let the keyphrase vectorizer decide on suitable keyphrases, without limitations to a maximum or minimum
518 | n-gram range. We only have to pass a keyphrase vectorizer as parameter to KeyBERT:
519 | 
520 | ```python
521 | >>> kw_model.extract_keywords(docs=docs, vectorizer=KeyphraseCountVectorizer())
522 | [[('learning', 0.4813), 
523 |   ('training data', 0.5271), 
524 |   ('learning algorithm', 0.5632), 
525 |   ('supervised learning', 0.6779), 
526 |   ('supervised learning algorithm', 0.6992)], 
527 |  [('document content', 0.3988), 
528 |   ('information retrieval environment', 0.5166), 
529 |   ('information retrieval', 0.5792), 
530 |   ('keywords', 0.6046), 
531 |   ('document relevance', 0.633)]]
532 | ```
533 | 
534 | This allows us to make sure that we do not cut off important words caused by defining our n-gram range too short. For
535 | example, we would not have found the keyphrase "supervised learning algorithm" with `keyphrase_ngram_range=(1,2)`.
536 | Furthermore, we avoid to get keyphrases that are slightly off-key like "labeled training", "signal supervised" or
537 | "keywords quickly".
538 | 
539 | For more tips on how to use the KeyphraseVectorizers together with KeyBERT, visit [this guide](https://maartengr.github.io/KeyBERT/guides/countvectorizer.html#keyphrasevectorizers "KeyBERT rCountVectorizer Guide").
540 | 
541 | <a name="#topic-modeling-with-bertopic-and-keyphrasevectorizers"/></a>
542 | 
543 | ### Topic modeling with [BERTopic](https://github.com/MaartenGr/BERTopic "BERTopic repository") and KeyphraseVectorizers
544 | 
545 | [Back to Table of Contents](#toc)
546 | 
547 | Similar to the application with KeyBERT, the keyphrase vectorizers can be used to obtain grammatically correct keyphrases as
548 | descriptions for topics instead of simple n-grams. This allows us to make sure that we do not cut off important topic
549 | description keyphrases by defining our n-gram range too short. Moreover, we don't need to clean stopwords upfront, can
550 | get more precise topic models and avoid to get topic description keyphrases that are slightly off-key.
551 | 
552 | #### Example:
553 | 
554 | BERTopic can be installed via `pip install bertopic`.
555 | 
556 | ```python
557 | from keyphrase_vectorizers import KeyphraseCountVectorizer
558 | from bertopic import BERTopic
559 | from sklearn.datasets import fetch_20newsgroups
560 | 
561 | # load text documents
562 | docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
563 | # only use subset of the data 
564 | docs = docs[:5000]
565 | 
566 | # train topic model with KeyphraseCountVectorizer
567 | keyphrase_topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
568 | keyphrase_topics, keyphrase_probs = keyphrase_topic_model.fit_transform(docs)
569 | 
570 | # get topics
571 | >>> keyphrase_topic_model.topics
572 | {-1: [('file', 0.007265527630674131),
573 |   ('one', 0.007055454904474792),
574 |   ('use', 0.00633563957153475),
575 |   ('program', 0.006053271092949018),
576 |   ('get', 0.006011060091056076),
577 |   ('people', 0.005729309058970368),
578 |   ('know', 0.005635951168273583),
579 |   ('like', 0.0055692449802916015),
580 |   ('time', 0.00527028825803415),
581 |   ('us', 0.00525564504880084)],
582 |  0: [('game', 0.024134589719090525),
583 |   ('team', 0.021852806383170772),
584 |   ('players', 0.01749406934044139),
585 |   ('games', 0.014397938026886745),
586 |   ('hockey', 0.013932342023677305),
587 |   ('win', 0.013706115572901401),
588 |   ('year', 0.013297593024390321),
589 |   ('play', 0.012533185558169046),
590 |   ('baseball', 0.012412743802062559),
591 |   ('season', 0.011602725885164318)],
592 |  1: [('patients', 0.022600352291162015),
593 |   ('msg', 0.02023877371575874),
594 |   ('doctor', 0.018816282737587457),
595 |   ('medical', 0.018614407917995103),
596 |   ('treatment', 0.0165028251400717),
597 |   ('food', 0.01604980195180696),
598 |   ('candida', 0.015255961242066143),
599 |   ('disease', 0.015115496310099693),
600 |   ('pain', 0.014129703072484495),
601 |   ('hiv', 0.012884503220341102)],
602 |  2: [('key', 0.028851633177510126),
603 |   ('encryption', 0.024375137861044675),
604 |   ('clipper', 0.023565947302544528),
605 |   ('privacy', 0.019258719348097385),
606 |   ('security', 0.018983682856076434),
607 |   ('chip', 0.018822199098878365),
608 |   ('keys', 0.016060139239615384),
609 |   ('internet', 0.01450486904722165),
610 |   ('encrypted', 0.013194373119964168),
611 |   ('government', 0.01303978311708837)],
612 |   ...
613 | ```
614 | 
615 | The same topics look a bit different when no keyphrase vectorizer is used:
616 | 
617 | ```python
618 | from bertopic import BERTopic
619 | from sklearn.datasets import fetch_20newsgroups
620 | 
621 | # load text documents
622 | docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
623 | # only use subset of the data 
624 | docs = docs[:5000]
625 | 
626 | # train topic model without KeyphraseCountVectorizer
627 | topic_model = BERTopic()
628 | topics, probs = topic_model.fit_transform(docs)
629 | 
630 | # get topics
631 | >>> topic_model.topics
632 | {-1: [('the', 0.012864641020408933),
633 |   ('to', 0.01187920529994724),
634 |   ('and', 0.011431498631699856),
635 |   ('of', 0.01099851927541331),
636 |   ('is', 0.010995478673036962),
637 |   ('in', 0.009908233622158523),
638 |   ('for', 0.009903667215879675),
639 |   ('that', 0.009619596716087699),
640 |   ('it', 0.009578499681829809),
641 |   ('you', 0.0095328846440753)],
642 |  0: [('game', 0.013949166096523719),
643 |   ('team', 0.012458483177116456),
644 |   ('he', 0.012354733462693834),
645 |   ('the', 0.01119583508278812),
646 |   ('10', 0.010190243555226108),
647 |   ('in', 0.0101436249231417),
648 |   ('players', 0.009682212470082758),
649 |   ('to', 0.00933700544705287),
650 |   ('was', 0.009172402203816335),
651 |   ('and', 0.008653375901739337)],
652 |  1: [('of', 0.012771267188340924),
653 |   ('to', 0.012581337590513296),
654 |   ('is', 0.012554884458779008),
655 |   ('patients', 0.011983273578628046),
656 |   ('and', 0.011863499662237566),
657 |   ('that', 0.011616113472989725),
658 |   ('it', 0.011581944987387165),
659 |   ('the', 0.011475148304229873),
660 |   ('in', 0.011395485985801054),
661 |   ('msg', 0.010715000656335596)],
662 |  2: [('key', 0.01725282988290282),
663 |   ('the', 0.014634841495851404),
664 |   ('be', 0.014429762197907552),
665 |   ('encryption', 0.013530733999898166),
666 |   ('to', 0.013443159534369817),
667 |   ('clipper', 0.01296614319927958),
668 |   ('of', 0.012164734232650158),
669 |   ('is', 0.012128295958613464),
670 |   ('and', 0.011972763728732667),
671 |   ('chip', 0.010785744492767285)],
672 |  ...
673 | ```
674 | 
675 | <a name="#online-keyphrasevectorizers"/></a>
676 | 
677 | ### Online KeyphraseVectorizers
678 | 
679 | [Back to Table of Contents](#toc)
680 | 
681 | The KeyphraseVectorizers also support online/incremental updates of their representation (similar to
682 | the [OnlineCountVectorizer](https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html#onlinecountvectorizer)).
683 | The vectorizer can not only update out-of-vocabulary keyphrases but also implements decay and cleaning functions to
684 | prevent the sparse document-keyphrases matrix to become too large.
685 | 
686 | **Parameters for online updates:**
687 | 
688 | * `decay`: At each iteration, we sum the document-keyphrase representation of the new documents with the
689 |   document-keyphrase representation of all documents processed thus far. In other words, the document-keyphrase matrix
690 |   keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less
691 |   and less relevant as time goes on. Therefore, a decay parameter was implemented that decays the document-keyphrase
692 |   frequencies at each iteration before adding the document frequencies of new documents. The decay parameter is a value
693 |   between 0 and 1 and indicates the percentage of frequencies the previous document-keyphrase matrix should be reduced
694 |   to. For example, a value of .1 will decrease the frequencies in the document-keyphrase matrix by 10% at each iteration
695 |   before adding the new document-keyphrase matrix. This will make sure that recent data has more weight than previous
696 |   iterations.
697 | * `delete_min_df`: We might want to remove keyphrases from the document-keyphrase representation that appear
698 |   infrequently. The `min_df` parameter works quite well for that. However, when we have a streaming setting,
699 |   the `min_df` does not work as well since a keyphrases's frequency might start below `min_df` but will end up higher
700 |   than that over time. Setting that value high might not always be advised. As a result, the list of keyphrases learned
701 |   by the vectorizer and the resulting document-keyphrase matrix can become quite large. Similarly, if we implement
702 |   the `decay` parameter, then some values will decrease over time until they are below `min_df`. For these reasons,
703 |   the `delete_min_df` parameter was implemented. The parameter takes positive integers and indicates, at each iteration,
704 |   which keyphrases will be removed from the already learned ones. If the value is set to 5, it will check after each
705 |   iteration if the total frequency of a keyphrase is exceeded by that value. If so, the keyphrase will be removed in its
706 |   entirety from the list of keyphrases learned by the vectorizer. This helps to keep the document-keyphrase matrix of a
707 |   manageable size.
708 | 
709 | #### Example:
710 | 
711 | ```python
712 | from keyphrase_vectorizers import KeyphraseCountVectorizer
713 | 
714 | docs = ["""Supervised learning is the machine learning task of learning a function that
715 |          maps an input to an output based on example input-output pairs. It infers a
716 |          function from labeled training data consisting of a set of training examples.
717 |          In supervised learning, each example is a pair consisting of an input object
718 |          (typically a vector) and a desired output value (also called the supervisory signal). 
719 |          A supervised learning algorithm analyzes the training data and produces an inferred function, 
720 |          which can be used for mapping new examples. An optimal scenario will allow for the 
721 |          algorithm to correctly determine the class labels for unseen instances. This requires 
722 |          the learning algorithm to generalize from the training data to unseen situations in a 
723 |          'reasonable' way (see inductive bias).""",
724 | 
725 |         """Keywords are defined as phrases that capture the main topics discussed in a document. 
726 |         As they offer a brief yet precise summary of document content, they can be utilized for various applications. 
727 |         In an information retrieval environment, they serve as an indication of document relevance for users, as the list 
728 |         of keywords can quickly help to determine whether a given document is relevant to their interest. 
729 |         As keywords reflect a document's main topics, they can be utilized to classify documents into groups 
730 |         by measuring the overlap between the keywords assigned to them. Keywords are also used proactively 
731 |         in information retrieval."""]
732 | 
733 | # Init default vectorizer.
734 | vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)
735 | 
736 | # intitial vectorizer fit
737 | vectorizer.fit_transform([docs[0]]).toarray()
738 | >>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
739 |              1, 1, 1]])
740 | 
741 | # check learned keyphrases
742 | print(vectorizer.get_feature_names_out())
743 | >>> ['output pairs', 'output value', 'function', 'optimal scenario',
744 |       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
745 |       'supervised learning algorithm', 'way', 'training examples',
746 |       'input object', 'example', 'machine', 'output',
747 |       'unseen situations', 'unseen instances', 'inductive bias',
748 |       'new examples', 'input', 'task', 'training data', 'class labels',
749 |       'set', 'vector']
750 | 
751 | # learn additional keyphrases from new documents with partial fit
752 | vectorizer.partial_fit([docs[1]])
753 | vectorizer.transform([docs[1]]).toarray()
754 | >>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
755 |              0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]])
756 | 
757 | # check learned keyphrases, including newly learned ones
758 | print(vectorizer.get_feature_names_out())
759 | >>> ['output pairs', 'output value', 'function', 'optimal scenario',
760 |       'pair', 'supervised learning', 'supervisory signal', 'algorithm',
761 |       'supervised learning algorithm', 'way', 'training examples',
762 |       'input object', 'example', 'machine', 'output',
763 |       'unseen situations', 'unseen instances', 'inductive bias',
764 |       'new examples', 'input', 'task', 'training data', 'class labels',
765 |       'set', 'vector', 'list', 'various applications',
766 |       'information retrieval', 'groups', 'overlap', 'main topics',
767 |       'precise summary', 'document relevance', 'interest', 'indication',
768 |       'information retrieval environment', 'phrases', 'keywords',
769 |       'document content', 'documents', 'document', 'users']
770 | 
771 | # update list of learned keyphrases according to 'delete_min_df'
772 | vectorizer.update_bow([docs[1]])
773 | vectorizer.transform([docs[1]]).toarray()
774 | >>> array([[5, 5]])
775 | 
776 | # check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
777 | print(vectorizer.get_feature_names_out())
778 | >>> ['keywords', 'document']
779 | 
780 | # update again and check the impact of 'decay' on the learned document-keyphrase matrix
781 | vectorizer.update_bow([docs[1]])
782 | vectorizer.X_.toarray()
783 | >>> array([[7.5, 7.5]])
784 | ```
785 | 
786 | <a name="#citation-information"/></a>
787 | 
788 | ### Citation information
789 | 
790 | [Back to Table of Contents](#toc)
791 | 
792 | When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry:
793 | 
794 | ```plaintext
795 | @conference{schopf_etal_kdir22,
796 | author={Tim Schopf and Simon Klimek and Florian Matthes},
797 | title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
798 | booktitle={Proceedings of the 14th International Joint Conference on Knowledge Discovery, Knowledge Engineering and Knowledge Management (IC3K 2022) - KDIR},
799 | year={2022},
800 | pages={243-248},
801 | publisher={SciTePress},
802 | organization={INSTICC},
803 | doi={10.5220/0011546600003335},
804 | isbn={978-989-758-614-9},
805 | issn={2184-3228},
806 | }
807 | ``` 
808 | 


--------------------------------------------------------------------------------