├── .bandit ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CITATION ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── bin ├── nb_md.sh ├── preview.py ├── push_pypi.sh └── vis_doc.py ├── code_of_conduct.md ├── dat ├── ars.txt ├── cfc.txt ├── gen.txt ├── lee.txt ├── mih.txt └── suz.txt ├── deprecated ├── WARNING.md ├── example.ipynb ├── oldsrc.py └── test.py ├── docs ├── ack.md ├── apidocs.yml ├── assets │ ├── favicon.png │ ├── logo.png │ └── noam.jpg ├── biblio.jinja ├── biblio.md ├── build.md ├── codecov_io.svg ├── depend.md ├── faq.md ├── glossary.jinja ├── glossary.md ├── index.md ├── javascripts │ └── config.js ├── mkrefs.ttl ├── mkrefs.yml ├── noam.jpg ├── overview.md ├── ref.jinja ├── ref.md ├── setup.md ├── start.md ├── stylesheets │ └── extra.css ├── talks.md ├── todo.md └── tutorial.md ├── environment.yml ├── examples ├── explain_algo.ipynb ├── explain_summ.ipynb └── sample.ipynb ├── lgtm.yml ├── mkdocs.yml ├── pkg_doc.cfg ├── pkg_doc.py ├── pyfixdoc.py ├── pylintrc ├── pyproject.toml ├── pytextrank ├── __init__.py ├── base.py ├── biasedrank.py ├── positionrank.py ├── topicrank.py ├── util.py └── version.py ├── requirements-dev.txt ├── requirements-viz.txt ├── requirements.txt ├── sample.py ├── setup.py ├── tests ├── conftest.py ├── test_base.py ├── test_biasedrank.py ├── test_positionrank.py ├── test_topicrank.py └── trace.py ├── tmp_api.py └── wip └── error.ipynb /.bandit: -------------------------------------------------------------------------------- 1 | [bandit] 2 | exclude_dirs: 3 | - tests 4 | - examples 5 | exclude: setup.py 6 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: ceteri 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | 4 | version: 2 5 | updates: 6 | - package-ecosystem: "pip" 7 | directory: "/" 8 | schedule: 9 | interval: "daily" 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | # pre-commit: 7 | # name: Run pre-commit 8 | # runs-on: ubuntu-latest 9 | # steps: 10 | # - uses: actions/checkout@v3 11 | # - uses: actions/setup-python@v3 12 | # - uses: pre-commit/action@v3.0.0 13 | 14 | test: 15 | name: Tests for Python ${{ matrix.python-version }} 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ['3.7', '3.8', '3.9', '3.10'] 20 | fail-fast: false 21 | # needs: pre-commit 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | 26 | - name: Set up Python 27 | uses: actions/setup-python@v3 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Install dependencies 32 | run: | 33 | pip install -e . 34 | spacy download en_core_web_sm 35 | pip install pre-commit pytest 36 | 37 | - name: Run tests 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # misc. 2 | venv/ 3 | README.rst 4 | *~ 5 | 6 | # files generated by running PyTextRank 7 | *.dot 8 | graph.png 9 | 10 | # files generated by Jupyter notebooks 11 | .ipynb_checkpoints/ 12 | 13 | # files generated by building the documentation 14 | docs/biblio.md 15 | docs/glossary.md 16 | docs/ex*.md 17 | docs/ex*_files 18 | docs/sample.md 19 | docs/sample_files 20 | site/ 21 | ptr.tgz 22 | 23 | # codecov.io 24 | .cc_token 25 | .coverage 26 | coverage.xml 27 | 28 | # Compiled python modules. 29 | *.pyc 30 | 31 | # Setuptools distribution folder. 32 | /dist/ 33 | /build/ 34 | 35 | # Python egg metadata, regenerated from source files by setuptools. 36 | .eggs/ 37 | /*.egg-info 38 | /*.egg 39 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_stages: [commit, push] 4 | default_language_version: 5 | python: python3 6 | exclude: "deprecated" 7 | repos: 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v3.4.0 10 | hooks: 11 | - id: check-added-large-files 12 | - id: check-builtin-literals 13 | - id: check-executables-have-shebangs 14 | - id: check-merge-conflict 15 | - id: check-yaml 16 | - id: debug-statements 17 | - id: detect-private-key 18 | - repo: https://github.com/PyCQA/bandit 19 | rev: 1.7.0 20 | hooks: 21 | - id: bandit # security vulnerabilities 22 | args: ["--exclude", "setup.py,bin,tests"] 23 | - repo: https://github.com/pre-commit/mirrors-mypy 24 | rev: v0.812 25 | hooks: 26 | - id: mypy # type annotations 27 | exclude: ^tests/,^examples/ 28 | - repo: https://github.com/PyCQA/pylint 29 | rev: pylint-2.7.2 30 | hooks: 31 | - id: pylint 32 | exclude: pyfixdoc 33 | - repo: https://github.com/codespell-project/codespell 34 | rev: v2.0.0 35 | hooks: 36 | - id: codespell # spell-check source code 37 | args: ["pytextrank/*.py", "*.md", "docs/*.md"] 38 | exclude: ^examples/ 39 | language: python 40 | types: [text] 41 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # PyTextRank changelog 2 | 3 | ## 3.2.4 4 | 5 | 2022-07-27 6 | 7 | * better support for "ru" and other languages without `noun_chunks` support in spaCy 8 | * updated example notebook to illustrate `TopicRank` algorithm 9 | * made the node bias setting case-independent for `Biased Textrank` algorithm; kudos @Ankush-Chander 10 | * updated summarization tests; kudos @tomaarsen 11 | * reworked some unit tests to be less brittle, less dependent on specific spaCy point releases 12 | 13 | 14 | ## 3.2.3 15 | 16 | 2022-03-06 17 | 18 | * handles missing `noun_chunks` in some language models (e.g., "ru") 19 | * add *TopicRank* algorithm; kudos @tomaarsen 20 | * improved test suite; fixed tests for newer spaCy releases; kudos @tomaarsen 21 | 22 | 23 | ## 3.2.2 24 | 25 | 2021-10-09 26 | 27 | * adjust for changes in `NetworkX` where they are removing `SciPy` as a dependency; kudos @clabornd, @tomaarsen, @duarteocarmo 28 | * more scrubber examples; kudos @dayalstrub-cma 29 | 30 | 31 | ## 3.2.1 32 | 33 | 2021-07-24 34 | 35 | * add "paragraph" option into `summary()` function; kudos @CaptXiong 36 | 37 | 38 | ## 3.2.0 39 | 40 | 2021-07-17 41 | 42 | * **NB: THIS SCRUBBER UPDATE WILL BREAK PREVIOUS RELEASES** 43 | * allow `Span` as scrubber argument, to align with `spaCy` 3.1.x; kudos @Ankush-Chander 44 | * add `lgtm` code reviews (slow, not integrating into GitHub PRs directly) 45 | * evaluating `grayskull` to generate a conda-forge recipe 46 | * add use of `pipdeptree` to analyze dependencies 47 | * use KG from `biblio.ttl` to generate bibliography 48 | * fixed overlooked comment from earlier code; kudos @debraj135 49 | * add visualisation using `altair`; kudos @louisguitton 50 | * add scrubber usage in sample notebook; kudos @Ankush-Chander 51 | * integrating use of `MkRefs` to generate semantic reference pages in `docs` 52 | 53 | 54 | ## 3.1.1 55 | 56 | 2021-03-25 57 | 58 | * fix the span length calculation in explanation notebook; kudos @Ankush-Chander 59 | * add `BiasedTextRank` by @Ankush-Chander (many thanks!) 60 | * add conda `environment.yml` plus instructions 61 | * use `bandit` to check for security issues 62 | * use `codespell` to check for spelling errors 63 | * add `pre-commit` checks in general 64 | * update `doc._.phrases` in the call to `change_focus()` so the summarization will sync with the latest focus 65 | 66 | 67 | ## 3.1.0 68 | 69 | 2021-03-12 70 | 71 | * rename `master` branch to `main` 72 | * add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander 73 | * refactor the stopwords feature as a constructor argument 74 | * add `get_unit_vector()` method to expose the characteristic *unit vector* 75 | * add `calc_sent_dist()` method to expose the sentence distance measures (for summarization) 76 | * include a unit test for summarization 77 | * updated contributor instructions 78 | * `pylint` coverage for code checking 79 | * linking definitions and citations in source code apidocs to our online docs 80 | * updated links on PyPi 81 | 82 | 83 | ## 3.0.1 84 | 85 | 2021-02-27 86 | 87 | * `mypy` coverage for type annotations 88 | * add DOI to README and CITATION 89 | * now deploying online docs at 90 | 91 | 92 | ## 3.0.0 93 | 94 | 2021-02-14 95 | 96 | * **THIS WILL BREAK THINGS!!!** 97 | * support for `spaCy` 3.0.x; kudos @Lord-V15 98 | * full integration of `PositionRank` 99 | * migrated all unit tests to `pytest` 100 | * removed use of `logger` for debugging, introducing `icecream` instead 101 | 102 | 103 | ## 2.1.0 104 | 105 | 2021-01-31 106 | 107 | * add `PositionRank` by @louisguitton (many thanks!) 108 | * fixes chunk in `explain_summ.ipynb` by @anna-droid-beep 109 | * add option `preserve_order` in TextRank.summary by @kavorite 110 | * tested with `spaCy` 2.3.5 111 | 112 | 113 | ## 2.0.3 114 | 115 | 2020-09-15 116 | 117 | * try-catch `ZeroDivisionError` in summary method -- kudos @shyamcody 118 | * tested with updated dependencies: `spaCy` 2.3.x and `NetworkX` 2.5 119 | 120 | 121 | ## 2.0.2 122 | 123 | 2020-05-20 124 | 125 | * fixed default value of `._.phrases` to allow for disabling PTR in a pipeline 126 | 127 | 128 | ## 2.0.1 129 | 130 | 2020-03-02 131 | 132 | * fix `KeyError` issue for pre Python 3.6 133 | * integrated `codecov.io` 134 | * added PyTextRank to the spaCy uniVerse 135 | * fixed README.md instructions to download `en_core_web_sm` 136 | 137 | 138 | ## 2.0.0 139 | 140 | 2019-11-05 141 | 142 | * refactored library to run as a `spaCy` extension 143 | * supports multiple languages 144 | * significantly faster, with less memory required 145 | * better extraction of top-ranked phrases 146 | * changed license to MIT 147 | * uses lemma-based stopwords for more precise control 148 | * WIP toward integration with knowledge graph use cases 149 | 150 | 151 | ## 1.2.1 152 | 153 | 2019-11-01 154 | 155 | * fixed error in installation instructions 156 | 157 | 158 | ## 1.2.0 159 | 160 | 2019-11-01 161 | 162 | * updated to fix for current versions of `spaCy` and `NetworkX` -- kudos @dimmu 163 | * removed deprecated argument -- kudos @laxatives 164 | 165 | 166 | ## 1.1.1 167 | 168 | 2017-09-15 169 | 170 | * patch disables use of NER in `spaCy` until an intermittent bug is resolved. 171 | * will probably replace named tuples with `spaCy` spans instead. 172 | 173 | 174 | ## 1.1.0 175 | 176 | 2017-06-07 177 | 178 | * replaced use of `TextBlob` with `spaCy` 179 | * updated other Py dependencies 180 | * better handling for UTF-8 181 | 182 | 183 | ## 1.0.1 184 | 185 | 2017-04-30 186 | 187 | * updated Jupyter notebook example -- kudos @kjam 188 | * better install/import for `aptagger` 189 | * comparing `spaCy` performance with `TextBlob` 190 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @software{PyTextRank, 2 | author = {Paco Nathan}, 3 | title = {{PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents}}, 4 | year = 2016, 5 | publisher = {Derwen}, 6 | doi = {10.5281/zenodo.4602393}, 7 | url = {https://github.com/DerwenAI/pytextrank} 8 | } 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Welcome! 5 | 6 | Thanks for your interest in contributing to **PyTextRank** 🎉 7 | 8 | This page gives a quick overview of how things are organized and most 9 | importantly, how to get involved. 10 | 11 | 12 | ## Issues and bug reports 13 | 14 | First, if you want to report a potential issue with this library, please 15 | [do a quick search](https://github.com/DerwenAI/pytextrank/issues) 16 | to see if the issue has already been reported. 17 | If so, it's best to simply leave a comment on an existing issue, 18 | rather than create a new one. 19 | Older issues may also include helpful info and show solutions to 20 | commonly encountered questions. 21 | 22 | 23 | ## Opening new issues 24 | 25 | When opening a 26 | [new issue](https://github.com/DerwenAI/pytextrank/issues/new/choose), 27 | please use a **descriptive title** and include information about your 28 | **environment** and library **installation**: 29 | 30 | * Which operating system and version number? 31 | * Which version of Python? 32 | * How did you install? `pip`, `conda`, clone repo then `setup.py`, etc. 33 | 34 | Try to provide as many details as possible. 35 | What exactly is going wrong? 36 | _How_ is it failing? 37 | Is there an error? 38 | 39 | Please understand that in general our developer community does not 40 | provide support via email, Twitter DMs, and other 1:1 messaging. 41 | We believe that help is much more valuable when it gets **shared 42 | publicly**, so that more people can benefit. 43 | 44 | 45 | ## Code of conduct 46 | 47 | In all communications and collaborations, we adhere to the 48 | [Contributor Covenant Code of Conduct](https://github.com/DerwenAI/pytextrank/blob/main/code_of_conduct.md). 49 | By participating, you are expected to follow this code. 50 | 51 | 52 | ## Developer community 53 | 54 | If you'd like to contribute to this open source project, the best way 55 | to get involved with our developer community is to participate in our 56 | [public office hours](https://www.notion.so/KG-Community-Events-Calendar-8aacbe22efa94d9b8b39b7288e22c2d3) 57 | events. 58 | First join the 59 | [*Graph-Based Data Science*](https://www.linkedin.com/groups/6725785/) 60 | group on LinkedIn where these meetingsget announced. 61 | We'll also have other developer discussions on that forum, along with 62 | related updates, news, conference coupons, etc. 63 | 64 | The 65 | [Knowledge Graph Conference](https://derwen.ai/docs/kgl/glossary/#knowledge-graph-conference) 66 | hosts several community resources where you can post questions and get 67 | help about **PyTextRank** and related topics. 68 | Many of our developers are involved there too: 69 | 70 | * [community Slack](https://knowledgegraphconf.slack.com/ssb/redirect) – specifically on the `#ask` channel 71 | 72 | * [Knowledge Tech Q&A site](https://answers.knowledgegraph.tech/) for extended questions posed to experts 73 | 74 | 75 | ## Contributing to the code base 76 | 77 | You don't have to be an expert to contribute, and we're happy to help 78 | you get started. 79 | We'll try to use the 80 | [`good first issue`](https://github.com/DerwenAI/pytextrank/labels/good%20first%20issue) 81 | tags to mark bugs and feature requests that are easy and self-contained. 82 | 83 | If you've decided to take on one of these problems, it's best to 84 | [fork the repo](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-forks) 85 | and do development and testing in your own fork first. 86 | 87 | Please follow the conventions for code formatting, type annotations, 88 | unit tests, code linting, naming conventions, and so on. 89 | Understand that we will not be able to accept pull requests that make 90 | *major overhauls* of the code base or completely change our shared 91 | work on formatting, testing, etc. 92 | 93 | If you need to incorporate other libraries, please discuss this with 94 | the other developers. 95 | There may be issues regarding point releases and compatibility that 96 | would have impact on other parts of the code base. 97 | 98 | Once you're making good progress, don't forget to add a quick comment 99 | to the original issue. 100 | You can also use the issue to ask questions, or share your work in 101 | progress. 102 | Then when you're ready to submit code for review, please use a 103 | [pull request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request) 104 | on our `main` repo branch. 105 | 106 | 107 | ## Project roadmap 108 | 109 | The 110 | ["Graph-Based Data Science"](https://derwen.ai/s/kcgh) 111 | talk describes the **PyTextRank** open source project in more detail, 112 | and discusses some about our roadmap. 113 | In other words, what new features and integrations are we working toward? 114 | 115 | See also our: 116 | 117 | * [Project Board](https://github.com/DerwenAI/pytextrank/projects/1) 118 | * [Milestones](https://github.com/DerwenAI/pytextrank/milestones) 119 | 120 | Suggestions and contributions for our documentation and tutorial are 121 | always welcomed. 122 | These tend to be good starting points for new contributors: you'll get 123 | familiar with our code samples and other resources through that. 124 | 125 | Many thanks! 126 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016-2022 Derwen, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CITATION 2 | include LICENSE 3 | include README.md 4 | include pyproject.toml 5 | include requirements.txt 6 | include setup.py 7 | include tests/*.py 8 | prune .ipynb_checkpoints 9 | # added by check-manifest 10 | include *.md 11 | include *.py 12 | include *.txt 13 | include *.yaml 14 | include *.yml 15 | include pylintrc 16 | recursive-include bin *.py 17 | recursive-include bin *.sh 18 | recursive-include dat *.txt 19 | recursive-include deprecated *.ipynb 20 | recursive-include deprecated *.md 21 | recursive-include deprecated *.py 22 | recursive-include docs *.css 23 | recursive-include docs *.jinja 24 | recursive-include docs *.jpg 25 | recursive-include docs *.js 26 | recursive-include docs *.md 27 | recursive-include docs *.png 28 | recursive-include docs *.svg 29 | recursive-include docs *.ttl 30 | recursive-include docs *.yml 31 | recursive-include examples *.ipynb 32 | recursive-include tests *.py 33 | recursive-include wip *.ipynb 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTextRank 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4637885.svg)](https://doi.org/10.5281/zenodo.4637885) 4 | ![Licence](https://img.shields.io/github/license/DerwenAI/pytextrank) 5 | ![Repo size](https://img.shields.io/github/repo-size/DerwenAI/pytextrank) 6 | ![GitHub commit activity](https://img.shields.io/github/commit-activity/w/DerwenAI/pytextrank?style=plastic) 7 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 8 | [![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) 9 | ![CI](https://github.com/DerwenAI/pytextrank/workflows/CI/badge.svg) 10 | ![downloads](https://img.shields.io/pypi/dm/pytextrank) 11 | ![sponsor](https://img.shields.io/github/sponsors/ceteri) 12 | 13 | **PyTextRank** is a Python implementation of *TextRank* as a 14 | [spaCy pipeline extension](https://spacy.io/universe/project/spacy-pytextrank), 15 | for graph-based natural language work -- and related knowledge graph practices. 16 | This includes the family of 17 | [*textgraph*](https://derwen.ai/docs/ptr/glossary/#textgraphs) algorithms: 18 | 19 | - *TextRank* by [[mihalcea04textrank]](https://derwen.ai/docs/ptr/biblio/#mihalcea04textrank) 20 | - *PositionRank* by [[florescuc17]](https://derwen.ai/docs/ptr/biblio/#florescuc17) 21 | - *Biased TextRank* by [[kazemi-etal-2020-biased]](https://derwen.ai/docs/ptr/biblio/#kazemi-etal-2020-biased) 22 | - *TopicRank* by [[bougouin-etal-2013-topicrank]](https://derwen.ai/docs/ptr/biblio/#bougouin-etal-2013-topicrank) 23 | 24 | Popular use cases for this library include: 25 | 26 | - *phrase extraction*: get the top-ranked phrases from a text document 27 | - low-cost *extractive summarization* of a text document 28 | - help infer concepts from unstructured text into more structured representation 29 | 30 | See our full documentation at: 31 | 32 | 33 | ## Getting Started 34 | 35 | See the ["Getting Started"](https://derwen.ai/docs/ptr/start/) 36 | section of the online documentation. 37 | 38 | To install from [PyPi](https://pypi.python.org/pypi/pytextrank): 39 | ``` 40 | python3 -m pip install pytextrank 41 | python3 -m spacy download en_core_web_sm 42 | ``` 43 | 44 | If you work directly from this Git repo, be sure to install the 45 | dependencies as well: 46 | ``` 47 | python3 -m pip install -r requirements.txt 48 | ``` 49 | 50 | Alternatively, to install dependencies using `conda`: 51 | ``` 52 | conda env create -f environment.yml 53 | conda activate pytextrank 54 | ``` 55 | 56 | Then to use the library with a simple use case: 57 | ```python 58 | import spacy 59 | import pytextrank 60 | 61 | # example text 62 | text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types." 63 | 64 | # load a spaCy model, depending on language, scale, etc. 65 | nlp = spacy.load("en_core_web_sm") 66 | 67 | # add PyTextRank to the spaCy pipeline 68 | nlp.add_pipe("textrank") 69 | doc = nlp(text) 70 | 71 | # examine the top-ranked phrases in the document 72 | for phrase in doc._.phrases: 73 | print(phrase.text) 74 | print(phrase.rank, phrase.count) 75 | print(phrase.chunks) 76 | ``` 77 | 78 | See the **tutorial notebooks** in the `examples` subdirectory for 79 | sample code and patterns to use in integrating **PyTextTank** with 80 | related libraries in Python: 81 | 82 | 83 | 84 |
85 | Contributing Code 86 | 87 | We welcome people getting involved as contributors to this open source 88 | project! 89 | 90 | For detailed instructions please see: 91 | [CONTRIBUTING.md](https://github.com/DerwenAI/pytextrank/blob/main/CONTRIBUTING.md) 92 |
93 | 94 |
95 | Build Instructions 96 | 97 | 98 | Note: unless you are contributing code and updates, 99 | in most use cases won't need to build this package locally. 100 | 101 | 102 | Instead, simply install from 103 | [PyPi](https://pypi.python.org/pypi/pytextrank) 104 | or use [Conda](https://docs.conda.io/). 105 | 106 | To set up the build environment locally, see the 107 | ["Build Instructions"](https://derwen.ai/docs/ptr/build/) 108 | section of the online documentation. 109 |
110 | 111 |
112 | Semantic Versioning 113 | 114 | Generally speaking the major release number of PyTextRank 115 | will track with the major release number of the associated spaCy 116 | version. 117 | 118 | See: 119 | [CHANGELOG.md](https://github.com/DerwenAI/pytextrank/blob/main/CHANGELOG.md) 120 |
121 | 122 | thanks noam! 127 | 128 | 129 | ## License and Copyright 130 | 131 | Source code for **PyTextRank** plus its logo, documentation, and examples 132 | have an [MIT license](https://spdx.org/licenses/MIT.html) which is 133 | succinct and simplifies use in commercial applications. 134 | 135 | All materials herein are Copyright © 2016-2024 Derwen, Inc. 136 | 137 | 138 | ## Attribution 139 | 140 | Please use the following BibTeX entry for citing **PyTextRank** if you 141 | use it in your research or software: 142 | ```bibtex 143 | @software{PyTextRank, 144 | author = {Paco Nathan}, 145 | title = {{PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents}}, 146 | year = 2016, 147 | publisher = {Derwen}, 148 | doi = {10.5281/zenodo.4637885}, 149 | url = {https://github.com/DerwenAI/pytextrank} 150 | } 151 | ``` 152 | 153 | Citations are helpful for the continued development and maintenance of 154 | this library. 155 | For example, see our citations listed on 156 | [Google Scholar](https://scholar.google.com/scholar?q=related:5tl6J4xZlCIJ:scholar.google.com/&scioq=&hl=en&as_sdt=0,5). 157 | 158 | 159 | ## Kudos 160 | 161 | Many thanks to our open source [sponsors](https://github.com/sponsors/ceteri); 162 | and to our contributors: 163 | [@ceteri](https://github.com/ceteri), 164 | [@louisguitton](https://github.com/louisguitton), 165 | [@Ankush-Chander](https://github.com/Ankush-Chander), 166 | [@tomaarsen](https://github.com/tomaarsen), 167 | [@CaptXiong](https://github.com/CaptXiong), 168 | [@Lord-V15](https://github.com/Lord-V15), 169 | [@anna-droid-beep](https://github.com/anna-droid-beep), 170 | [@dvsrepo](https://github.com/dvsrepo), 171 | [@clabornd](https://github.com/clabornd), 172 | [@dayalstrub-cma](https://github.com/dayalstrub-cma), 173 | [@kavorite](https://github.com/kavorite), 174 | [@0dB](https://github.com/0dB), 175 | [@htmartin](https://github.com/htmartin), 176 | [@williamsmj](https://github.com/williamsmj/), 177 | [@mattkohl](https://github.com/mattkohl), 178 | [@vanita5](https://github.com/vanita5), 179 | [@HarshGrandeur](https://github.com/HarshGrandeur), 180 | [@mnowotka](https://github.com/mnowotka), 181 | [@kjam](https://github.com/kjam), 182 | [@SaiThejeshwar](https://github.com/SaiThejeshwar), 183 | [@laxatives](https://github.com/laxatives), 184 | [@dimmu](https://github.com/dimmu), 185 | [@JasonZhangzy1757](https://github.com/JasonZhangzy1757), 186 | [@jake-aft](https://github.com/jake-aft), 187 | [@junchen1992](https://github.com/junchen1992), 188 | [@shyamcody](https://github.com/shyamcody), 189 | [@chikubee](https://github.com/chikubee); 190 | also to [@mihalcea](https://github.com/mihalcea) who leads outstanding NLP research work, 191 | encouragement from the wonderful folks at Explosion who develop [spaCy](https://github.com/explosion/spaCy), 192 | plus general support from [Derwen, Inc.](https://derwen.ai/) 193 | 194 | ## Star History 195 | 196 | [![Star History Chart](https://api.star-history.com/svg?repos=derwenai/pytextrank&type=Date)](https://star-history.com/#derwenai/pytextrank&Date) 197 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Versions which are currently being supported with security updates: 6 | 7 | | Version | Supported | 8 | | ------- | ------------------ | 9 | | > 0.2 | :white_check_mark: | 10 | 11 | ## Reporting a Vulnerability 12 | 13 | To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/pytextrank/issues). 14 | We will be notified immediately, and will attempt to respond on the reported issue immediately. 15 | -------------------------------------------------------------------------------- /bin/nb_md.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e -x 2 | 3 | for notebook_path in examples/*.ipynb; do 4 | [ -e "$notebook_path" ] || continue 5 | 6 | notebook=`basename $notebook_path` 7 | stem=`basename $notebook_path .ipynb` 8 | 9 | cp $notebook_path docs/$notebook 10 | jupyter nbconvert docs/$notebook --to markdown 11 | python3 bin/vis_doc.py docs/"$stem".md 12 | rm docs/$notebook 13 | done 14 | -------------------------------------------------------------------------------- /bin/preview.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from flask import Flask, redirect, send_from_directory, url_for # pylint: disable=E0401 5 | from pathlib import PurePosixPath 6 | import os 7 | 8 | DOCS_ROUTE = "/docs/" 9 | DOCS_FILES = "../site" 10 | DOCS_PORT = 8000 11 | 12 | APP = Flask(__name__, static_folder=DOCS_FILES, template_folder=DOCS_FILES) 13 | 14 | APP.config["DEBUG"] = False 15 | APP.config["MAX_CONTENT_LENGTH"] = 52428800 16 | APP.config["SECRET_KEY"] = "Technically, I remain uncommitted." 17 | APP.config["SEND_FILE_MAX_AGE_DEFAULT"] = 3000 18 | 19 | 20 | @APP.route(DOCS_ROUTE, methods=["GET"]) 21 | @APP.route(DOCS_ROUTE + "", methods=["GET"], defaults={"path": None}) 22 | @APP.route(DOCS_ROUTE + "", methods=["GET"]) 23 | def static_proxy (path=""): 24 | """Serve static files from the /site directory.""" 25 | if not path: 26 | suffix = "" 27 | else: 28 | suffix = PurePosixPath(path).suffix 29 | 30 | if suffix not in [".css", ".js", ".map", ".png", ".svg", ".xml"]: 31 | path = os.path.join(path, "index.html") 32 | 33 | return send_from_directory(DOCS_FILES, path) 34 | 35 | 36 | @APP.route("/index.html") 37 | @APP.route("/home/") 38 | @APP.route("/") 39 | def home_redirects (): 40 | """Serve generated documentation microsite. 41 | 42 | See build.md for more details. 43 | """ 44 | return redirect(url_for("static_proxy")) 45 | 46 | 47 | if __name__ == "__main__": 48 | APP.run(host="0.0.0.0", port=DOCS_PORT, debug=True) 49 | -------------------------------------------------------------------------------- /bin/push_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e -x 2 | 3 | rm -rf dist build pytextrank.egg-info 4 | python3 -m build 5 | twine check dist/* 6 | 7 | # this assumes the use of `~/.pypirc` 8 | # https://packaging.python.org/en/latest/specifications/pypirc/ 9 | 10 | twine upload ./dist/* --verbose 11 | -------------------------------------------------------------------------------- /bin/vis_doc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from selenium import webdriver # pylint: disable=E0401 5 | from selenium.webdriver.chrome.options import Options # pylint: disable=E0401 6 | import os 7 | import pathlib 8 | import re 9 | import sys 10 | import time 11 | 12 | PAT_HEADER = re.compile(r"^(```python\n\# for use.*production:\n.*\n```\n)", re.MULTILINE) 13 | PAT_IFRAME = re.compile(r"^(\\<\/iframe\>\n)", re.MULTILINE) 14 | PAT_SOURCE = re.compile(r"\s+src\=\"(\S+)\"") 15 | 16 | 17 | def get_pyvis_html ( 18 | iframe 19 | ): 20 | """ 21 | located the HTML file generated by PyVis, if any 22 | """ 23 | source_html = None 24 | m_source = PAT_SOURCE.search(iframe) 25 | 26 | if m_source: 27 | source_html = m_source.group(1) 28 | 29 | if "tmp.fig" not in source_html: 30 | # the