├── .gitignore ├── .python-version ├── Dockerfile ├── LICENSE ├── README.md ├── config.ini.template ├── embedrank.gif ├── launch.py ├── requirements.txt ├── setup.cfg ├── setup.py └── swisscom_ai ├── __init__.py └── research_keyphrase ├── __init__.py ├── embeddings ├── __init__.py ├── emb_distrib_interface.py └── emb_distrib_local.py ├── model ├── __init__.py ├── extractor.py ├── input_representation.py ├── method.py └── methods_embeddings.py ├── preprocessing ├── __init__.py ├── custom_stanford.py └── postagging.py └── util ├── __init__.py ├── fileIO.py └── solr_fields.py /.gitignore: -------------------------------------------------------------------------------- 1 | stanford-postagger*/ 2 | config.ini 3 | 4 | # JetBrains IDEs 5 | .idea/ 6 | 7 | # Python 8 | __pycache__/ 9 | build/ 10 | dist/ 11 | *.egg-info/ 12 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.6.2 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a base image that comes with NumPy and SciPy pre-installed 2 | FROM publysher/alpine-scipy:1.0.0-numpy1.14.0-python3.6-alpine3.7 3 | # Because of the image, our versions differ from those in the requirements.txt: 4 | # numpy==1.14.0 (instead of 1.13.1) 5 | # scipy==1.0.0 (instead of 0.19.1) 6 | 7 | # Install Java for Stanford Tagger 8 | RUN apk --update add openjdk8-jre 9 | # Set environment 10 | ENV JAVA_HOME /opt/jdk 11 | ENV PATH ${PATH}:${JAVA_HOME}/bin 12 | 13 | # Download CoreNLP full Stanford Tagger for English 14 | RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip && \ 15 | unzip stanford-corenlp-full-*.zip && \ 16 | rm stanford-corenlp-full-*.zip && \ 17 | mv stanford-corenlp-full-* stanford-corenlp 18 | 19 | # Install sent2vec 20 | RUN apk add --update git g++ make && \ 21 | git clone https://github.com/epfml/sent2vec && \ 22 | cd sent2vec && \ 23 | git checkout f827d014a473aa22b2fef28d9e29211d50808d48 && \ 24 | make && \ 25 | apk del git make && \ 26 | rm -rf /var/cache/apk/* && \ 27 | pip install cython && \ 28 | cd src && \ 29 | python setup.py build_ext && \ 30 | pip install . 31 | 32 | 33 | 34 | # Install requirements 35 | WORKDIR /app 36 | ADD requirements.txt . 37 | # Remove NumPy and SciPy from the requirements before installing the rest 38 | RUN cd /app && \ 39 | sed -i '/^numpy.*$/d' requirements.txt && \ 40 | sed -i '/^scipy.*$/d' requirements.txt && \ 41 | pip install -r requirements.txt 42 | 43 | # Download NLTK data 44 | RUN python -c "import nltk; nltk.download('punkt')" 45 | 46 | # Set the paths in config.ini 47 | ADD config.ini.template config.ini 48 | RUN sed -i '6 c\host = localhost' config.ini && \ 49 | sed -i '7 c\port = 9000' config.ini && \ 50 | sed -i '10 c\model_path = /sent2vec/pretrained_model.bin' config.ini 51 | 52 | # Add actual source code 53 | ADD swisscom_ai swisscom_ai/ 54 | ADD launch.py . 55 | 56 | ENTRYPOINT ["/bin/sh"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the implementation of the following paper: https://arxiv.org/abs/1801.04470 2 | 3 | # Installation 4 | 5 | ## Local Installation 6 | 7 | 1. Download full Stanford CoreNLP Tagger version 3.8.0 8 | http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip 9 | 10 | 2. Install sent2vec from 11 | https://github.com/epfml/sent2vec 12 | * Clone/Download the directory 13 | * go to sent2vec directory 14 | * git checkout f827d014a473aa22b2fef28d9e29211d50808d48 15 | * make 16 | * pip install cython 17 | * inside the src folder 18 | * ``python setup.py build_ext`` 19 | * ``pip install . `` 20 | * (In OSX) If the setup.py throws an **error** (ignore warnings), open setup.py and add '-stdlib=libc++' in the compile_opts list. 21 | * Download a pre-trained model (see readme of Sent2Vec repo) , for example wiki_bigrams.bin 22 | 23 | 3. Install requirements 24 | 25 | After cloning this repository go to the root directory and 26 | ``pip install -r requirements.txt`` 27 | 28 | 4. Download NLTK data 29 | ``` 30 | import nltk 31 | nltk.download('punkt') 32 | ``` 33 | 34 | 5. Launch Stanford Core NLP tagger 35 | * Open a new terminal 36 | * Go to the stanford-core-nlp-full directory 37 | * Run the server `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 15000 & ` 38 | 39 | 40 | 6. Set the paths in config.ini.template 41 | * You can leave [STANFORDTAGGER] parameters empty 42 | * For [STANFORDCORENLPTAGGER] : 43 | * set host to localhost 44 | * set port to 9000 45 | * For [SENT2VEC]: 46 | * set your model_path to the pretrained model 47 | your_path_to_model/wiki_bigrams.bin (if you choosed wiki_bigrams.bin) 48 | * rename config.ini.template to config.ini 49 | 50 | ## Docker 51 | 52 | Probably the easiest way to get started is by using the provided Docker image. 53 | From the project's root directory, the image can be built like so: 54 | ``` 55 | $ docker build . -t keyphrase-extraction 56 | ``` 57 | This can take a few minutes to finish. 58 | Also, keep in mind that pre-trained sent2vec models will not be downloaded since each model is several GBs in size and don't forget to allocate enough memory to your docker container (models are loaded in RAM). 59 | 60 | To launch the model in an interactive mode, in order to use your own code, run 61 | ``` 62 | $ docker run -v {path to wiki_bigrams.bin}:/sent2vec/pretrained_model.bin -it keyphrase-extraction 63 | # Run the corenlp server 64 | /app # cd /stanford-corenlp 65 | /stanford-corenlp # nohup java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 15000 & 66 | # Press enter to get stdin back 67 | /stanford-corenlp # cd /app 68 | /app # python 69 | >>> import launch 70 | ``` 71 | You have to specify the path to your sent2vec model using the `-v` argument. 72 | If, for example, you should choose not to use the *wiki_bigrams.bin* model, adjust your path accordingly (and of course, remember to remove the curly brackets). 73 | 74 | # Usage 75 | 76 | Once the CoreNLP server is running 77 | 78 | ``` 79 | import launch 80 | 81 | embedding_distributor = launch.load_local_embedding_distributor() 82 | pos_tagger = launch.load_local_corenlp_pos_tagger() 83 | 84 | kp1 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text, 10, 'en') #extract 10 keyphrases 85 | kp2 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text2, 10, 'en') 86 | ... 87 | ``` 88 | 89 | This return for each text a tuple containing three lists: 90 | 1) The top N candidates (string) i.e keyphrases 91 | 2) For each keyphrase the associated relevance score 92 | 3) For each keyphrase a list of alias (other candidates very similar to the one selected 93 | as keyphrase) 94 | 95 | # Method 96 | 97 | This is the implementation of the following paper: 98 | https://arxiv.org/abs/1801.04470 99 | 100 | ![embedrank](embedrank.gif) 101 | 102 | By using sentence embeddings , EmbedRank embeds both the document and candidate phrases into the same embedding space. 103 | 104 | N candidates are selected as keyphrases by using Maximal Margin Relevance using the cosine similarity between the candidates and the 105 | document in order to model the informativness and the cosine 106 | similarity between the candidates is used to model the diversity. 107 | 108 | An hyperparameter, beta (default=0.55), controls the importance given to 109 | informativness and diversity when extracting keyphrases. 110 | (beta = 1 only informativness , beta = 0 only diversity) 111 | You can change the beta hyperparameter value when calling extract_keyphrases: 112 | 113 | ``` 114 | kp1 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text, 10, 'en', beta=0.8) #extract 10 keyphrases with beta=0.8 115 | 116 | ``` 117 | 118 | If you want to replicate the results of the paper you have to set beta to 1 or 0.5 and turn off the alias feature by specifiying alias_threshold=1 to extract_keyphrases method. 119 | 120 | -------------------------------------------------------------------------------- /config.ini.template: -------------------------------------------------------------------------------- 1 | [STANFORDTAGGER] 2 | jar_path = 3 | model_directory_path = 4 | 5 | [STANFORDCORENLPTAGGER] 6 | host = 7 | port = 8 | 9 | [SENT2VEC] 10 | model_path = -------------------------------------------------------------------------------- /embedrank.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/embedrank.gif -------------------------------------------------------------------------------- /launch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from configparser import ConfigParser 3 | 4 | from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal 5 | from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj 6 | from swisscom_ai.research_keyphrase.model.method import MMRPhrase 7 | from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingCoreNLP 8 | from swisscom_ai.research_keyphrase.util.fileIO import read_file 9 | 10 | 11 | def extract_keyphrases(embedding_distrib, ptagger, raw_text, N, lang, beta=0.55, alias_threshold=0.7): 12 | """ 13 | Method that extract a set of keyphrases 14 | 15 | :param embedding_distrib: An Embedding Distributor object see @EmbeddingDistributor 16 | :param ptagger: A Pos Tagger object see @PosTagger 17 | :param raw_text: A string containing the raw text to extract 18 | :param N: The number of keyphrases to extract 19 | :param lang: The language 20 | :param beta: beta factor for MMR (tradeoff informativness/diversity) 21 | :param alias_threshold: threshold to group candidates as aliases 22 | :return: A tuple with 3 elements : 23 | 1)list of the top-N candidates (or less if there are not enough candidates) (list of string) 24 | 2)list of associated relevance scores (list of float) 25 | 3)list containing for each keyphrase a list of alias (list of list of string) 26 | """ 27 | tagged = ptagger.pos_tag_raw_text(raw_text) 28 | text_obj = InputTextObj(tagged, lang) 29 | return MMRPhrase(embedding_distrib, text_obj, N=N, beta=beta, alias_threshold=alias_threshold) 30 | 31 | 32 | def load_local_embedding_distributor(): 33 | config_parser = ConfigParser() 34 | config_parser.read('config.ini') 35 | sent2vec_model_path = config_parser.get('SENT2VEC', 'model_path') 36 | return EmbeddingDistributorLocal(sent2vec_model_path) 37 | 38 | 39 | def load_local_corenlp_pos_tagger(): 40 | config_parser = ConfigParser() 41 | config_parser.read('config.ini') 42 | host = config_parser.get('STANFORDCORENLPTAGGER', 'host') 43 | port = config_parser.get('STANFORDCORENLPTAGGER', 'port') 44 | return PosTaggingCoreNLP(host, port) 45 | 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description='Extract keyphrases from raw text') 49 | 50 | group = parser.add_mutually_exclusive_group(required=True) 51 | group.add_argument('-raw_text', help='raw text to process') 52 | group.add_argument('-text_file', help='file containing the raw text to process') 53 | 54 | 55 | parser.add_argument('-tagger_host', help='CoreNLP host', default='localhost') 56 | parser.add_argument('-tagger_port', help='CoreNLP port', default=9000) 57 | parser.add_argument('-N', help='number of keyphrases to extract', required=True, type=int) 58 | args = parser.parse_args() 59 | 60 | if args.text_file: 61 | raw_text = read_file(args.text_file) 62 | else: 63 | raw_text = args.raw_text 64 | 65 | embedding_distributor = load_local_embedding_distributor() 66 | pos_tagger = load_local_corenlp_pos_tagger(args.tagger_host, args.tagger_port) 67 | print(extract_keyphrases(embedding_distributor, pos_tagger, raw_text, args.N, 'en')) 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langdetect==1.0.7 2 | nltk==3.4.1 3 | numpy==1.14.3 4 | scikit-learn==0.19.0 5 | scipy==0.19.1 6 | six==1.10.0 7 | requests==2.21.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | 3 | See: 4 | https://packaging.python.org/en/latest/distributing.html 5 | https://github.com/pypa/sampleproject 6 | """ 7 | from codecs import open 8 | 9 | from setuptools import setup, find_packages 10 | 11 | with open('requirements.txt') as f: 12 | required = f.read().splitlines() 13 | 14 | setup( 15 | name='swisscom_ai.research_keyphrase', 16 | 17 | # Versions should comply with PEP440. For a discussion on single-sourcing 18 | # the version across setup.py and the project code, see 19 | # https://packaging.python.org/en/latest/single_source_version.html 20 | version='0.9.5', 21 | 22 | description='Swisscom AI Research Keyphrase Extraction', 23 | url='https://github.com/swisscom/ai-research-keyphrase-extraction', 24 | 25 | author='Swisscom (Schweiz) AG', 26 | 27 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 28 | classifiers=[ 29 | 'Programming Language :: Python :: 3.6', 30 | ], 31 | 32 | # You can just specify the packages manually here if your project is 33 | # simple. Or you can use find_packages(). 34 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 35 | 36 | package_data={'swisscom_ai.research_keyphrase': []}, 37 | include_package_data=True, 38 | 39 | # List run-time dependencies here. These will be installed by pip when 40 | # your project is installed. For an analysis of "install_requires" vs pip's 41 | # requirements files see: 42 | # https://packaging.python.org/en/latest/requirements.html 43 | install_requires=required, 44 | 45 | # List additional groups of dependencies here (e.g. development 46 | # dependencies). You can install these using the following syntax, 47 | # for example: 48 | # $ pip install -e .[dev,test] 49 | extras_require={ 50 | 'dev': [], 51 | 'test': [], 52 | }, 53 | ) 54 | -------------------------------------------------------------------------------- /swisscom_ai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/embeddings/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/embeddings/emb_distrib_interface.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | from abc import ABC, abstractmethod 7 | 8 | 9 | class Singleton(type): 10 | _instances = {} 11 | 12 | def __call__(cls, *args, **kwargs): 13 | if cls not in cls._instances: 14 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 15 | return cls._instances[cls] 16 | 17 | 18 | class EmbeddingDistributor(ABC): 19 | """ 20 | Abstract class in charge of providing the embeddings of piece of texts 21 | """ 22 | @abstractmethod 23 | def get_tokenized_sents_embeddings(self, sents): 24 | """ 25 | Generate a numpy ndarray with the embedding of each element of sent in each row 26 | :param sents: list of string (sentences/phrases) 27 | :return: ndarray with shape (len(sents), dimension of embeddings) 28 | """ 29 | pass 30 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/embeddings/emb_distrib_local.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | import numpy as np 7 | 8 | from swisscom_ai.research_keyphrase.embeddings.emb_distrib_interface import EmbeddingDistributor 9 | import sent2vec 10 | 11 | 12 | class EmbeddingDistributorLocal(EmbeddingDistributor): 13 | """ 14 | Concrete class of @EmbeddingDistributor using a local installation of sent2vec 15 | https://github.com/epfml/sent2vec 16 | 17 | """ 18 | 19 | def __init__(self, fasttext_model): 20 | self.model = sent2vec.Sent2vecModel() 21 | self.model.load_model(fasttext_model) 22 | 23 | def get_tokenized_sents_embeddings(self, sents): 24 | """ 25 | @see EmbeddingDistributor 26 | """ 27 | for sent in sents: 28 | if '\n' in sent: 29 | raise RuntimeError('New line is not allowed inside a sentence') 30 | 31 | return self.model.embed_sentences(sents) 32 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/model/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/model/extractor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | """Contain method that return list of candidate""" 7 | 8 | import re 9 | 10 | import nltk 11 | 12 | GRAMMAR_EN = """ NP: 13 | {*} # Adjective(s)(optional) + Noun(s)""" 14 | 15 | GRAMMAR_DE = """ 16 | NBAR: 17 | {*+} # [Adjective(s) or Article(s) or Posessive pronoun](optional) + Noun(s) 18 | {+*+} 19 | 20 | NP: 21 | {*}# Above, connected with APPR and APPART (beim vom) 22 | {+} 23 | """ 24 | 25 | GRAMMAR_FR = """ NP: 26 | {*+*} # Adjective(s)(optional) + Noun(s) + Adjective(s)(optional)""" 27 | 28 | 29 | def get_grammar(lang): 30 | if lang == 'en': 31 | grammar = GRAMMAR_EN 32 | elif lang == 'de': 33 | grammar = GRAMMAR_DE 34 | elif lang == 'fr': 35 | grammar = GRAMMAR_FR 36 | else: 37 | raise ValueError('Language not handled') 38 | return grammar 39 | 40 | 41 | def extract_candidates(text_obj, no_subset=False): 42 | """ 43 | Based on part of speech return a list of candidate phrases 44 | :param text_obj: Input text Representation see @InputTextObj 45 | :param no_subset: if true won't put a candidate which is the subset of an other candidate 46 | :param lang: language (currently en, fr and de are supported) 47 | :return: list of candidate phrases (string) 48 | """ 49 | 50 | keyphrase_candidate = set() 51 | 52 | np_parser = nltk.RegexpParser(get_grammar(text_obj.lang)) # Noun phrase parser 53 | trees = np_parser.parse_sents(text_obj.pos_tagged) # Generator with one tree per sentence 54 | 55 | for tree in trees: 56 | for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): # For each nounphrase 57 | # Concatenate the token with a space 58 | keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves())) 59 | 60 | keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5} 61 | 62 | if no_subset: 63 | keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate) 64 | else: 65 | keyphrase_candidate = list(keyphrase_candidate) 66 | 67 | return keyphrase_candidate 68 | 69 | 70 | def extract_sent_candidates(text_obj): 71 | """ 72 | 73 | :param text_obj: input Text Representation see @InputTextObj 74 | :return: list of tokenized sentence (string) , each token is separated by a space in the string 75 | """ 76 | return [(' '.join(word for word, tag in sent)) for sent in text_obj.pos_tagged] 77 | 78 | 79 | def unique_ngram_candidates(strings): 80 | """ 81 | ['machine learning', 'machine', 'backward induction', 'induction', 'start'] -> 82 | ['backward induction', 'start', 'machine learning'] 83 | :param strings: List of string 84 | :return: List of string where no string is fully contained inside another string 85 | """ 86 | results = [] 87 | for s in sorted(set(strings), key=len, reverse=True): 88 | if not any(re.search(r'\b{}\b'.format(re.escape(s)), r) for r in results): 89 | results.append(s) 90 | return results 91 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/model/input_representation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | from nltk.stem import PorterStemmer 7 | 8 | 9 | class InputTextObj: 10 | """Represent the input text in which we want to extract keyphrases""" 11 | 12 | def __init__(self, pos_tagged, lang, stem=False, min_word_len=3): 13 | """ 14 | :param pos_tagged: List of list : Text pos_tagged as a list of sentences 15 | where each sentence is a list of tuple (word, TAG). 16 | :param stem: If we want to apply stemming on the text. 17 | """ 18 | self.min_word_len = min_word_len 19 | self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'} 20 | self.pos_tagged = [] 21 | self.filtered_pos_tagged = [] 22 | self.isStemmed = stem 23 | self.lang = lang 24 | 25 | if stem: 26 | stemmer = PorterStemmer() 27 | self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged] 28 | else: 29 | self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged] 30 | 31 | temp = [] 32 | for sent in self.pos_tagged: 33 | s = [] 34 | for elem in sent: 35 | if len(elem[0]) < min_word_len: 36 | s.append((elem[0], 'LESS')) 37 | else: 38 | s.append(elem) 39 | temp.append(s) 40 | 41 | self.pos_tagged = temp 42 | # Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method. 43 | if lang in ['fr', 'de']: 44 | self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence 45 | in 46 | self.pos_tagged] 47 | self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in 48 | self.pos_tagged] 49 | 50 | def is_candidate(self, tagged_token): 51 | """ 52 | 53 | :param tagged_token: tuple (word, tag) 54 | :return: True if its a valid candidate word 55 | """ 56 | return tagged_token[1] in self.considered_tags 57 | 58 | def extract_candidates(self): 59 | """ 60 | :return: set of all candidates word 61 | """ 62 | return {tagged_token[0].lower() 63 | for sentence in self.pos_tagged 64 | for tagged_token in sentence 65 | if self.is_candidate(tagged_token) and len(tagged_token[0]) >= self.min_word_len 66 | } 67 | 68 | 69 | def convert(fr_or_de_tag): 70 | if fr_or_de_tag in {'NN', 'NNE', 'NE', 'N', 'NPP', 'NC', 'NOUN'}: 71 | return 'NN' 72 | elif fr_or_de_tag in {'ADJA', 'ADJ'}: 73 | return 'JJ' 74 | else: 75 | return fr_or_de_tag 76 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/model/method.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | import warnings 7 | 8 | import numpy as np 9 | from sklearn.metrics.pairwise import cosine_similarity 10 | 11 | from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc, \ 12 | extract_doc_embedding, extract_sent_candidates_embedding_for_doc 13 | 14 | 15 | def _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered, alias_threshold): 16 | """ 17 | Core method using Maximal Marginal Relevance in charge to return the top-N candidates 18 | 19 | :param embdistrib: embdistrib: embedding distributor see @EmbeddingDistributor 20 | :param text_obj: Input text representation see @InputTextObj 21 | :param candidates: list of candidates (string) 22 | :param X: numpy array with the embedding of each candidate in each row 23 | :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity) 24 | :param N: number of candidates to extract 25 | :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding 26 | :return: A tuple with 3 elements : 27 | 1)list of the top-N candidates (or less if there are not enough candidates) (list of string) 28 | 2)list of associated relevance scores (list of float) 29 | 3)list containing for each keyphrase a list of alias (list of list of string) 30 | """ 31 | 32 | N = min(N, len(candidates)) 33 | doc_embedd = extract_doc_embedding(embdistrib, text_obj, use_filtered) # Extract doc embedding 34 | doc_sim = cosine_similarity(X, doc_embedd.reshape(1, -1)) 35 | 36 | doc_sim_norm = doc_sim/np.max(doc_sim) 37 | doc_sim_norm = 0.5 + (doc_sim_norm - np.average(doc_sim_norm)) / np.std(doc_sim_norm) 38 | 39 | sim_between = cosine_similarity(X) 40 | np.fill_diagonal(sim_between, np.NaN) 41 | 42 | sim_between_norm = sim_between/np.nanmax(sim_between, axis=0) 43 | sim_between_norm = \ 44 | 0.5 + (sim_between_norm - np.nanmean(sim_between_norm, axis=0)) / np.nanstd(sim_between_norm, axis=0) 45 | 46 | selected_candidates = [] 47 | unselected_candidates = [c for c in range(len(candidates))] 48 | 49 | j = np.argmax(doc_sim) 50 | selected_candidates.append(j) 51 | unselected_candidates.remove(j) 52 | 53 | for _ in range(N - 1): 54 | selec_array = np.array(selected_candidates) 55 | unselec_array = np.array(unselected_candidates) 56 | 57 | distance_to_doc = doc_sim_norm[unselec_array, :] 58 | dist_between = sim_between_norm[unselec_array][:, selec_array] 59 | if dist_between.ndim == 1: 60 | dist_between = dist_between[:, np.newaxis] 61 | j = np.argmax(beta * distance_to_doc - (1 - beta) * np.max(dist_between, axis=1).reshape(-1, 1)) 62 | item_idx = unselected_candidates[j] 63 | selected_candidates.append(item_idx) 64 | unselected_candidates.remove(item_idx) 65 | 66 | # Not using normalized version of doc_sim for computing relevance 67 | relevance_list = max_normalization(doc_sim[selected_candidates]).tolist() 68 | aliases_list = get_aliases(sim_between[selected_candidates, :], candidates, alias_threshold) 69 | 70 | return candidates[selected_candidates].tolist(), relevance_list, aliases_list 71 | 72 | 73 | def MMRPhrase(embdistrib, text_obj, beta=0.65, N=10, use_filtered=True, alias_threshold=0.8): 74 | """ 75 | Extract N keyphrases 76 | 77 | :param embdistrib: embedding distributor see @EmbeddingDistributor 78 | :param text_obj: Input text representation see @InputTextObj 79 | :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity) 80 | :param N: number of keyphrases to extract 81 | :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding 82 | :return: A tuple with 3 elements : 83 | 1)list of the top-N candidates (or less if there are not enough candidates) (list of string) 84 | 2)list of associated relevance scores (list of float) 85 | 3)list containing for each keyphrase a list of alias (list of list of string) 86 | """ 87 | candidates, X = extract_candidates_embedding_for_doc(embdistrib, text_obj) 88 | 89 | if len(candidates) == 0: 90 | warnings.warn('No keyphrase extracted for this document') 91 | return None, None, None 92 | 93 | return _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered, alias_threshold) 94 | 95 | 96 | def MMRSent(embdistrib, text_obj, beta=0.5, N=10, use_filtered=True): 97 | """ 98 | 99 | Extract N key sentences 100 | 101 | :param embdistrib: embedding distributor see @EmbeddingDistributor 102 | :param text_obj: Input text representation see @InputTextObj 103 | :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity) 104 | :param N: number of key sentences to extract 105 | :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding 106 | :return: list of N key sentences (or less if there are not enough candidates) 107 | """ 108 | candidates, X = extract_sent_candidates_embedding_for_doc(embdistrib, text_obj) 109 | 110 | if len(candidates) == 0: 111 | warnings.warn('No keysentence extracted for this document') 112 | return [] 113 | 114 | return _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered) 115 | 116 | 117 | def max_normalization(array): 118 | """ 119 | Compute maximum normalization (max is set to 1) of the array 120 | :param array: 1-d array 121 | :return: 1-d array max- normalized : each value is multiplied by 1/max value 122 | """ 123 | return 1/np.max(array) * array.squeeze(axis=1) 124 | 125 | 126 | def get_aliases(kp_sim_between, candidates, threshold): 127 | """ 128 | Find candidates which are very similar to the keyphrases (aliases) 129 | :param kp_sim_between: ndarray of shape (nb_kp , nb candidates) containing the similarity 130 | of each kp with all the candidates. Note that the similarity between the keyphrase and itself should be set to 131 | NaN or 0 132 | :param candidates: array of candidates (array of string) 133 | :return: list containing for each keyphrase a list that contain candidates which are aliases 134 | (very similar) (list of list of string) 135 | """ 136 | 137 | kp_sim_between = np.nan_to_num(kp_sim_between, 0) 138 | idx_sorted = np.flip(np.argsort(kp_sim_between), 1) 139 | aliases = [] 140 | for kp_idx, item in enumerate(idx_sorted): 141 | alias_for_item = [] 142 | for i in item: 143 | if kp_sim_between[kp_idx, i] >= threshold: 144 | alias_for_item.append(candidates[i]) 145 | else: 146 | break 147 | aliases.append(alias_for_item) 148 | 149 | return aliases 150 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/model/methods_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | import numpy as np 7 | 8 | from swisscom_ai.research_keyphrase.model.extractor import extract_candidates, extract_sent_candidates 9 | 10 | 11 | def extract_doc_embedding(embedding_distrib, inp_rpr, use_filtered=False): 12 | """ 13 | Return the embedding of the full document 14 | 15 | :param embedding_distrib: embedding distributor see @EmbeddingDistributor 16 | :param inp_rpr: input text representation see @InputTextObj 17 | :param use_filtered: if true keep only candidate words in the raw text before computing the embedding 18 | :return: numpy array of shape (1, dimension of embeddings) that contains the document embedding 19 | """ 20 | if use_filtered: 21 | tagged = inp_rpr.filtered_pos_tagged 22 | else: 23 | tagged = inp_rpr.pos_tagged 24 | 25 | tokenized_doc_text = ' '.join(token[0].lower() for sent in tagged for token in sent) 26 | return embedding_distrib.get_tokenized_sents_embeddings([tokenized_doc_text]) 27 | 28 | 29 | def extract_candidates_embedding_for_doc(embedding_distrib, inp_rpr): 30 | """ 31 | 32 | Return the list of candidate phrases as well as the associated numpy array that contains their embeddings. 33 | Note that candidates phrases extracted by PosTag rules which are uknown (in term of embeddings) 34 | will be removed from the candidates. 35 | 36 | :param embedding_distrib: embedding distributor see @EmbeddingDistributor 37 | :param inp_rpr: input text representation see @InputTextObj 38 | :return: A tuple of two element containing 1) the list of candidate phrases 39 | 2) a numpy array of shape (number of candidate phrases, dimension of embeddings : 40 | each row is the embedding of one candidate phrase 41 | """ 42 | candidates = np.array(extract_candidates(inp_rpr)) # List of candidates based on PosTag rules 43 | if len(candidates) > 0: 44 | embeddings = np.array(embedding_distrib.get_tokenized_sents_embeddings(candidates)) # Associated embeddings 45 | valid_candidates_mask = ~np.all(embeddings == 0, axis=1) # Only candidates which are not unknown. 46 | return candidates[valid_candidates_mask], embeddings[valid_candidates_mask, :] 47 | else: 48 | return np.array([]), np.array([]) 49 | 50 | 51 | def extract_sent_candidates_embedding_for_doc(embedding_distrib, inp_rpr): 52 | """ 53 | Return the list of candidate senetences as well as the associated numpy array that contains their embeddings. 54 | Note that candidates sentences which are uknown (in term of embeddings) will be removed from the candidates. 55 | 56 | :param embedding_distrib: embedding distributor see @EmbeddingDistributor 57 | :param inp_rpr: input text representation see @InputTextObj 58 | :return: A tuple of two element containing 1) the list of candidate sentences 59 | 2) a numpy array of shape (number of candidate sentences, dimension of embeddings : 60 | each row is the embedding of one candidate sentence 61 | """ 62 | candidates = np.array(extract_sent_candidates(inp_rpr)) 63 | embeddings = np.array(embedding_distrib.get_tokenized_sents_embeddings(candidates)) 64 | 65 | valid_candidates_mask = ~np.all(embeddings == 0, axis=1) 66 | return candidates[valid_candidates_mask], embeddings[valid_candidates_mask, :] 67 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/preprocessing/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/preprocessing/custom_stanford.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | """Implementation of StanfordPOSTagger with tokenization in the specific language, s.t. the tag and tag_sent methods 7 | perform tokenization in the specific language. 8 | """ 9 | from nltk.tag import StanfordPOSTagger 10 | 11 | 12 | class EnglishStanfordPOSTagger(StanfordPOSTagger): 13 | 14 | @property 15 | def _cmd(self): 16 | return ['edu.stanford.nlp.tagger.maxent.MaxentTagger', 17 | '-model', self._stanford_model, '-textFile', self._input_file_path, 18 | '-outputFormatOptions', 'keepEmptySentences'] 19 | 20 | 21 | class FrenchStanfordPOSTagger(StanfordPOSTagger): 22 | """ 23 | Taken from github mhkuu/french-learner-corpus 24 | Extends the StanfordPosTagger with a custom command that calls the FrenchTokenizerFactory. 25 | """ 26 | 27 | @property 28 | def _cmd(self): 29 | return ['edu.stanford.nlp.tagger.maxent.MaxentTagger', 30 | '-model', self._stanford_model, '-textFile', 31 | self._input_file_path, '-tokenizerFactory', 32 | 'edu.stanford.nlp.international.french.process.FrenchTokenizer$FrenchTokenizerFactory', 33 | '-outputFormatOptions', 'keepEmptySentences'] 34 | 35 | 36 | class GermanStanfordPOSTagger(StanfordPOSTagger): 37 | """ Use english tokenizer for german """ 38 | 39 | @property 40 | def _cmd(self): 41 | return ['edu.stanford.nlp.tagger.maxent.MaxentTagger', 42 | '-model', self._stanford_model, '-textFile', self._input_file_path, 43 | '-outputFormatOptions', 'keepEmptySentences'] 44 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/preprocessing/postagging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | import argparse 7 | import os 8 | import re 9 | import warnings 10 | from abc import ABC, abstractmethod 11 | 12 | # NLTK imports 13 | import nltk 14 | from nltk.tag.util import tuple2str 15 | from nltk.parse import CoreNLPParser 16 | 17 | import swisscom_ai.research_keyphrase.preprocessing.custom_stanford as custom_stanford 18 | from swisscom_ai.research_keyphrase.util.fileIO import read_file, write_string 19 | 20 | # If you want to use spacy , install it and uncomment the following import 21 | # import spacy 22 | 23 | 24 | class PosTagging(ABC): 25 | @abstractmethod 26 | def pos_tag_raw_text(self, text, as_tuple_list=True): 27 | """ 28 | Tokenize and POS tag a string 29 | Sentence level is kept in the result : 30 | Either we have a list of list (for each sentence a list of tuple (word,tag)) 31 | Or a separator [ENDSENT] if we are requesting a string by putting as_tuple_list = False 32 | 33 | Example : 34 | >>from sentkp.preprocessing import postagger as pt 35 | 36 | >>pt = postagger.PosTagger() 37 | 38 | >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.') 39 | [ 40 | [('Write', 'VB'), ('your', 'PRP$'), ('python', 'NN'), 41 | ('code', 'NN'), ('in', 'IN'), ('a', 'DT'), ('.', '.'), ('py', 'NN'), ('file', 'NN'), ('.', '.') 42 | ], 43 | [('Thank', 'VB'), ('you', 'PRP'), ('.', '.')] 44 | ] 45 | 46 | >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False) 47 | 48 | 'Write/VB your/PRP$ python/NN code/NN in/IN a/DT ./.[ENDSENT]py/NN file/NN ./.[ENDSENT]Thank/VB you/PRP ./.' 49 | 50 | 51 | >>pt = postagger.PosTagger(separator='_') 52 | >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False) 53 | Write_VB your_PRP$ python_NN code_NN in_IN a_DT ._. py_NN file_NN ._. 54 | Thank_VB you_PRP ._. 55 | 56 | 57 | 58 | :param as_tuple_list: Return result as list of list (word,Pos_tag) 59 | :param text: String to POS tag 60 | :return: POS Tagged string or Tuple list 61 | """ 62 | 63 | pass 64 | 65 | def pos_tag_file(self, input_path, output_path=None): 66 | 67 | """ 68 | POS Tag a file. 69 | Either we have a list of list (for each sentence a list of tuple (word,tag)) 70 | Or a file with the POS tagged text 71 | 72 | Note : The jumpline is only for readibility purpose , when reading a tagged file we'll use again 73 | sent_tokenize to find the sentences boundaries. 74 | 75 | :param input_path: path of the source file 76 | :param output_path: If set write POS tagged text with separator (self.pos_tag_raw_text with as_tuple_list False) 77 | If not set, return list of list of tuple (self.post_tag_raw_text with as_tuple_list = True) 78 | 79 | :return: resulting POS tagged text as a list of list of tuple or nothing if output path is set. 80 | """ 81 | 82 | original_text = read_file(input_path) 83 | 84 | if output_path is not None: 85 | tagged_text = self.pos_tag_raw_text(original_text, as_tuple_list=False) 86 | # Write to the output the POS-Tagged text. 87 | write_string(tagged_text, output_path) 88 | else: 89 | return self.pos_tag_raw_text(original_text, as_tuple_list=True) 90 | 91 | def pos_tag_and_write_corpora(self, list_of_path, suffix): 92 | """ 93 | POS tag a list of files 94 | It writes the resulting file in the same directory with the same name + suffix 95 | e.g 96 | pos_tag_and_write_corpora(['/Users/user1/text1', '/Users/user1/direct/text2'] , suffix = _POS) 97 | will create 98 | /Users/user1/text1_POS 99 | /Users/user1/direct/text2_POS 100 | 101 | :param list_of_path: list containing the path (as string) of each file to POS Tag 102 | :param suffix: suffix to append at the end of the original filename for the resulting pos_tagged file. 103 | 104 | """ 105 | for path in list_of_path: 106 | output_file_path = path + suffix 107 | if os.path.isfile(path): 108 | self.pos_tag_file(path, output_file_path) 109 | else: 110 | warnings.warn('file ' + output_file_path + 'does not exists') 111 | 112 | 113 | class PosTaggingStanford(PosTagging): 114 | """ 115 | Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger 116 | 117 | tokenizer contains the default nltk tokenizer (PhunktSentenceTokenizer). 118 | tagger contains the StanfordPOSTagger object (which also trigger word tokenization see : -tokenize option in Java). 119 | 120 | """ 121 | 122 | def __init__(self, jar_path, model_path_directory, separator='|', lang='en'): 123 | """ 124 | :param model_path_directory: path of the model directory 125 | :param jar_path: path of the jar for StanfordPOSTagger (override the configuration file) 126 | :param separator: Separator between a token and a tag in the resulting string (default : |) 127 | 128 | """ 129 | 130 | if lang == 'en': 131 | model_path = os.path.join(model_path_directory, 'english-left3words-distsim.tagger') 132 | self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 133 | self.tagger = custom_stanford.EnglishStanfordPOSTagger(model_path, jar_path, java_options='-mx2g') 134 | elif lang == 'de': 135 | model_path = os.path.join(model_path_directory, 'german-hgc.tagger') 136 | self.sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') 137 | self.tagger = custom_stanford.GermanStanfordPOSTagger(model_path, jar_path, java_options='-mx2g') 138 | elif lang == 'fr': 139 | model_path = os.path.join(model_path_directory, 'french.tagger') 140 | self.sent_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') 141 | self.tagger = custom_stanford.FrenchStanfordPOSTagger(model_path, jar_path, java_options='-mx2g') 142 | else: 143 | raise ValueError('Language ' + lang + 'not handled') 144 | 145 | self.separator = separator 146 | 147 | def pos_tag_raw_text(self, text, as_tuple_list=True): 148 | """ 149 | Implementation of abstract method from PosTagging 150 | @see PosTagging 151 | """ 152 | tagged_text = self.tagger.tag_sents([self.sent_tokenizer.sentences_from_text(text)]) 153 | 154 | if as_tuple_list: 155 | return tagged_text 156 | return '[ENDSENT]'.join( 157 | [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text]) 158 | 159 | 160 | class PosTaggingSpacy(PosTagging): 161 | """ 162 | Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger 163 | """ 164 | 165 | def __init__(self, nlp=None, separator='|' ,lang='en'): 166 | if not nlp: 167 | print('Loading Spacy model') 168 | # self.nlp = spacy.load(lang, entity=False) 169 | print('Spacy model loaded ' + lang) 170 | else: 171 | self.nlp = nlp 172 | self.separator = separator 173 | 174 | def pos_tag_raw_text(self, text, as_tuple_list=True): 175 | """ 176 | Implementation of abstract method from PosTagging 177 | @see PosTagging 178 | """ 179 | 180 | # This step is not necessary int the stanford tokenizer. 181 | # This is used to avoid such tags : (' ', 'SP') 182 | text = re.sub('[ ]+', ' ', text).strip() # Convert multiple whitespaces into one 183 | 184 | doc = self.nlp(text) 185 | if as_tuple_list: 186 | return [[(token.text, token.tag_) for token in sent] for sent in doc.sents] 187 | return '[ENDSENT]'.join(' '.join(self.separator.join([token.text, token.tag_]) for token in sent) for sent in doc.sents) 188 | 189 | 190 | class PosTaggingCoreNLP(PosTagging): 191 | """ 192 | Concrete class of PosTagging using a CoreNLP server 193 | Provides a faster way to process several documents using since it doesn't require to load the model each time. 194 | """ 195 | 196 | def __init__(self, host='localhost' ,port=9000, separator='|'): 197 | self.parser = CoreNLPParser(url=f'http://{host}:{port}') 198 | self.separator = separator 199 | 200 | def pos_tag_raw_text(self, text, as_tuple_list=True): 201 | # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp 202 | # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split) 203 | # We create a small custom function highly inspired from raw_tag_sents to do both 204 | 205 | def raw_tag_text(): 206 | """ 207 | Perform tokenizing sentence splitting and PosTagging and keep the 208 | sentence splits structure 209 | """ 210 | properties = {'annotators':'tokenize,ssplit,pos'} 211 | tagged_data = self.parser.api_call(text, properties=properties) 212 | for tagged_sentence in tagged_data['sentences']: 213 | yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']] 214 | 215 | tagged_text = list(raw_tag_text()) 216 | 217 | if as_tuple_list: 218 | return tagged_text 219 | return '[ENDSENT]'.join( 220 | [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text]) 221 | 222 | 223 | 224 | 225 | if __name__ == '__main__': 226 | parser = argparse.ArgumentParser(description='Write POS tagged files, the resulting file will be written' 227 | ' at the same location with _POS append at the end of the filename') 228 | 229 | parser.add_argument('tagger', help='which pos tagger to use [stanford, spacy, corenlp]') 230 | parser.add_argument('listing_file_path', help='path to a text file ' 231 | 'containing in each row a path to a file to POS tag') 232 | args = parser.parse_args() 233 | 234 | if args.tagger == 'stanford': 235 | pt = PosTaggingStanford() 236 | suffix = 'STANFORD' 237 | elif args.tagger == 'spacy': 238 | pt = PosTaggingSpacy() 239 | suffix = 'SPACY' 240 | elif args.tagger == 'corenlp': 241 | pt = PosTaggingCoreNLP() 242 | suffix = 'CoreNLP' 243 | 244 | list_of_path = read_file(args.listing_file_path).splitlines() 245 | print('POS Tagging and writing ', len(list_of_path), 'files') 246 | pt.pos_tag_and_write_corpora(list_of_path, suffix) 247 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/util/__init__.py -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/util/fileIO.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | import codecs 7 | 8 | codecs.register_error('replace_with_space', lambda e: (u' ', e.start + 1)) 9 | 10 | 11 | def write_string(s, output_path): 12 | with open(output_path, 'w') as output_file: 13 | output_file.write(s) 14 | 15 | 16 | def read_file(input_path): 17 | with open(input_path, 'r', errors='replace_with_space') as input_file: 18 | return input_file.read().strip() 19 | -------------------------------------------------------------------------------- /swisscom_ai/research_keyphrase/util/solr_fields.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG. 2 | # All rights reserved. 3 | # 4 | #Authors: Kamil Bennani-Smires, Yann Savary 5 | 6 | """Module containing helper function to process results of a solr query""" 7 | 8 | 9 | def process_tagged_text(s): 10 | """ 11 | Return a tagged_text as a list of sentence where each sentence is list of tuple (word,tag) 12 | :param s: string tagged_text coming from solr word1|tag1 word2|tag2[ENDSENT]word3|tag3 ... 13 | :return: (list of list of tuple) list of sentences where each sentence is a list of tuple (word,tag) 14 | """ 15 | 16 | def str2tuple(tagged_token_text, sep='|'): 17 | loc = tagged_token_text.rfind(sep) 18 | if loc >= 0: 19 | return tagged_token_text[:loc], tagged_token_text[loc + len(sep):] 20 | else: 21 | raise RuntimeError('Problem when parsing tagged token '+tagged_token_text) 22 | 23 | result = [] 24 | for sent in s.split('[ENDSENT]'): 25 | sent = [str2tuple(tagged_token) for tagged_token in sent.split(' ')] 26 | result.append(sent) 27 | return result 28 | --------------------------------------------------------------------------------