├── .gitignore
├── .python-version
├── Dockerfile
├── LICENSE
├── README.md
├── config.ini.template
├── embedrank.gif
├── launch.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── swisscom_ai
    ├── __init__.py
    └── research_keyphrase
        ├── __init__.py
        ├── embeddings
            ├── __init__.py
            ├── emb_distrib_interface.py
            └── emb_distrib_local.py
        ├── model
            ├── __init__.py
            ├── extractor.py
            ├── input_representation.py
            ├── method.py
            └── methods_embeddings.py
        ├── preprocessing
            ├── __init__.py
            ├── custom_stanford.py
            └── postagging.py
        └── util
            ├── __init__.py
            ├── fileIO.py
            └── solr_fields.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | stanford-postagger*/
 2 | config.ini
 3 | 
 4 | # JetBrains IDEs
 5 | .idea/
 6 | 
 7 | # Python
 8 | __pycache__/
 9 | build/
10 | dist/
11 | *.egg-info/
12 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.6.2


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a base image that comes with NumPy and SciPy pre-installed
 2 | FROM publysher/alpine-scipy:1.0.0-numpy1.14.0-python3.6-alpine3.7
 3 | # Because of the image, our versions differ from those in the requirements.txt:
 4 | #   numpy==1.14.0 (instead of 1.13.1)
 5 | #   scipy==1.0.0 (instead of 0.19.1)
 6 | 
 7 | # Install Java for Stanford Tagger
 8 | RUN apk --update add openjdk8-jre
 9 | # Set environment
10 | ENV JAVA_HOME /opt/jdk
11 | ENV PATH ${PATH}:${JAVA_HOME}/bin
12 | 
13 | # Download CoreNLP full Stanford Tagger for English
14 | RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip && \
15 |     unzip stanford-corenlp-full-*.zip && \
16 |     rm stanford-corenlp-full-*.zip && \
17 |     mv stanford-corenlp-full-* stanford-corenlp
18 | 
19 | # Install sent2vec
20 | RUN apk add --update git g++ make && \
21 |     git clone https://github.com/epfml/sent2vec && \
22 |     cd sent2vec && \
23 |     git checkout f827d014a473aa22b2fef28d9e29211d50808d48 && \
24 |     make && \
25 |     apk del git make && \
26 |     rm -rf /var/cache/apk/* && \
27 |     pip install cython && \
28 |     cd src && \
29 |     python setup.py build_ext && \
30 |     pip install .
31 | 
32 | 
33 | 
34 | # Install requirements
35 | WORKDIR /app
36 | ADD requirements.txt .
37 | # Remove NumPy and SciPy from the requirements before installing the rest
38 | RUN cd /app && \
39 |     sed -i '/^numpy.*$/d' requirements.txt && \
40 |     sed -i '/^scipy.*$/d' requirements.txt && \
41 |     pip install -r requirements.txt
42 | 
43 | # Download NLTK data
44 | RUN python -c "import nltk; nltk.download('punkt')"
45 | 
46 | # Set the paths in config.ini
47 | ADD config.ini.template config.ini
48 | RUN sed -i '6 c\host = localhost' config.ini && \
49 |     sed -i '7 c\port = 9000' config.ini && \
50 |     sed -i '10 c\model_path = /sent2vec/pretrained_model.bin' config.ini
51 | 
52 | # Add actual source code
53 | ADD swisscom_ai swisscom_ai/
54 | ADD launch.py .
55 | 
56 | ENTRYPOINT ["/bin/sh"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This is the implementation of the following paper: https://arxiv.org/abs/1801.04470
  2 | 
  3 | # Installation
  4 | 
  5 | ## Local Installation
  6 | 
  7 | 1. Download full Stanford CoreNLP Tagger version 3.8.0
  8 | http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip
  9 | 
 10 | 2. Install sent2vec from 
 11 | https://github.com/epfml/sent2vec
 12 |     * Clone/Download the directory
 13 |     * go to sent2vec directory
 14 |     * git checkout f827d014a473aa22b2fef28d9e29211d50808d48
 15 |     * make
 16 |     * pip install cython
 17 |     * inside the src folder 
 18 |         * ``python setup.py build_ext``
 19 |         * ``pip install . ``
 20 |         * (In OSX) If the setup.py throws an **error** (ignore warnings), open setup.py and add '-stdlib=libc++' in the compile_opts list.        
 21 |     * Download a pre-trained model (see readme of Sent2Vec repo) , for example wiki_bigrams.bin
 22 |      
 23 | 3. Install requirements
 24 |     
 25 |     After cloning this repository go to the root directory and
 26 |     ``pip install -r requirements.txt``
 27 | 
 28 | 4. Download NLTK data
 29 | ```
 30 | import nltk 
 31 | nltk.download('punkt')
 32 | ```
 33 | 
 34 | 5. Launch Stanford Core NLP tagger
 35 |     * Open a new terminal
 36 |     * Go to the stanford-core-nlp-full directory
 37 |     * Run the server `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 15000 & `
 38 | 
 39 | 
 40 | 6. Set the paths in config.ini.template
 41 |     * You can leave [STANFORDTAGGER] parameters empty
 42 |     * For [STANFORDCORENLPTAGGER] :
 43 |         * set host to localhost
 44 |         * set port to 9000
 45 |     * For [SENT2VEC]:
 46 |         * set your model_path to the pretrained model
 47 |         your_path_to_model/wiki_bigrams.bin (if you choosed wiki_bigrams.bin)
 48 |     * rename config.ini.template to config.ini
 49 | 
 50 | ## Docker
 51 | 
 52 | Probably the easiest way to get started is by using the provided Docker image.
 53 | From the project's root directory, the image can be built like so:
 54 | ```
 55 | $ docker build . -t keyphrase-extraction
 56 | ```
 57 | This can take a few minutes to finish.
 58 | Also, keep in mind that pre-trained sent2vec models will not be downloaded since each model is several GBs in size and don't forget to allocate enough memory to your docker container (models are loaded in RAM).
 59 | 
 60 | To launch the model in an interactive mode, in order to use your own code, run
 61 | ```
 62 | $ docker run -v {path to wiki_bigrams.bin}:/sent2vec/pretrained_model.bin -it keyphrase-extraction
 63 | # Run the corenlp server
 64 | /app # cd /stanford-corenlp
 65 | /stanford-corenlp # nohup java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 15000 &
 66 | # Press enter to get stdin back
 67 | /stanford-corenlp # cd /app
 68 | /app # python
 69 | >>> import launch
 70 | ```
 71 | You have to specify the path to your sent2vec model using the `-v` argument.
 72 | If, for example, you should choose not to use the *wiki_bigrams.bin* model, adjust your path accordingly (and of course, remember to remove the curly brackets).
 73 | 
 74 | # Usage
 75 | 
 76 | Once the CoreNLP server is running
 77 | 
 78 | ```
 79 | import launch
 80 | 
 81 | embedding_distributor = launch.load_local_embedding_distributor()
 82 | pos_tagger = launch.load_local_corenlp_pos_tagger()
 83 | 
 84 | kp1 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text, 10, 'en')  #extract 10 keyphrases
 85 | kp2 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text2, 10, 'en')
 86 | ...
 87 | ```
 88 | 
 89 | This return for each text a tuple containing three lists:
 90 | 1) The top N candidates (string) i.e keyphrases
 91 | 2) For each keyphrase the associated relevance score
 92 | 3) For each keyphrase a list of alias (other candidates very similar to the one selected
 93 | as keyphrase)
 94 | 
 95 | # Method
 96 | 
 97 | This is the implementation of the following paper:
 98 | https://arxiv.org/abs/1801.04470
 99 | 
100 | ![embedrank](embedrank.gif)
101 | 
102 | By using sentence embeddings , EmbedRank embeds both the document and candidate phrases into the same embedding space.
103 | 
104 | N candidates are selected as keyphrases by using Maximal Margin Relevance using the cosine similarity between the candidates and the
105 | document in order to model the informativness and the cosine
106 | similarity between the candidates is used to model the diversity.
107 | 
108 | An hyperparameter, beta (default=0.55), controls the importance given to 
109 | informativness and diversity when extracting keyphrases.
110 | (beta = 1 only informativness , beta = 0 only diversity)
111 | You can change the beta hyperparameter value when calling extract_keyphrases:
112 | 
113 | ```
114 | kp1 = launch.extract_keyphrases(embedding_distributor, pos_tagger, raw_text, 10, 'en', beta=0.8)  #extract 10 keyphrases with beta=0.8
115 | 
116 | ```
117 | 
118 | If you want to replicate the results of the paper you have to set beta to 1 or 0.5 and turn off the alias feature by specifiying alias_threshold=1 to extract_keyphrases method.
119 | 
120 | 


--------------------------------------------------------------------------------
/config.ini.template:
--------------------------------------------------------------------------------
 1 | [STANFORDTAGGER]
 2 | jar_path =
 3 | model_directory_path =
 4 | 
 5 | [STANFORDCORENLPTAGGER]
 6 | host =
 7 | port =
 8 | 
 9 | [SENT2VEC]
10 | model_path =


--------------------------------------------------------------------------------
/embedrank.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/embedrank.gif


--------------------------------------------------------------------------------
/launch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from configparser import ConfigParser
 3 | 
 4 | from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal
 5 | from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj
 6 | from swisscom_ai.research_keyphrase.model.method import MMRPhrase
 7 | from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingCoreNLP
 8 | from swisscom_ai.research_keyphrase.util.fileIO import read_file
 9 | 
10 | 
11 | def extract_keyphrases(embedding_distrib, ptagger, raw_text, N, lang, beta=0.55, alias_threshold=0.7):
12 |     """
13 |     Method that extract a set of keyphrases
14 | 
15 |     :param embedding_distrib: An Embedding Distributor object see @EmbeddingDistributor
16 |     :param ptagger: A Pos Tagger object see @PosTagger
17 |     :param raw_text: A string containing the raw text to extract
18 |     :param N: The number of keyphrases to extract
19 |     :param lang: The language
20 |     :param beta: beta factor for MMR (tradeoff informativness/diversity)
21 |     :param alias_threshold: threshold to group candidates as aliases
22 |     :return: A tuple with 3 elements :
23 |     1)list of the top-N candidates (or less if there are not enough candidates) (list of string)
24 |     2)list of associated relevance scores (list of float)
25 |     3)list containing for each keyphrase a list of alias (list of list of string)
26 |     """
27 |     tagged = ptagger.pos_tag_raw_text(raw_text)
28 |     text_obj = InputTextObj(tagged, lang)
29 |     return MMRPhrase(embedding_distrib, text_obj, N=N, beta=beta, alias_threshold=alias_threshold)
30 | 
31 | 
32 | def load_local_embedding_distributor():
33 |     config_parser = ConfigParser()
34 |     config_parser.read('config.ini')
35 |     sent2vec_model_path = config_parser.get('SENT2VEC', 'model_path')
36 |     return EmbeddingDistributorLocal(sent2vec_model_path)
37 | 
38 | 
39 | def load_local_corenlp_pos_tagger():
40 |     config_parser = ConfigParser()
41 |     config_parser.read('config.ini')
42 |     host = config_parser.get('STANFORDCORENLPTAGGER', 'host')
43 |     port = config_parser.get('STANFORDCORENLPTAGGER', 'port')
44 |     return PosTaggingCoreNLP(host, port)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description='Extract keyphrases from raw text')
49 | 
50 |     group = parser.add_mutually_exclusive_group(required=True)
51 |     group.add_argument('-raw_text', help='raw text to process')
52 |     group.add_argument('-text_file', help='file containing the raw text to process')
53 |     
54 | 
55 |     parser.add_argument('-tagger_host', help='CoreNLP host', default='localhost')
56 |     parser.add_argument('-tagger_port', help='CoreNLP port', default=9000)
57 |     parser.add_argument('-N', help='number of keyphrases to extract', required=True, type=int)
58 |     args = parser.parse_args()
59 | 
60 |     if args.text_file:
61 |         raw_text = read_file(args.text_file)
62 |     else:
63 |         raw_text = args.raw_text
64 | 
65 |     embedding_distributor = load_local_embedding_distributor()
66 |     pos_tagger = load_local_corenlp_pos_tagger(args.tagger_host, args.tagger_port)
67 |     print(extract_keyphrases(embedding_distributor, pos_tagger, raw_text, args.N, 'en'))
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langdetect==1.0.7
2 | nltk==3.4.1
3 | numpy==1.14.3
4 | scikit-learn==0.19.0
5 | scipy==0.19.1
6 | six==1.10.0
7 | requests==2.21.0


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """A setuptools based setup module.
 2 | 
 3 | See:
 4 | https://packaging.python.org/en/latest/distributing.html
 5 | https://github.com/pypa/sampleproject
 6 | """
 7 | from codecs import open
 8 | 
 9 | from setuptools import setup, find_packages
10 | 
11 | with open('requirements.txt') as f:
12 |     required = f.read().splitlines()
13 | 
14 | setup(
15 |     name='swisscom_ai.research_keyphrase',
16 | 
17 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
18 |     # the version across setup.py and the project code, see
19 |     # https://packaging.python.org/en/latest/single_source_version.html
20 |     version='0.9.5',
21 | 
22 |     description='Swisscom AI Research Keyphrase Extraction',
23 |     url='https://github.com/swisscom/ai-research-keyphrase-extraction',
24 | 
25 |     author='Swisscom (Schweiz) AG',
26 | 
27 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
28 |     classifiers=[
29 |         'Programming Language :: Python :: 3.6',
30 |     ],
31 | 
32 |     # You can just specify the packages manually here if your project is
33 |     # simple. Or you can use find_packages().
34 |     packages=find_packages(exclude=['contrib', 'docs', 'tests']),
35 | 
36 |     package_data={'swisscom_ai.research_keyphrase': []},
37 |     include_package_data=True,
38 | 
39 |     # List run-time dependencies here.  These will be installed by pip when
40 |     # your project is installed. For an analysis of "install_requires" vs pip's
41 |     # requirements files see:
42 |     # https://packaging.python.org/en/latest/requirements.html
43 |     install_requires=required,
44 | 
45 |     # List additional groups of dependencies here (e.g. development
46 |     # dependencies). You can install these using the following syntax,
47 |     # for example:
48 |     # $ pip install -e .[dev,test]
49 |     extras_require={
50 |         'dev': [],
51 |         'test': [],
52 |     },
53 | )
54 | 


--------------------------------------------------------------------------------
/swisscom_ai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/embeddings/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/embeddings/emb_distrib_interface.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | 
 8 | 
 9 | class Singleton(type):
10 |     _instances = {}
11 | 
12 |     def __call__(cls, *args, **kwargs):
13 |         if cls not in cls._instances:
14 |             cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
15 |         return cls._instances[cls]
16 | 
17 | 
18 | class EmbeddingDistributor(ABC):
19 |     """
20 |     Abstract class in charge of providing the embeddings of piece of texts
21 |     """
22 |     @abstractmethod
23 |     def get_tokenized_sents_embeddings(self, sents):
24 |         """
25 |         Generate a numpy ndarray with the embedding of each element of sent in each row
26 |         :param sents: list of string (sentences/phrases)
27 |         :return: ndarray with shape (len(sents), dimension of embeddings)
28 |         """
29 |         pass
30 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/embeddings/emb_distrib_local.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | import numpy as np
 7 | 
 8 | from swisscom_ai.research_keyphrase.embeddings.emb_distrib_interface import EmbeddingDistributor
 9 | import sent2vec
10 | 
11 | 
12 | class EmbeddingDistributorLocal(EmbeddingDistributor):
13 |     """
14 |     Concrete class of @EmbeddingDistributor using a local installation of sent2vec
15 |     https://github.com/epfml/sent2vec
16 |     
17 |     """
18 | 
19 |     def __init__(self, fasttext_model):
20 |         self.model = sent2vec.Sent2vecModel()
21 |         self.model.load_model(fasttext_model)
22 | 
23 |     def get_tokenized_sents_embeddings(self, sents):
24 |         """
25 |         @see EmbeddingDistributor
26 |         """
27 |         for sent in sents:
28 |             if '\n' in sent:
29 |                 raise RuntimeError('New line is not allowed inside a sentence')
30 | 
31 |         return self.model.embed_sentences(sents)
32 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/model/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/model/extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | """Contain method that return list of candidate"""
 7 | 
 8 | import re
 9 | 
10 | import nltk
11 | 
12 | GRAMMAR_EN = """  NP:
13 |         {<NN.*|JJ>*<NN.*>}  # Adjective(s)(optional) + Noun(s)"""
14 | 
15 | GRAMMAR_DE = """
16 | NBAR:
17 |         {<JJ|CARD>*<NN.*>+}  # [Adjective(s) or Article(s) or Posessive pronoun](optional) + Noun(s)
18 |         {<NN>+<PPOSAT><JJ|CARD>*<NN.*>+}
19 | 
20 | NP:
21 | {<NBAR><APPR|APPRART><ART>*<NBAR>}# Above, connected with APPR and APPART (beim vom)
22 | {<NBAR>+}
23 | """
24 | 
25 | GRAMMAR_FR = """  NP:
26 |         {<NN.*|JJ>*<NN.*>+<JJ>*}  # Adjective(s)(optional) + Noun(s) + Adjective(s)(optional)"""
27 | 
28 | 
29 | def get_grammar(lang):
30 |     if lang == 'en':
31 |         grammar = GRAMMAR_EN
32 |     elif lang == 'de':
33 |         grammar = GRAMMAR_DE
34 |     elif lang == 'fr':
35 |         grammar = GRAMMAR_FR
36 |     else:
37 |         raise ValueError('Language not handled')
38 |     return grammar
39 | 
40 | 
41 | def extract_candidates(text_obj, no_subset=False):
42 |     """
43 |     Based on part of speech return a list of candidate phrases
44 |     :param text_obj: Input text Representation see @InputTextObj
45 |     :param no_subset: if true won't put a candidate which is the subset of an other candidate
46 |     :param lang: language (currently en, fr and de are supported)
47 |     :return: list of candidate phrases (string)
48 |     """
49 | 
50 |     keyphrase_candidate = set()
51 | 
52 |     np_parser = nltk.RegexpParser(get_grammar(text_obj.lang))  # Noun phrase parser
53 |     trees = np_parser.parse_sents(text_obj.pos_tagged)  # Generator with one tree per sentence
54 | 
55 |     for tree in trees:
56 |         for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):  # For each nounphrase
57 |             # Concatenate the token with a space
58 |             keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))
59 | 
60 |     keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5}
61 | 
62 |     if no_subset:
63 |         keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate)
64 |     else:
65 |         keyphrase_candidate = list(keyphrase_candidate)
66 | 
67 |     return keyphrase_candidate
68 | 
69 | 
70 | def extract_sent_candidates(text_obj):
71 |     """
72 | 
73 |     :param text_obj: input Text Representation see @InputTextObj
74 |     :return: list of tokenized sentence (string) , each token is separated by a space in the string
75 |     """
76 |     return [(' '.join(word for word, tag in sent)) for sent in text_obj.pos_tagged]
77 | 
78 | 
79 | def unique_ngram_candidates(strings):
80 |     """
81 |     ['machine learning', 'machine', 'backward induction', 'induction', 'start'] ->
82 |     ['backward induction', 'start', 'machine learning']
83 |     :param strings: List of string
84 |     :return: List of string where no string is fully contained inside another string
85 |     """
86 |     results = []
87 |     for s in sorted(set(strings), key=len, reverse=True):
88 |         if not any(re.search(r'\b{}\b'.format(re.escape(s)), r) for r in results):
89 |             results.append(s)
90 |     return results
91 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/model/input_representation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | from nltk.stem import PorterStemmer
 7 | 
 8 | 
 9 | class InputTextObj:
10 |     """Represent the input text in which we want to extract keyphrases"""
11 | 
12 |     def __init__(self, pos_tagged, lang, stem=False, min_word_len=3):
13 |         """
14 |         :param pos_tagged: List of list : Text pos_tagged as a list of sentences
15 |         where each sentence is a list of tuple (word, TAG).
16 |         :param stem: If we want to apply stemming on the text.
17 |         """
18 |         self.min_word_len = min_word_len
19 |         self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'}
20 |         self.pos_tagged = []
21 |         self.filtered_pos_tagged = []
22 |         self.isStemmed = stem
23 |         self.lang = lang
24 | 
25 |         if stem:
26 |             stemmer = PorterStemmer()
27 |             self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged]
28 |         else:
29 |             self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged]
30 | 
31 |         temp = []
32 |         for sent in self.pos_tagged:
33 |             s = []
34 |             for elem in sent:
35 |                 if len(elem[0]) < min_word_len:
36 |                     s.append((elem[0], 'LESS'))
37 |                 else:
38 |                     s.append(elem)
39 |             temp.append(s)
40 | 
41 |         self.pos_tagged = temp
42 |         # Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method.
43 |         if lang in ['fr', 'de']:
44 |             self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence
45 |                                in
46 |                                self.pos_tagged]
47 |         self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in
48 |                                     self.pos_tagged]
49 | 
50 |     def is_candidate(self, tagged_token):
51 |         """
52 | 
53 |         :param tagged_token: tuple (word, tag)
54 |         :return: True if its a valid candidate word
55 |         """
56 |         return tagged_token[1] in self.considered_tags
57 | 
58 |     def extract_candidates(self):
59 |         """
60 |         :return: set of all candidates word
61 |         """
62 |         return {tagged_token[0].lower()
63 |                 for sentence in self.pos_tagged
64 |                 for tagged_token in sentence
65 |                 if self.is_candidate(tagged_token) and len(tagged_token[0]) >= self.min_word_len
66 |                 }
67 | 
68 | 
69 | def convert(fr_or_de_tag):
70 |     if fr_or_de_tag in {'NN', 'NNE', 'NE', 'N', 'NPP', 'NC', 'NOUN'}:
71 |         return 'NN'
72 |     elif fr_or_de_tag in {'ADJA', 'ADJ'}:
73 |         return 'JJ'
74 |     else:
75 |         return fr_or_de_tag
76 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/model/method.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
  2 | # All rights reserved.
  3 | #
  4 | #Authors: Kamil Bennani-Smires, Yann Savary
  5 | 
  6 | import warnings
  7 | 
  8 | import numpy as np
  9 | from sklearn.metrics.pairwise import cosine_similarity
 10 | 
 11 | from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc, \
 12 |     extract_doc_embedding, extract_sent_candidates_embedding_for_doc
 13 | 
 14 | 
 15 | def _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered, alias_threshold):
 16 |     """
 17 |     Core method using Maximal Marginal Relevance in charge to return the top-N candidates
 18 | 
 19 |     :param embdistrib: embdistrib: embedding distributor see @EmbeddingDistributor
 20 |     :param text_obj: Input text representation see @InputTextObj
 21 |     :param candidates: list of candidates (string)
 22 |     :param X: numpy array with the embedding of each candidate in each row
 23 |     :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity)
 24 |     :param N: number of candidates to extract
 25 |     :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding
 26 |     :return: A tuple with 3 elements :
 27 |     1)list of the top-N candidates (or less if there are not enough candidates) (list of string)
 28 |     2)list of associated relevance scores (list of float)
 29 |     3)list containing for each keyphrase a list of alias (list of list of string)
 30 |     """
 31 | 
 32 |     N = min(N, len(candidates))
 33 |     doc_embedd = extract_doc_embedding(embdistrib, text_obj, use_filtered)  # Extract doc embedding
 34 |     doc_sim = cosine_similarity(X, doc_embedd.reshape(1, -1))
 35 | 
 36 |     doc_sim_norm = doc_sim/np.max(doc_sim)
 37 |     doc_sim_norm = 0.5 + (doc_sim_norm - np.average(doc_sim_norm)) / np.std(doc_sim_norm)
 38 | 
 39 |     sim_between = cosine_similarity(X)
 40 |     np.fill_diagonal(sim_between, np.NaN)
 41 | 
 42 |     sim_between_norm = sim_between/np.nanmax(sim_between, axis=0)
 43 |     sim_between_norm = \
 44 |         0.5 + (sim_between_norm - np.nanmean(sim_between_norm, axis=0)) / np.nanstd(sim_between_norm, axis=0)
 45 | 
 46 |     selected_candidates = []
 47 |     unselected_candidates = [c for c in range(len(candidates))]
 48 | 
 49 |     j = np.argmax(doc_sim)
 50 |     selected_candidates.append(j)
 51 |     unselected_candidates.remove(j)
 52 | 
 53 |     for _ in range(N - 1):
 54 |         selec_array = np.array(selected_candidates)
 55 |         unselec_array = np.array(unselected_candidates)
 56 | 
 57 |         distance_to_doc = doc_sim_norm[unselec_array, :]
 58 |         dist_between = sim_between_norm[unselec_array][:, selec_array]
 59 |         if dist_between.ndim == 1:
 60 |             dist_between = dist_between[:, np.newaxis]
 61 |         j = np.argmax(beta * distance_to_doc - (1 - beta) * np.max(dist_between, axis=1).reshape(-1, 1))
 62 |         item_idx = unselected_candidates[j]
 63 |         selected_candidates.append(item_idx)
 64 |         unselected_candidates.remove(item_idx)
 65 | 
 66 |     # Not using normalized version of doc_sim for computing relevance
 67 |     relevance_list = max_normalization(doc_sim[selected_candidates]).tolist()
 68 |     aliases_list = get_aliases(sim_between[selected_candidates, :], candidates, alias_threshold)
 69 | 
 70 |     return candidates[selected_candidates].tolist(), relevance_list, aliases_list
 71 | 
 72 | 
 73 | def MMRPhrase(embdistrib, text_obj, beta=0.65, N=10, use_filtered=True, alias_threshold=0.8):
 74 |     """
 75 |     Extract N keyphrases
 76 | 
 77 |     :param embdistrib: embedding distributor see @EmbeddingDistributor
 78 |     :param text_obj: Input text representation see @InputTextObj
 79 |     :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity)
 80 |     :param N: number of keyphrases to extract
 81 |     :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding
 82 |     :return: A tuple with 3 elements :
 83 |     1)list of the top-N candidates (or less if there are not enough candidates) (list of string)
 84 |     2)list of associated relevance scores (list of float)
 85 |     3)list containing for each keyphrase a list of alias (list of list of string)
 86 |     """
 87 |     candidates, X = extract_candidates_embedding_for_doc(embdistrib, text_obj)
 88 | 
 89 |     if len(candidates) == 0:
 90 |         warnings.warn('No keyphrase extracted for this document')
 91 |         return None, None, None
 92 | 
 93 |     return _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered, alias_threshold)
 94 | 
 95 | 
 96 | def MMRSent(embdistrib, text_obj, beta=0.5, N=10, use_filtered=True):
 97 |     """
 98 | 
 99 |     Extract N key sentences
100 | 
101 |     :param embdistrib: embedding distributor see @EmbeddingDistributor
102 |     :param text_obj: Input text representation see @InputTextObj
103 |     :param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity)
104 |     :param N: number of key sentences to extract
105 |     :param use_filtered: if true filter the text by keeping only candidate word before computing the doc embedding
106 |     :return: list of N key sentences (or less if there are not enough candidates)
107 |     """
108 |     candidates, X = extract_sent_candidates_embedding_for_doc(embdistrib, text_obj)
109 | 
110 |     if len(candidates) == 0:
111 |         warnings.warn('No keysentence extracted for this document')
112 |         return []
113 | 
114 |     return _MMR(embdistrib, text_obj, candidates, X, beta, N, use_filtered)
115 | 
116 | 
117 | def max_normalization(array):
118 |     """
119 |     Compute maximum normalization (max is set to 1) of the array
120 |     :param array: 1-d array
121 |     :return: 1-d array max- normalized : each value is multiplied by 1/max value
122 |     """
123 |     return 1/np.max(array) * array.squeeze(axis=1)
124 | 
125 | 
126 | def get_aliases(kp_sim_between, candidates, threshold):
127 |     """
128 |     Find candidates which are very similar to the keyphrases (aliases)
129 |     :param kp_sim_between: ndarray of shape (nb_kp , nb candidates) containing the similarity
130 |     of each kp with all the candidates. Note that the similarity between the keyphrase and itself should be set to
131 |     NaN or 0
132 |     :param candidates: array of candidates (array of string)
133 |     :return: list containing for each keyphrase a list that contain candidates which are aliases
134 |     (very similar) (list of list of string)
135 |     """
136 | 
137 |     kp_sim_between = np.nan_to_num(kp_sim_between, 0)
138 |     idx_sorted = np.flip(np.argsort(kp_sim_between), 1)
139 |     aliases = []
140 |     for kp_idx, item in enumerate(idx_sorted):
141 |         alias_for_item = []
142 |         for i in item:
143 |             if kp_sim_between[kp_idx, i] >= threshold:
144 |                 alias_for_item.append(candidates[i])
145 |             else:
146 |                 break
147 |         aliases.append(alias_for_item)
148 | 
149 |     return aliases
150 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/model/methods_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | import numpy as np
 7 | 
 8 | from swisscom_ai.research_keyphrase.model.extractor import extract_candidates, extract_sent_candidates
 9 | 
10 | 
11 | def extract_doc_embedding(embedding_distrib, inp_rpr, use_filtered=False):
12 |     """
13 |     Return the embedding of the full document
14 | 
15 |     :param embedding_distrib: embedding distributor see @EmbeddingDistributor
16 |     :param inp_rpr: input text representation see @InputTextObj
17 |     :param use_filtered: if true keep only candidate words in the raw text before computing the embedding
18 |     :return: numpy array of shape (1, dimension of embeddings) that contains the document embedding
19 |     """
20 |     if use_filtered:
21 |         tagged = inp_rpr.filtered_pos_tagged
22 |     else:
23 |         tagged = inp_rpr.pos_tagged
24 | 
25 |     tokenized_doc_text = ' '.join(token[0].lower() for sent in tagged for token in sent)
26 |     return embedding_distrib.get_tokenized_sents_embeddings([tokenized_doc_text])
27 | 
28 | 
29 | def extract_candidates_embedding_for_doc(embedding_distrib, inp_rpr):
30 |     """
31 | 
32 |     Return the list of candidate phrases as well as the associated numpy array that contains their embeddings.
33 |     Note that candidates phrases extracted by PosTag rules  which are uknown (in term of embeddings)
34 |     will be removed from the candidates.
35 | 
36 |     :param embedding_distrib: embedding distributor see @EmbeddingDistributor
37 |     :param inp_rpr: input text representation see @InputTextObj
38 |     :return: A tuple of two element containing 1) the list of candidate phrases
39 |     2) a numpy array of shape (number of candidate phrases, dimension of embeddings :
40 |     each row is the embedding of one candidate phrase
41 |     """
42 |     candidates = np.array(extract_candidates(inp_rpr))  # List of candidates based on PosTag rules
43 |     if len(candidates) > 0:
44 |         embeddings = np.array(embedding_distrib.get_tokenized_sents_embeddings(candidates))  # Associated embeddings
45 |         valid_candidates_mask = ~np.all(embeddings == 0, axis=1)  # Only candidates which are not unknown.
46 |         return candidates[valid_candidates_mask], embeddings[valid_candidates_mask, :]
47 |     else:
48 |         return np.array([]), np.array([])
49 | 
50 | 
51 | def extract_sent_candidates_embedding_for_doc(embedding_distrib, inp_rpr):
52 |     """
53 |     Return the list of candidate senetences as well as the associated numpy array that contains their embeddings.
54 |     Note that candidates sentences which are uknown (in term of embeddings) will be removed from the candidates.
55 | 
56 |     :param embedding_distrib: embedding distributor see @EmbeddingDistributor
57 |     :param inp_rpr: input text representation see @InputTextObj
58 |     :return: A tuple of two element containing 1) the list of candidate sentences
59 |     2) a numpy array of shape (number of candidate sentences, dimension of embeddings :
60 |     each row is the embedding of one candidate sentence
61 |     """
62 |     candidates = np.array(extract_sent_candidates(inp_rpr))
63 |     embeddings = np.array(embedding_distrib.get_tokenized_sents_embeddings(candidates))
64 | 
65 |     valid_candidates_mask = ~np.all(embeddings == 0, axis=1)
66 |     return candidates[valid_candidates_mask], embeddings[valid_candidates_mask, :]
67 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/preprocessing/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/preprocessing/custom_stanford.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | """Implementation of StanfordPOSTagger with tokenization in the specific language, s.t. the tag and tag_sent methods
 7 | perform tokenization in the specific language.
 8 | """
 9 | from nltk.tag import StanfordPOSTagger
10 | 
11 | 
12 | class EnglishStanfordPOSTagger(StanfordPOSTagger):
13 | 
14 |     @property
15 |     def _cmd(self):
16 |         return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
17 |                 '-model', self._stanford_model, '-textFile', self._input_file_path,
18 |                 '-outputFormatOptions', 'keepEmptySentences']
19 | 
20 | 
21 | class FrenchStanfordPOSTagger(StanfordPOSTagger):
22 |     """
23 |     Taken from github mhkuu/french-learner-corpus
24 |     Extends the StanfordPosTagger with a custom command that calls the FrenchTokenizerFactory.
25 |     """
26 | 
27 |     @property
28 |     def _cmd(self):
29 |         return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
30 |                 '-model', self._stanford_model, '-textFile',
31 |                 self._input_file_path, '-tokenizerFactory',
32 |                 'edu.stanford.nlp.international.french.process.FrenchTokenizer$FrenchTokenizerFactory',
33 |                 '-outputFormatOptions', 'keepEmptySentences']
34 | 
35 | 
36 | class GermanStanfordPOSTagger(StanfordPOSTagger):
37 |     """ Use english tokenizer for german """
38 | 
39 |     @property
40 |     def _cmd(self):
41 |         return ['edu.stanford.nlp.tagger.maxent.MaxentTagger',
42 |                 '-model', self._stanford_model, '-textFile', self._input_file_path,
43 |                 '-outputFormatOptions', 'keepEmptySentences']
44 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/preprocessing/postagging.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
  2 | # All rights reserved.
  3 | #
  4 | #Authors: Kamil Bennani-Smires, Yann Savary
  5 | 
  6 | import argparse
  7 | import os
  8 | import re
  9 | import warnings
 10 | from abc import ABC, abstractmethod
 11 | 
 12 | # NLTK imports
 13 | import nltk
 14 | from nltk.tag.util import tuple2str
 15 | from nltk.parse import CoreNLPParser
 16 | 
 17 | import swisscom_ai.research_keyphrase.preprocessing.custom_stanford as custom_stanford
 18 | from swisscom_ai.research_keyphrase.util.fileIO import read_file, write_string
 19 | 
 20 | # If you want to use spacy , install it and uncomment the following import
 21 | # import spacy
 22 | 
 23 | 
 24 | class PosTagging(ABC):
 25 |     @abstractmethod
 26 |     def pos_tag_raw_text(self, text, as_tuple_list=True):
 27 |         """
 28 |         Tokenize and POS tag a string
 29 |         Sentence level is kept in the result :
 30 |         Either we have a list of list (for each sentence a list of tuple (word,tag))
 31 |         Or a separator [ENDSENT] if we are requesting a string by putting as_tuple_list = False
 32 | 
 33 |         Example :
 34 |         >>from sentkp.preprocessing import postagger as pt
 35 | 
 36 |         >>pt = postagger.PosTagger()
 37 | 
 38 |         >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.')
 39 |         [
 40 |             [('Write', 'VB'), ('your', 'PRP$'), ('python', 'NN'),
 41 |             ('code', 'NN'), ('in', 'IN'), ('a', 'DT'), ('.', '.'), ('py', 'NN'), ('file', 'NN'), ('.', '.')
 42 |             ],
 43 |             [('Thank', 'VB'), ('you', 'PRP'), ('.', '.')]
 44 |         ]
 45 | 
 46 |         >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False)
 47 | 
 48 |         'Write/VB your/PRP$ python/NN code/NN in/IN a/DT ./.[ENDSENT]py/NN file/NN ./.[ENDSENT]Thank/VB you/PRP ./.'
 49 | 
 50 | 
 51 |         >>pt = postagger.PosTagger(separator='_')
 52 |         >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False)
 53 |         Write_VB your_PRP$ python_NN code_NN in_IN a_DT ._. py_NN file_NN ._.
 54 |         Thank_VB you_PRP ._.
 55 | 
 56 | 
 57 | 
 58 |         :param as_tuple_list: Return result as list of list (word,Pos_tag)
 59 |         :param text:  String to POS tag
 60 |         :return: POS Tagged string or Tuple list
 61 |         """
 62 | 
 63 |         pass
 64 | 
 65 |     def pos_tag_file(self, input_path, output_path=None):
 66 | 
 67 |         """
 68 |         POS Tag a file.
 69 |         Either we have a list of list (for each sentence a list of tuple (word,tag))
 70 |         Or a file with the POS tagged text
 71 | 
 72 |         Note : The jumpline is only for readibility purpose , when reading a tagged file we'll use again
 73 |         sent_tokenize to find the sentences boundaries.
 74 | 
 75 |         :param input_path: path of the source file
 76 |         :param output_path: If set write POS tagged text with separator (self.pos_tag_raw_text with as_tuple_list False)
 77 |                             If not set, return list of list of tuple (self.post_tag_raw_text with as_tuple_list = True)
 78 | 
 79 |         :return: resulting POS tagged text as a list of list of tuple or nothing if output path is set.
 80 |         """
 81 | 
 82 |         original_text = read_file(input_path)
 83 | 
 84 |         if output_path is not None:
 85 |             tagged_text = self.pos_tag_raw_text(original_text, as_tuple_list=False)
 86 |             # Write to the output the POS-Tagged text.
 87 |             write_string(tagged_text, output_path)
 88 |         else:
 89 |             return self.pos_tag_raw_text(original_text, as_tuple_list=True)
 90 | 
 91 |     def pos_tag_and_write_corpora(self, list_of_path, suffix):
 92 |         """
 93 |         POS tag a list of files
 94 |         It writes the resulting file in the same directory with the same name + suffix
 95 |         e.g
 96 |         pos_tag_and_write_corpora(['/Users/user1/text1', '/Users/user1/direct/text2'] , suffix = _POS)
 97 |         will create
 98 |         /Users/user1/text1_POS
 99 |         /Users/user1/direct/text2_POS
100 | 
101 |         :param list_of_path: list containing the path (as string) of each file to POS Tag
102 |         :param suffix: suffix to append at the end of the original filename for the resulting pos_tagged file.
103 | 
104 |         """
105 |         for path in list_of_path:
106 |             output_file_path = path + suffix
107 |             if os.path.isfile(path):
108 |                 self.pos_tag_file(path, output_file_path)
109 |             else:
110 |                 warnings.warn('file ' + output_file_path + 'does not exists')
111 | 
112 | 
113 | class PosTaggingStanford(PosTagging):
114 |     """
115 |     Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger
116 | 
117 |     tokenizer contains the default nltk tokenizer (PhunktSentenceTokenizer).
118 |     tagger contains the StanfordPOSTagger object (which also trigger word tokenization  see : -tokenize option in Java).
119 | 
120 |     """
121 | 
122 |     def __init__(self, jar_path, model_path_directory, separator='|', lang='en'):
123 |         """
124 |         :param model_path_directory: path of the model directory
125 |         :param jar_path: path of the jar for StanfordPOSTagger (override the configuration file)
126 |         :param separator: Separator between a token and a tag in the resulting string (default : |)
127 | 
128 |         """
129 | 
130 |         if lang == 'en':
131 |             model_path = os.path.join(model_path_directory, 'english-left3words-distsim.tagger')
132 |             self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
133 |             self.tagger = custom_stanford.EnglishStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
134 |         elif lang == 'de':
135 |             model_path = os.path.join(model_path_directory, 'german-hgc.tagger')
136 |             self.sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
137 |             self.tagger = custom_stanford.GermanStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
138 |         elif lang == 'fr':
139 |             model_path = os.path.join(model_path_directory, 'french.tagger')
140 |             self.sent_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
141 |             self.tagger = custom_stanford.FrenchStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
142 |         else:
143 |             raise ValueError('Language ' + lang + 'not handled')
144 | 
145 |         self.separator = separator
146 | 
147 |     def pos_tag_raw_text(self, text, as_tuple_list=True):
148 |         """
149 |         Implementation of abstract method from PosTagging
150 |         @see PosTagging
151 |         """
152 |         tagged_text = self.tagger.tag_sents([self.sent_tokenizer.sentences_from_text(text)])
153 | 
154 |         if as_tuple_list:
155 |             return tagged_text
156 |         return '[ENDSENT]'.join(
157 |             [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
158 | 
159 | 
160 | class PosTaggingSpacy(PosTagging):
161 |     """
162 |         Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger
163 |     """
164 | 
165 |     def __init__(self, nlp=None, separator='|' ,lang='en'):
166 |         if not nlp:
167 |             print('Loading Spacy model')
168 |             #  self.nlp = spacy.load(lang, entity=False)
169 |             print('Spacy model loaded ' + lang)
170 |         else:
171 |             self.nlp = nlp
172 |         self.separator = separator
173 | 
174 |     def pos_tag_raw_text(self, text, as_tuple_list=True):
175 |         """
176 |             Implementation of abstract method from PosTagging
177 |             @see PosTagging
178 |         """
179 | 
180 |         # This step is not necessary int the stanford tokenizer.
181 |         # This is used to avoid such tags :  ('      ', 'SP')
182 |         text = re.sub('[ ]+', ' ', text).strip()  # Convert multiple whitespaces into one
183 | 
184 |         doc = self.nlp(text)
185 |         if as_tuple_list:
186 |             return [[(token.text, token.tag_) for token in sent] for sent in doc.sents]
187 |         return '[ENDSENT]'.join(' '.join(self.separator.join([token.text, token.tag_]) for token in sent) for sent in doc.sents)
188 |     
189 | 
190 | class PosTaggingCoreNLP(PosTagging):
191 |     """
192 |     Concrete class of PosTagging using a CoreNLP server 
193 |     Provides a faster way to process several documents using since it doesn't require to load the model each time.
194 |     """
195 | 
196 |     def __init__(self, host='localhost' ,port=9000, separator='|'):
197 |         self.parser = CoreNLPParser(url=f'http://{host}:{port}')
198 |         self.separator = separator
199 |     
200 |     def pos_tag_raw_text(self, text, as_tuple_list=True):
201 |         # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp
202 |         # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split)
203 |         # We create a small custom function highly inspired from raw_tag_sents to do both
204 | 
205 |         def raw_tag_text():
206 |             """
207 |             Perform tokenizing sentence splitting and PosTagging and keep the 
208 |             sentence splits structure
209 |             """
210 |             properties = {'annotators':'tokenize,ssplit,pos'}
211 |             tagged_data = self.parser.api_call(text, properties=properties)
212 |             for tagged_sentence in tagged_data['sentences']:
213 |                 yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']]
214 |         
215 |         tagged_text = list(raw_tag_text())
216 | 
217 |         if as_tuple_list:
218 |             return tagged_text
219 |         return '[ENDSENT]'.join(
220 |             [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
221 |         
222 | 
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     parser = argparse.ArgumentParser(description='Write POS tagged files, the resulting file will be written'
227 |                                                  ' at the same location with _POS append at the end of the filename')
228 | 
229 |     parser.add_argument('tagger', help='which pos tagger to use [stanford, spacy, corenlp]')
230 |     parser.add_argument('listing_file_path', help='path to a text file '
231 |                                                   'containing in each row a path to a file to POS tag')
232 |     args = parser.parse_args()
233 | 
234 |     if args.tagger == 'stanford':
235 |         pt = PosTaggingStanford()
236 |         suffix = 'STANFORD'
237 |     elif args.tagger == 'spacy':
238 |         pt = PosTaggingSpacy()
239 |         suffix = 'SPACY'
240 |     elif args.tagger == 'corenlp':
241 |         pt = PosTaggingCoreNLP()
242 |         suffix = 'CoreNLP'
243 | 
244 |     list_of_path = read_file(args.listing_file_path).splitlines()
245 |     print('POS Tagging and writing ', len(list_of_path), 'files')
246 |     pt.pos_tag_and_write_corpora(list_of_path, suffix)
247 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swisscom/ai-research-keyphrase-extraction/78c0b13633f0e443cf43892b098b4c8dabf3dad9/swisscom_ai/research_keyphrase/util/__init__.py


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/util/fileIO.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | import codecs
 7 | 
 8 | codecs.register_error('replace_with_space', lambda e: (u' ', e.start + 1))
 9 | 
10 | 
11 | def write_string(s, output_path):
12 |     with open(output_path, 'w') as output_file:
13 |         output_file.write(s)
14 | 
15 | 
16 | def read_file(input_path):
17 |     with open(input_path, 'r', errors='replace_with_space') as input_file:
18 |         return input_file.read().strip()
19 | 


--------------------------------------------------------------------------------
/swisscom_ai/research_keyphrase/util/solr_fields.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Swisscom (Schweiz) AG.
 2 | # All rights reserved.
 3 | #
 4 | #Authors: Kamil Bennani-Smires, Yann Savary
 5 | 
 6 | """Module containing helper function to process results of a solr query"""
 7 | 
 8 | 
 9 | def process_tagged_text(s):
10 |     """
11 |     Return a tagged_text as a list of sentence where each sentence is list of tuple (word,tag)
12 |     :param s: string tagged_text coming from solr word1|tag1 word2|tag2[ENDSENT]word3|tag3 ...
13 |     :return: (list of list of tuple) list of sentences where each sentence is a list of tuple (word,tag)
14 |     """
15 | 
16 |     def str2tuple(tagged_token_text, sep='|'):
17 |         loc = tagged_token_text.rfind(sep)
18 |         if loc >= 0:
19 |             return tagged_token_text[:loc], tagged_token_text[loc + len(sep):]
20 |         else:
21 |             raise RuntimeError('Problem when parsing tagged token '+tagged_token_text)
22 | 
23 |     result = []
24 |     for sent in s.split('[ENDSENT]'):
25 |         sent = [str2tuple(tagged_token) for tagged_token in sent.split(' ')]
26 |         result.append(sent)
27 |     return result
28 | 


--------------------------------------------------------------------------------