├── .dockerignore ├── .gitignore ├── LICENSE.md ├── README.md ├── dockerfile ├── requirements.txt ├── src ├── embedrank.py ├── example.py └── nlp_uitl.py └── web └── api.py /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | runs/ 3 | trunk/ 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | runs/ 2 | data/ 3 | model/ 4 | trunk/ 5 | 6 | ### https://raw.github.com/github/gitignore/72190ee30bd1e2ccc233222341435adacb7a6500/Python.gitignore 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # Environments 97 | .env 98 | .venv 99 | env/ 100 | venv/ 101 | ENV/ 102 | env.bak/ 103 | venv.bak/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | .dmypy.json 118 | dmypy.json 119 | 120 | 121 | ### https://raw.github.com/github/gitignore/72190ee30bd1e2ccc233222341435adacb7a6500/Global/macOS.gitignore 122 | 123 | # General 124 | .DS_Store 125 | .AppleDouble 126 | .LSOverride 127 | 128 | # Icon must end with two \r 129 | Icon 130 | 131 | # Thumbnails 132 | ._* 133 | 134 | # Files that might appear in the root of a volume 135 | .DocumentRevisions-V100 136 | .fseventsd 137 | .Spotlight-V100 138 | .TemporaryItems 139 | .Trashes 140 | .VolumeIcon.icns 141 | .com.apple.timemachine.donotpresent 142 | 143 | # Directories potentially created on remote AFP share 144 | .AppleDB 145 | .AppleDesktop 146 | Network Trash Folder 147 | Temporary Items 148 | .apdisk 149 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2019 yag_ays 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EmbedRank 2 | 3 | Python Implementaion of "[Simple Unsupervised Keyphrase Extraction using Sentence Embeddings](https://arxiv.org/abs/1801.04470)" 4 | 5 | ## Usage 6 | 7 | EmbedRank requires pretrained document embeddings (now doc2vec supported). Please see [my blog ](https://yag-ays.github.io/project/pretrained_doc2vec_wikipedia/) for using pretrained Japanese doc2vec models. 8 | 9 | ```py 10 | from gensim.models.doc2vec import Doc2Vec 11 | 12 | from embedrank import EmbedRank 13 | from nlp_uitl import tokenize 14 | 15 | model = Doc2Vec.load("model/jawiki.doc2vec.dbow300d.model") 16 | embedrank = EmbedRank(model=model, tokenize=tokenize) 17 | 18 | text = """バーレーンの首都マナマ(マナーマとも)で現在開催されている 19 | ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた 20 | 「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、 21 | 世界遺産に登録することを決定した。文化庁が同日発表した。 22 | 日本国内の文化財の世界遺産登録は昨年に登録された福岡県の 23 | 「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。 24 | 2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。""" 25 | ``` 26 | 27 | ```py 28 | In []: embedrank.extract_keyword(text) 29 | [('世界遺産登録', 0.61837685), ('(長崎県', 0.517046), ('ユネスコ(国際連合教育科学文化機関)', 0.5726031), ('潜伏キリシタン関連遺産', 0.544827), ('首都マナマ(マナーマ', 0.4898381)] 30 | 31 | ``` 32 | 33 | (Source: [潜伏キリシタン関連遺産、世界遺産登録 \- ウィキニュース](https://ja.wikinews.org/wiki/%E6%BD%9C%E4%BC%8F%E3%82%AD%E3%83%AA%E3%82%B7%E3%82%BF%E3%83%B3%E9%96%A2%E9%80%A3%E9%81%BA%E7%94%A3%E3%80%81%E4%B8%96%E7%95%8C%E9%81%BA%E7%94%A3%E7%99%BB%E9%8C%B2)) 34 | 35 | ## Docker 36 | 37 | Set the extracted doc2vec model in `model/` directory and run the following commands. 38 | 39 | ```sh 40 | $ docker build -t embedrank . 41 | $ docker run --rm -p 8080:8080 --memory 7g -it embedrank 42 | ``` 43 | 44 | ```sh 45 | $ curl -XPOST "localhost:8080/embedrank" --data-urlencode text='バーレーンの首都マナマ(マナーマとも)で現在開催されている 46 | ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた 47 | 「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、 48 | 世界遺産に登録することを決定した。文化庁が同日発表した。 49 | 日本国内の文化財の世界遺産登録は昨年に登録された福岡県の 50 | 「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。 51 | 2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。' 52 | -d 'num_keywords=3' 53 | 54 | { 55 | "keywords": [ 56 | { 57 | "keyword": "世界遺産登録", 58 | "score": "0.58336747" 59 | }, 60 | { 61 | "keyword": "天草地方", 62 | "score": "0.52296615" 63 | }, 64 | { 65 | "keyword": "首都マナマ(マナーマ", 66 | "score": "0.5126816" 67 | } 68 | ] 69 | } 70 | ``` 71 | 72 | Caution: 73 | 74 | - You need to allocate total memory size more than 7GB. 75 | - Container size is very large (7.38GB) 76 | -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN apt-get update && apt-get install -y \ 4 | libmecab-dev \ 5 | mecab-ipadic-utf8 \ 6 | && apt-get clean \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | COPY . /app 10 | WORKDIR /app/ 11 | 12 | RUN pip install -U pip 13 | RUN pip install -r requirements.txt 14 | 15 | WORKDIR /app/web/ 16 | 17 | ENTRYPOINT ["python", "api.py"] 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mecab-python3==0.7 2 | gensim==3.7.0 3 | scikit-learn==0.20.3 4 | bottle==0.12.16 5 | -------------------------------------------------------------------------------- /src/embedrank.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gensim.models.doc2vec import Doc2Vec 3 | from sklearn.metrics.pairwise import cosine_similarity 4 | 5 | from nlp_uitl import extract_keyphrase_candidates, tokenize 6 | 7 | 8 | class EmbedRank(): 9 | def __init__(self, model, tokenize, N=5, l=0.55): 10 | self.model = model 11 | self.tokenize = tokenize 12 | self.N = N 13 | self.l = l 14 | self.phrases = [] 15 | self.phrase_embeddings = [] 16 | self.document_embedding = [] 17 | self.document_similarity = [] 18 | 19 | def extract_keyword(self, text): 20 | phrase_indices = self._mmr(text) 21 | output = [] 22 | for idx in phrase_indices: 23 | output.append((self.phrases[idx], self.document_similarity[idx][0])) 24 | return output 25 | 26 | def _mmr(self, document): 27 | self.document_embedding = self.model.infer_vector(self.tokenize(document)) 28 | 29 | self.phrases = [] 30 | self.phrase_embeddings = [] 31 | for candidate_tokens in extract_keyphrase_candidates(document): 32 | candidate_text = "".join(candidate_tokens) 33 | self.phrases.append(candidate_text) 34 | self.phrase_embeddings.append(self.model.infer_vector(self.tokenize(candidate_text))) 35 | self.phrase_embeddings = np.array(self.phrase_embeddings) 36 | 37 | if len(self.phrases) == 0: 38 | return [] 39 | if len(self.phrases) < self.N: 40 | # Num of candidate phrases are smaller than extracted num: 41 | # extract all phrases and reranked by mmr 42 | self.N = len(self.phrases) 43 | 44 | self.document_similarity = cosine_similarity(self.phrase_embeddings, self.document_embedding.reshape(1, -1)) 45 | phrase_similarity_matrix = cosine_similarity(self.phrase_embeddings) 46 | 47 | # MMR 48 | # 1st iteration 49 | unselected = list(range(len(self.phrases))) 50 | select_idx = np.argmax(self.document_similarity) 51 | 52 | selected = [select_idx] 53 | unselected.remove(select_idx) 54 | 55 | # other iterations 56 | for _ in range(self.N - 1): 57 | mmr_distance_to_doc = self.document_similarity[unselected, :] 58 | mmr_distance_between_phrases = np.max(phrase_similarity_matrix[unselected][:, selected], axis=1) 59 | 60 | mmr = self.l * mmr_distance_to_doc - (1 - self.l) * mmr_distance_between_phrases.reshape(-1, 1) 61 | mmr_idx = unselected[np.argmax(mmr)] 62 | 63 | selected.append(mmr_idx) 64 | unselected.remove(mmr_idx) 65 | 66 | return selected 67 | -------------------------------------------------------------------------------- /src/example.py: -------------------------------------------------------------------------------- 1 | from gensim.models.doc2vec import Doc2Vec 2 | 3 | from embedrank import EmbedRank 4 | from nlp_uitl import tokenize 5 | 6 | model = Doc2Vec.load("model/jawiki.doc2vec.dbow300d.model") 7 | embedrank = EmbedRank(model=model, tokenize=tokenize) 8 | 9 | text = """バーレーンの首都マナマ(マナーマとも)で現在開催されている 10 | ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた 11 | 「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、 12 | 世界遺産に登録することを決定した。文化庁が同日発表した。 13 | 日本国内の文化財の世界遺産登録は昨年に登録された福岡県の 14 | 「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。 15 | 2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。""" 16 | # source: https://ja.wikinews.org/wiki/%E6%BD%9C%E4%BC%8F%E3%82%AD%E3%83%AA%E3%82%B7%E3%82%BF%E3%83%B3%E9%96%A2%E9%80%A3%E9%81%BA%E7%94%A3%E3%80%81%E4%B8%96%E7%95%8C%E9%81%BA%E7%94%A3%E7%99%BB%E9%8C%B2 17 | 18 | print(embedrank.extract_keyword(text)) 19 | -------------------------------------------------------------------------------- /src/nlp_uitl.py: -------------------------------------------------------------------------------- 1 | import MeCab 2 | 3 | 4 | def tokenize(text): 5 | wakati = MeCab.Tagger("-O wakati") 6 | wakati.parse("") 7 | return wakati.parse(text).strip().split(" ") 8 | 9 | 10 | def extract_keyphrase_candidates(text): 11 | tagger = MeCab.Tagger() 12 | tagger.parse("") 13 | 14 | node = tagger.parseToNode(text) 15 | 16 | keyphrase_candidates = [] 17 | phrase = [] 18 | phrase_noun = [] 19 | is_adj_candidate = False 20 | is_multinoun_candidate = False 21 | 22 | while node: 23 | # adjectives + nouns 24 | if node.feature.startswith('形容詞'): 25 | is_adj_candidate = True 26 | phrase.append(node.surface) 27 | if node.feature.startswith("名詞") and is_adj_candidate: 28 | phrase.append(node.surface) 29 | elif len(phrase) >= 2: 30 | keyphrase_candidates.append(phrase) 31 | 32 | is_adj_candidate = False 33 | phrase = [] 34 | 35 | # multiple nouns 36 | if node.feature.startswith("名詞"): 37 | phrase_noun.append(node.surface) 38 | is_multinoun_candidate = True 39 | elif len(phrase_noun) >= 2: 40 | keyphrase_candidates.append(phrase_noun) 41 | 42 | is_multinoun_candidate = False 43 | phrase_noun = [] 44 | else: 45 | is_multinoun_candidate = False 46 | phrase_noun = [] 47 | 48 | node = node.next 49 | 50 | return keyphrase_candidates 51 | -------------------------------------------------------------------------------- /web/api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | sys.path.append("../src/") 5 | 6 | import json 7 | import bottle 8 | from bottle import route, run, request, response, static_file 9 | from gensim.models.doc2vec import Doc2Vec 10 | 11 | from embedrank import EmbedRank 12 | from nlp_uitl import tokenize 13 | 14 | 15 | @route("/") 16 | def hello(): 17 | return "It works!" 18 | 19 | 20 | @route("/embedrank", method="POST") 21 | def result(): 22 | embedrank = EmbedRank(model=model, tokenize=tokenize, N=int(request.forms.num_keywords)) 23 | result = embedrank.extract_keyword(request.forms.text) 24 | 25 | response.content_type = 'application/json' 26 | return json.dumps({"keywords": [{"keyword": t[0], "score":str(t[1])} for t in result]}, ensure_ascii=False) 27 | 28 | 29 | if __name__ == "__main__": 30 | model = Doc2Vec.load("../model/jawiki.doc2vec.dbow300d.model") 31 | run(host="0.0.0.0", port=8080, debug=True) 32 | --------------------------------------------------------------------------------