├── .dockerignore
├── .gitignore
├── LICENSE.md
├── README.md
├── dockerfile
├── requirements.txt
├── src
    ├── embedrank.py
    ├── example.py
    └── nlp_uitl.py
└── web
    └── api.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | runs/
3 | trunk/
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | runs/
  2 | data/
  3 | model/
  4 | trunk/
  5 | 
  6 | ### https://raw.github.com/github/gitignore/72190ee30bd1e2ccc233222341435adacb7a6500/Python.gitignore
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # Environments
 97 | .env
 98 | .venv
 99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | .dmypy.json
118 | dmypy.json
119 | 
120 | 
121 | ### https://raw.github.com/github/gitignore/72190ee30bd1e2ccc233222341435adacb7a6500/Global/macOS.gitignore
122 | 
123 | # General
124 | .DS_Store
125 | .AppleDouble
126 | .LSOverride
127 | 
128 | # Icon must end with two \r
129 | Icon
130 | 
131 | # Thumbnails
132 | ._*
133 | 
134 | # Files that might appear in the root of a volume
135 | .DocumentRevisions-V100
136 | .fseventsd
137 | .Spotlight-V100
138 | .TemporaryItems
139 | .Trashes
140 | .VolumeIcon.icns
141 | .com.apple.timemachine.donotpresent
142 | 
143 | # Directories potentially created on remote AFP share
144 | .AppleDB
145 | .AppleDesktop
146 | Network Trash Folder
147 | Temporary Items
148 | .apdisk
149 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 2019 yag_ays
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EmbedRank
 2 | 
 3 | Python Implementaion of "[Simple Unsupervised Keyphrase Extraction using Sentence Embeddings](https://arxiv.org/abs/1801.04470)"
 4 | 
 5 | ## Usage
 6 | 
 7 | EmbedRank requires pretrained document embeddings (now doc2vec supported). Please see [my blog ](https://yag-ays.github.io/project/pretrained_doc2vec_wikipedia/) for using pretrained Japanese doc2vec models.
 8 | 
 9 | ```py
10 | from gensim.models.doc2vec import Doc2Vec
11 | 
12 | from embedrank import EmbedRank
13 | from nlp_uitl import tokenize
14 | 
15 | model = Doc2Vec.load("model/jawiki.doc2vec.dbow300d.model")
16 | embedrank = EmbedRank(model=model, tokenize=tokenize)
17 | 
18 | text = """バーレーンの首都マナマ(マナーマとも)で現在開催されている
19 | ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた
20 | 「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、
21 | 世界遺産に登録することを決定した。文化庁が同日発表した。
22 | 日本国内の文化財の世界遺産登録は昨年に登録された福岡県の
23 | 「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。
24 | 2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。"""
25 | ```
26 | 
27 | ```py
28 | In []: embedrank.extract_keyword(text)
29 | [('世界遺産登録', 0.61837685), ('(長崎県', 0.517046), ('ユネスコ(国際連合教育科学文化機関)', 0.5726031), ('潜伏キリシタン関連遺産', 0.544827), ('首都マナマ(マナーマ', 0.4898381)]
30 | 
31 | ```
32 | 
33 | (Source: [潜伏キリシタン関連遺産、世界遺産登録 \- ウィキニュース](https://ja.wikinews.org/wiki/%E6%BD%9C%E4%BC%8F%E3%82%AD%E3%83%AA%E3%82%B7%E3%82%BF%E3%83%B3%E9%96%A2%E9%80%A3%E9%81%BA%E7%94%A3%E3%80%81%E4%B8%96%E7%95%8C%E9%81%BA%E7%94%A3%E7%99%BB%E9%8C%B2))
34 | 
35 | ## Docker
36 | 
37 | Set the extracted doc2vec model in `model/` directory and run the following commands.
38 | 
39 | ```sh
40 | $ docker build -t embedrank .
41 | $ docker run --rm -p 8080:8080 --memory 7g -it embedrank
42 | ```
43 | 
44 | ```sh
45 | $ curl -XPOST "localhost:8080/embedrank" --data-urlencode text='バーレーンの首都マナマ(マナーマとも)で現在開催されている
46 |                                             ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた
47 |                                             「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、
48 |                                             世界遺産に登録することを決定した。文化庁が同日発表した。
49 |                                             日本国内の文化財の世界遺産登録は昨年に登録された福岡県の
50 |                                             「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。
51 |                                             2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。'
52 |                                             -d 'num_keywords=3'
53 | 
54 | {
55 |   "keywords": [
56 |     {
57 |       "keyword": "世界遺産登録",
58 |       "score": "0.58336747"
59 |     },
60 |     {
61 |       "keyword": "天草地方",
62 |       "score": "0.52296615"
63 |     },
64 |     {
65 |       "keyword": "首都マナマ(マナーマ",
66 |       "score": "0.5126816"
67 |     }
68 |   ]
69 | }                                            
70 | ```
71 | 
72 | Caution:
73 | 
74 | - You need to allocate total memory size more than 7GB.
75 | - Container size is very large (7.38GB)
76 | 


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |     libmecab-dev \
 5 |     mecab-ipadic-utf8 \
 6 |     && apt-get clean \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | COPY . /app
10 | WORKDIR /app/
11 | 
12 | RUN pip install -U pip
13 | RUN pip install -r requirements.txt
14 | 
15 | WORKDIR /app/web/
16 | 
17 | ENTRYPOINT ["python", "api.py"]
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mecab-python3==0.7
2 | gensim==3.7.0
3 | scikit-learn==0.20.3
4 | bottle==0.12.16
5 | 


--------------------------------------------------------------------------------
/src/embedrank.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gensim.models.doc2vec import Doc2Vec
 3 | from sklearn.metrics.pairwise import cosine_similarity
 4 | 
 5 | from nlp_uitl import extract_keyphrase_candidates, tokenize
 6 | 
 7 | 
 8 | class EmbedRank():
 9 |     def __init__(self, model, tokenize, N=5, l=0.55):
10 |         self.model = model
11 |         self.tokenize = tokenize
12 |         self.N = N
13 |         self.l = l
14 |         self.phrases = []
15 |         self.phrase_embeddings = []
16 |         self.document_embedding = []
17 |         self.document_similarity = []
18 | 
19 |     def extract_keyword(self, text):
20 |         phrase_indices = self._mmr(text)
21 |         output = []
22 |         for idx in phrase_indices:
23 |             output.append((self.phrases[idx], self.document_similarity[idx][0]))
24 |         return output
25 | 
26 |     def _mmr(self, document):
27 |         self.document_embedding = self.model.infer_vector(self.tokenize(document))
28 | 
29 |         self.phrases = []
30 |         self.phrase_embeddings = []
31 |         for candidate_tokens in extract_keyphrase_candidates(document):
32 |             candidate_text = "".join(candidate_tokens)
33 |             self.phrases.append(candidate_text)
34 |             self.phrase_embeddings.append(self.model.infer_vector(self.tokenize(candidate_text)))
35 |         self.phrase_embeddings = np.array(self.phrase_embeddings)
36 | 
37 |         if len(self.phrases) == 0:
38 |             return []
39 |         if len(self.phrases) < self.N:
40 |             # Num of candidate phrases are smaller than extracted num:
41 |             #   extract all phrases and reranked by mmr
42 |             self.N = len(self.phrases)
43 | 
44 |         self.document_similarity = cosine_similarity(self.phrase_embeddings, self.document_embedding.reshape(1, -1))
45 |         phrase_similarity_matrix = cosine_similarity(self.phrase_embeddings)
46 | 
47 |         # MMR
48 |         # 1st iteration
49 |         unselected = list(range(len(self.phrases)))
50 |         select_idx = np.argmax(self.document_similarity)
51 | 
52 |         selected = [select_idx]
53 |         unselected.remove(select_idx)
54 | 
55 |         # other iterations
56 |         for _ in range(self.N - 1):
57 |             mmr_distance_to_doc = self.document_similarity[unselected, :]
58 |             mmr_distance_between_phrases = np.max(phrase_similarity_matrix[unselected][:, selected], axis=1)
59 | 
60 |             mmr = self.l * mmr_distance_to_doc - (1 - self.l) * mmr_distance_between_phrases.reshape(-1, 1)
61 |             mmr_idx = unselected[np.argmax(mmr)]
62 | 
63 |             selected.append(mmr_idx)
64 |             unselected.remove(mmr_idx)
65 | 
66 |         return selected
67 | 


--------------------------------------------------------------------------------
/src/example.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.doc2vec import Doc2Vec
 2 | 
 3 | from embedrank import EmbedRank
 4 | from nlp_uitl import tokenize
 5 | 
 6 | model = Doc2Vec.load("model/jawiki.doc2vec.dbow300d.model")
 7 | embedrank = EmbedRank(model=model, tokenize=tokenize)
 8 | 
 9 | text = """バーレーンの首都マナマ(マナーマとも)で現在開催されている
10 | ユネスコ(国際連合教育科学文化機関)の第42回世界遺産委員会は日本の推薦していた
11 | 「長崎と天草地方の潜伏キリシタン関連遺産」 (長崎県、熊本県)を30日、
12 | 世界遺産に登録することを決定した。文化庁が同日発表した。
13 | 日本国内の文化財の世界遺産登録は昨年に登録された福岡県の
14 | 「『神宿る島』宗像・沖ノ島と関連遺産群」に次いで18件目。
15 | 2013年の「富士山-信仰の対象と芸術の源泉」の文化遺産登録から6年連続となった。"""
16 | # source: https://ja.wikinews.org/wiki/%E6%BD%9C%E4%BC%8F%E3%82%AD%E3%83%AA%E3%82%B7%E3%82%BF%E3%83%B3%E9%96%A2%E9%80%A3%E9%81%BA%E7%94%A3%E3%80%81%E4%B8%96%E7%95%8C%E9%81%BA%E7%94%A3%E7%99%BB%E9%8C%B2
17 | 
18 | print(embedrank.extract_keyword(text))
19 | 


--------------------------------------------------------------------------------
/src/nlp_uitl.py:
--------------------------------------------------------------------------------
 1 | import MeCab
 2 | 
 3 | 
 4 | def tokenize(text):
 5 |     wakati = MeCab.Tagger("-O wakati")
 6 |     wakati.parse("")
 7 |     return wakati.parse(text).strip().split(" ")
 8 | 
 9 | 
10 | def extract_keyphrase_candidates(text):
11 |     tagger = MeCab.Tagger()
12 |     tagger.parse("")
13 | 
14 |     node = tagger.parseToNode(text)
15 | 
16 |     keyphrase_candidates = []
17 |     phrase = []
18 |     phrase_noun = []
19 |     is_adj_candidate = False
20 |     is_multinoun_candidate = False
21 | 
22 |     while node:
23 |         # adjectives + nouns
24 |         if node.feature.startswith('形容詞'):
25 |             is_adj_candidate = True
26 |             phrase.append(node.surface)
27 |         if node.feature.startswith("名詞") and is_adj_candidate:
28 |             phrase.append(node.surface)
29 |         elif len(phrase) >= 2:
30 |             keyphrase_candidates.append(phrase)
31 | 
32 |             is_adj_candidate = False
33 |             phrase = []
34 | 
35 |         # multiple nouns
36 |         if node.feature.startswith("名詞"):
37 |             phrase_noun.append(node.surface)
38 |             is_multinoun_candidate = True
39 |         elif len(phrase_noun) >= 2:
40 |             keyphrase_candidates.append(phrase_noun)
41 | 
42 |             is_multinoun_candidate = False
43 |             phrase_noun = []
44 |         else:
45 |             is_multinoun_candidate = False
46 |             phrase_noun = []
47 | 
48 |         node = node.next
49 | 
50 |     return keyphrase_candidates
51 | 


--------------------------------------------------------------------------------
/web/api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | sys.path.append("../src/")
 5 | 
 6 | import json
 7 | import bottle
 8 | from bottle import route, run, request, response, static_file
 9 | from gensim.models.doc2vec import Doc2Vec
10 | 
11 | from embedrank import EmbedRank
12 | from nlp_uitl import tokenize
13 | 
14 | 
15 | @route("/")
16 | def hello():
17 |     return "It works!"
18 | 
19 | 
20 | @route("/embedrank", method="POST")
21 | def result():
22 |     embedrank = EmbedRank(model=model, tokenize=tokenize, N=int(request.forms.num_keywords))
23 |     result = embedrank.extract_keyword(request.forms.text)
24 | 
25 |     response.content_type = 'application/json'
26 |     return json.dumps({"keywords": [{"keyword": t[0], "score":str(t[1])} for t in result]},  ensure_ascii=False)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     model = Doc2Vec.load("../model/jawiki.doc2vec.dbow300d.model")
31 |     run(host="0.0.0.0", port=8080, debug=True)
32 | 


--------------------------------------------------------------------------------