├── setup.py ├── .gitignore ├── README.md ├── key2vec ├── __init__.py ├── glove.py ├── cleaner.py ├── constants.json ├── phrase_graph.py ├── key2vec.py ├── docs.py └── constants.py ├── tests ├── test_glove.py └── test_docs.py ├── requirements.txt ├── test.py └── test.txt /setup.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__ 3 | data 4 | .ipynb_checkpoints 5 | *.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Key2Vec 2 | 3 | Python implementation of Mahata, Kuriakose, et. al.'s [Key2Vec](http://aclweb.org/anthology/N18-2100) -------------------------------------------------------------------------------- /key2vec/__init__.py: -------------------------------------------------------------------------------- 1 | from . import cleaner 2 | from . import constants 3 | from . import docs 4 | from . import glove 5 | from . import key2vec 6 | from . import phrase_graph -------------------------------------------------------------------------------- /tests/test_glove.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from key2vec.glove import Glove 3 | 4 | def test_glove(): 5 | path = '../data/glove.6B/glove.6B.50d.txt' 6 | glove = Glove(path) 7 | assert glove.dim == 50 8 | assert glove.embeddings.get('the', None) is not None -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | blis==0.4.1 2 | certifi==2019.9.11 3 | chardet==3.0.4 4 | cymem==2.0.2 5 | en-core-web-sm==2.2.0 6 | idna==2.8 7 | murmurhash==1.0.2 8 | nltk==3.4.5 9 | numpy==1.17.3 10 | plac==0.9.6 11 | preshed==3.0.2 12 | python-dotenv==0.10.3 13 | requests==2.22.0 14 | scipy==1.3.1 15 | six==1.12.0 16 | spacy==2.2.1 17 | srsly==0.1.0 18 | thinc==7.1.1 19 | tqdm==4.36.1 20 | urllib3==1.25.6 21 | wasabi==0.2.2 22 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import key2vec 2 | 3 | path = './data/glove.6B/glove.6B.50d.txt' 4 | glove = key2vec.glove.Glove(path) 5 | with open('./test.txt', 'r') as f: 6 | test = f.read() 7 | m = key2vec.key2vec.Key2Vec(test, glove) 8 | m.extract_candidates() 9 | m.set_theme_weights() 10 | m.build_candidate_graph() 11 | ranked = m.page_rank_candidates() 12 | 13 | for row in ranked: 14 | print('{}. {}'.format(row.rank, row.text)) -------------------------------------------------------------------------------- /tests/test_docs.py: -------------------------------------------------------------------------------- 1 | # More things to test about both the Document object 2 | # and the Phrase object 3 | 4 | import pytest 5 | from key2vec.glove import Glove 6 | from key2vec.docs import Document, Phrase 7 | 8 | glove = Glove('../data/glove.6B/glove.6B.50d.txt') 9 | 10 | def test_document(): 11 | text = "Hello! My name is Mark Secada. I'm a Data Scientist." 12 | doc = Document(text, glove) 13 | assert doc.text == text 14 | assert doc.dim == 50 15 | assert doc.embedding is not None 16 | 17 | def test_phrase(): 18 | text = "Hello! My name is Mark Secada. I'm a Data Scientist." 19 | doc = Document(text, glove) 20 | phrase = Phrase("Mark Secada", glove, doc) 21 | assert phrase.text == "Mark Secada" 22 | assert phrase.dim == 50 23 | assert phrase.embedding is not None 24 | assert phrase.parent.text == text 25 | assert phrase.parent.dim == phrase.dim 26 | assert phrase.parent.embedding is not None 27 | assert type(phrase.similarity) == float 28 | 29 | phrase = Phrase("Secada", glove, doc) 30 | assert phrase.similarity == -1 31 | 32 | -------------------------------------------------------------------------------- /key2vec/glove.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Dict 3 | 4 | class Glove(object): 5 | """GloVe vectors. 6 | 7 | Parameters 8 | ---------- 9 | path : str, required 10 | Path to the GloVe embeddings 11 | 12 | Attributes 13 | ---------- 14 | embeddings : Dict[str, np.float64] 15 | Dictionary of GloVe embeddings 16 | dim : int 17 | Dimension of GloVe embeddings 18 | """ 19 | 20 | def __init__(self, path: str) -> None: 21 | self.embeddings = self.__read_glove(path) 22 | self.dim = self.__get_dim() 23 | 24 | def __read_glove(self, path: str) -> Dict[str, np.float64]: 25 | """Reads GloVe vectors into a dictionary, where 26 | the words are the keys, and the vectors are the values. 27 | 28 | Returns 29 | ------- 30 | word_vectors : Dict[str, np.float64] 31 | """ 32 | with open(path, 'r') as f: 33 | data = f.readlines() 34 | word_vectors = {} 35 | for row in data: 36 | stripped_row = row.strip('\n') 37 | split_row = stripped_row.split(' ') 38 | word = split_row[0] 39 | vector = [] 40 | for el in split_row[1:]: 41 | vector.append(float(el)) 42 | word_vectors[word] = np.array(vector) 43 | return word_vectors 44 | 45 | def __get_dim(self) -> int: 46 | return len(self.embeddings[list(self.embeddings.keys())[0]]) -------------------------------------------------------------------------------- /key2vec/cleaner.py: -------------------------------------------------------------------------------- 1 | from .constants import STOPWORDS, POS_BLACKLIST, DETERMINERS, PUNCT_SET 2 | 3 | class Cleaner(object): 4 | """Cleans candidate keyphrase""" 5 | 6 | def __init__(self, doc): 7 | self.doc = doc 8 | self.tokens = [token for token in doc] 9 | 10 | def transform_text(self): 11 | transformed_text = [] 12 | tokens_len = len(self.tokens) 13 | for i, token in enumerate(self.tokens): 14 | remove = False 15 | if (i == 0) or (i == tokens_len - 1): 16 | is_stop = token.text in STOPWORDS 17 | is_banned_pos = token.pos_ in POS_BLACKLIST 18 | is_determiner = token.text in DETERMINERS 19 | has_punct = not set(token.text).isdisjoint(PUNCT_SET) 20 | remove = (is_stop 21 | or is_banned_pos 22 | or is_determiner 23 | or has_punct) 24 | else: 25 | pass 26 | if not remove: 27 | transformed_text.append(token.text) 28 | 29 | if transformed_text == []: 30 | return '' 31 | elif '-' in transformed_text: 32 | dash_index = transformed_text.index('-') 33 | first_half = ' '.join(transformed_text[:dash_index]) 34 | sec_half = ' '.join(transformed_text[dash_index + 1:]) 35 | return ' '.join([first_half, sec_half]).lower() 36 | else: 37 | return ' '.join(transformed_text).lower() -------------------------------------------------------------------------------- /key2vec/constants.json: -------------------------------------------------------------------------------- 1 | { 2 | "punctuation": [ 3 | "\\", 4 | "]", 5 | ";", 6 | "%", 7 | "(", 8 | "_", 9 | "@", 10 | ",", 11 | "-", 12 | "–", 13 | "=", 14 | "!", 15 | ":", 16 | "[", 17 | "\"", 18 | ")", 19 | "?", 20 | "}", 21 | "&", 22 | "'", 23 | "|", 24 | "/", 25 | "#", 26 | "<", 27 | "$", 28 | "^", 29 | ".", 30 | "`", 31 | "*", 32 | "+", 33 | "~", 34 | "{", 35 | ">", 36 | "\n", 37 | "\t", 38 | ], 39 | "pos_blacklist": [ 40 | "INTJ", 41 | "AUX", 42 | "CCONJ", 43 | "ADP", 44 | "DET", 45 | "NUM", 46 | "PART", 47 | "PRON", 48 | "SCONJ", 49 | "PUNCT", 50 | "SYM", 51 | "X", 52 | ], 53 | "ents_to_ignore": [ 54 | "DATE", 55 | "TIME", 56 | "PERCENT", 57 | "MONEY", 58 | "QUANTITY", 59 | "ORDINAL", 60 | "CARDINAL", 61 | ], 62 | "determiners": [ 63 | "the", 64 | "a", 65 | "an", 66 | "this", 67 | "that", 68 | "these", 69 | "those", 70 | "my", 71 | "your", 72 | "his", 73 | "her", 74 | "its", 75 | "our", 76 | "their", 77 | "a few", 78 | "a little", 79 | "much", 80 | "many", 81 | "a lot of", 82 | "most", 83 | "some", 84 | "any", 85 | "enough", 86 | "one", 87 | "ten", 88 | "thirty", 89 | "all", 90 | "both", 91 | "either", 92 | "neither", 93 | "each", 94 | "every", 95 | "other", 96 | "another", 97 | "such", 98 | "what", 99 | "rather", 100 | "quite", 101 | ] 102 | } -------------------------------------------------------------------------------- /key2vec/phrase_graph.py: -------------------------------------------------------------------------------- 1 | from .docs import Document, Phrase, cosine_similarity 2 | from typing import List 3 | 4 | class PhraseNode(object): 5 | """Node in Phrase Graph.""" 6 | 7 | def __init__(self, phrase: Phrase): 8 | self.key = phrase.text 9 | self.phrase = phrase 10 | self.incoming_edges = 0 11 | self.adj_nodes = {} 12 | self.adj_weights = {} 13 | 14 | def __repr__(self): 15 | return str(self.key) 16 | 17 | def __lt__(self, other): 18 | return self.key < other.key 19 | 20 | def add_neighbor(self, neighbor, candidates, weight=0): 21 | if neighbor is None or weight is None: 22 | raise TypeError('neighbor or weight cannot be None') 23 | if self.__in_window(neighbor): 24 | neighbor.incoming_edges += 1 25 | cosine_score = cosine_similarity(self.phrase.embedding, 26 | neighbor.phrase.embedding) 27 | # need to rewrite api to allow candidates to be calculated 28 | pmi = self.phrase.calc_pmi(neighbor.phrase, candidates) 29 | self.adj_weights[neighbor.key] = cosine_score * pmi 30 | self.adj_nodes[neighbor.key] = neighbor 31 | 32 | def __in_window(self, neighbor): 33 | window = self.phrase.window 34 | neighbor_pos = neighbor.phrase.positions 35 | for pos in neighbor_pos: 36 | pos0 = window.get(pos[0]) 37 | pos1 = window.get(pos[1]) 38 | if window.get(pos0) or window.get(pos1): 39 | return True 40 | return False 41 | 42 | class PhraseGraph(object): 43 | """Bi-directional G=graph of phrases""" 44 | 45 | def __init__(self, candidates: List[Phrase]): 46 | self.nodes = {} 47 | self.candidates = candidates 48 | 49 | def add_node(self, key): 50 | if key is None: 51 | raise TypeError('key cannot be None') 52 | if key not in self.nodes: 53 | self.nodes[key] = PhraseNode(key) 54 | return self.nodes[key] 55 | 56 | def add_edge(self, source_key, dest_key, weight=0): 57 | if source_key is None or dest_key is None: 58 | raise KeyError('Invalid key') 59 | if source_key not in self.nodes: 60 | self.add_node(dest_key) 61 | if dest_key not in self.nodes: 62 | self.add_node(dest_key) 63 | self.nodes[source_key].add_neighbor(self.nodes[dest_key], 64 | weight) -------------------------------------------------------------------------------- /key2vec/key2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spacy 3 | import string 4 | import en_core_web_sm 5 | import os 6 | 7 | from nltk import sent_tokenize, wordpunct_tokenize 8 | from typing import Dict, List 9 | from .cleaner import Cleaner 10 | from .constants import ENTS_TO_IGNORE, STOPWORDS, PUNCT_SET 11 | from .docs import Document, Phrase 12 | from .glove import Glove 13 | from .phrase_graph import PhraseNode, PhraseGraph 14 | 15 | NLP = en_core_web_sm.load() 16 | 17 | class Key2Vec(object): 18 | """Implementation of Key2Vec. 19 | 20 | Parameters 21 | ---------- 22 | text : str, required 23 | The text to extract the top keyphrases from. 24 | glove : Glove 25 | GloVe vectors. 26 | 27 | Attributes 28 | ---------- 29 | text : Document 30 | Document object of the `text` parameter. 31 | glove : Glove 32 | candidates : List[Phrase] 33 | List of candidate keyphrases. Initialized as an empty list. 34 | candidate_graph : PhraseGraph 35 | Bidrectional graph of all candidate phrases 36 | """ 37 | 38 | def __init__(self, 39 | text: str, 40 | glove: Glove) -> None: 41 | 42 | self.doc = Document(text, glove) 43 | self.glove = glove 44 | self.candidates = [] 45 | self.candidate_graph = None 46 | 47 | def extract_candidates(self): 48 | """Extracts candidate phrases from the text. Sets 49 | `candidates` attributes to a list of Phrase objects. 50 | """ 51 | 52 | sentences = sent_tokenize(self.doc.text) 53 | candidates = {} 54 | for sentence in sentences: 55 | doc = NLP(sentence) 56 | candidates = self.__extract_tokens(doc, candidates) 57 | candidates = self.__extract_entities(doc, candidates) 58 | candidates = self.__extract_noun_chunks(doc, candidates) 59 | self.candidates = list(candidates.values()) 60 | 61 | def __extract_tokens(self, doc, candidates): 62 | for token in doc: 63 | text = token.text.lower() 64 | not_punct = set(text).isdisjoint(PUNCT_SET) 65 | is_stopword = text in STOPWORDS 66 | in_candidates = candidates.get(text) is not None 67 | not_empty = text != '' 68 | keep = (not_punct 69 | and not_empty 70 | and not (is_stopword or in_candidates)) 71 | if keep: 72 | try: 73 | candidates[text] = Phrase(text, self.doc, 74 | self.glove) 75 | except KeyError: 76 | next 77 | else: 78 | pass 79 | return candidates 80 | 81 | def __extract_entities(self, doc, candidates): 82 | for ent in doc.ents: 83 | cleaned_text = Cleaner(ent).transform_text() 84 | is_ent_to_ignore = ent.label_ in ENTS_TO_IGNORE 85 | in_candidates = candidates.get(cleaned_text) is not None 86 | not_empty = cleaned_text != '' 87 | if not (is_ent_to_ignore or in_candidates) and not_empty: 88 | try: 89 | candidates[cleaned_text] = Phrase(cleaned_text, self.doc, 90 | self.glove) 91 | except KeyError: 92 | next 93 | return candidates 94 | 95 | def __extract_noun_chunks(self, doc, candidates): 96 | for chunk in doc.noun_chunks: 97 | cleaned_text = Cleaner(chunk).transform_text() 98 | not_empty = cleaned_text != '' 99 | if candidates.get(cleaned_text) is None and not_empty: 100 | try: 101 | candidates[cleaned_text] = Phrase(cleaned_text, 102 | self.doc, self.glove) 103 | except KeyError: 104 | next 105 | return candidates 106 | 107 | def set_theme_weights(self) -> List[Phrase]: 108 | """Ranks candidate keyphrases. 109 | 110 | Parameters 111 | ---------- 112 | top_n : int, optional (int = 10) 113 | How many top keyphrases to return. 114 | 115 | Returns 116 | ------- 117 | sorted_candidates : List[Phrase] 118 | Sorted list of candidates in reverse order. Returns `top_n` 119 | Phrase objects. 120 | """ 121 | max_ = max([c.similarity for c in self.candidates]) 122 | min_ = min([c.similarity for c in self.candidates]) 123 | 124 | for c in self.candidates: 125 | c.set_theme_weight(min_, max_) 126 | 127 | def build_candidate_graph(self) -> None: 128 | """Builds bidirectional graph of candidates.""" 129 | 130 | if self.candidates == []: 131 | return 132 | 133 | candidate_graph = PhraseGraph(self.candidates) 134 | for candidate in self.candidates: 135 | candidate_graph.add_node(candidate) 136 | 137 | nodes = len(self.candidates) 138 | 139 | for node in candidate_graph.nodes: 140 | for other in candidate_graph.nodes: 141 | if node != other: 142 | candidate_graph.nodes[node].add_neighbor( 143 | candidate_graph.nodes[other], nodes) 144 | self.candidate_graph = candidate_graph 145 | 146 | def page_rank_candidates(self, top_n: int=10) -> List[Phrase]: 147 | """Page Ranks candidate phrases.""" 148 | if self.candidate_graph is None: 149 | return 150 | 151 | for node in self.candidate_graph.nodes.values(): 152 | theme = node.phrase.theme_weight 153 | d = 0.85 154 | weights = [] 155 | neighbors = list(node.adj_nodes.keys()) 156 | for neighbor in neighbors: 157 | out = node.adj_nodes[neighbor].incoming_edges 158 | weights.append(node.adj_weights[neighbor] / out) 159 | score = theme * (1 - d) + d * sum(weights) 160 | node.phrase.score = score 161 | 162 | sorted_candidates = sorted(self.candidates, 163 | key=lambda x: x.score)[::-1] 164 | 165 | for i, c in enumerate(sorted_candidates): 166 | c.rank = i + 1 167 | 168 | return sorted_candidates[:top_n] -------------------------------------------------------------------------------- /key2vec/docs.py: -------------------------------------------------------------------------------- 1 | from nltk import sent_tokenize, wordpunct_tokenize 2 | from typing import Dict, List, Tuple 3 | from .constants import PUNCT_SET 4 | from .glove import Glove 5 | 6 | import numpy as np 7 | 8 | def cosine_similarity(a: np.float64, b: np.float64) -> float: 9 | norm_a = np.linalg.norm(a) 10 | norm_b = np.linalg.norm(b) 11 | if norm_a == 0 or norm_b == 0: 12 | return -1 13 | return np.dot(a, b) / (norm_a * norm_b) 14 | 15 | def _filter_words(text: str) -> List[str]: 16 | tokens = wordpunct_tokenize(text) 17 | words_filter = [word.lower() for word in tokens 18 | if set(word).isdisjoint(PUNCT_SET)] 19 | return words_filter 20 | 21 | class Document(object): 22 | """Document to be embedded. May be a word, a sentence, etc. 23 | 24 | Parameters 25 | ---------- 26 | text : str, required 27 | The text to be embedded 28 | glove : Glove, required 29 | GloVe embeddings 30 | 31 | Attributes 32 | ---------- 33 | text : str 34 | dim : int 35 | Dimension of GloVe embeddings. 36 | embedding : np.float64 37 | Document embedding built from average of GloVe embeddings. 38 | """ 39 | 40 | def __init__(self, 41 | text: str, 42 | glove: Glove) -> None: 43 | self.text = text 44 | self.dim = glove.dim 45 | self.embedding = self.__embed_document(glove.embeddings) 46 | 47 | def __embed_document(self, 48 | embeddings: Dict[str, np.float64]) -> np.float64: 49 | words = wordpunct_tokenize(self.text.lower()) 50 | vector = np.zeros(self.dim) 51 | for i, word in enumerate(words): 52 | if embeddings.get(word, None) is None: 53 | vector += np.zeros(self.dim) 54 | else: 55 | vector += embeddings[word] 56 | return vector / len(words) 57 | 58 | def get_word_positions(self) -> Dict[str, List[int]]: 59 | words = _filter_words(self.text) 60 | word_positions = {} 61 | for i, word in enumerate(words): 62 | if word_positions.get(word) is None: 63 | word_positions[word] = [i] 64 | else: 65 | word_positions[word].append(i) 66 | return word_positions 67 | 68 | class Phrase(Document): 69 | """Phrase to be embedded. Inherits from Document object. 70 | 71 | Parameters 72 | ---------- 73 | text : str, required 74 | The text to be embedded 75 | glove : Glove, required 76 | GloVe embeddings 77 | parent : Document, required 78 | Document where the Phrase is from 79 | 80 | Attributes 81 | ---------- 82 | text : str 83 | dim : int 84 | embedding : np.float64 85 | parent : Document 86 | positions : List[Tuple[int]] 87 | List of indices where a given phrase is located. 88 | Each index is represented as a Tuple where the first 89 | element is the first index the phrase appears in 90 | and the second element is the second index the phrase 91 | appears in. If a phrase is a unigram, a position Tuple 92 | is (position, position). 93 | similarity : float 94 | Cosine similarity between the parent document and the phrase. 95 | score : float, None 96 | Min/Max scaling of the cosine similarity in relation to the 97 | other candidate keyphrases. 98 | rank : int, None 99 | Phrase ranking with respect to the score in descending order. 100 | """ 101 | 102 | def __init__(self, 103 | text: str, 104 | parent: Document, 105 | glove: Glove) -> None: 106 | super().__init__(text, glove) 107 | self.parent = parent 108 | self.positions = self.__get_positions() 109 | self.window = self.__expand_window() 110 | self.similarity = cosine_similarity(parent.embedding, 111 | self.embedding) 112 | self.theme_weight = None 113 | self.score = None 114 | self.rank = None 115 | 116 | def __str__(self) -> str: 117 | return self.text 118 | 119 | def set_theme_weight(self, 120 | min_: float, 121 | max_: float) -> None: 122 | # THIS SHOULD BE SET_THEME_EMBEDDING!!!!! 123 | diff = max_ - min_ 124 | self.theme_weight = (self.similarity - min_) / diff 125 | 126 | def calc_pmi(self, phrase, candidates: int): 127 | """Calculates point-wise mutual information between 128 | one candidate phrase and another.""" 129 | prob_phrase_one = len(self.positions) / candidates 130 | prob_phrase_two = len(phrase.positions) / candidates 131 | cooccur = 0 132 | for pos in phrase.positions: 133 | if self.window.get(pos[0]) or self.window.get(pos[1]): 134 | cooccur += 1 135 | prob_cooccur = cooccur / candidates 136 | return np.log(prob_cooccur / (prob_phrase_one * prob_phrase_two)) 137 | 138 | def __get_positions(self) -> List[Tuple[int]]: 139 | """Gets positions a phrase is in.""" 140 | parent_word_positions = self.parent.get_word_positions() 141 | phrase_split = self.text.lower().split(' ') 142 | positions = [] 143 | if len(phrase_split) == 1: 144 | for word_pos in parent_word_positions[phrase_split[0]]: 145 | positions.append((word_pos, word_pos)) 146 | else: 147 | phrase = {word: parent_word_positions[word] 148 | for word in phrase_split} 149 | len_phrase = len(phrase_split) 150 | for position in phrase[phrase_split[0]]: 151 | for i, word in enumerate(phrase_split[1:]): 152 | pred_pos = position + i + 1 153 | end_of_phrase = i + 2 == len_phrase 154 | is_pred_pos = pred_pos in phrase[word] 155 | if is_pred_pos and end_of_phrase: 156 | positions.append((position, pred_pos)) 157 | return positions 158 | 159 | def __expand_window(self) -> Dict[int, int]: 160 | """Returns dictionary of positions in a phrase's 161 | adj. window.""" 162 | window = {} 163 | phrase_len = len(self.parent.text.split(' ')) 164 | for pos in self.positions: 165 | min_index = max(pos[0] - 5, 0) 166 | max_index = min(pos[1] + 6, phrase_len) 167 | indices = [i for i in range(min_index, max_index)] 168 | for i in indices: 169 | if window.get(i) is None: 170 | window[i] = i 171 | return window -------------------------------------------------------------------------------- /key2vec/constants.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | PUNCT_SET = list(set(string.punctuation)) 4 | PUNCT_SET.append(u'\u201c') 5 | PUNCT_SET.append(u'\u201d') 6 | PUNCT_SET.append(u'\u2018') 7 | PUNCT_SET.append(u'\u2019') 8 | PUNCT_SET.append(u'\u2014') 9 | PUNCT_SET.append(b'\xe2\x80\x9c') 10 | PUNCT_SET.append('\n') 11 | PUNCT_SET.append('\\') 12 | PUNCT_SET.append('\"') 13 | PUNCT_SET.append('\a') 14 | PUNCT_SET.append('\f') 15 | PUNCT_SET.append('\n') 16 | PUNCT_SET.append('\r') 17 | PUNCT_SET.append('\t') 18 | PUNCT_SET.append('\v') 19 | PUNCT_SET = set(PUNCT_SET) 20 | 21 | POS_BLACKLIST = ['INTJ', 'AUX', 'CCONJ', 22 | 'ADP', 'DET', 'NUM', 'PART', 23 | 'PRON', 'SCONJ', 'PUNCT', 24 | 'SYM', 'X'] 25 | 26 | ENTS_TO_IGNORE = ['DATE', 'TIME', 'PERCENT', 27 | 'MONEY', 'QUANTITY', 'ORDINAL', 28 | 'CARDINAL'] 29 | 30 | DETERMINERS = ['the', 'a', 'an', 'this', 'that', 'these', 'those', 31 | 'my', 'your', 'his', 'her', 'its', 'our', 'their', 32 | 'a few', 'a little', 'much', 'many', 'a lot of', 'most', 33 | 'some', 'any', 'enough', 'one', 'ten', 'thirty', 'all', 34 | 'both', 'either', 'neither', 'each', 'every', 'other', 35 | 'another', 'such', 'what', 'rather', 'quite'] 36 | 37 | STOPWORDS = ["word", 38 | "a", "a's", "able", "about", "above", "according", 39 | "accordingly", "across", "actually", "after", "afterwards", 40 | "again", "against", "ago", "aim", "ain't", "all", "allow", 41 | "allows", "almost", "alone", "along", "already", "also", 42 | "although", "always", "am", "among", "amongst", "an", "and", 43 | "another", "any", "anybody", "anyhow", "anyone", "anything", 44 | "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", 45 | "approach", "appropriate", "are", "area", "areas", "aren't", 46 | "around", "as", "aside", "ask", "asked", "asking", "asks", 47 | "associated", "at", "available", "away", "awfully", "b", "back", 48 | "backed", "backing", "backs", "bad", "based", "be", "became", 49 | "because", "become", "becomes", "becoming", "been", "before", 50 | "beforehand", "began", "behind", "being", "beings", "believe", 51 | "below", "beside", "besides", "best", "better", "between", 52 | "beyond", "big", "bit", "both", "brief", "bring", "but", "by", 53 | "c", "c'mon", "c's", "came", "can", "can't", "cannot", "cant", 54 | "case", "cases", "cause", "causes", "certain", "certainly", 55 | "changes", "clear", "clearly", "co", "com", "come", "comes", 56 | "concerning", "consequently", "consider", "considering", 57 | "contain", "containing", "contains", "continue", "corresponding", 58 | "could", "couldn't", "course", "currently", "d", "definitely", 59 | "described", "despite", "did", "didn't", "differ", "different", 60 | "differently", "do", "does", "doesn't", "doing", "don't", "done", 61 | "down", "downed", "downing", "downs", "downwards", "dr", "during", 62 | "e", "each", "earlier", "early", "edu", "eg", "eight", "either", 63 | "else", "elsewhere", "end", "ended", "ending", "ends", "enough", 64 | "entirely", "especially", "et", "etc", "even", "evenly", "ever", 65 | "every", "everybody", "everyone", "everything", "everywhere", "ex", 66 | "exactly", "example", "except", "f", "face", "faces", "fact", 67 | "facts", "far", "felt", "few", "fifth", "find", "finds", "first", 68 | "five", "flawed", "focusing", "followed", "following", "follows", 69 | "for", "former", "formerly", "forth", "four", "from", "full", 70 | "fully", "fun", "further", "furthered", "furthering", 71 | "furthermore", "furthers", "g", "gave", "general", "generally", 72 | "get", "gets", "getting", "gigot", "give", "given", "gives", "go", 73 | "goes", "going", "gone", "good", "goods", "got", "gotten", "great", 74 | "greater", "greatest", "greetings", "group", "grouped", "grouping", 75 | "groups", "h", "had", "hadn't", "half", "happens", "hardly", "has", 76 | "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", 77 | "he's", "held", "hello", "help", "hence", "her", "here", "here's", 78 | "hereafter", "hereby", "herein", "hereupon", "hers", "herself", 79 | "hi", "high", "higher", "highest", "him", "himself", "his", 80 | "hither", "hopefully", "how", "how's", "howbeit", "however", "i", 81 | "i'd", "i'll", "i'm", "i've", "ie", "if", "ignored", "ii", 82 | "immediate", "immediately", "important", "in", "inasmuch", "inc", 83 | "include", "including", "indeed", "indicate", "indicated", 84 | "indicates", "inevitable", "inner", "insofar", "instead", 85 | "interest", "interested", "interesting", "interests", "into", 86 | "involving", "inward", "is", "isn't", "issue", "it", "it'd", 87 | "it'll", "it's", "its", "itself", "ix", "j", "just", "k", "keep", 88 | "keeps", "kept", "kind", "knew", "know", "known", "knows", "l", 89 | "large", "largely", "last", "lately", "later", "latest", 90 | "latter", "latterly", "lead", "least", "led", "less", "lest", 91 | "let", "let's", "lets", "letting", "like", "liked", "likely", 92 | "likes", "line", "listen", "little", "long", "longer", "longest", 93 | "look", "looking", "looks", "lot", "ltd", "m", "m.d", "made", 94 | "mainly", "make", "makes", "making", "man", "many", "may", "maybe", 95 | "me", "mean", "meant", "meanwhile", "member", "members", "men", 96 | "merely", "messrs", "met", "might", "more", "moreover", "most", 97 | "mostly", "move", "mr", "mrs", "ms", "much", "must", "mustn't", 98 | "my", "myself", "n", "name", "namely", "nd", "near", "nearly", 99 | "necessary", "need", "needed", "needing", "needs", "neither", 100 | "never", "nevertheless", "new", "newer", "newest", "next", 101 | "nine", "no", "nobody", "non", "none", "nonetheless", "noone", 102 | "nor", "normally", "not", "nothing", "novel", "now", "nowhere", 103 | "number", "numbers", "o", "obviously", "of", "off", "often", 104 | "oh", "ok", "okay", "old", "older", "oldest", "on", "once", 105 | "one", "ones", "only", "onto", "open", "opened", "opening", 106 | "opens", "or", "order", "ordered", "ordering", "orders", 107 | "other", "others", "otherwise", "ought", "our", "ours", 108 | "ourselves", "out", "outside", "over", "overall", 109 | "overwhelming", "own", "p", "part", "parted", "particular", 110 | "particularly", "parting", "parts", "people", "per", "perhaps", 111 | "place", "placed", "places", "please", "plus", "point", "pointed", 112 | "pointing", "points", "possible", "prefer", "present", "presented", 113 | "presenting", "presents", "presumably", "probably", "problem", 114 | "problems", "prof", "provides", "put", "puts", "putting", "q", 115 | "que", "quite", "qv", "r", "rather", "rd", "re", "really", 116 | "reasonably", "recently", "regarding", "regardless", "regards", 117 | "relatively", "respectively", "right", "room", "rooms", "s", 118 | "said", "same", "saw", "say", "saying", "says", "sec", "second", 119 | "secondly", "seconds", "see", "seeing", "seem", "seemed", 120 | "seeming", "seemingly", "seems", "seen", "sees", "self", "selves", 121 | "sensible", "sent", "serious", "seriously", "set", "seven", 122 | "several", "shall", "shan't", "she", "she'd", "she'll", "she's", 123 | "shortly", "should", "shouldn't", "show", "showed", "showing", 124 | "shows", "side", "sides", "simply", "since", "six", "small", 125 | "smaller", "smallest", "so", "some", "somebody", "somehow", 126 | "someone", "something", "sometime", "sometimes", "somewhat", 127 | "somewhere", "soon", "sorry", "specified", "specify", "specifying", 128 | "st", "state", "states", "still", "sub", "such", "sup", "sure", 129 | "t", "t's", "take", "taken", "tell", "tends", "th", "than", 130 | "thank", "thanks", "thanx", "that", "that's", "thats", "the", 131 | "their", "theirs", "them", "themselves", "then", "thence", "there", 132 | "there's", "thereafter", "thereby", "therefore", "therein", 133 | "theres", "thereupon", "these", "they", "they'd", "they'll", 134 | "they're", "they've", "thing", "things", "think", "thinks", 135 | "third", "this", "thorough", "thoroughly", "those", "though", 136 | "thought", "thoughts", "three", "through", "throughout", "thru", 137 | "thus", "to", "today", "together", "told", "too", "took", "top", 138 | "toward", "towards", "tried", "tries", "truly", "try", "trying", 139 | "turn", "turned", "turning", "turns", "twice", "two", "u", "un", 140 | "under", "unfortunately", "unless", "unlike", "unlikely", "until", 141 | "unto", "up", "upon", "us", "use", "used", "useful", "uses", 142 | "using", "usually", "uucp", "v", "value", "various", "very", "via", 143 | "viz", "vs", "w", "want", "wanted", "wanting", "wants", "was", 144 | "wasn't", "watched", "way", "ways", "we", "we'd", "we'll", "we're", 145 | "we've", "welcome", "well", "wells", "went", "were", "weren't", 146 | "what", "what's", "whatever", "when", "when's", "whence", 147 | "whenever", "where", "where's", "whereafter", "whereas", "whereby", 148 | "wherein", "whereupon", "wherever", "whether", "which", "while", 149 | "whither", "who", "who's", "whoever", "whole", "whom", "whose", 150 | "why", "why's", "will", "willing", "wish", "with", "within", 151 | "without", "won't", "wonder", "work", "worked", "working", 152 | "works", "worst", "would", "wouldn't", "x", "y", "year", "years", 153 | "yes", "yet", "you", "you'd", "you'll", "you're", "you've", 154 | "young", "younger", "youngest", "your", "yours", "yourself", 155 | "yourselves", "z", "zero", "mr", "ms", "mrs", "mssrs", "mssr", 156 | "also", "said", "should", "could", "would", "week", "weeks", 157 | "month", "months", "year", "years"] -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | Crown Prince Mohammed bin Salman of Saudi Arabia authorized a secret campaign to silence dissenters — which included the surveillance, kidnapping, detention and torture of Saudi citizens — more than a year before the killing of Jamal Khashoggi, according to American officials who have read classified intelligence reports about the campaign. 2 | 3 | At least some of the clandestine missions were carried out by members of the same team that killed and dismembered Mr. Khashoggi in Istanbul in October, suggesting that his killing was a particularly egregious part of a wider campaign to silence Saudi dissidents, according to the officials and associates of some of the Saudi victims. 4 | 5 | Members of the team that killed Mr. Khashoggi, which American officials called the Saudi Rapid Intervention Group, were involved in at least a dozen operations starting in 2017, the officials said. 6 | 7 | Some of the operations involved forcibly repatriating Saudis from other Arab countries and detaining and abusing prisoners in palaces belonging to the crown prince and his father, King Salman, the officials and associates said. 8 | 9 | One of the Saudis detained by the group, a university lecturer in linguistics who wrote a blog about women in Saudi Arabia, tried to kill herself last year after being subjected to psychological torture, according to American intelligence reports and others briefed on her situation. 10 | 11 | The rapid intervention team had been so busy that last June its leader asked a top adviser to Prince Mohammed whether the crown prince would give the team bonuses for Eid al-Fitr, the holiday marking the end of Ramadan, according to American officials familiar with the intelligence reports. 12 | 13 | Details about the operations come from American officials who have read classified intelligence assessments about the Saudi campaign, as well as from Saudis with direct knowledge of some of the operations. They spoke on the condition of anonymity for fear of repercussions from disclosing classified information or, in the case of the Saudis, from angering the Saudi government. 14 | 15 | A spokesman for the Saudi Embassy in Washington said the kingdom “takes any allegations of ill treatment of defendants awaiting trial or prisoners serving their sentences very seriously.” 16 | 17 | Saudi laws prohibit torture and hold accountable those involved in such abuses of power, the spokesman said, and judges cannot accept confessions obtained under duress. The kingdom’s public prosecutor and the Saudi Human Rights Commission are investigating “recent allegations,” he said. 18 | 19 | The Saudi government insists that the killing of Mr. Khashoggi — a dissident journalist living in the United States who wrote for The Washington Post — was not an assassination ordered from Riyadh. The decision to kill him was made by the team on the spot, government officials say, and those responsible are being prosecuted. Turkey and American intelligence agencies say the killing was premeditated. 20 | 21 | The kingdom says that 11 Saudis are facing criminal charges for the killing and that prosecutors are seeking the death penalty for five of them, but officials have not publicly identified the accused. 22 | 23 | Turkish forensic experts searching for the remains of Mr. Khashoggi at a villa in Turkey in November.CreditErdem Sahin/EPA, via Shutterstock 24 | After the killing of Mr. Khashoggi, Saudi officials acknowledged that the Saudi intelligence service had a standing order to bring dissidents home. What they did not acknowledge was that a specific team had been built to do it. 25 | 26 | Saudi officials declined to confirm or deny that such a team existed, or answer questions about its work. 27 | 28 | Saudi Arabia has a history of going after dissidents and other Saudi citizens abroad, but the crackdown escalated sharply after Prince Mohammed was elevated to crown prince in 2017, a period when he was moving quickly to consolidate power. He pushed aside Prince Mohammed bin Nayef, who oversaw the security services, giving the young prince sway over the intelligence agencies. 29 | 30 | Since then, Saudi security forces have detained dozens of clerics, intellectuals and activists who were perceived to pose a threat, as well as people who had posted critical or sarcastic comments about the government on Twitter. 31 | 32 | “We’ve never seen it on a scale like this,” said Bruce Riedel, a former C.I.A. analyst now with the Brookings Institution. “A dissident like Jamal Khashoggi in the past wouldn’t have been considered worth the effort.” 33 | 34 | Mr. Khashoggi was killed inside the Saudi Consulate in Istanbul and dismembered with a bone saw. Turkey used surveillance video and audio recordings to uncover the crime, identify the team that carried it out and leak their names and photos to the news media. 35 | 36 | A still image from security footage of Maher Abdulaziz Mutreb, a Saudi intelligence officer, at the Saudi Consulate in Istanbul shortly before the disappearance of Mr. Khashoggi. 37 | 38 | Mr. Riedel said the team’s sloppiness showed that it was used to operating freely inside the kingdom and not under the watchful eye of an adversary’s intelligence service. 39 | 40 | The Rapid Intervention Group was authorized by Prince Mohammed and overseen by Saud al-Qahtani, a top aide whose official job was media czar at the royal court, American officials said. His deputy, Maher Abdulaziz Mutreb, an intelligence officer who has traveled abroad with the crown prince, led the team in the field. 41 | 42 | Another operative on the team was Thaar Ghaleb al-Harbi, a member of the royal guard who was promoted in 2017 for valor during an attack on a palace of Prince Mohammed’s. 43 | 44 | Mr. Mutreb and Mr. al-Harbi are on trial in Riyadh for charges connected to Mr. Khashoggi’s death, a Saudi official said, while Mr. Qahtani is under house arrest, has been banned from travel and is under investigation, making it unclear whether the team is still operating. 45 | 46 | American intelligence reports did not specify how involved Prince Mohammed was with the group’s work, but said that the operatives saw Mr. al-Qahtani as a “conduit” to the prince. 47 | 48 | When Prince Mohammed locked hundreds of princes, businessmen and former officials in the Riyadh Ritz-Carlton in 2017 on accusations of corruption, Mr. al-Qahtani and Mr. Mutreb worked in the hotel, helping pressure detainees to sign over assets, according to associates of detainees who saw them. 49 | 50 | The Ritz-Carlton in Riyadh, where Prince Mohammed detained hundreds of princes, businessmen and former officials in 2017 on accusations of corruption. 51 | 52 | Many of those detained at the Ritz were subject to physical abuse, and one died in custody, according to witnesses. It is not known whether members of the rapid intervention team were involved in the abuse. The Saudi government has denied that any physical abuse took place there. 53 | 54 | But it was only after Mr. Khashoggi’s killing that the extent of the team’s work began to emerge. Mr. Mutreb and Mr. al-Harbi were both in the consulate when Mr. Khashoggi was killed, according to Turkish officials. American intelligence about the team’s previous operations informed the assessment by the C.I.A. in November that Prince Mohammed had ordered Mr. Khashoggi’s killing, United States officials said. 55 | 56 | The C.I.A. declined to comment. 57 | 58 | United States intelligence agencies do not appear to have conclusive, smoking-gun proof that Prince Mohammed ordering the killing, but they have pieced together a pattern of similar operations carried out by Saudi operatives under the prince’s authority. 59 | 60 | The agencies continue to gather evidence about Prince Mohammed’s role in the operations, and in December the National Security Agency produced a report saying that in 2017, the prince told a top aide that he would use “a bullet” on Mr. Khashoggi if he did not return to the kingdom and end his criticism of the government. 61 | 62 | Intelligence analysts concluded that Prince Mohammed may have not spoken literally — that he was not ordering Mr. Khashoggi to be shot — but that he intended to silence the journalist by killing him if the circumstances required it. 63 | 64 | The C.I.A. assessment has created tension between American spy agencies and President Trump, who has made warm relations with the Saudis a cornerstone of his foreign policy. The crown prince has been a close ally of the Trump White House, especially Jared Kushner, the president’s son-in-law and senior adviser. Despite the C.I.A.’s assessment that Prince Mohammed ordered the operation, Mr. Trump has said repeatedly that the evidence was not conclusive. 65 | 66 | The grisly killing of Mr. Khashoggi led to a storm of outrage in foreign capitals and a new scrutiny of the powerful crown prince, who had billed himself as a forward-thinking reformer with a grand vision to modernize the kingdom. But the journalist’s killing was just the latest in a string of clandestine operations against less high-profile Saudis, including members of the royal family. 67 | 68 | American intelligence officials said that some of those detained in these operations were held at secret locations, including opulent palaces used by King Salman and his son, until November 2017, when many were moved to the compound surrounding the Riyadh Ritz-Carlton. At the time, the hotel was being used as a five-star jail in the kingdom’s anti-corruption campaign. 69 | 70 | That crackdown became a cover for clandestine operations against Saudi dissidents, who were moved into detention in the Ritz at that time, according to American officials. 71 | 72 | The Rapid Intervention Group also appears to have been involved in the detention and abuse of about a dozen women’s rights activists, who were detained last spring and summer. The activists, who had campaigned for lifting the kingdom’s ban on driving by women, included several well-known figures: Loujain al-Hathloul, who had been jailed for trying to drive her car into the kingdom from the United Arab Emirates; Aziza al-Yousef, a retired computer science professor; and Eman al-Nafjan, the linguistics lecturer. 73 | 74 | At first, the women were not held in a prison, but were detained informally in what appeared to be an unused palace in the Red Sea port city of Jidda, according to Ms. al-Hathloul’s sister, Alia. Each woman was locked in a small room, and the windows were covered. Some of the women were frequently taken downstairs for interrogation, which included beatings, electric shocks, waterboarding and threats of rape and murder. 75 | 76 | In an Op-Ed article for The New York Times, Alia al-Hathloul wrote that Mr. al-Qahtani was “present several times” when her sister was tortured, and that he threatened to kill her and throw her body in the sewer. 77 | 78 | The treatment was so harsh that Ms. al-Nafjan tried to commit suicide, according to a United States intelligence assessment. 79 | 80 | The women were later moved to the Dhahban Prison in Jidda, where the physical abuse stopped and their relatives were allowed to visit, Ms. al-Hathloul said. 81 | 82 | Their trial opened in Riyadh on Wednesday, but journalists and diplomats were not permitted to attend, and the government did not announce the charges against them. 83 | 84 | The Saudi official said that Ms. al-Hathloul, Ms. al-Yousef and Ms. al-Nafjan were being tried “in connection with activities that threatened the kingdom’s national security.” 85 | 86 | In the kingdom’s effort to forcibly repatriate Saudi citizens living abroad, it was not always clear which operations were carried out by the rapid intervention team and which by other parts of the security services. 87 | 88 | At least one Saudi who was detained in the Ritz and accused of corruption, Rami al-Naimi, a son of a former Saudi oil minister, was forcibly repatriated from the United Arab Emirates in November 2017. An associate of a senior member of the royal family, Faisal al-Jarba, was snatched in a midnight raid on an apartment in Jordan last June and returned to Saudi Arabia. His family has struggled to get information on where he is or why he is being held. 89 | 90 | In August 2017, a minor prince, Saud bin al-Muntasir bin Saud, was sent back to the kingdom from Morocco. Last May, a university student who had dual Saudi-Qatari citizenship was arrested during a visit to Kuwait and sent home. 91 | 92 | --------------------------------------------------------------------------------