├── README.md ├── cluster-triples.py └── extract_ReVerb_patterns_PT.py /README.md: -------------------------------------------------------------------------------- 1 | # information-extraction-PT 2 | 3 | An example on how to perform relationship extraction/information extraction on Portuguese, using only a part-of-speech tagger, and a named-entity recognizer. 4 | 5 | http://davidsbatista.net/blog/2017/05/08/ReVerb/ 6 | -------------------------------------------------------------------------------- /cluster-triples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import pickle 6 | import numpy as np 7 | 8 | import requests 9 | import scipy 10 | 11 | from sklearn.cluster import DBSCAN 12 | from sklearn.feature_extraction.text import TfidfVectorizer 13 | from extract_ReVerb_patterns_PT import Triple 14 | from sklearn.metrics.pairwise import cosine_distances 15 | from sklearn.metrics.pairwise import pairwise_distances 16 | 17 | __author__ = "David S. Batista" 18 | __email__ = "dsbatista@gmail.com" 19 | 20 | 21 | def generate_embeddings(text): 22 | 23 | embeddings_vector = np.zeros(400) 24 | 25 | for token in text.split(): 26 | try: 27 | embeddings_vector += get_word_embedding(token) 28 | except KeyError: 29 | print "Not Found:", token 30 | 31 | except ValueError: 32 | print "Value Error:", token 33 | 34 | return embeddings_vector 35 | 36 | 37 | def get_word_embedding(word): 38 | payload = {'word': word} 39 | answer = requests.get('http://127.0.0.1:8889/get_vector?', params=payload) 40 | return np.array(answer.json()['vector']) 41 | 42 | 43 | def compute_embeddings_vectors(): 44 | triples = [] 45 | count = 0 46 | with open('triples.csv', 'r') as csvfile: 47 | reader = csv.reader(csvfile, delimiter='\t') 48 | for t in reader: 49 | e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4] 50 | vector = generate_embeddings(rel) 51 | t = Triple(e1, e1_type, rel, e2, e2_type) 52 | t.vector = vector 53 | triples.append(t) 54 | count += 1 55 | if count % 10000 == 0: 56 | print count 57 | with open('triples_vectors.pkl', 'w') as out_file: 58 | pickle.dump(triples, out_file) 59 | 60 | 61 | def compute_pairwise_distances(triples, vectors): 62 | # size = len(vectors) 63 | size = 69213 64 | distances_matrix = np.zeros((size, size)) 65 | for i, ele_1 in enumerate(vectors): 66 | for j, ele_2 in enumerate(vectors): 67 | # Matrix is symmetrical, no need to calculate every position 68 | if j >= i: 69 | break 70 | # distance = cosine_distances(ele_1.reshape(1, -1), ele_2.reshape(1, -1)) 71 | distance = cosine_distances(ele_1, ele_2) 72 | distances_matrix[i, j] = distance[0][0] 73 | distances_matrix[j, i] = distance[0][0] 74 | 75 | if i % 500 == 0: 76 | print i 77 | 78 | return distances_matrix 79 | 80 | 81 | def main(): 82 | 83 | """ 84 | compute_embeddings_vectors() 85 | print "Reading embedding vectors" 86 | with open('triples_vectors.pkl', 'r') as in_file: 87 | triples = pickle.load(in_file) 88 | vectors = [] 89 | for t in triples: 90 | vectors.append(t.vector) 91 | """ 92 | 93 | text = [] 94 | triples = [] 95 | with open('triples.csv', 'r') as csvfile: 96 | reader = csv.reader(csvfile, delimiter='\t') 97 | for t in reader: 98 | e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4] 99 | t = Triple(e1, e1_type, rel, e2, e2_type) 100 | text.append(rel) 101 | triples.append(t) 102 | 103 | tfidf = TfidfVectorizer() 104 | tfidf_matrix = tfidf.fit_transform(text) 105 | 106 | print "Clustering" 107 | dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute', 108 | leaf_size=30, p=None, n_jobs=1) 109 | labels = dbscan.fit_predict(tfidf_matrix) 110 | with open('triples_labels.txt', 'w') as out_file: 111 | for l in labels: 112 | out_file.write(str(l) + '\n') 113 | 114 | print "Reading cluster labels" 115 | labels = [] 116 | with open('triples_labels.txt', 'r') as in_file: 117 | for label in in_file: 118 | labels.append(int(label.strip())) 119 | 120 | for i in range(len(triples)): 121 | triples[i].label = labels[i] 122 | 123 | clusters = dict() 124 | for t in triples: 125 | try: 126 | clusters[t.label] += 1 127 | except KeyError: 128 | clusters[t.label] = 1 129 | 130 | print clusters 131 | exit(-1) 132 | # print len(clusters) 133 | 134 | # top-terms for each cluster 135 | for x in range(-1, len(clusters)): 136 | print x, len(clusters[x]) 137 | for t in triples: 138 | if t.label == str(x): 139 | print t.rel 140 | print 141 | print 142 | 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /extract_ReVerb_patterns_PT.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import codecs 6 | import nltk 7 | import os 8 | 9 | from BeautifulSoup import BeautifulSoup 10 | from polyglot.text import Text 11 | 12 | __author__ = "David S. Batista" 13 | __email__ = "dsbatista@gmail.com" 14 | 15 | 16 | ignore_entities = [".", ". .", ". . .", "! . . .", "»", "?", "E", "-", 17 | ". . »", ". . . »", ". . . )", "sr"] 18 | 19 | categories = ['Nacional', 'Mundo', 'Economia', 'Sociedade', 'Cultura'] 20 | 21 | DEBUG = 0 22 | SDT_OUTPUT = 0 23 | 24 | 25 | class Triple(object): 26 | 27 | def __init__(self, e1, e1_type, rel, e2, e2_type): 28 | self.e1 = e1 29 | self.e1_type = e1_type 30 | self.rel = rel 31 | self.e2 = e2 32 | self.e2_type = e2_type 33 | 34 | def __str__(self): 35 | out = self.e1+'\t'+self.e1_type+'\t'+self.rel+'\t'+self.e2+'\t'+self.e2_type 36 | return out.encode("utf8") 37 | 38 | 39 | def extract_triples(reverb_pattern, text): 40 | 41 | triples = [] 42 | 43 | for s in text.sentences: 44 | try: 45 | if len(s.entities) < 2: 46 | continue 47 | except ValueError: 48 | continue 49 | 50 | if DEBUG == 1: 51 | print s 52 | print 53 | print s.entities 54 | print 55 | 56 | for i in range(len(s.entities)): 57 | if i + 1 == len(s.entities): 58 | break 59 | 60 | e1 = s.entities[i] 61 | e2 = s.entities[i + 1] 62 | entity1 = " ".join(e1) 63 | entity2 = " ".join(e2) 64 | 65 | if entity1.encode("utf8") in ignore_entities \ 66 | or entity2.encode("utf8") in ignore_entities: 67 | continue 68 | 69 | if entity1.islower() or entity1.islower(): 70 | continue 71 | 72 | context = s.words[e1.end:e2.start] 73 | if len(context) > 8 or len(context) == 0: 74 | continue 75 | 76 | if DEBUG == 1: 77 | print entity1, '\t', entity2 78 | print s.pos_tags[e1.end:e2.start] 79 | print 80 | rel = reverb_pattern.parse(s.pos_tags[e1.end:e2.start]) 81 | for x in rel: 82 | if isinstance(x, nltk.Tree) and x.label() == 'REL_PHRASE': 83 | rel_phrase = " ".join([t[0] for t in x.leaves()]) 84 | triple = Triple(entity1, e1.tag, rel_phrase, entity2, e2.tag) 85 | triples.append(triple) 86 | 87 | if SDT_OUTPUT == 1: 88 | for t in triples: 89 | print t 90 | 91 | return triples 92 | 93 | 94 | def process_chave(reverb_pattern): 95 | 96 | input_base_path = "/Users/dbatista/Downloads/CHAVEPublico/" 97 | triples = [] 98 | 99 | for root, dirs, files in os.walk(input_base_path): 100 | for news_file in files: 101 | file_path = os.path.join(root, news_file) 102 | print news_file 103 | with codecs.open(file_path, "r", encoding='latin_1') as input_file: 104 | 105 | # open SGML file and get text sections 106 | sgml_file = input_file.read().encode("utf8") 107 | soup = BeautifulSoup(sgml_file) 108 | 109 | # get article category 110 | for doc in soup.findAll("doc"): 111 | children = doc.findChildren() 112 | if len(children) == 4: 113 | continue 114 | category = children[3] 115 | 116 | # filter by category and extract triples 117 | if category.getText() in categories: 118 | article_text = children[-1].getText() 119 | text = Text(article_text, hint_language_code='pt') 120 | extracted_triples = extract_triples(reverb_pattern, text) 121 | triples.extend(extracted_triples) 122 | 123 | with open('triples.csv', 'w') as csvfile: 124 | writer = csv.writer(csvfile, delimiter='\t', quotechar='"') 125 | for t in triples: 126 | writer.writerow([t.e1.encode("utf8"), t.e1_type, 127 | t.rel.encode("utf8"), 128 | t.e2.encode("utf8"), t.e2_type]) 129 | 130 | print len(triples), "extracted" 131 | 132 | 133 | def test_patterns(reverb_pattern): 134 | 135 | sentences = [ 136 | 137 | "-- Vai amanhã à missa no Sagrado Coração, em memória do ministro Guy Malary, " 138 | "assassinado há um ano pela Junta?", 139 | 140 | "«O tribunal (5º Juízo Cível da Comarca de Lisboa) confirmou a suspensão de Paco " 141 | "Bandeira e Fernando Luso Soares da Sociedade Portuguesa de Autores, recusando " 142 | "a providência cautelar que aqueles haviam requerido para que fosse suspensa " 143 | "a sua expulsão», divulgou ontem em comunicado a Sociedade Portuguesa de " 144 | "Autores (SPA).", 145 | 146 | 'O director-geral do FMI, Michel Camdessus, alertou na semana passada Moscovo ' 147 | 'para as dificuldades que se colocarão para a concessão de novos empréstimos.', 148 | 149 | "Ontem, o assessor de Imprensa da Casa Branca, George Stephanopoulos, " 150 | "em declarações à cadeia de televisão ABC, classificou de «oportunismo político» " 151 | "o pedido dos republicanos, afirmando que Clinton entregou todos os documentos " 152 | "ao Departamento de Justiça.", 153 | 154 | 'Além da nova fábrica, a NC² está instalando um escritório em São Paulo.', 155 | 156 | 'O Conselho de Reitores das Universidades Portuguesas(CRUP) vai entretanto enviar a ' 157 | 'Marçal Grilo, no princípio da próxima semana, um pedido de audiência e um convite' 158 | 'à sua participação numa reunião plenária da estrutura, para que possam revelar ao' 159 | 'novo ministro a sua perspectiva sobre os grandes problemas do ensino superior.', 160 | 161 | 'António Pires de Lima, que mantém divergências políticas públicas com ' 162 | 'Nobre Guedes, considerou que a ausência do ex-ministro do Ambiente vai limitar ' 163 | 'o âmbito do próprio Congresso.', 164 | ] 165 | for s in sentences: 166 | text = Text(s, hint_language_code='pt') 167 | for x in extract_triples(reverb_pattern, text): 168 | print x 169 | print 170 | print "===================" 171 | 172 | 173 | def main(): 174 | 175 | verb = "****" 176 | word = "" 177 | preposition = "" 178 | 179 | rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition) 180 | grammar_long = '''REL_PHRASE: {%s}''' % rel_pattern 181 | 182 | print grammar_long 183 | reverb_pattern = nltk.RegexpParser(grammar_long) 184 | 185 | # test_patterns(reverb_pattern) 186 | 187 | process_chave(reverb_pattern) 188 | 189 | 190 | if __name__ == "__main__": 191 | main() 192 | --------------------------------------------------------------------------------