├── README.md
├── cluster-triples.py
└── extract_ReVerb_patterns_PT.py


/README.md:
--------------------------------------------------------------------------------
1 | # information-extraction-PT
2 | 
3 | An example on how to perform relationship extraction/information extraction on Portuguese, using only a part-of-speech tagger, and a named-entity recognizer.
4 | 
5 | http://davidsbatista.net/blog/2017/05/08/ReVerb/
6 | 


--------------------------------------------------------------------------------
/cluster-triples.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import csv
  5 | import pickle
  6 | import numpy as np
  7 | 
  8 | import requests
  9 | import scipy
 10 | 
 11 | from sklearn.cluster import DBSCAN
 12 | from sklearn.feature_extraction.text import TfidfVectorizer
 13 | from extract_ReVerb_patterns_PT import Triple
 14 | from sklearn.metrics.pairwise import cosine_distances
 15 | from sklearn.metrics.pairwise import pairwise_distances
 16 | 
 17 | __author__ = "David S. Batista"
 18 | __email__ = "dsbatista@gmail.com"
 19 | 
 20 | 
 21 | def generate_embeddings(text):
 22 | 
 23 |     embeddings_vector = np.zeros(400)
 24 | 
 25 |     for token in text.split():
 26 |         try:
 27 |             embeddings_vector += get_word_embedding(token)
 28 |         except KeyError:
 29 |             print "Not Found:", token
 30 | 
 31 |         except ValueError:
 32 |             print "Value Error:", token
 33 | 
 34 |     return embeddings_vector
 35 | 
 36 | 
 37 | def get_word_embedding(word):
 38 |     payload = {'word': word}
 39 |     answer = requests.get('http://127.0.0.1:8889/get_vector?', params=payload)
 40 |     return np.array(answer.json()['vector'])
 41 | 
 42 | 
 43 | def compute_embeddings_vectors():
 44 |     triples = []
 45 |     count = 0
 46 |     with open('triples.csv', 'r') as csvfile:
 47 |         reader = csv.reader(csvfile, delimiter='\t')
 48 |         for t in reader:
 49 |             e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
 50 |             vector = generate_embeddings(rel)
 51 |             t = Triple(e1, e1_type, rel, e2, e2_type)
 52 |             t.vector = vector
 53 |             triples.append(t)
 54 |             count += 1
 55 |             if count % 10000 == 0:
 56 |                 print count
 57 |     with open('triples_vectors.pkl', 'w') as out_file:
 58 |         pickle.dump(triples, out_file)
 59 | 
 60 | 
 61 | def compute_pairwise_distances(triples, vectors):
 62 |     # size = len(vectors)
 63 |     size = 69213
 64 |     distances_matrix = np.zeros((size, size))
 65 |     for i, ele_1 in enumerate(vectors):
 66 |         for j, ele_2 in enumerate(vectors):
 67 |             # Matrix is symmetrical, no need to calculate every position
 68 |             if j >= i:
 69 |                 break
 70 |             # distance = cosine_distances(ele_1.reshape(1, -1), ele_2.reshape(1, -1))
 71 |             distance = cosine_distances(ele_1, ele_2)
 72 |             distances_matrix[i, j] = distance[0][0]
 73 |             distances_matrix[j, i] = distance[0][0]
 74 | 
 75 |         if i % 500 == 0:
 76 |             print i
 77 | 
 78 |     return distances_matrix
 79 | 
 80 | 
 81 | def main():
 82 | 
 83 |     """
 84 |     compute_embeddings_vectors()
 85 |     print "Reading embedding vectors"
 86 |     with open('triples_vectors.pkl', 'r') as in_file:
 87 |         triples = pickle.load(in_file)
 88 |     vectors = []
 89 |     for t in triples:
 90 |         vectors.append(t.vector)
 91 |     """
 92 | 
 93 |     text = []
 94 |     triples = []
 95 |     with open('triples.csv', 'r') as csvfile:
 96 |         reader = csv.reader(csvfile, delimiter='\t')
 97 |         for t in reader:
 98 |             e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
 99 |             t = Triple(e1, e1_type, rel, e2, e2_type)
100 |             text.append(rel)
101 |             triples.append(t)
102 | 
103 |     tfidf = TfidfVectorizer()
104 |     tfidf_matrix = tfidf.fit_transform(text)
105 | 
106 |     print "Clustering"
107 |     dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute',
108 |                     leaf_size=30, p=None, n_jobs=1)
109 |     labels = dbscan.fit_predict(tfidf_matrix)
110 |     with open('triples_labels.txt', 'w') as out_file:
111 |         for l in labels:
112 |             out_file.write(str(l) + '\n')
113 | 
114 |     print "Reading cluster labels"
115 |     labels = []
116 |     with open('triples_labels.txt', 'r') as in_file:
117 |         for label in in_file:
118 |             labels.append(int(label.strip()))
119 | 
120 |     for i in range(len(triples)):
121 |         triples[i].label = labels[i]
122 | 
123 |     clusters = dict()
124 |     for t in triples:
125 |         try:
126 |             clusters[t.label] += 1
127 |         except KeyError:
128 |             clusters[t.label] = 1
129 | 
130 |     print clusters
131 |     exit(-1)
132 |     # print len(clusters)
133 | 
134 |     # top-terms for each cluster
135 |     for x in range(-1, len(clusters)):
136 |         print x, len(clusters[x])
137 |         for t in triples:
138 |             if t.label == str(x):
139 |                 print t.rel
140 |         print
141 |         print
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/extract_ReVerb_patterns_PT.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import csv
  5 | import codecs
  6 | import nltk
  7 | import os
  8 | 
  9 | from BeautifulSoup import BeautifulSoup
 10 | from polyglot.text import Text
 11 | 
 12 | __author__ = "David S. Batista"
 13 | __email__ = "dsbatista@gmail.com"
 14 | 
 15 | 
 16 | ignore_entities = [".", ". .", ". . .",  "! . . .", "»", "?", "E", "-",
 17 |                    ". . »", ". . . »", ". . . )", "sr"]
 18 | 
 19 | categories = ['Nacional', 'Mundo', 'Economia', 'Sociedade', 'Cultura']
 20 | 
 21 | DEBUG = 0
 22 | SDT_OUTPUT = 0
 23 | 
 24 | 
 25 | class Triple(object):
 26 | 
 27 |     def __init__(self, e1, e1_type, rel, e2, e2_type):
 28 |         self.e1 = e1
 29 |         self.e1_type = e1_type
 30 |         self.rel = rel
 31 |         self.e2 = e2
 32 |         self.e2_type = e2_type
 33 | 
 34 |     def __str__(self):
 35 |         out = self.e1+'\t'+self.e1_type+'\t'+self.rel+'\t'+self.e2+'\t'+self.e2_type
 36 |         return out.encode("utf8")
 37 | 
 38 | 
 39 | def extract_triples(reverb_pattern, text):
 40 | 
 41 |     triples = []
 42 | 
 43 |     for s in text.sentences:
 44 |         try:
 45 |             if len(s.entities) < 2:
 46 |                 continue
 47 |         except ValueError:
 48 |             continue
 49 | 
 50 |         if DEBUG == 1:
 51 |             print s
 52 |             print
 53 |             print s.entities
 54 |             print
 55 | 
 56 |         for i in range(len(s.entities)):
 57 |             if i + 1 == len(s.entities):
 58 |                 break
 59 | 
 60 |             e1 = s.entities[i]
 61 |             e2 = s.entities[i + 1]
 62 |             entity1 = " ".join(e1)
 63 |             entity2 = " ".join(e2)
 64 | 
 65 |             if entity1.encode("utf8") in ignore_entities \
 66 |                     or entity2.encode("utf8") in ignore_entities:
 67 |                 continue
 68 | 
 69 |             if entity1.islower() or entity1.islower():
 70 |                 continue
 71 | 
 72 |             context = s.words[e1.end:e2.start]
 73 |             if len(context) > 8 or len(context) == 0:
 74 |                 continue
 75 | 
 76 |             if DEBUG == 1:
 77 |                 print entity1, '\t', entity2
 78 |                 print s.pos_tags[e1.end:e2.start]
 79 |                 print
 80 |             rel = reverb_pattern.parse(s.pos_tags[e1.end:e2.start])
 81 |             for x in rel:
 82 |                 if isinstance(x, nltk.Tree) and x.label() == 'REL_PHRASE':
 83 |                     rel_phrase = " ".join([t[0] for t in x.leaves()])
 84 |                     triple = Triple(entity1, e1.tag, rel_phrase, entity2, e2.tag)
 85 |                     triples.append(triple)
 86 | 
 87 |     if SDT_OUTPUT == 1:
 88 |         for t in triples:
 89 |             print t
 90 | 
 91 |     return triples
 92 | 
 93 | 
 94 | def process_chave(reverb_pattern):
 95 | 
 96 |     input_base_path = "/Users/dbatista/Downloads/CHAVEPublico/"
 97 |     triples = []
 98 | 
 99 |     for root, dirs, files in os.walk(input_base_path):
100 |         for news_file in files:
101 |             file_path = os.path.join(root, news_file)
102 |             print news_file
103 |             with codecs.open(file_path, "r", encoding='latin_1') as input_file:
104 | 
105 |                 # open SGML file and get text sections
106 |                 sgml_file = input_file.read().encode("utf8")
107 |                 soup = BeautifulSoup(sgml_file)
108 | 
109 |                 # get article category
110 |                 for doc in soup.findAll("doc"):
111 |                     children = doc.findChildren()
112 |                     if len(children) == 4:
113 |                         continue
114 |                     category = children[3]
115 | 
116 |                     # filter by category and extract triples
117 |                     if category.getText() in categories:
118 |                         article_text = children[-1].getText()
119 |                         text = Text(article_text, hint_language_code='pt')
120 |                         extracted_triples = extract_triples(reverb_pattern, text)
121 |                         triples.extend(extracted_triples)
122 | 
123 |     with open('triples.csv', 'w') as csvfile:
124 |         writer = csv.writer(csvfile, delimiter='\t', quotechar='"')
125 |         for t in triples:
126 |             writer.writerow([t.e1.encode("utf8"), t.e1_type,
127 |                              t.rel.encode("utf8"),
128 |                              t.e2.encode("utf8"), t.e2_type])
129 | 
130 |     print len(triples), "extracted"
131 | 
132 | 
133 | def test_patterns(reverb_pattern):
134 | 
135 |     sentences = [
136 | 
137 |         "-- Vai amanhã à missa no Sagrado Coração, em memória do ministro Guy Malary, "
138 |         "assassinado há um ano pela Junta?",
139 | 
140 |         "«O tribunal (5º Juízo Cível da Comarca de Lisboa) confirmou a suspensão de Paco "
141 |         "Bandeira e Fernando Luso Soares da Sociedade Portuguesa de Autores, recusando "
142 |         "a providência cautelar que aqueles haviam requerido para que fosse suspensa "
143 |         "a sua expulsão», divulgou ontem em comunicado a Sociedade Portuguesa de "
144 |         "Autores (SPA).",
145 | 
146 |         'O director-geral do FMI, Michel Camdessus, alertou na semana passada Moscovo '
147 |         'para as dificuldades que se colocarão para a concessão de novos empréstimos.',
148 | 
149 |         "Ontem, o assessor de Imprensa da Casa Branca, George Stephanopoulos, "
150 |         "em declarações à cadeia de televisão ABC, classificou de «oportunismo político» "
151 |         "o pedido dos republicanos, afirmando que Clinton entregou todos os documentos "
152 |         "ao Departamento de Justiça.",
153 | 
154 |         'Além da nova fábrica, a NC² está instalando um escritório em São Paulo.',
155 | 
156 |         'O Conselho de Reitores das Universidades Portuguesas(CRUP) vai entretanto enviar a '
157 |         'Marçal Grilo, no princípio da próxima semana, um pedido de audiência e um convite'
158 |         'à sua participação numa reunião plenária da estrutura, para que possam revelar ao'
159 |         'novo ministro a sua perspectiva sobre os grandes problemas do ensino superior.',
160 | 
161 |         'António Pires de Lima, que mantém divergências políticas públicas com '
162 |         'Nobre Guedes, considerou que a ausência do ex-ministro do Ambiente vai limitar '
163 |         'o âmbito do próprio Congresso.',
164 |     ]
165 |     for s in sentences:
166 |         text = Text(s, hint_language_code='pt')
167 |         for x in extract_triples(reverb_pattern, text):
168 |             print x
169 |         print
170 |         print "==================="
171 | 
172 | 
173 | def main():
174 | 
175 |     verb = "<ADV>*<AUX>*<VERB><PART>*<ADV>*"
176 |     word = "<NOUN|ADJ|ADV|DET|ADP>"
177 |     preposition = "<ADP|ADJ>"
178 | 
179 |     rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition)
180 |     grammar_long = '''REL_PHRASE: {%s}''' % rel_pattern
181 | 
182 |     print grammar_long
183 |     reverb_pattern = nltk.RegexpParser(grammar_long)
184 | 
185 |     # test_patterns(reverb_pattern)
186 | 
187 |     process_chave(reverb_pattern)
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     main()
192 | 


--------------------------------------------------------------------------------