├── .gitignore ├── utils.py ├── README.md ├── relation_provider.py ├── relation.py ├── pattern_matcher.py ├── text_extractor_pipe.py ├── matcher_pipe.py ├── text_extractor.py ├── requirements.txt ├── such_as_pattern_matcher.py ├── or_other_pattern_matcher.py ├── and_other_pattern_matcher.py ├── knowledge_graph.py ├── especially_pattern_matcher.py └── including_pattern_matcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | text/ 3 | .idea/ 4 | __pycache__ -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from nltk import Tree 2 | 3 | def buildTree(token): 4 | if token.n_lefts + token.n_rights > 0: 5 | return Tree(token, [buildTree(child) for child in token.children]) 6 | else: 7 | return buildTree(token) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Support repository for the [Programmerbackpack blog](https://programmerbackpack.com/) 2 | 3 | [Python Knowledge Graph: Understanding Semantic Relationships](https://programmerbackpack.com/python-knowledge-graph-understanding-semantic-relationships/) -------------------------------------------------------------------------------- /relation_provider.py: -------------------------------------------------------------------------------- 1 | from relation import Relation 2 | 3 | class RelationProvider: 4 | 5 | __relations: [Relation] 6 | 7 | def __init__(self, relations=[Relation]): 8 | self.__relations = relations 9 | 10 | def getRelations(self): 11 | return self.__relations 12 | -------------------------------------------------------------------------------- /relation.py: -------------------------------------------------------------------------------- 1 | class Relation: 2 | 3 | __hypernym: str 4 | __hyponym: str 5 | 6 | def __init__(self, hypernym, hyponym): 7 | self.__hypernym = hypernym 8 | self.__hyponym = hyponym 9 | 10 | def getHypernym(self): 11 | return self.__hypernym 12 | 13 | def getHyponym(self): 14 | return self.__hyponym 15 | 16 | -------------------------------------------------------------------------------- /pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from spacy.matcher import Matcher 2 | from abc import abstractmethod 3 | from spacy.tokens import Doc 4 | from relation import Relation 5 | 6 | 7 | class PatternMatcher: 8 | 9 | 10 | def __init__(self, pattern, nlp, matcherId): 11 | self._nlp = nlp 12 | self._matcher = Matcher(nlp.vocab) 13 | self._matcher.add(matcherId, None, pattern) 14 | 15 | @abstractmethod 16 | def getRelations(self, doc: Doc) -> [Relation]: 17 | ... 18 | -------------------------------------------------------------------------------- /text_extractor_pipe.py: -------------------------------------------------------------------------------- 1 | from text_extractor import TextExtractor 2 | 3 | 4 | class TextExtractorPipe: 5 | 6 | __textExtractors: [TextExtractor] 7 | 8 | def __init__(self): 9 | self.__textExtractors = [] 10 | 11 | def addTextExtractor(self, textExtractor: TextExtractor): 12 | self.__textExtractors.append(textExtractor) 13 | 14 | def extract(self) -> str: 15 | result = '' 16 | for textExtractor in self.__textExtractors: 17 | result = result + textExtractor.getText() 18 | return result -------------------------------------------------------------------------------- /matcher_pipe.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from relation import Relation 3 | from spacy.tokens import Doc 4 | 5 | 6 | class MatcherPipe: 7 | 8 | __matchers: [PatternMatcher] 9 | 10 | def __init__(self): 11 | self.__matchers = [] 12 | 13 | def addMatcher(self, matcher: PatternMatcher): 14 | self.__matchers.append(matcher) 15 | 16 | def extract(self, doc: Doc) -> [Relation]: 17 | results = [] 18 | for matcher in self.__matchers: 19 | results.extend(matcher.getRelations(doc)) 20 | return results 21 | -------------------------------------------------------------------------------- /text_extractor.py: -------------------------------------------------------------------------------- 1 | import wikipedia 2 | 3 | 4 | class TextExtractor: 5 | 6 | __pageTitle: str 7 | __pageId: str 8 | 9 | def __init__(self, pageTitle, pageId): 10 | self.__pageTitle = pageTitle 11 | self.__pageId = pageId 12 | 13 | def extract(self): 14 | page = wikipedia.page(title=self.__pageTitle, pageid=self.__pageId) 15 | f = open("./text/" + self.__pageTitle + ".txt", "w") 16 | f.write(page.content) 17 | f.close() 18 | 19 | def getText(self): 20 | f = open("./text/" + self.__pageTitle + ".txt", "r") 21 | return f.read() 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.1 2 | blis==0.4.1 3 | catalogue==1.0.0 4 | certifi==2020.6.20 5 | chardet==3.0.4 6 | click==7.1.2 7 | cycler==0.10.0 8 | cymem==2.0.3 9 | decorator==4.4.2 10 | en-core-web-sm==2.3.1 11 | idna==2.10 12 | importlib-metadata==1.7.0 13 | joblib==0.16.0 14 | kiwisolver==1.2.0 15 | matplotlib==3.3.0 16 | murmurhash==1.0.2 17 | networkx==2.4 18 | nltk==3.5 19 | numpy==1.19.1 20 | Pillow==7.2.0 21 | plac==1.1.3 22 | preshed==3.0.2 23 | pyparsing==2.4.7 24 | python-dateutil==2.8.1 25 | regex==2020.7.14 26 | requests==2.24.0 27 | six==1.15.0 28 | soupsieve==2.0.1 29 | spacy==2.3.2 30 | srsly==1.0.2 31 | thinc==7.4.1 32 | tqdm==4.48.2 33 | urllib3==1.25.10 34 | wasabi==0.7.1 35 | wikipedia==1.4.0 36 | zipp==3.1.0 37 | -------------------------------------------------------------------------------- /such_as_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from spacy.tokens import Doc 3 | from relation import Relation 4 | 5 | 6 | class SuchAsPatternMatcher(PatternMatcher): 7 | 8 | 9 | def __init__(self, nlp): 10 | pattern = [{'POS': 'NOUN'}, 11 | {'IS_PUNCT': True, 'OP': '?'}, 12 | {'LOWER': 'such'}, 13 | {'LOWER': 'as'}, 14 | {'POS': 'NOUN'}] 15 | PatternMatcher.__init__(self, pattern, nlp, "suchAs") 16 | 17 | def getRelations(self, doc: Doc) -> [Relation]: 18 | relations = [] 19 | matches = self._matcher(doc) 20 | for match_id, start, end in matches: 21 | span = doc[start:end] 22 | hypernym = span.root.text 23 | hyponym = span.text.split()[-1] 24 | relations.append(Relation(hypernym, hyponym)) 25 | for right in span.rights: 26 | if right.pos_ == "NOUN": 27 | relations.append(Relation(hypernym, right.text)) 28 | return relations -------------------------------------------------------------------------------- /or_other_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from spacy.tokens import Doc 3 | from relation import Relation 4 | 5 | 6 | class OrOtherPatternMatcher(PatternMatcher): 7 | 8 | def __init__(self, nlp): 9 | pattern = [{'POS': 'NOUN'}, 10 | {'LOWER': 'or'}, 11 | {'LOWER': 'other'}, 12 | {'POS': 'NOUN'}] 13 | PatternMatcher.__init__(self, pattern, nlp, "orOther") 14 | 15 | def getRelations(self, doc: Doc) -> [Relation]: 16 | relations = [] 17 | matches = self._matcher(doc) 18 | for match_id, start, end in matches: 19 | span = doc[start:end] 20 | firstToken = span.root.head 21 | results = [firstToken] 22 | while firstToken and firstToken.head.pos_ == "NOUN": 23 | results.append(firstToken.head) 24 | firstToken = firstToken.head 25 | hypernym = span.text.split()[-1] 26 | relations.append(Relation(hypernym, span.text.split()[0])) 27 | if len(results) > 0: 28 | for result in results: 29 | relations.append(Relation(hypernym, result.text)) 30 | return relations 31 | 32 | 33 | -------------------------------------------------------------------------------- /and_other_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from spacy.tokens import Doc 3 | from relation import Relation 4 | 5 | 6 | class AndOtherPatternMatcher(PatternMatcher): 7 | 8 | 9 | def __init__(self, nlp): 10 | pattern = [{'POS': 'NOUN'}, 11 | {'LOWER': 'and'}, 12 | {'LOWER': 'other'}, 13 | {'POS': 'NOUN'}] 14 | PatternMatcher.__init__(self, pattern, nlp, "andOther") 15 | 16 | def getRelations(self, doc: Doc) -> [Relation]: 17 | relations = [] 18 | matches = self._matcher(doc) 19 | for match_id, start, end in matches: 20 | span = doc[start:end] 21 | firstToken = span.root.head 22 | results = [firstToken] 23 | while firstToken and firstToken.head.pos_ == "NOUN": 24 | results.append(firstToken.head) 25 | firstToken = firstToken.head 26 | hypernym = span.text.split()[-1] 27 | relations.append(Relation(hypernym, span.text.split()[0])) 28 | if len(results) > 0: 29 | for result in results: 30 | relations.append(Relation(hypernym, result.text)) 31 | return relations 32 | 33 | 34 | -------------------------------------------------------------------------------- /knowledge_graph.py: -------------------------------------------------------------------------------- 1 | from relation import Relation 2 | import networkx as nx 3 | import matplotlib.pyplot as plt 4 | 5 | class KnowledgeGraph: 6 | 7 | __relations: [Relation] 8 | __graph: nx.Graph 9 | __colors: {} 10 | 11 | def __init__(self, relations): 12 | self.__relations = relations 13 | self.__graph = nx.Graph() 14 | self.__colors = {} 15 | 16 | def build(self): 17 | for relation in self.__relations: 18 | self.__graph.add_node(relation.getHypernym()) 19 | self.__colors[relation.getHypernym()] = '#e34234' 20 | self.__graph.add_node(relation.getHyponym()) 21 | self.__colors[relation.getHyponym()] = '#009966' 22 | self.__graph.add_edge(relation.getHypernym(), relation.getHyponym()) 23 | 24 | def show(self): 25 | pos = nx.spring_layout(self.__graph) 26 | plt.figure() 27 | colorMap = [] 28 | for node in self.__graph.nodes: 29 | colorMap.append(self.__colors[node]) 30 | nx.draw(self.__graph, pos, edge_color='black', width=1, linewidths=1, 31 | node_size=500, node_color=colorMap, alpha=0.9, 32 | labels={node: node for node in self.__graph.nodes()}) 33 | plt.axis('off') 34 | plt.show() -------------------------------------------------------------------------------- /especially_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from spacy.tokens import Doc 3 | from relation import Relation 4 | 5 | 6 | class EspeciallyPatternMatcher(PatternMatcher): 7 | 8 | def __init__(self, nlp): 9 | pattern = [{'POS': 'NOUN'}, 10 | {'IS_PUNCT': True, 'OP': '?'}, 11 | {'LOWER': 'especially'}, 12 | {'POS': 'NOUN'}] 13 | PatternMatcher.__init__(self, pattern, nlp, "especially") 14 | 15 | def getRelations(self, doc: Doc) -> [Relation]: 16 | relations = [] 17 | matches = self._matcher(doc) 18 | for match_id, start, end in matches: 19 | span = doc[start:end] 20 | candidates = set() 21 | for sent in doc.sents: 22 | for token in sent: 23 | # Find relation 24 | if token.i == span.root.i: 25 | for token2 in sent: 26 | # First hyponym 27 | if token2.head.i == token.i: 28 | for token3 in sent: 29 | startToken = token3 30 | while startToken and startToken.head.i != sent.root.i and startToken.i != token2.i: 31 | if startToken.pos_ == "NOUN": 32 | candidates.add(startToken) 33 | startToken = startToken.head 34 | if len(candidates) > 0: 35 | hypernym = span.text.split()[0].replace(',', '') 36 | for candidate in candidates: 37 | relations.append(Relation(hypernym, candidate.text)) 38 | 39 | return relations 40 | 41 | 42 | -------------------------------------------------------------------------------- /including_pattern_matcher.py: -------------------------------------------------------------------------------- 1 | from pattern_matcher import PatternMatcher 2 | from spacy.tokens import Doc 3 | from relation import Relation 4 | 5 | 6 | class IncludingPatternMatcher(PatternMatcher): 7 | 8 | def __init__(self, nlp): 9 | pattern = [{'POS': 'NOUN'}, 10 | {'IS_PUNCT': True, 'OP': '?'}, 11 | {'LOWER': 'including'}, 12 | {'POS': 'NOUN'}] 13 | PatternMatcher.__init__(self, pattern, nlp, "including") 14 | 15 | def getRelations(self, doc: Doc) -> [Relation]: 16 | relations = [] 17 | matches = self._matcher(doc) 18 | for match_id, start, end in matches: 19 | span = doc[start:end] 20 | for sent in doc.sents: 21 | for token in sent: 22 | # Find the relation 23 | if token.text == "including" and token.head.i == span.root.i: 24 | for token2 in sent: 25 | # First hyponym 26 | if token2.head.i == token.i: 27 | results = set() 28 | results.add(span.text.split()[-1]) 29 | # Other hyponyms 30 | for token3 in sent: 31 | startToken = token3 32 | while startToken and startToken.head.i != sent.root.i and startToken.i != token2.i: 33 | if startToken.pos_ == "NOUN": 34 | results.add(startToken.text) 35 | startToken = startToken.head 36 | if len(results) > 0: 37 | hypernym = span.text.split()[0].replace(',', '') 38 | for result in results: 39 | relations.append(Relation(hypernym, result)) 40 | return relations 41 | 42 | 43 | --------------------------------------------------------------------------------