├── .gitignore
├── utils.py
├── README.md
├── relation_provider.py
├── relation.py
├── pattern_matcher.py
├── text_extractor_pipe.py
├── matcher_pipe.py
├── text_extractor.py
├── requirements.txt
├── such_as_pattern_matcher.py
├── or_other_pattern_matcher.py
├── and_other_pattern_matcher.py
├── knowledge_graph.py
├── especially_pattern_matcher.py
└── including_pattern_matcher.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | text/
3 | .idea/
4 | __pycache__


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from nltk import Tree
2 | 
3 | def buildTree(token):
4 |     if token.n_lefts + token.n_rights > 0:
5 |         return Tree(token, [buildTree(child) for child in token.children])
6 |     else:
7 |         return buildTree(token)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Support repository for the [Programmerbackpack blog](https://programmerbackpack.com/)
2 | 
3 | [Python Knowledge Graph: Understanding Semantic Relationships](https://programmerbackpack.com/python-knowledge-graph-understanding-semantic-relationships/)


--------------------------------------------------------------------------------
/relation_provider.py:
--------------------------------------------------------------------------------
 1 | from relation import Relation
 2 | 
 3 | class RelationProvider:
 4 | 
 5 |     __relations: [Relation]
 6 | 
 7 |     def __init__(self, relations=[Relation]):
 8 |         self.__relations = relations
 9 | 
10 |     def getRelations(self):
11 |         return self.__relations
12 | 


--------------------------------------------------------------------------------
/relation.py:
--------------------------------------------------------------------------------
 1 | class Relation:
 2 | 
 3 |     __hypernym: str
 4 |     __hyponym: str
 5 | 
 6 |     def __init__(self, hypernym, hyponym):
 7 |         self.__hypernym = hypernym
 8 |         self.__hyponym = hyponym
 9 | 
10 |     def getHypernym(self):
11 |         return self.__hypernym
12 | 
13 |     def getHyponym(self):
14 |         return self.__hyponym
15 | 
16 | 


--------------------------------------------------------------------------------
/pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from spacy.matcher import Matcher
 2 | from abc import abstractmethod
 3 | from spacy.tokens import Doc
 4 | from relation import Relation
 5 | 
 6 | 
 7 | class PatternMatcher:
 8 | 
 9 | 
10 |     def __init__(self, pattern, nlp, matcherId):
11 |         self._nlp = nlp
12 |         self._matcher = Matcher(nlp.vocab)
13 |         self._matcher.add(matcherId, None, pattern)
14 | 
15 |     @abstractmethod
16 |     def getRelations(self, doc: Doc) -> [Relation]:
17 |         ...
18 | 


--------------------------------------------------------------------------------
/text_extractor_pipe.py:
--------------------------------------------------------------------------------
 1 | from text_extractor import TextExtractor
 2 | 
 3 | 
 4 | class TextExtractorPipe:
 5 | 
 6 |     __textExtractors: [TextExtractor]
 7 | 
 8 |     def __init__(self):
 9 |         self.__textExtractors = []
10 | 
11 |     def addTextExtractor(self, textExtractor: TextExtractor):
12 |         self.__textExtractors.append(textExtractor)
13 | 
14 |     def extract(self) -> str:
15 |         result = ''
16 |         for textExtractor in self.__textExtractors:
17 |             result = result + textExtractor.getText()
18 |         return result


--------------------------------------------------------------------------------
/matcher_pipe.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from relation import Relation
 3 | from spacy.tokens import Doc
 4 | 
 5 | 
 6 | class MatcherPipe:
 7 | 
 8 |     __matchers: [PatternMatcher]
 9 | 
10 |     def __init__(self):
11 |         self.__matchers = []
12 | 
13 |     def addMatcher(self, matcher: PatternMatcher):
14 |         self.__matchers.append(matcher)
15 | 
16 |     def extract(self, doc: Doc) -> [Relation]:
17 |         results = []
18 |         for matcher in self.__matchers:
19 |             results.extend(matcher.getRelations(doc))
20 |         return results
21 | 


--------------------------------------------------------------------------------
/text_extractor.py:
--------------------------------------------------------------------------------
 1 | import wikipedia
 2 | 
 3 | 
 4 | class TextExtractor:
 5 | 
 6 |     __pageTitle: str
 7 |     __pageId: str
 8 | 
 9 |     def __init__(self, pageTitle, pageId):
10 |         self.__pageTitle = pageTitle
11 |         self.__pageId = pageId
12 | 
13 |     def extract(self):
14 |         page = wikipedia.page(title=self.__pageTitle, pageid=self.__pageId)
15 |         f = open("./text/" + self.__pageTitle + ".txt", "w")
16 |         f.write(page.content)
17 |         f.close()
18 | 
19 |     def getText(self):
20 |         f = open("./text/" + self.__pageTitle + ".txt", "r")
21 |         return f.read()
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.9.1
 2 | blis==0.4.1
 3 | catalogue==1.0.0
 4 | certifi==2020.6.20
 5 | chardet==3.0.4
 6 | click==7.1.2
 7 | cycler==0.10.0
 8 | cymem==2.0.3
 9 | decorator==4.4.2
10 | en-core-web-sm==2.3.1
11 | idna==2.10
12 | importlib-metadata==1.7.0
13 | joblib==0.16.0
14 | kiwisolver==1.2.0
15 | matplotlib==3.3.0
16 | murmurhash==1.0.2
17 | networkx==2.4
18 | nltk==3.5
19 | numpy==1.19.1
20 | Pillow==7.2.0
21 | plac==1.1.3
22 | preshed==3.0.2
23 | pyparsing==2.4.7
24 | python-dateutil==2.8.1
25 | regex==2020.7.14
26 | requests==2.24.0
27 | six==1.15.0
28 | soupsieve==2.0.1
29 | spacy==2.3.2
30 | srsly==1.0.2
31 | thinc==7.4.1
32 | tqdm==4.48.2
33 | urllib3==1.25.10
34 | wasabi==0.7.1
35 | wikipedia==1.4.0
36 | zipp==3.1.0
37 | 


--------------------------------------------------------------------------------
/such_as_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from spacy.tokens import Doc
 3 | from relation import Relation
 4 | 
 5 | 
 6 | class SuchAsPatternMatcher(PatternMatcher):
 7 | 
 8 | 
 9 |     def __init__(self, nlp):
10 |         pattern = [{'POS': 'NOUN'},
11 |                    {'IS_PUNCT': True, 'OP': '?'},
12 |                    {'LOWER': 'such'},
13 |                    {'LOWER': 'as'},
14 |                    {'POS': 'NOUN'}]
15 |         PatternMatcher.__init__(self, pattern, nlp, "suchAs")
16 | 
17 |     def getRelations(self, doc: Doc) -> [Relation]:
18 |         relations = []
19 |         matches = self._matcher(doc)
20 |         for match_id, start, end in matches:
21 |             span = doc[start:end]
22 |             hypernym = span.root.text
23 |             hyponym = span.text.split()[-1]
24 |             relations.append(Relation(hypernym, hyponym))
25 |             for right in span.rights:
26 |                 if right.pos_ == "NOUN":
27 |                     relations.append(Relation(hypernym, right.text))
28 |         return relations


--------------------------------------------------------------------------------
/or_other_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from spacy.tokens import Doc
 3 | from relation import Relation
 4 | 
 5 | 
 6 | class OrOtherPatternMatcher(PatternMatcher):
 7 | 
 8 |     def __init__(self, nlp):
 9 |         pattern = [{'POS': 'NOUN'},
10 |                    {'LOWER': 'or'},
11 |                    {'LOWER': 'other'},
12 |                    {'POS': 'NOUN'}]
13 |         PatternMatcher.__init__(self, pattern, nlp, "orOther")
14 | 
15 |     def getRelations(self, doc: Doc) -> [Relation]:
16 |         relations = []
17 |         matches = self._matcher(doc)
18 |         for match_id, start, end in matches:
19 |             span = doc[start:end]
20 |             firstToken = span.root.head
21 |             results = [firstToken]
22 |             while firstToken and firstToken.head.pos_ == "NOUN":
23 |                 results.append(firstToken.head)
24 |                 firstToken = firstToken.head
25 |             hypernym = span.text.split()[-1]
26 |             relations.append(Relation(hypernym, span.text.split()[0]))
27 |             if len(results) > 0:
28 |                 for result in results:
29 |                     relations.append(Relation(hypernym, result.text))
30 |         return relations
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/and_other_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from spacy.tokens import Doc
 3 | from relation import Relation
 4 | 
 5 | 
 6 | class AndOtherPatternMatcher(PatternMatcher):
 7 | 
 8 | 
 9 |     def __init__(self, nlp):
10 |         pattern = [{'POS': 'NOUN'},
11 |                    {'LOWER': 'and'},
12 |                    {'LOWER': 'other'},
13 |                    {'POS': 'NOUN'}]
14 |         PatternMatcher.__init__(self, pattern, nlp, "andOther")
15 | 
16 |     def getRelations(self, doc: Doc) -> [Relation]:
17 |         relations = []
18 |         matches = self._matcher(doc)
19 |         for match_id, start, end in matches:
20 |             span = doc[start:end]
21 |             firstToken = span.root.head
22 |             results = [firstToken]
23 |             while firstToken and firstToken.head.pos_ == "NOUN":
24 |                 results.append(firstToken.head)
25 |                 firstToken = firstToken.head
26 |             hypernym = span.text.split()[-1]
27 |             relations.append(Relation(hypernym, span.text.split()[0]))
28 |             if len(results) > 0:
29 |                 for result in results:
30 |                     relations.append(Relation(hypernym, result.text))
31 |         return relations
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/knowledge_graph.py:
--------------------------------------------------------------------------------
 1 | from relation import Relation
 2 | import networkx as nx
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | class KnowledgeGraph:
 6 | 
 7 |     __relations: [Relation]
 8 |     __graph: nx.Graph
 9 |     __colors: {}
10 | 
11 |     def __init__(self, relations):
12 |         self.__relations = relations
13 |         self.__graph = nx.Graph()
14 |         self.__colors = {}
15 | 
16 |     def build(self):
17 |         for relation in self.__relations:
18 |             self.__graph.add_node(relation.getHypernym())
19 |             self.__colors[relation.getHypernym()] = '#e34234'
20 |             self.__graph.add_node(relation.getHyponym())
21 |             self.__colors[relation.getHyponym()] = '#009966'
22 |             self.__graph.add_edge(relation.getHypernym(), relation.getHyponym())
23 | 
24 |     def show(self):
25 |         pos = nx.spring_layout(self.__graph)
26 |         plt.figure()
27 |         colorMap = []
28 |         for node in self.__graph.nodes:
29 |             colorMap.append(self.__colors[node])
30 |         nx.draw(self.__graph, pos, edge_color='black', width=1, linewidths=1,
31 |                 node_size=500, node_color=colorMap, alpha=0.9,
32 |                 labels={node: node for node in self.__graph.nodes()})
33 |         plt.axis('off')
34 |         plt.show()


--------------------------------------------------------------------------------
/especially_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from spacy.tokens import Doc
 3 | from relation import Relation
 4 | 
 5 | 
 6 | class EspeciallyPatternMatcher(PatternMatcher):
 7 | 
 8 |     def __init__(self, nlp):
 9 |         pattern = [{'POS': 'NOUN'},
10 |                    {'IS_PUNCT': True, 'OP': '?'},
11 |                    {'LOWER': 'especially'},
12 |                    {'POS': 'NOUN'}]
13 |         PatternMatcher.__init__(self, pattern, nlp, "especially")
14 | 
15 |     def getRelations(self, doc: Doc) -> [Relation]:
16 |         relations = []
17 |         matches = self._matcher(doc)
18 |         for match_id, start, end in matches:
19 |             span = doc[start:end]
20 |             candidates = set()
21 |             for sent in doc.sents:
22 |                 for token in sent:
23 |                     # Find relation
24 |                     if token.i == span.root.i:
25 |                         for token2 in sent:
26 |                             # First hyponym
27 |                             if token2.head.i == token.i:
28 |                                 for token3 in sent:
29 |                                     startToken = token3
30 |                                     while startToken and startToken.head.i != sent.root.i and startToken.i != token2.i:
31 |                                         if startToken.pos_ == "NOUN":
32 |                                             candidates.add(startToken)
33 |                                         startToken = startToken.head
34 |             if len(candidates) > 0:
35 |                 hypernym = span.text.split()[0].replace(',', '')
36 |                 for candidate in candidates:
37 |                     relations.append(Relation(hypernym, candidate.text))
38 | 
39 |         return relations
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/including_pattern_matcher.py:
--------------------------------------------------------------------------------
 1 | from pattern_matcher import PatternMatcher
 2 | from spacy.tokens import Doc
 3 | from relation import Relation
 4 | 
 5 | 
 6 | class IncludingPatternMatcher(PatternMatcher):
 7 | 
 8 |     def __init__(self, nlp):
 9 |         pattern = [{'POS': 'NOUN'},
10 |                    {'IS_PUNCT': True, 'OP': '?'},
11 |                    {'LOWER': 'including'},
12 |                    {'POS': 'NOUN'}]
13 |         PatternMatcher.__init__(self, pattern, nlp, "including")
14 | 
15 |     def getRelations(self, doc: Doc) -> [Relation]:
16 |         relations = []
17 |         matches = self._matcher(doc)
18 |         for match_id, start, end in matches:
19 |             span = doc[start:end]
20 |             for sent in doc.sents:
21 |                 for token in sent:
22 |                     # Find the relation
23 |                     if token.text == "including" and token.head.i == span.root.i:
24 |                         for token2 in sent:
25 |                             # First hyponym
26 |                             if token2.head.i == token.i:
27 |                                 results = set()
28 |                                 results.add(span.text.split()[-1])
29 |                                 # Other hyponyms
30 |                                 for token3 in sent:
31 |                                     startToken = token3
32 |                                     while startToken and startToken.head.i != sent.root.i and startToken.i != token2.i:
33 |                                         if startToken.pos_ == "NOUN":
34 |                                             results.add(startToken.text)
35 |                                         startToken = startToken.head
36 |                                 if len(results) > 0:
37 |                                     hypernym = span.text.split()[0].replace(',', '')
38 |                                     for result in results:
39 |                                         relations.append(Relation(hypernym, result))
40 |         return relations
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------