├── README.md ├── requirements.txt └── skg ├── __init__.py ├── es_queries.py ├── ingest_xml.py └── skg.py /README.md: -------------------------------------------------------------------------------- 1 | # Semantic Knowledge Graph 2 | Implementation of Semantic Knowledge graph with Elasticsearch 6.6 and Python 3.7 3 | 4 | Based on Trey Grainger's presentation here: 5 | https://www.youtube.com/watch?v=JvuQX92zyi0&t=2124s 6 | and white paper here: 7 | https://arxiv.org/pdf/1609.00464.pdf 8 | 9 | To recreate the Jean Grey example from the presentation, downloaded scifi.stackexchange.com.7z from https://archive.org/download/stackexchange, ingest_xml.py is used to populate the Elasticsearch indexes with the archive data. As written, the code is looking for Posts.xml and Comments.xml in the same directory as ingest_xml.py. 10 | 11 | skg.py takes the provided nodes and displays the terms that show how the nodes are related, effectively describing the edge that connects the nodes. This is done through a query to Elasticsearch with two match_phrase conditions, one for each node supplied, results come from significant terms aggregation. There is also an optional 'levels' parameter to indicate recursion depth. If this is supplied, skg will iterate through the returned significant terms, and sequentially pair these with the orignal nodes to find more related terms. 12 | 13 | *** ingest_xml.py now takes an optional parameter, index_entities. When set to true, spaCy is used to identify the entities in each text, and these are indexed in the 'entities' field. So now it is possible to run skg.py and get a significant terms aggregation based on the entities identified in each document rather than just the terms in the text. 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2024.7.4 2 | chardet==3.0.4 3 | cymem==2.0.2 4 | cytoolz==0.9.0.1 5 | dill==0.2.9 6 | elasticsearch==6.3.1 7 | en-core-web-sm==2.0.0 8 | idna==3.7 9 | msgpack==0.5.6 10 | msgpack-numpy==0.4.3.2 11 | murmurhash==1.0.2 12 | numpy==1.22.0 13 | plac==0.9.6 14 | preshed==2.0.1 15 | regex==2018.1.10 16 | requests==2.32.2 17 | six==1.12.0 18 | spacy==2.0.18 19 | thinc==6.12.1 20 | toolz==0.9.0 21 | tqdm==4.66.3 22 | ujson==5.4.0 23 | urllib3==1.26.19 24 | wrapt==1.10.11 25 | -------------------------------------------------------------------------------- /skg/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /skg/es_queries.py: -------------------------------------------------------------------------------- 1 | find_edge = lambda node_one, node_two, field_name: { 2 | "query" : { 3 | "bool": { 4 | "must": [ 5 | { 6 | "match_phrase": { 7 | field_name : node_one 8 | } 9 | }, 10 | { 11 | "match_phrase": { 12 | field_name : node_two 13 | } 14 | } 15 | ] 16 | } 17 | }, 18 | "aggregations": { 19 | "keywords" : { 20 | "significant_text" : { 21 | "field" : field_name, "min_doc_count":2, "gnd":{} 22 | } 23 | } 24 | } 25 | } 26 | 27 | index_settings = lambda doc_type : { 28 | "settings": { 29 | "index": { 30 | "analysis": { 31 | "tokenizer": { 32 | "comma": { 33 | "type": "pattern", 34 | "pattern": "," 35 | } 36 | }, 37 | "analyzer": { 38 | "entity_analyzer": { 39 | "type": "custom", 40 | "tokenizer": "comma", 41 | "filter": ["trim","lowercase"] 42 | } 43 | } 44 | } 45 | } 46 | }, 47 | "mappings": { 48 | doc_type: { 49 | "properties": { 50 | "entities": { 51 | "type": "text", 52 | "analyzer": "entity_analyzer" 53 | }, 54 | "body_text" : { 55 | "type" : "text", 56 | "fields" : { 57 | "keyword" : { 58 | "type" : "keyword" 59 | } 60 | } 61 | }, 62 | "tags" : { 63 | "type" : "text", 64 | "fields" : { 65 | "keyword" : { 66 | "type" : "keyword" 67 | } 68 | } 69 | } 70 | } 71 | }} 72 | } -------------------------------------------------------------------------------- /skg/ingest_xml.py: -------------------------------------------------------------------------------- 1 | from xml.dom import minidom 2 | from elasticsearch import Elasticsearch 3 | import es_queries 4 | import re 5 | import spacy 6 | import en_core_web_sm 7 | 8 | 9 | es = Elasticsearch() 10 | pattern = re.compile('(<[^>]*>)|(\\"|\\n|\\r)') 11 | nlp = en_core_web_sm.load() 12 | 13 | 14 | def populate_index(source_file, index_name, doc_type, field_mapping, index_entities=False): 15 | es.indices.create(index_name, es_queries.index_settings(doc_type)) 16 | 17 | items = source_file.getElementsByTagName('row') 18 | for item in items: 19 | item_dict = {} 20 | for k, v in field_mapping.items(): 21 | item_dict[v] = pattern.sub('', item.getAttribute(k)) 22 | 23 | if index_entities: 24 | item_dict['entities'] = get_entities(item_dict['body_text']) 25 | 26 | if len(item_dict['body_text']) > 0: 27 | result = es.index(index=index_name, doc_type=doc_type, id=item.getAttribute('Id'), body=item_dict) 28 | 29 | es.indices.refresh(index=index_name) 30 | 31 | def get_entities(body_text): 32 | doc = nlp(body_text) 33 | return ",".join([(e.text) for e in doc.ents]) 34 | 35 | parsed_doc = minidom.parse("Comments.xml") 36 | populate_index(parsed_doc, "scifi_comments", "comments", {'Text': 'body_text', 'PostId':'post_id'}, True) 37 | parsed_doc = minidom.parse("Posts.xml") 38 | populate_index(parsed_doc, "scifi_posts", "posts", {'Body': 'body_text', 'Tags':'tags'}, True) 39 | -------------------------------------------------------------------------------- /skg/skg.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from elasticsearch.helpers import scan 3 | import es_queries 4 | 5 | 6 | es = Elasticsearch() 7 | 8 | 9 | def get_edge_for_two_nodes(nodes, search_field, indexes_to_search, depth=1, level=1): 10 | query_body = es_queries.find_edge(nodes[0], nodes[1], search_field) 11 | 12 | result = es.search(index=indexes_to_search, body=query_body) 13 | buckets = result['aggregations']['keywords']['buckets'] 14 | 15 | 16 | print("level: {} nodes: {} | {}".format(level, nodes[0], nodes[1])) 17 | for b in buckets: 18 | if not b['key'] in nodes: 19 | print("term: {} | docs: {} | bg_ount: {} | score: {}%".format(b['key'], b['doc_count'], 20 | b['bg_count'], int(b['score']*100))) 21 | if depth > 1: 22 | for n in nodes: 23 | get_edge_for_two_nodes([n, b['key']], search_field, indexes_to_search, depth-1, level+1) 24 | 25 | 26 | get_edge_for_two_nodes(["jean grey", "in love"], "body_text", ["scifi_posts", "scifi_comments"]) 27 | #get_edge_for_two_nodes(["vader", "the force"], "body_text", ["scifi_posts", "scifi_comments"], 3) 28 | #get_edge_for_two_nodes(["marty mcfly", "time travel"], "body_text", ["scifi_posts", "scifi_comments"], 5) 29 | 30 | get_edge_for_two_nodes(["bruce banner", "iron man"], "entities", ["scifi_posts", "scifi_comments"], 2) 31 | #get_edge_for_two_nodes(["darth vader", "luke skywalker"], "entities", ["scifi_posts"], 3) 32 | --------------------------------------------------------------------------------