├── web_app ├── oke │ ├── core │ │ ├── __init__.py │ │ ├── models │ │ │ ├── summarisation │ │ │ │ ├── word_graph_summariser │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── LICENSE.md │ │ │ │ │ ├── resources │ │ │ │ │ │ ├── stopwords.en.dat │ │ │ │ │ │ └── stopwords.fr.dat │ │ │ │ │ └── README.md │ │ │ │ ├── multi_sentence_compressor.py │ │ │ │ └── neural_sentence_summariser.py │ │ │ ├── classification │ │ │ │ ├── eurovoc_classifier.py │ │ │ │ ├── concept_classifier.py │ │ │ │ └── sentence_classifier.py │ │ │ ├── knowledge_extraction │ │ │ │ ├── lattice_builder.py │ │ │ │ ├── ontology_builder.py │ │ │ │ ├── couple_abstractor.py │ │ │ │ └── couple_extractor.py │ │ │ └── model_manager.py │ │ └── misc │ │ │ ├── levenshtein_lib.py │ │ │ ├── onto_reader.py │ │ │ ├── tfidf_lib.py │ │ │ ├── tree_cluster_builder.py │ │ │ ├── adjacency_matrix.py │ │ │ ├── jsonld_lib.py │ │ │ ├── graph_builder.py │ │ │ └── doc_reader.py │ ├── documents │ │ └── yai4law │ │ │ ├── Rome II_EN.pdf │ │ │ ├── Rome I_EN.pdf │ │ │ └── BrusselsReg_EN_1215-20212.pdf │ ├── requirements.txt │ ├── server.py │ └── server_interface.py ├── yai │ ├── requirements.txt │ ├── static │ │ ├── css │ │ │ ├── style.css │ │ │ └── tree.css │ │ ├── img │ │ │ └── favicon.ico │ │ ├── html │ │ │ └── index.html │ │ └── js │ │ │ ├── app.js │ │ │ ├── stage_builder │ │ │ ├── api_lib.js │ │ │ ├── item_stage_builder.js │ │ │ └── domain_stage_builder.js │ │ │ ├── template │ │ │ ├── jsonld_handler.js │ │ │ └── template_lib.js │ │ │ └── vue_component │ │ │ └── explanation_components.js │ └── server.py ├── server.sh └── setup.sh ├── .gitignore ├── kg_hinge ├── kg_hinge.png └── kg_hinge.graphml ├── kg_taxonomy └── kg_taxonomy.png └── README.md /web_app/oke/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web_app/yai/requirements.txt: -------------------------------------------------------------------------------- 1 | bottle 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/word_graph_summariser/__init__.py: -------------------------------------------------------------------------------- 1 | from .takahe import * 2 | -------------------------------------------------------------------------------- /kg_hinge/kg_hinge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/kg_hinge/kg_hinge.png -------------------------------------------------------------------------------- /web_app/yai/static/css/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-size: 18px; 3 | } 4 | 5 | .detail_btn { 6 | color: blue; 7 | text-decoration: underline; 8 | cursor: pointer; 9 | } -------------------------------------------------------------------------------- /kg_taxonomy/kg_taxonomy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/kg_taxonomy/kg_taxonomy.png -------------------------------------------------------------------------------- /web_app/yai/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/yai/static/img/favicon.ico -------------------------------------------------------------------------------- /web_app/oke/documents/yai4law/Rome II_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/Rome II_EN.pdf -------------------------------------------------------------------------------- /web_app/oke/documents/yai4law/Rome I_EN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/Rome I_EN.pdf -------------------------------------------------------------------------------- /web_app/oke/documents/yai4law/BrusselsReg_EN_1215-20212.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/BrusselsReg_EN_1215-20212.pdf -------------------------------------------------------------------------------- /web_app/server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MY_DIR="`python -c "import os; print(os.path.split(os.path.realpath('$0'))[0])"`" 4 | cd $MY_DIR 5 | 6 | # PyClean 7 | (find ./ -name __pycache__ -type d | xargs rm -r) && (find ./ -name *.pyc -type f | xargs rm -r) 8 | 9 | # Run OKE Server 10 | cd oke 11 | echo 'Running OKE server..' 12 | source .env/bin/activate 13 | python3 server.py $1 &> server.log & 14 | disown 15 | cd .. 16 | 17 | # Run YAI Server 18 | cd yai 19 | echo 'Running YAI server..' 20 | source .env/bin/activate 21 | python3 server.py $1 &> server.log & 22 | disown 23 | cd .. 24 | -------------------------------------------------------------------------------- /web_app/oke/requirements.txt: -------------------------------------------------------------------------------- 1 | bottle 2 | 3 | tensorflow==2.1.0 # machine learning 4 | tensorflow_hub==0.8.0 # pre-trained models 5 | tensorflow_text==2.1.1 # question answering 6 | #torch==1.4.0 # machine learning 7 | torchvision==0.5.0 8 | keras==2.2.4 9 | spacy==2.3.2 # NLP 10 | gensim==3.8.3 # TFIDF 11 | nltk==3.5 # stemming, wordnet, etc.. 12 | concepts==0.9.1 # formal concept analysis 13 | pywsd==1.2.4 # word sense disambiguation 14 | wikipedia==1.4.0 # definition extraction 15 | transformers==3.0.2 # text summarization and other NLP tasks 16 | 17 | more_itertools==8.2.0 18 | sortedcontainers==2.1.0 19 | python-Levenshtein # levenshtein string distance 20 | bs4 # xml to txt 21 | scikit-learn 22 | scipy==1.4.1 23 | wordcloud==1.7.0 # graphics 24 | tika # pdf to txt 25 | matplotlib # graphics 26 | pydotplus==2.0.2 # graphs 27 | networkx # graphs 28 | pandas 29 | lxml 30 | -------------------------------------------------------------------------------- /web_app/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MY_DIR="`python -c "import os; print(os.path.realpath('$1'))"`" 4 | cd $MY_DIR 5 | 6 | # Run YAI server 7 | cd yai 8 | echo 'Setting up YAI server..' 9 | python3 -m venv .env 10 | source .env/bin/activate 11 | pip install -U pip setuptools wheel twine 12 | pip install -r requirements.txt 13 | cd .. 14 | 15 | # Run OKE Server 16 | cd oke 17 | echo 'Setting up OKE server..' 18 | python3 -m venv .env 19 | source .env/bin/activate 20 | pip install -U pip setuptools wheel twine 21 | # cd .env/lib 22 | # git clone https://github.com/huggingface/neuralcoref.git 23 | # cd neuralcoref 24 | # pip install -r requirements.txt 25 | # pip install -e . 26 | # cd .. 27 | # cd ../.. 28 | pip install -r requirements.txt 29 | python3 -m spacy download en_core_web_md 30 | # python3 -m spacy download en_core_web_sm 31 | python3 -m nltk.downloader stopwords punkt averaged_perceptron_tagger framenet_v17 wordnet brown 32 | cd .. 33 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/levenshtein_lib.py: -------------------------------------------------------------------------------- 1 | import Levenshtein 2 | 3 | def remove_similar_labels(tuple_list, threshold=0.3): 4 | fetch_value = lambda x: x[0] if isinstance(x, (list,tuple)) else x 5 | new_tuple_list = [] 6 | for t in tuple_list: 7 | is_unique = True 8 | for other_t in new_tuple_list: 9 | if labels_are_similar(fetch_value(t),fetch_value(other_t),threshold): 10 | is_unique = False 11 | break 12 | if is_unique: 13 | new_tuple_list.append(t) 14 | return new_tuple_list 15 | 16 | def get_normalized_sintactic_distance(a,b): 17 | return Levenshtein.distance(a,b)/max(len(a),len(b)) 18 | 19 | def labels_are_similar(a,b, threshold=0.3): 20 | return get_normalized_sintactic_distance(a,b) < threshold 21 | 22 | def get_most_similar_label(label,other_label_list): 23 | distance, most_similar_label = min(map(lambda x: (Levenshtein.distance(label,x),x), other_label_list), key=lambda x:x[0]) 24 | return most_similar_label# if min(1.,distance/len(label)) < 0.2 else label 25 | -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/word_graph_summariser/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Florian Boudin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /web_app/oke/core/models/classification/eurovoc_classifier.py: -------------------------------------------------------------------------------- 1 | from models.classification.concept_classifier import ConceptClassifier 2 | from more_itertools import unique_everseen 3 | import os 4 | import pandas as pd 5 | 6 | class EuroVocClassifier(ConceptClassifier): 7 | EUROVOC_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data/eurovoc.csv') 8 | DEFAULT_OPTIONS = { 9 | 'spacy_model': 'en_core_web_md', 10 | 'tf_model':'USE_Transformer', 11 | 'with_semantic_shifting':True, 12 | 'with_centered_similarity':True, 13 | 'tfidf_importance': 3/4, 14 | 'default_similarity_threshold': 0.8, 15 | } 16 | 17 | def __init__(self, model_options=DEFAULT_OPTIONS): 18 | super().__init__(model_options) 19 | eurovoc_df = pd.read_csv(self.EUROVOC_PATH, sep=';') 20 | unique_term_list = tuple(unique_everseen(eurovoc_df['TERMS (PT-NPT)'].values)) 21 | concept_description_dict = {t:[t] for t in unique_term_list} 22 | self.set_concept_description_dict(concept_description_dict) 23 | 24 | def get_concept_dict(self, concept_counter_dict={}, similarity_threshold=None, with_numbers=True, size=1): 25 | return super().get_concept_dict(concept_counter_dict=concept_counter_dict, similarity_threshold=similarity_threshold, with_numbers=with_numbers, size=size) 26 | -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/word_graph_summariser/resources/stopwords.en.dat: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | after 5 | again 6 | against 7 | all 8 | am 9 | an 10 | and 11 | any 12 | as 13 | at 14 | be 15 | because 16 | been 17 | before 18 | being 19 | below 20 | between 21 | both 22 | but 23 | by 24 | cannot 25 | could 26 | did 27 | do 28 | does 29 | doing 30 | down 31 | during 32 | each 33 | few 34 | for 35 | from 36 | further 37 | had 38 | he 39 | her 40 | here 41 | hers 42 | herself 43 | him 44 | himself 45 | his 46 | how 47 | how's 48 | i 49 | if 50 | in 51 | into 52 | is 53 | it 54 | its 55 | itself 56 | me 57 | more 58 | most 59 | my 60 | myself 61 | no 62 | nor 63 | not 64 | of 65 | off 66 | on 67 | once 68 | only 69 | or 70 | other 71 | ought 72 | our 73 | ours 74 | ourselves 75 | out 76 | over 77 | own 78 | same 79 | she 80 | so 81 | some 82 | such 83 | than 84 | that 85 | the 86 | their 87 | theirs 88 | them 89 | themselves 90 | then 91 | there 92 | these 93 | they 94 | this 95 | those 96 | through 97 | to 98 | too 99 | under 100 | until 101 | up 102 | very 103 | was 104 | we 105 | what 106 | when 107 | where 108 | which 109 | while 110 | who 111 | whom 112 | why 113 | with 114 | would 115 | you 116 | your 117 | yours 118 | yourself 119 | yourselves 120 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/onto_reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import pandas as pd 5 | from misc.jsonld_lib import * 6 | from misc.doc_reader import get_document_list 7 | 8 | def get_dataframe_dict(ontology_dir): 9 | doc_list = get_document_list(ontology_dir) 10 | dataframe_dict = {} 11 | for obj_path in doc_list: 12 | if obj_path.endswith(('.csv',)): 13 | print('Parsing:', obj_path) 14 | _, filename = os.path.split(obj_path) 15 | class_name = filename.split('.')[0] 16 | dataframe_dict[class_name] = pd.read_csv(obj_path, sep=';') 17 | return dataframe_dict 18 | 19 | def get_concept_description_dict(ontology_dir): 20 | dataframe_dict = get_dataframe_dict(ontology_dir) 21 | 22 | concept_dict = {} 23 | for concept, df in dataframe_dict.items(): 24 | concept_dict[concept] = [explode_concept_key(concept).lower().strip()] 25 | sub_classes = df['SubClasses'].values.tolist() 26 | concept_dict.update({ 27 | sc: [explode_concept_key(sc).lower().strip()] 28 | for sc in sub_classes 29 | }) 30 | return concept_dict 31 | 32 | def get_concept_description_dict_from_jsonld(ontology_path, key): 33 | with open(ontology_path,'r') as f: 34 | graph = json.load(f) 35 | 36 | return { 37 | sub_graph['@id']: [explode_concept_key(sub_graph[key]).lower().strip()] 38 | for sub_graph in graph 39 | if key in sub_graph 40 | } 41 | 42 | ''' 43 | import sys 44 | _, ontology_path, skos_path = sys.argv 45 | 46 | print(get_concept_description_dict(ontology_path, skos_path)) 47 | ''' -------------------------------------------------------------------------------- /web_app/oke/core/misc/tfidf_lib.py: -------------------------------------------------------------------------------- 1 | import gensim # for the tf-idf model 2 | from gensim.test.utils import get_tmpfile 3 | 4 | def build_tfidf(words_vector, very_big_corpus=False): 5 | # The code in the following block comes from: https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python 6 | ########################## START BLOCK ########################## 7 | # Build word dictionary 8 | dictionary = gensim.corpora.Dictionary(words_vector) 9 | # Build the Bag-of-Words corpus from lemmatized documents 10 | corpus = [dictionary.doc2bow(gen_doc) for gen_doc in words_vector] 11 | # Build the tf-idf model from the corpus 12 | tfidf_model = gensim.models.TfidfModel(corpus) 13 | # Build similarities cache 14 | # Similarity with cache into temporary file is slower than MatrixSimilarity but it can handle bigger corpus 15 | if very_big_corpus: 16 | tfidf_corpus_similarities = gensim.similarities.Similarity(get_tmpfile("index"), tfidf_model[corpus], num_features=len(dictionary)) 17 | else: 18 | tfidf_corpus_similarities = gensim.similarities.MatrixSimilarity(tfidf_model[corpus], num_features=len(dictionary)) 19 | ########################## END BLOCK ########################## 20 | return dictionary, tfidf_model, tfidf_corpus_similarities 21 | 22 | def get_query_tfidf_similarity(words_vector, dictionary, tfidf_model, tfidf_corpus_similarities): 23 | # Get query BoW (Bag of Words) 24 | query_bow = dictionary.doc2bow(words_vector) 25 | # Get query tf-idf 26 | query_tfidf = tfidf_model[query_bow] 27 | # Get query similarity vector 28 | return tfidf_corpus_similarities[query_tfidf] 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Legal Knowledge Extraction for Knowledge Graph Based Question-Answering 2 | ========== 3 | 4 | Extra documentation (knowledge graph, images, etc..) of the paper "Legal Knowledge Extraction for Knowledge Graph Based Question-Answering". 5 | 6 | ## Usage and Installation 7 | This project has been tested on Debian 9 and macOS Mojave 10.14 with Python 3.7.9. 8 | The [web_app](web_app) folder contains the code of the answer retriever. The [web_app/setup.sh](web_app/setup.sh) script installs the software. To run the web app, execute the following command ```./web_app/server.sh port_num``` where port_num is the number of the port. After running the "server.sh" script, you can access the web app through your browser at http://localhost:port_num (remember to replace the string port_num with the correct port number you have chosen). 9 | 10 | **N.B.** Before being able to run the setup.sh scripts you have to install: virtualenv, python3-dev, python3-pip and make. 11 | 12 | ## Files 13 | 14 | * [The whole Knowledge Graph in graphml format](graph.graphml) 15 | * [The Taxonomy within the Knowledge Graph](kg_taxonomy) 16 | * [The ontological hinge obtained by aligning the Ontology Design Patterns to the Taxonomy](kg_hinge) 17 | 18 | ## Sources 19 | 20 | The Knowledge Graph has been extracted from: 21 | * [Rome I Regulation EC 593/2008](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32008R0593&from=EN) 22 | * [Rome II Regulation EC 864/2007](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32007R0864&from=EN) 23 | * [Brussels I bis Regulation EU 1215/2012](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32012R1215&from=EN) 24 | 25 | ## KG's Taxonomy 26 | ![KG's Taxonomy](kg_taxonomy/kg_taxonomy.png) 27 | 28 | ## Taxonomical Hinge 29 | ![Taxonomical Hinge](kg_hinge/kg_hinge.png) 30 | 31 | ## Contact 32 | 33 | To report issues, use GitHub Issues. 34 | For other queries, contact Francesco Sovrano: 35 | * 36 | * -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/word_graph_summariser/resources/stopwords.fr.dat: -------------------------------------------------------------------------------- 1 | alors 2 | alors 3 | après 4 | au 5 | au 6 | aucun 7 | aujourd'hui 8 | aussi 9 | autre 10 | autre 11 | avant 12 | avant 13 | avec 14 | avec 15 | avoir 16 | avoir 17 | bien 18 | bon 19 | bon 20 | c' 21 | car 22 | car 23 | ce 24 | ce 25 | cela 26 | ces 27 | ces 28 | cette 29 | ceux 30 | chaque 31 | chez 32 | ci 33 | comme 34 | comme 35 | comment 36 | comment 37 | d' 38 | dans 39 | dans 40 | de 41 | de 42 | dedans 43 | dehors 44 | demain 45 | depuis 46 | des 47 | des 48 | deux 49 | deux 50 | devrait 51 | dire 52 | dit 53 | doit 54 | donc 55 | donner 56 | dont 57 | dos 58 | droite 59 | du 60 | du 61 | début 62 | elle 63 | elle 64 | elles 65 | elles 66 | en 67 | en 68 | encore 69 | encore 70 | enfant 71 | ensuite 72 | entre 73 | essai 74 | et 75 | et 76 | eu 77 | eux 78 | faire 79 | fait 80 | fait 81 | faites 82 | femme 83 | fois 84 | font 85 | force 86 | grand 87 | haut 88 | hier 89 | homme 90 | hors 91 | ici 92 | ici 93 | il 94 | il 95 | ils 96 | ils 97 | j' 98 | jamais 99 | je 100 | je 101 | juste 102 | l' 103 | la 104 | le 105 | les 106 | leur 107 | lui 108 | là 109 | m' 110 | ma 111 | maintenant 112 | mais 113 | me 114 | mes 115 | mine 116 | moi 117 | moins 118 | mon 119 | même 120 | ne 121 | ni 122 | n' 123 | non 124 | nos 125 | notre 126 | nous 127 | on 128 | ou 129 | où 130 | par 131 | parce 132 | pas 133 | petit 134 | peu 135 | plupart 136 | plus 137 | pour 138 | pourquoi 139 | près 140 | puis 141 | quand 142 | que 143 | quel 144 | quelle 145 | quelles 146 | quels 147 | qui 148 | quoi 149 | s' 150 | sa 151 | sans 152 | se 153 | ses 154 | seulement 155 | si 156 | sien 157 | soi 158 | son 159 | sous 160 | sujet 161 | sur 162 | ta 163 | tandis 164 | tard 165 | te 166 | tellement 167 | tels 168 | tes 169 | toi 170 | ton 171 | toujours 172 | tous 173 | tout 174 | toute 175 | trop 176 | très 177 | tu 178 | tôt 179 | un 180 | une 181 | valeur 182 | voici 183 | vos 184 | votre 185 | vous 186 | vu 187 | y 188 | à 189 | ça 190 | être 191 | lundi 192 | mardi 193 | mercredi 194 | jeudi 195 | vendredi 196 | samedi 197 | dimanche 198 | soir 199 | matin 200 | midi 201 | demain 202 | aujourd'hui 203 | hier -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/multi_sentence_compressor.py: -------------------------------------------------------------------------------- 1 | from models.model_manager import ModelManager 2 | import models.summarisation.word_graph_summariser as wgs 3 | 4 | class MultiSentenceCompressor(ModelManager): 5 | 6 | def __init__(self, model_options): 7 | super().__init__(model_options) 8 | self.disable_spacy_component = ["ner", "textcat", "neuralcoref"] 9 | 10 | def summarise_sentence_list(self, sentence_list, n=1, min_words_n=20, candidates_horizon=1000, ranking_strategy='boudin-morin', cached=True): 11 | taggedsentences=[ 12 | ' '.join(( 13 | token.text+"/"+(token.tag_ if token.pos_ != 'PUNCT' else 'PUNCT') 14 | for token in doc 15 | )).strip() 16 | for doc in self.nlp([sentence]) 17 | ] 18 | # print(taggedsentences) 19 | 20 | # Create a word graph from the set of sentences with parameters : 21 | # - minimal number of words in the compression : 6 22 | # - language of the input sentences : en (english) 23 | # - POS tag for punctuation marks : PUNCT 24 | compresser = wgs.word_graph(taggedsentences, nb_words=min_words_n, lang='en', punct_tag="PUNCT") 25 | 26 | # Get the 50 best paths 27 | candidates = compresser.get_compression(candidates_horizon) 28 | 29 | if ranking_strategy == 'boudin-morin': 30 | # 2. Rerank compressions by keyphrases (Boudin and Morin's method) 31 | reranker = wgs.keyphrase_reranker(taggedsentences, candidates, lang = 'en') 32 | reranked_candidates = reranker.rerank_nbest_compressions() 33 | best_candidates = sorted(( # Normalize path score by path length 34 | { 35 | 'score': score, 36 | 'text': ' '.join([u[0] for u in path]) 37 | } 38 | for score, path in reranked_candidates 39 | ), key=lambda x: x['score'], reverse=True) 40 | else: #if ranking_strategy == 'filippova': 41 | # 1. Rerank compressions by path length (Filippova's method) 42 | best_candidates = sorted(( # Normalize path score by path length 43 | { 44 | 'score': cummulative_score/len(path), 45 | 'text': ' '.join([u[0] for u in path]) 46 | } 47 | for cummulative_score, path in candidates 48 | ), key=lambda x: x['score'], reverse=True) 49 | 50 | if len(best_candidates) == 0: 51 | best_candidates = [{'text':sentence}] 52 | elif n: 53 | best_candidates = best_candidates[:n] 54 | return tuple(map(lambda x:x['text'], best_candidates)) 55 | -------------------------------------------------------------------------------- /web_app/yai/server.py: -------------------------------------------------------------------------------- 1 | from bottle import run, get, post, route, hook, request, response, static_file 2 | import sys 3 | port = int(sys.argv[1]) 4 | 5 | ############################################################### 6 | # CORS 7 | 8 | @route('/<:re:.*>', method='OPTIONS') 9 | def enable_cors_generic_route(): 10 | """ 11 | This route takes priority over all others. So any request with an OPTIONS 12 | method will be handled by this function. 13 | 14 | See: https://github.com/bottlepy/bottle/issues/402 15 | 16 | NOTE: This means we won't 404 any invalid path that is an OPTIONS request. 17 | """ 18 | add_cors_headers() 19 | 20 | @hook('after_request') 21 | def enable_cors_after_request_hook(): 22 | """ 23 | This executes after every route. We use it to attach CORS headers when 24 | applicable. 25 | """ 26 | add_cors_headers() 27 | 28 | def add_cors_headers(): 29 | try: 30 | response.headers['Access-Control-Allow-Origin'] = '*' 31 | response.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS' 32 | response.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token' 33 | except Exception as e: 34 | print('Error:',e) 35 | 36 | ############################################################### 37 | # Static Routes 38 | 39 | @get("/favicon.ico") 40 | def favicon(): 41 | return static_file("favicon.ico", root="static/img/") 42 | 43 | @get("/resources/static/") 44 | def css(filepath): 45 | return static_file(filepath, root="static/css/") 46 | 47 | @get("/resources/static/") 48 | def font(filepath): 49 | return static_file(filepath, root="static/css/") 50 | 51 | @get("/resources/static/") 52 | def img(filepath): 53 | return static_file(filepath, root="static/img/") 54 | 55 | @get("/resources/static/") 56 | def js(filepath): 57 | return static_file(filepath, root="static/js/") 58 | 59 | @get("/resources/static/") 60 | def js(filepath): 61 | return static_file(filepath, root="static/json/") 62 | 63 | @get("/documents/") 64 | def docs(filepath): 65 | return static_file(filepath, root="../oke/documents/") 66 | 67 | @get("/") 68 | def html(filepath): 69 | print(filepath) 70 | return static_file(filepath, root="static/html/") 71 | 72 | @get("/") 73 | def home(): 74 | return static_file('index.html', root="static/html/") 75 | 76 | if __name__ == "__main__": 77 | run(host='0.0.0.0', port=port, debug=True) 78 | -------------------------------------------------------------------------------- /web_app/yai/static/css/tree.css: -------------------------------------------------------------------------------- 1 | #tree,#expand{ 2 | font-size: medium; 3 | overflow-y:auto; 4 | overflow-x: hidden; 5 | /*border:1px solid silver; */ 6 | /*min-height:100px; */ 7 | max-height: 400px; 8 | /*width: 770px; */ 9 | margin: 1rem 1rem 0 1rem; 10 | background: white; 11 | /*padding: 0 1rem;*/ 12 | } 13 | 14 | .card { 15 | /*margin: 0.25rem 0 !important;*/ 16 | margin: none !important; 17 | /*enable wrapping*/ 18 | white-space : normal !important; 19 | /*ensure lower nodes move down*/ 20 | height : auto !important; 21 | /*border-style: solid dashed;*/ 22 | border-bottom: none; 23 | border-right: none; 24 | border-radius: 0 !important; 25 | } 26 | 27 | .card-header { 28 | border-top: none; 29 | border-left: none; 30 | border-right: none; 31 | padding: 0.5rem !important; 32 | background-color: rgba(0,0,0,.03) !important; 33 | } 34 | 35 | .card-leaf { 36 | padding: 0.5rem !important; 37 | } 38 | 39 | .card-body { 40 | padding: 0.5rem 0 0.5rem 1.25rem !important; 41 | } 42 | 43 | .btn { 44 | text-align: left !important; 45 | padding: 0 !important; 46 | } 47 | 48 | .link { 49 | /*color: darkblue; */ 50 | text-decoration: underline; 51 | } 52 | 53 | /*.link:after { 54 | font-family: FontAwesome; 55 | content: "\f05a"; 56 | display:inline-block; 57 | position: relative; 58 | top:-5px; 59 | font-size: 75%; 60 | }*/ 61 | 62 | .link:hover { 63 | /*font-weight: bold;*/ 64 | color: blue !important; 65 | cursor: pointer; 66 | } 67 | 68 | .link .link { 69 | color: blue !important; 70 | text-decoration: underline !important; 71 | } 72 | 73 | .card input[type=number] { 74 | /*for absolutely positioning spinners*/ 75 | position: relative; 76 | padding: 0.5rem; 77 | /*padding-right: 2.5rem;*/ 78 | width: 4rem; 79 | } 80 | 81 | .card input[type=number]::-webkit-inner-spin-button, 82 | .card input[type=number]::-webkit-outer-spin-button { 83 | opacity: 1; 84 | } 85 | 86 | .card input[type=number]::-webkit-outer-spin-button, 87 | .card input[type=number]::-webkit-inner-spin-button { 88 | -webkit-appearance: inner-spin-button !important; 89 | /*width: 25px;*/ 90 | position: absolute; 91 | top: 0; 92 | right: 0; 93 | height: 100%; 94 | } 95 | 96 | .initial_explanans { 97 | padding: 4%; 98 | background: blanchedalmond; 99 | } 100 | 101 | .h-divider { 102 | margin: auto; 103 | position: relative; 104 | } 105 | 106 | .h-divider .shadow { 107 | overflow: hidden; 108 | height: 20px; 109 | box-shadow: none !important; 110 | } 111 | 112 | .h-divider .shadow:after { 113 | content: ''; 114 | display: block; 115 | margin: -25px auto 0; 116 | width: 100%; 117 | height: 25px; 118 | border-radius: 125px/12px; 119 | box-shadow: 0 0 8px black; 120 | } 121 | -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/word_graph_summariser/README.md: -------------------------------------------------------------------------------- 1 | # takahe 2 | 3 | takahe is a multi-sentence compression module. Given a set of redundant sentences, a word-graph is constructed by iteratively adding sentences to it. The best compression is obtained by finding the shortest path in the word graph. The original algorithm was published and described in: 4 | 5 | * Katja Filippova, Multi-Sentence Compression: Finding Shortest Paths in Word Graphs, *Proceedings of the 23rd International Conference on Computational Linguistics (Coling 2010)*, pages 322-330, 2010. 6 | 7 | A keyphrase-based reranking method can be applied to generate more informative compressions. The reranking method is described in: 8 | 9 | * Florian Boudin and Emmanuel Morin, Keyphrase Extraction for N-best Reranking in Multi-Sentence Compression, *Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2013)*, 2013. 10 | 11 | 12 | ## Dependancies 13 | 14 | As of today, takahe is built for Python 2. 15 | 16 | You may need to install the following libraries : 17 | 18 | - [networkx](http://networkx.github.io/) (installation guide is available [here](http://networkx.github.io/documentation/latest/install.html)) 19 | - [graphviz](http://www.graphviz.org/) and graphviz-dev 20 | - [pygraphviz](http://pygraphviz.github.io/documentation/latest/install.html) 21 | 22 | 23 | 24 | ## Example 25 | A typical usage of this module is: 26 | 27 | import takahe 28 | 29 | # Create a word graph from the set of sentences with parameters : 30 | # - minimal number of words in the compression : 6 31 | # - language of the input sentences : en (english) 32 | # - POS tag for punctuation marks : PUNCT 33 | compresser = takahe.word_graph( sentences, 34 | nb_words = 6, 35 | lang = 'en', 36 | punct_tag = "PUNCT" ) 37 | 38 | # Get the 50 best paths 39 | candidates = compresser.get_compression(50) 40 | 41 | # 1. Rerank compressions by path length (Filippova's method) 42 | for cummulative_score, path in candidates: 43 | 44 | # Normalize path score by path length 45 | normalized_score = cummulative_score / len(path) 46 | 47 | # Print normalized score and compression 48 | print round(normalized_score, 3), ' '.join([u[0] for u in path]) 49 | 50 | # Write the word graph in the dot format 51 | compresser.write_dot('test.dot') 52 | 53 | # 2. Rerank compressions by keyphrases (Boudin and Morin's method) 54 | reranker = takahe.keyphrase_reranker( sentences, 55 | candidates, 56 | lang = 'en' ) 57 | 58 | reranked_candidates = reranker.rerank_nbest_compressions() 59 | 60 | # Loop over the best reranked candidates 61 | for score, path in reranked_candidates: 62 | 63 | # Print the best reranked candidates 64 | print round(score, 3), ' '.join([u[0] for u in path]) 65 | -------------------------------------------------------------------------------- /web_app/yai/static/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |

QA4Law

16 |
17 |

18 | Here you can ask any English question concerning the content of the following documents: 19 |

20 | 25 |
26 |
27 | 35 |
36 |

Loading answers, please wait a while..

37 |
38 |

39 | Question: {{ question_text }} 40 |

41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 |
PertinenceSourceDocument
{{ answer.confidence }}«{{ answer.sentence }}»{{answer.document.name}}
56 |
57 |
58 |

59 | No answer found. 60 |

61 |
62 |
63 |
64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
84 | 85 | -------------------------------------------------------------------------------- /web_app/oke/core/models/summarisation/neural_sentence_summariser.py: -------------------------------------------------------------------------------- 1 | import math 2 | import json 3 | from models.model_manager import ModelManager 4 | 5 | class NeuralSentenceSummariser(ModelManager): 6 | 7 | def __init__(self, model_options): 8 | model_options['hf_model']['type'] = 'summarization' 9 | super().__init__(model_options) 10 | self.debug = model_options.get('debug',False) 11 | self.max_input_token_count = self.get_hf_model()['config'].max_position_embeddings 12 | 13 | @staticmethod 14 | def sentify(s): 15 | return ' '.join(( 16 | p[0].upper() + p[1:] + ('.' if p[-1] != '.' else '') 17 | for p in s.split(' . ') 18 | if p 19 | )) 20 | 21 | def summarise_sentence(self, sentence, sentence_id=None, n=1, options=None, min_size=None): 22 | # if len(sentence) < 100: 23 | # return (sentence,) 24 | if not options: 25 | options = {} 26 | # Format sentence 27 | tokenizer = self.get_hf_model()['tokenizer'] 28 | tokenized_sentence = tokenizer.convert_ids_to_tokens(tokenizer.encode(sentence)) 29 | if min_size and len(tokenized_sentence) < min_size: 30 | return (sentence,) 31 | tokenized_sentence = tokenized_sentence[:self.max_input_token_count-3] # the 1st and last token are a BoS (Begin of String) and a EoS (End of String), furthermore a task token is added to the beginning of the sentence 32 | formatted_sentence = tokenizer.convert_tokens_to_string(tokenized_sentence) 33 | # print(formatted_sentence) 34 | # sentence = ' '.join(sentence.split(' ')[:self.max_input_token_count]) 35 | summary_ids = self.run_hf_task( 36 | [formatted_sentence], 37 | min_length=3, 38 | max_length=self.max_input_token_count, 39 | num_return_sequences=n, # default 1 40 | **options 41 | # do_sample=True, # default False 42 | )[0] 43 | return tuple(map(lambda x: x['summary_text'], summary_ids)) 44 | 45 | @staticmethod 46 | def integrate_summary_tree_list(integration_map, summary_tree_list): 47 | for summary_tree in summary_tree_list: 48 | sentence = summary_tree.get('sentence',None) 49 | if sentence: 50 | integration = integration_map.get(sentence, None) 51 | if integration: 52 | summary_tree.update(integration) 53 | if 'children' in summary_tree: 54 | NeuralSentenceSummariser.integrate_summary_tree_list(integration_map, summary_tree['children']) 55 | 56 | def summarise_sentence_list(self, sentence_list, tree_arity=2, cut_factor=1, depth=None, options=None, min_size=None): 57 | def get_elements_to_merge(slist, merge_size): 58 | # slist = tuple(filter(lambda x: x[-1]=='.', slist)) 59 | return tuple( 60 | slist[i:i+merge_size] 61 | for i in range(0,len(slist),merge_size) 62 | ) 63 | 64 | root_set = tuple(map( 65 | lambda s: self.summarise_sentence(s, n=1, options=options, min_size=min_size)[0], 66 | sentence_list 67 | )) 68 | root_set = tuple(map( 69 | self.sentify, 70 | root_set 71 | )) 72 | summary_tree = [ 73 | { 74 | 'summary': k, 75 | 'children': [{'sentence':v}] 76 | } 77 | for k,v in zip(root_set, sentence_list) 78 | ] 79 | limit = 1 if not depth else math.ceil(len(root_set)/(tree_arity**depth)) 80 | while len(root_set) > limit: 81 | root_set = tuple( 82 | self.summarise_sentence( 83 | ' '.join(map(self.sentify, etm)), 84 | n=1, 85 | options=options, 86 | min_size=min_size 87 | )[0] if len(etm) > 1 else etm[0] 88 | for etm in get_elements_to_merge(root_set, tree_arity) 89 | ) 90 | root_set = tuple(map(self.sentify, root_set)) 91 | summary_tree = [ 92 | { 93 | 'summary': summary, 94 | 'children': summary_tree[i*tree_arity:(i+1)*tree_arity] 95 | } 96 | for i,summary in enumerate(root_set) 97 | ] 98 | if cut_factor > 1 and len(root_set) > 2: 99 | root_set = root_set[:math.ceil(len(root_set)/cut_factor)] 100 | # print(json.dumps(summary_tree, indent=4)) 101 | return summary_tree 102 | -------------------------------------------------------------------------------- /web_app/yai/static/js/app.js: -------------------------------------------------------------------------------- 1 | const OKE_SERVER_URL = location.protocol+'//'+location.hostname+(location.port ? ':'+(parseInt(location.port,10)+2): '')+'/'; 2 | console.log('OKE_SERVER_URL:', OKE_SERVER_URL); 3 | const GET_OVERVIEW_API = OKE_SERVER_URL+"overview"; 4 | const GET_ANSWER_API = OKE_SERVER_URL+"answer"; 5 | const GET_ANNOTATION_API = OKE_SERVER_URL+"annotation"; 6 | 7 | var app = new Vue({ 8 | el: '#app', 9 | data: { 10 | answer_list: [], 11 | empty_answers: true, 12 | loading_answers: false, 13 | question_text: '', 14 | important_answer_list: [], 15 | summary_answer: '', 16 | show_details: false, 17 | single_answer_details: [], 18 | documents: { 19 | 'myfile:BrusselsReg_EN_1215-20212': { 20 | url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32012R1215&from=EN', // 'documents/BrusselsReg_EN_1215-20212.pdf', 21 | name: 'Brussels I bis Regulation EU 1215/2012', 22 | }, 23 | 'myfile:Rome_I_EN': { 24 | url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32008R0593&from=EN', // 'documents/Rome I_EN.pdf', 25 | name: 'Rome I Regulation EC 593/2008', 26 | }, 27 | 'myfile:Rome_II_EN': { 28 | url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32007R0864&from=EN', // 'documents/Rome II_EN.pdf', 29 | name: 'Rome II Regulation EC 864/2007', 30 | } 31 | } 32 | }, 33 | methods: { 34 | question: function(event) { 35 | // console.log(event); 36 | var self = this; 37 | self.loading_answers = true; 38 | self.answer_list = []; 39 | self.show_details = false; 40 | 41 | var x = event.target.value.replace(/(\r\n|\n|\r)/gm, "").trim(); 42 | x = x.charAt(0).toUpperCase() + x.slice(1); 43 | console.log('Sending question:',x); 44 | $.ajax({ 45 | type: "GET", 46 | url: GET_ANSWER_API, 47 | responseType:'application/json', 48 | data: { 49 | 'question': x, 50 | // 'summarised': true 51 | }, 52 | success: function (result) { 53 | // console.log('Getting answer:',JSON.stringify(result)); 54 | self.loading_answers = false; 55 | if (!result) 56 | { 57 | self.empty_answers = true; 58 | return; 59 | } 60 | var question = Object.keys(result)[0]; 61 | var important_answer_list = result[question]; 62 | self.empty_answers = false; 63 | self.question_text = question; 64 | self.important_answer_list = []; 65 | self.single_answer_details = []; 66 | // console.log('Getting answer:',JSON.stringify(important_answer_list)); 67 | console.log('Getting answer..'); 68 | for (var answer of important_answer_list) { 69 | answer.confidence = (answer.confidence*100).toFixed(2).toString()+'%'; 70 | if (answer.annotation) 71 | { 72 | var jsonld = build_minimal_entity_graph(tuple_list_to_formatted_jsonld(answer.annotation)); 73 | KNOWN_ENTITY_DICT = get_entity_dict(jsonld); 74 | var source_dict = jsonld[0]; // the biggest 75 | var doc_id = get_description(source_dict[prefixed_string_to_uri('my:docID')]); 76 | if (!doc_id) 77 | { 78 | answer.document = ''; 79 | continue; 80 | } 81 | answer.document = self.documents[uri_to_prefixed_string(doc_id)]; 82 | 83 | var article_id = get_description(source_dict[prefixed_string_to_uri('my:article_id')]); 84 | if (article_id) 85 | { 86 | var article = [article_id]; 87 | var paragraph_id = get_description(source_dict[prefixed_string_to_uri('my:paragraph_id')]); 88 | if (paragraph_id) 89 | article.push(paragraph_id); 90 | answer.article = article.map(get_known_label).join('.'); 91 | } 92 | } 93 | else 94 | { 95 | answer.document = ''; 96 | continue; 97 | } 98 | self.important_answer_list.push(answer); 99 | } 100 | if (self.important_answer_list.length == 0) 101 | self.empty_answers = true; 102 | }, 103 | }); 104 | } 105 | } 106 | }) 107 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/tree_cluster_builder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.cluster.hierarchy as hierarchy 3 | from scipy.spatial import distance 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | 6 | def build_hierarchical_cluster(elements, labels, method='centroid', metric='euclidean', optimal_ordering=False): 7 | cluster = hierarchy.linkage( 8 | elements, 9 | method=method, 10 | metric=metric, 11 | optimal_ordering=optimal_ordering 12 | ) 13 | 14 | cophentic_correlation_distance, cophenetic_distance_matrix = hierarchy.cophenet(cluster, distance.pdist(elements)) 15 | #print('Cophenetic Distance Matrix', cophenetic_distance_matrix) 16 | #print('Cophentic Correlation Distance:', cophentic_correlation_distance) 17 | 18 | clusters_dict = {} 19 | for i, merge in enumerate(cluster): 20 | # if it is an original point read it from the centers array # other wise read the cluster that has been created 21 | a = int(merge[0]) if merge[0] <= len(cluster) else clusters_dict[int(merge[0])] 22 | b = int(merge[1]) if merge[1] <= len(cluster) else clusters_dict[int(merge[1])] 23 | # the clusters_dict are 1-indexed by scipy 24 | clusters_dict[1 + i + len(cluster)] = [a,b] 25 | 26 | cluster_nested_list = clusters_dict[1 + i + len(cluster)] 27 | 28 | def flatten(container): # iterative flatten, this way we avoid 29 | list_to_flat = [container] 30 | while len(list_to_flat) > 0: 31 | current_list_to_flat = list_to_flat.pop() 32 | for element in current_list_to_flat: 33 | if isinstance(element, (list,tuple)): 34 | list_to_flat.append(element) 35 | else: 36 | yield element 37 | 38 | def build_centroid_tree(nested_list): 39 | # lazy building, this way we avoid 40 | def let_centroid_tree(nested_list): 41 | if isinstance(nested_list, (list,tuple)): 42 | centroid = np.average([elements[e] for e in flatten(nested_list)], 0) 43 | return {'centroid': centroid, 'sub_tree': (let_centroid_tree(l) for l in nested_list)} 44 | return {'label': labels[nested_list], 'value': elements[nested_list], 'idx': nested_list} 45 | centroid_tree = let_centroid_tree(nested_list) 46 | # eager building 47 | tree_to_build = [centroid_tree] 48 | while len(tree_to_build) > 0: 49 | current_tree_to_build = tree_to_build.pop() 50 | if 'sub_tree' in current_tree_to_build: 51 | current_tree_to_build['sub_tree'] = tuple(current_tree_to_build['sub_tree']) 52 | tree_to_build.extend(current_tree_to_build['sub_tree']) 53 | return centroid_tree 54 | 55 | return build_centroid_tree(cluster_nested_list), cophentic_correlation_distance 56 | 57 | def get_most_similar_leaf(dendrogram, entity_embedding): 58 | # iterative version to avoid 59 | leaf_list = [] 60 | tree_to_look = [dendrogram] 61 | while len(tree_to_look) > 0: 62 | current_tree_to_look = tree_to_look.pop() 63 | if 'sub_tree' in current_tree_to_look: 64 | tree_to_look.extend(current_tree_to_look['sub_tree']) 65 | elif 'value' in current_tree_to_look: 66 | leaf_list.append(current_tree_to_look) 67 | if len(leaf_list) == 0: 68 | return None 69 | value_list = [ 70 | leaf['value'] 71 | for leaf in leaf_list 72 | #if leaf['label'] not in centroid_set 73 | ] 74 | similarity_vec = cosine_similarity([entity_embedding], value_list) 75 | best_idx = np.argmax(similarity_vec) 76 | return leaf_list[best_idx]['label'] 77 | 78 | def build_edge_list(root_dendrogram): # iterative version to avoid 79 | centroid_set = set() 80 | edge_list = [] 81 | tree_to_build = [(None,root_dendrogram)] 82 | while len(tree_to_build) > 0: 83 | parent_label, dendrogram = tree_to_build.pop() 84 | if 'centroid' in dendrogram: 85 | centroid_embedding = dendrogram['centroid'] 86 | centroid_label = get_most_similar_leaf(dendrogram, centroid_embedding) 87 | if centroid_label is not None: 88 | centroid_set.add(centroid_label) 89 | if parent_label is not None: 90 | edge_list.append((parent_label, 'related_to', centroid_label)) 91 | tree_to_build.extend((centroid_label,sub_d) for sub_d in dendrogram['sub_tree']) 92 | elif 'value' in dendrogram: 93 | if dendrogram['label'] not in centroid_set: # centroids cannot be leaves 94 | edge_list.append((parent_label, 'related_to', dendrogram['label'])) 95 | return edge_list 96 | -------------------------------------------------------------------------------- /web_app/oke/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | # os.environ["CUDA_VISIBLE_DEVICES"]="-1" 3 | 4 | from bottle import run, get, post, route, hook, request, response, static_file 5 | import json 6 | 7 | from more_itertools import unique_everseen 8 | import sys 9 | port = int(sys.argv[1]) 10 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 11 | 12 | from server_interface import * 13 | 14 | ############################################################### 15 | # CORS 16 | 17 | @route('/<:re:.*>', method='OPTIONS') 18 | def enable_cors_generic_route(): 19 | """ 20 | This route takes priority over all others. So any request with an OPTIONS 21 | method will be handled by this function. 22 | 23 | See: https://github.com/bottlepy/bottle/issues/402 24 | 25 | NOTE: This means we won't 404 any invalid path that is an OPTIONS request. 26 | """ 27 | add_cors_headers() 28 | 29 | @hook('after_request') 30 | def enable_cors_after_request_hook(): 31 | """ 32 | This executes after every route. We use it to attach CORS headers when 33 | applicable. 34 | """ 35 | add_cors_headers() 36 | 37 | def add_cors_headers(): 38 | try: 39 | response.headers['Access-Control-Allow-Origin'] = '*' 40 | response.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS' 41 | response.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token' 42 | except Exception as e: 43 | print('Error:',e) 44 | 45 | def get_from_cache(cache, key, build_fn): 46 | if key not in cache: 47 | cache[key] = json.dumps(build_fn()) 48 | return cache[key] 49 | 50 | ############################################################### 51 | # API - Question Answerer 52 | 53 | ANSWERS_CACHE = {} 54 | @get('/answer') 55 | def get_answer(): 56 | response.content_type = 'application/json' 57 | # question = request.forms.get('question') # post 58 | question = request.query.get('question') 59 | def build_fn(): 60 | print('Answering..') 61 | # print(question) 62 | return get_question_answer_dict( 63 | [question], 64 | options={ 65 | 'answer_pertinence_threshold': 0.35, 66 | 'keep_the_n_most_similar_concepts': 1, 67 | 'query_concept_similarity_threshold': 0.55, 68 | 'add_external_definitions': True, 69 | 'add_clustered_triples': False, 70 | 'include_super_concepts_graph': False, 71 | 'include_sub_concepts_graph': True, 72 | 'consider_incoming_relations': True, 73 | 'tfidf_importance': 1/2, 74 | } 75 | ) 76 | return get_from_cache(ANSWERS_CACHE, question, build_fn) 77 | 78 | OVERVIEW_CACHE = {} 79 | @get('/overview') 80 | def get_overview(): 81 | response.content_type = 'application/json' 82 | concept_uri = request.query.get('concept_uri') 83 | # concept_uri = concept_uri.lower().strip() 84 | # query_template_list = json.loads(request.query.get('query_template_list')) 85 | def build_fn(): 86 | print('Answering..') 87 | query_template_list = [ 88 | ##### Causal + Justificatory 89 | 'Why?', 90 | ##### Theleological 91 | 'What for?', 92 | ##### Spatial 93 | 'Where?', 94 | ##### Temporal 95 | 'When?', 96 | ##### Descriptive 97 | 'What?', 98 | ##### Expository 99 | # 'How?', 100 | ##### Extra 101 | # 'Who?', 102 | # 'Who by?', 103 | # 'Why not?', 104 | ] 105 | question_answer_dict = get_concept_overview( 106 | query_template_list, 107 | concept_uri, 108 | options={ 109 | 'answer_pertinence_threshold': 0.02, 110 | 'add_external_definitions': True, 111 | 'add_clustered_triples': False, 112 | 'include_super_concepts_graph': False, 113 | 'include_sub_concepts_graph': True, 114 | 'consider_incoming_relations': True, 115 | 'tfidf_importance': 0, # questions are not compatible with TF-IDF 116 | } 117 | ) 118 | print('Summarising..') 119 | if question_answer_dict: 120 | question_summary_tree = get_summarised_question_answer_dict( 121 | question_answer_dict, 122 | options={ 123 | 'ignore_non_grounded_answers': False, 124 | 'use_abstracts': False, 125 | 'summary_horizon': 3, 126 | 'tree_arity': 3, 127 | # 'cut_factor': 2, 128 | # 'depth': 1, 129 | 'remove_duplicates': True, 130 | 'min_size_for_summarising': 50, 131 | } 132 | ) 133 | else: 134 | question_summary_tree = None 135 | print('Getting taxonomical view..') 136 | taxonomical_view = get_taxonomical_view(concept_uri, depth=0) 137 | print('Annotating..') 138 | annotation_iter = unique_everseen(annotate_question_summary_tree(question_summary_tree) + annotate_taxonomical_view(taxonomical_view)) 139 | equivalent_concept_uri_set = get_equivalent_concepts(concept_uri) 140 | equivalent_concept_uri_set.add(concept_uri) 141 | annotation_iter = filter(lambda x: x['annotation'] not in equivalent_concept_uri_set, annotation_iter) 142 | return { 143 | 'question_summary_tree': question_summary_tree, 144 | 'taxonomical_view': taxonomical_view, 145 | 'annotation_list': list(annotation_iter), 146 | } 147 | return get_from_cache(OVERVIEW_CACHE, concept_uri, build_fn) 148 | 149 | if __name__ == "__main__": 150 | run(host='0.0.0.0', port=port+2, debug=True) 151 | 152 | -------------------------------------------------------------------------------- /web_app/oke/core/models/classification/concept_classifier.py: -------------------------------------------------------------------------------- 1 | from misc.doc_reader import DocParser 2 | from models.classification.sentence_classifier import SentenceClassifier 3 | from models.knowledge_extraction.concept_extractor import ConceptExtractor 4 | from more_itertools import unique_everseen 5 | from collections import Counter 6 | import nltk 7 | from nltk.corpus import stopwords 8 | import re 9 | import json 10 | import itertools 11 | 12 | class ConceptClassifier(SentenceClassifier): 13 | def __init__(self, model_options): 14 | # nltk.download('stopwords') 15 | super().__init__(model_options) 16 | self.concept_extractor = ConceptExtractor(model_options) 17 | 18 | def set_concept_description_dict(self, concept_description_dict): 19 | id_doc_list = tuple(unique_everseen(( 20 | (key,description) 21 | for key, value in concept_description_dict.items() 22 | for description in value 23 | ))) 24 | self.set_documents(id_doc_list) 25 | return self 26 | 27 | def lemmatize_spacy_document(self, doc): 28 | return [ 29 | token.lemma_.casefold().strip() 30 | for token in doc 31 | if not token.is_punct and token.lemma_.lower() != '-pron-' 32 | # Remove stop tokens: 33 | #if not (token.is_punct or token.pos_ in ['PART','DET','ADP','CONJ','SCONJ']) 34 | ] 35 | 36 | def get_concept_dict(self, doc_parser: DocParser, concept_counter_dict=None, similarity_threshold=None, with_numbers=True, size=None, remove_stopwords=True, lemmatized=True, tfidf_importance=None): 37 | if concept_counter_dict is None: 38 | concept_counter_dict = {} 39 | # Extract concept dict list 40 | concept_dict_list = self.concept_extractor.get_concept_list(doc_parser) 41 | get_concept_label = lambda x: x['concept']['lemma' if lemmatized else 'text'] 42 | # Remove unwanted concepts 43 | filter_empty_fn = lambda x: x 44 | filter_stopwords_fn = lambda x: x not in stopwords.words('english') 45 | filter_numbers_fn = lambda x: re.search(r'\d', x) is None 46 | if remove_stopwords and with_numbers: 47 | filter_fn = lambda x: filter_empty_fn(x) and filter_stopwords_fn(x) and filter_numbers_fn(x) 48 | elif remove_stopwords: 49 | filter_fn = lambda x: filter_empty_fn(x) and filter_stopwords_fn(x) 50 | elif with_numbers: 51 | filter_fn = lambda x: filter_empty_fn(x) and filter_numbers_fn(x) 52 | else: 53 | filter_fn = lambda x: filter_empty_fn(x) 54 | concept_dict_list = list(filter(lambda x: filter_fn(get_concept_label(x)), concept_dict_list)) 55 | # Extract concept_counter 56 | concept_iter = map(get_concept_label, concept_dict_list) 57 | concept_list = tuple(concept_iter) 58 | concept_counter = Counter(concept_list) 59 | # Merge input concept_counter_dict with concept_counter 60 | for concept,count in concept_counter.items(): 61 | if concept not in concept_counter_dict: 62 | concept_counter_dict[concept] = { 63 | 'count': count, 64 | 'source_list': [], 65 | 'similar_to': [] 66 | } 67 | else: 68 | concept_counter_dict[concept]['count'] += count 69 | # Add sources 70 | for concept, cdict in zip(concept_list, concept_dict_list): 71 | concept_counter_dict[concept]['span'] = cdict['concept']['span'] 72 | concept_counter_dict[concept]['source_list'].append(cdict['source']) 73 | # Add similarities 74 | if not concept_counter_dict: 75 | return {} 76 | text_list, cdict_list = zip(*concept_counter_dict.items()) 77 | formatted_text_list = tuple(map(lambda x: x['span'], cdict_list)) 78 | index_of_most_similar_documents_list = self.get_index_of_most_similar_documents( 79 | self.get_formatted_query_similarity(text_list, formatted_text_list, tfidf_importance=tfidf_importance), 80 | similarity_threshold= similarity_threshold, 81 | similarity_type= 'weighted', 82 | ) 83 | for concept, index_of_most_similar_documents in zip(text_list, index_of_most_similar_documents_list): 84 | concept_counter_dict[concept]['similar_to'] = tuple(itertools.islice(index_of_most_similar_documents, size)) 85 | concept_counter_dict[concept]['source_list'] = tuple(unique_everseen(concept_counter_dict[concept]['source_list'], key=lambda x:x['sentence_text'])) 86 | return concept_counter_dict 87 | 88 | @staticmethod 89 | def get_missing_concepts_counter(concept_dict): 90 | return { 91 | concept: value['count'] 92 | for concept, value in concept_dict.items() 93 | if len(value['similar_to'])==0 94 | } 95 | 96 | def annotate(self, doc_parser: DocParser, similarity_threshold=None, max_concepts_per_alignment=1, tfidf_importance=None): 97 | concept_dict = self.get_concept_dict( 98 | doc_parser, 99 | similarity_threshold= similarity_threshold, 100 | with_numbers= True, 101 | lemmatized= False, 102 | remove_stopwords= True, 103 | size= max_concepts_per_alignment, 104 | tfidf_importance= tfidf_importance, 105 | ) 106 | annotation_iter = ( 107 | { 108 | 'text': concept_label, 109 | 'annotation': concept_uri_dict['id'], 110 | 'similarity': concept_uri_dict['similarity'], 111 | 'syntactic_similarity': concept_uri_dict['syntactic_similarity'], 112 | 'semantic_similarity': concept_uri_dict['semantic_similarity'], 113 | } 114 | for concept_label,similarity_dict in concept_dict.items() 115 | for concept_uri_dict in similarity_dict['similar_to'] 116 | ) 117 | annotation_iter = unique_everseen(annotation_iter, key=lambda x: x['text']) 118 | annotation_list = list(annotation_iter) 119 | # print(json.dumps(annotation_list, indent=4)) 120 | return annotation_list 121 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/adjacency_matrix.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class AdjacencyMatrix(): 4 | 5 | def __init__(self, graph, equivalence_relation_set, is_sorted=False): # Build the adjacency matrix, for both incoming and outcoming edges 6 | self.graph = graph 7 | self.equivalence_matrix = {} 8 | self.adjacency_matrix = {} 9 | for s,p,o in graph: 10 | if o not in self.adjacency_matrix: 11 | self.adjacency_matrix[o] = {'in': [], 'out': []} 12 | if s not in self.adjacency_matrix: 13 | self.adjacency_matrix[s] = {'in': [], 'out': []} 14 | if p not in equivalence_relation_set: 15 | continue 16 | if s not in self.equivalence_matrix: 17 | self.equivalence_matrix[s] = set() 18 | if o not in self.equivalence_matrix: 19 | self.equivalence_matrix[o] = set() 20 | self.equivalence_matrix[s].add(o) 21 | for e in self.equivalence_matrix[s]: 22 | if e == o: 23 | continue 24 | self.equivalence_matrix[e].add(o) 25 | self.equivalence_matrix[o].add(s) 26 | for e in self.equivalence_matrix[o]: 27 | if e == s: 28 | continue 29 | self.equivalence_matrix[e].add(s) 30 | # print(json.dumps(dict(map(lambda x:(x[0],list(x[1])), self.equivalence_matrix.items())), indent=4)) 31 | for s,p,o in graph: 32 | self.adjacency_matrix[s]['out'].append((p,o)) 33 | for e in self.equivalence_matrix.get(s,[]): 34 | self.adjacency_matrix[e]['out'].append((p,o)) 35 | self.adjacency_matrix[o]['in'].append((p,s)) 36 | for e in self.equivalence_matrix.get(o,[]): 37 | self.adjacency_matrix[e]['in'].append((p,s)) 38 | # print(json.dumps(self.adjacency_matrix['my:cem'], indent=4)) 39 | if is_sorted: 40 | for concept in self.adjacency_matrix.values(): 41 | concept['in'].sort() 42 | concept['out'].sort() 43 | 44 | def get_incoming_edges_matrix(self, concept): 45 | adjacency_list = self.adjacency_matrix.get(concept,None) 46 | return list(adjacency_list['in']) if adjacency_list else [] 47 | 48 | def get_outcoming_edges_matrix(self, concept): 49 | adjacency_list = self.adjacency_matrix.get(concept,None) 50 | return list(adjacency_list['out']) if adjacency_list else [] 51 | 52 | def get_equivalent_concepts(self, concept): 53 | return set(self.equivalence_matrix.get(concept,[])) 54 | 55 | def get_nodes(self): 56 | return self.adjacency_matrix.keys() 57 | 58 | def get_predicate_chain(self, concept_set, direction_set, predicate_filter_fn=None, depth=None, already_explored_concepts_set=None): # This function returns the related concepts of a given concept set for a given type of relations (e.g. if the relation is rdfs:subclassof, then it returns the super- and/or sub-classes), exploting an adjacency matrix 59 | if depth: 60 | depth -= 1 61 | if not already_explored_concepts_set: 62 | already_explored_concepts_set = set() 63 | joint_set = set() 64 | already_explored_concepts_set |= concept_set 65 | for c in concept_set: 66 | for direction in direction_set: 67 | adjacency_list = self.adjacency_matrix.get(c,None) 68 | if adjacency_list: 69 | adjacency_iter = filter(lambda x: x[-1] not in already_explored_concepts_set, adjacency_list[direction]) 70 | if predicate_filter_fn: 71 | adjacency_iter = filter(lambda x: predicate_filter_fn(x[0]), adjacency_iter) 72 | joint_set |= set(map(lambda y: y[-1], adjacency_iter)) 73 | if len(joint_set) == 0: 74 | return set(concept_set) 75 | elif depth and depth <= 0: 76 | return joint_set.union(concept_set) 77 | return concept_set.union(self.get_predicate_chain( 78 | joint_set, 79 | direction_set, 80 | predicate_filter_fn=predicate_filter_fn, 81 | depth=depth, 82 | already_explored_concepts_set=already_explored_concepts_set, 83 | )) 84 | 85 | # Tarjan's algorithm (single DFS) for finding strongly connected components in a given directed graph 86 | def SCC(self): # Complexity : O(V+E) 87 | '''A recursive function that finds and prints strongly connected 88 | components using DFS traversal 89 | u --> The vertex to be visited next 90 | disc[] --> Stores discovery times of visited vertices 91 | low[] -- >> earliest visited vertex (the vertex with minimum 92 | discovery time) that can be reached from subtree 93 | rooted with current vertex 94 | st -- >> To store all the connected ancestors (could be part 95 | of SCC) 96 | stackMember[] --> bit/index array for faster check whether 97 | a node is in stack 98 | ''' 99 | def helper(clique_list, u, low, disc, stackMember, st, Time=0): 100 | # Initialize discovery time and low value 101 | disc[u] = Time 102 | low[u] = Time 103 | Time += 1 104 | stackMember[u] = True 105 | st.append(u) 106 | 107 | # Go through all vertices adjacent to this 108 | for _,v in self.adjacency_matrix[u]['in']: 109 | 110 | # If v is not visited yet, then recur for it 111 | if disc[v] == -1: 112 | Time = helper(clique_list, v, low, disc, stackMember, st, Time) 113 | 114 | # Check if the subtree rooted with v has a connection to 115 | # one of the ancestors of u 116 | # Case 1 (per above discussion on Disc and Low value) 117 | low[u] = min(low[u], low[v]) 118 | 119 | elif stackMember[v] == True: 120 | 121 | '''Update low value of 'u' only if 'v' is still in stack 122 | (i.e. it's a back edge, not cross edge). 123 | Case 2 (per above discussion on Disc and Low value) ''' 124 | low[u] = min(low[u], disc[v]) 125 | 126 | # head node found, pop the stack and print an SCC 127 | w = -1 #To store stack extracted vertices 128 | if low[u] == disc[u]: 129 | clique = [] 130 | while w != u: 131 | w = st.pop() 132 | clique.append(w) 133 | stackMember[w] = False 134 | clique_list.append(clique) 135 | return Time 136 | # Mark all the vertices as not visited 137 | # and Initialize parent and visited, 138 | # and ap(articulation point) arrays 139 | disc = {k:-1 for k in self.adjacency_matrix.keys()} 140 | low = {k:-1 for k in self.adjacency_matrix.keys()} 141 | stackMember = {k:False for k in self.adjacency_matrix.keys()} 142 | st =[] 143 | 144 | 145 | # Call the recursive helper function 146 | # to find articulation points 147 | # in DFS tree rooted with vertex 'i' 148 | clique_list = [] 149 | Time = 0 150 | for i in self.adjacency_matrix.keys(): 151 | if disc[i] == -1: 152 | Time = helper(clique_list, i, low, disc, stackMember, st, Time) 153 | return clique_list 154 | -------------------------------------------------------------------------------- /kg_hinge/kg_hinge.graphml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | rdf:type 55 | 56 | 57 | rdf:type 58 | 59 | 60 | rdf:type 61 | 62 | 63 | rdf:type 64 | 65 | 66 | rdf:type 67 | 68 | 69 | rdf:type 70 | 71 | 72 | rdf:type 73 | 74 | 75 | rdf:type 76 | 77 | 78 | rdf:type 79 | 80 | 81 | rdf:type 82 | 83 | 84 | rdf:type 85 | 86 | 87 | rdf:type 88 | 89 | 90 | rdf:type 91 | 92 | 93 | rdf:type 94 | 95 | 96 | rdf:type 97 | 98 | 99 | rdf:type 100 | 101 | 102 | rdf:type 103 | 104 | 105 | rdf:type 106 | 107 | 108 | rdf:type 109 | 110 | 111 | rdf:type 112 | 113 | 114 | rdf:type 115 | 116 | 117 | rdf:type 118 | 119 | 120 | rdf:type 121 | 122 | 123 | rdf:type 124 | 125 | 126 | rdf:type 127 | 128 | 129 | rdf:type 130 | 131 | 132 | rdf:type 133 | 134 | 135 | rdf:type 136 | 137 | 138 | rdf:type 139 | 140 | 141 | rdf:type 142 | 143 | 144 | rdf:type 145 | 146 | 147 | rdf:type 148 | 149 | 150 | rdf:type 151 | 152 | 153 | rdf:type 154 | 155 | 156 | rdf:type 157 | 158 | 159 | rdf:type 160 | 161 | 162 | rdf:type 163 | 164 | 165 | rdf:type 166 | 167 | 168 | rdf:type 169 | 170 | 171 | rdf:type 172 | 173 | 174 | rdf:type 175 | 176 | 177 | rdf:type 178 | 179 | 180 | rdf:type 181 | 182 | 183 | rdf:type 184 | 185 | -------------------------------------------------------------------------------- /web_app/yai/static/js/stage_builder/api_lib.js: -------------------------------------------------------------------------------- 1 | const API_SERVER_URL = location.protocol+'//'+location.hostname+(location.port ? ':'+location.port: '')+'/'; 2 | 3 | function expand_information(information_uri, known_entity_dict, language='en', use_linksum=true, retry=true) 4 | { 5 | if (information_uri in known_entity_dict) 6 | return Object.assign({},known_entity_dict[information_uri]); 7 | 8 | var query = '' 9 | if (use_linksum) // LinkSUM: Using Link Analysis to Summarize Entity Data 10 | { 11 | query = [ 12 | PREFIX_MAP_STRING, 13 | "SELECT ?v ?predicate ?object", 14 | "FROM ", 15 | "FROM ", 16 | "WHERE {", 17 | "{", // show abstract, label and type first 18 | "SELECT ?predicate ?object WHERE {", 19 | "<"+information_uri+"> ?predicate ?object.", 20 | "FILTER(?predicate = dbo:abstract || ?predicate = rdfs:label || ?predicate = rdf:type).", 21 | "FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).", 22 | "}", 23 | "ORDER BY DESC(?predicate)", 24 | "} UNION {", // show subclasses 25 | "SELECT (my:superClassOf AS ?predicate) ?object WHERE {", 26 | "?object rdfs:subClassOf <"+information_uri+">.", 27 | "FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).", 28 | "}", 29 | "} UNION {", // show other properties second, ranked by importance 30 | "SELECT ?predicate ?object ?v WHERE {", 31 | "<"+information_uri+"> ?predicate ?object.", 32 | "FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).", 33 | "FILTER(?predicate != rdfs:comment && ?predicate != vrank:hasRank && ?predicate != dbo:abstract && ?predicate != rdfs:label && ?predicate != rdf:type).", 34 | "OPTIONAL {?object vrank:hasRank ?r. ?r vrank:rankValue ?v}.", 35 | "}", 36 | "ORDER BY DESC(?v), DESC(?predicate)", 37 | "}", 38 | "}", 39 | ].join("\n") 40 | } 41 | else 42 | { 43 | query = [ 44 | "SELECT DISTINCT ?predicate ?object WHERE {", 45 | "<"+information_uri+"> ?predicate ?object.", 46 | "FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).", 47 | "}", 48 | ].join("\n"); 49 | } 50 | var query_result = query_sparql_endpoint(DBPEDIA_ENDPOINT, query) 51 | if (!query_result || !query_result.results || query_result.results.bindings.length==0) 52 | { 53 | if (!retry) 54 | return null; 55 | return expand_information(PREFIX_MAP['dbr']+getPath(information_uri), known_entity_dict, language, use_linksum, false); 56 | } 57 | // console.log(query_result) 58 | // Build subject map 59 | var jsonld_graph = {'@id':information_uri} 60 | for (tuple of query_result.results.bindings) 61 | { 62 | var pred = tuple.predicate?String(tuple.predicate.value):''; 63 | if (pred == '') 64 | continue 65 | var obj = tuple.object?String(tuple.object.value):''; 66 | if (pred in jsonld_graph) 67 | { 68 | if (!isArray(jsonld_graph[pred])) 69 | jsonld_graph[pred] = [jsonld_graph[pred]] 70 | jsonld_graph[pred].push(obj) 71 | } 72 | else 73 | jsonld_graph[pred] = obj 74 | } 75 | var ground = { 76 | '@type': 'JSON', 77 | '@value': JSON.stringify(query_result, null, 2) 78 | } 79 | var formatted_jsonld_graph = format_jsonld(jsonld_graph, ground, query); 80 | return formatted_jsonld_graph; 81 | } 82 | 83 | function expand_link(link, show_expansion_fn, known_entity_dict) 84 | { 85 | try { 86 | if (!isURL(link)) // query wikipedia 87 | query_wikipedia_by_title(link, show_expansion_fn) 88 | else 89 | { // query dbpedia 90 | console.log('Expanding '+format_link(link)+' on DBPedia.'); 91 | var response = expand_information(link, known_entity_dict); 92 | if (response) 93 | response['@id'] = link; 94 | show_expansion_fn(response); 95 | } 96 | } catch (ex) { 97 | if (DEBUG) 98 | console.error(ex); 99 | } 100 | } 101 | 102 | function generate_counterfactual(information_dict) 103 | { 104 | var api = information_dict['api']; 105 | var input = information_dict['input'].map(x=>parseInt(x,10)); 106 | var output = null; 107 | try { 108 | $.ajax({ 109 | url:api, 110 | // async: false, 111 | method:'POST', 112 | async: false, 113 | data: JSON.stringify({'sample_value': input}), 114 | contentType: "application/json; charset=utf-8", 115 | success: x => output=x, 116 | }); 117 | } catch(e) { 118 | if (DEBUG) 119 | console.error(e); 120 | } 121 | return output; 122 | } 123 | 124 | function get_counterfactual(current, api, process_graph) 125 | { 126 | var process_input_dict = get_process_input_dict_from_formatted_jsonld({'my:processList':process_graph}); 127 | var input_list = [].concat(...Object.values(process_input_dict).filter(x=>x[0][COUNTERFACTUAL_API_URI] == api)); 128 | var input_values = $(".counterfactual").toArray().sort((a, b) => get_DOM_element_distance(a,current) - get_DOM_element_distance(b,current)); 129 | input_values = input_values.concat(input_list.map(x=>{ 130 | return {'id':x[FEATURE_ORDER_URI],'value':x[VALUE_URI]} 131 | })); 132 | input_values = get_unique_elements(input_values, x=>x.id); 133 | return input_values.sort((a, b) => parseInt(a.id) - parseFloat(b.id)).map(x=>x.value); 134 | } 135 | 136 | function get_typed_entity_dict_from_jsonld(jsonld) 137 | { 138 | var minimal_entity_graph = build_minimal_entity_graph(jsonld); 139 | var minimal_subclass_graph = build_minimal_type_graph(minimal_entity_graph, SUBCLASSOF_URI, HAS_SUBCLASS_URI); 140 | var minimal_type_graph = build_minimal_type_graph(minimal_entity_graph, TYPE_URI, HAS_ENTITY_URI); 141 | return get_entity_dict(minimal_entity_graph.concat(minimal_type_graph).concat(minimal_subclass_graph)); 142 | } 143 | 144 | function get_entity_dict_from_jsonld(jsonld) 145 | { 146 | return get_entity_dict(build_minimal_entity_graph(jsonld)); 147 | } 148 | 149 | function format_dataset(data, id=null) 150 | { 151 | var dataset = {}; 152 | var fragments_count = isArray(data)?data.length:1; 153 | console.log('RDF Fragments count:', fragments_count); 154 | if (fragments_count==0) 155 | return dataset; 156 | 157 | if (isDict(data) && '@id' in data) 158 | dataset['@id'] = data['@id']; 159 | if (id) 160 | { 161 | dataset[LABEL_URI] = build_RDF_item(id); 162 | console.log('Formatting dataset:', id); 163 | } 164 | else 165 | console.log('Formatting dataset:', dataset['@id']); 166 | 167 | // Get entity-centered graph 168 | var minimal_entity_graph = build_minimal_entity_graph(data); 169 | console.log('Entity count:', data.length) 170 | // Get class-centerd graph 171 | var minimal_type_graph = build_minimal_type_graph(minimal_entity_graph) 172 | console.log('Class count:', minimal_type_graph.length) 173 | dataset[STATEMENT_COUNT_URI] = build_RDF_item(count_graph_statements(minimal_entity_graph)) 174 | dataset[ENTITY_COUNT_URI] = build_RDF_item(minimal_entity_graph.length) 175 | dataset[CLASS_COUNT_URI] = build_RDF_item(minimal_type_graph.length) 176 | dataset[CLASS_LIST_URI] = minimal_type_graph 177 | return dataset 178 | } 179 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/jsonld_lib.py: -------------------------------------------------------------------------------- 1 | from more_itertools import unique_everseen 2 | import json 3 | import re 4 | 5 | class hashabledict(dict): 6 | def __hash__(self): 7 | return hash(tuple(sorted(self.items()))) 8 | 9 | CONCEPT_PREFIX = 'my:' 10 | DOC_PREFIX = 'myfile:' 11 | ANONYMOUS_PREFIX = '_:' 12 | WORDNET_PREFIX = 'wn:' 13 | 14 | DOC_ID_PREDICATE = 'my:docID' 15 | HAS_IDX_PREDICATE = 'my:hasIDX' 16 | HAS_SOURCE_PREDICATE = 'my:hasSource' 17 | HAS_LABEL_PREDICATE = 'rdfs:label' 18 | SUBCLASSOF_PREDICATE = 'rdfs:subClassOf' 19 | HAS_TYPE_PREDICATE = 'rdf:type' 20 | CAN_BE_PREDICATE = 'my:canBe' 21 | IN_SYNSET_PREDICATE = 'my:inSynset' 22 | HAS_DEFINITION_PREDICATE = 'dbo:abstract' 23 | CONTENT_PREDICATE = 'my:content' 24 | SPECIAL_PREDICATE_LIST = [ 25 | DOC_ID_PREDICATE, 26 | HAS_IDX_PREDICATE, 27 | HAS_SOURCE_PREDICATE, 28 | HAS_LABEL_PREDICATE, 29 | SUBCLASSOF_PREDICATE, 30 | HAS_TYPE_PREDICATE, 31 | CAN_BE_PREDICATE, 32 | IN_SYNSET_PREDICATE, 33 | HAS_DEFINITION_PREDICATE, 34 | CONTENT_PREDICATE 35 | ] 36 | 37 | def explode_concept_key(key): 38 | if not key: 39 | return '' 40 | key = re.sub(r"[_-]", " ", key) 41 | key = key.split(':')[-1] 42 | if not key: 43 | return '' 44 | key = key[0].upper() + key[1:] 45 | splitted_key = re.findall('[A-Z][^A-Z]*', key) 46 | 47 | # join upper case letters 48 | i = 0 49 | j = 1 50 | while j < len(splitted_key): 51 | if len(splitted_key[j]) == 1: 52 | splitted_key[i] += splitted_key[j] 53 | splitted_key[j] = '' 54 | j += 1 55 | else: 56 | i = j 57 | j = i+1 58 | 59 | exploded_key = ' '.join(splitted_key) 60 | exploded_key = re.sub(r" +", r" ", exploded_key).strip() 61 | return exploded_key 62 | 63 | def urify(str): 64 | return str.casefold().strip().replace(' ','_') 65 | 66 | def is_html(str): 67 | html_pattern = r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>" 68 | return re.match(html_pattern, str, re.IGNORECASE) is not None 69 | 70 | def is_url(str): 71 | if is_rdf_item(str): 72 | str = str['@value'] 73 | str = str.casefold().strip() 74 | if not str: 75 | return False 76 | if str.startswith('../') or str.startswith('./'): 77 | return True 78 | if re.match(r'\w+:', str, re.IGNORECASE) is not None: 79 | return True 80 | url_pattern = r'(http[s]?:)?//(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' 81 | return re.match(url_pattern, str, re.IGNORECASE) is not None 82 | 83 | def is_rdf_item(v): 84 | return isinstance(v, dict) and '@value' in v 85 | 86 | def is_dict(v): 87 | return isinstance(v, dict) and '@value' not in v 88 | 89 | def is_array(v): 90 | return isinstance(v, (list,tuple)) 91 | 92 | def get_jsonld_id(jsonld, default=None): 93 | if is_dict(jsonld): 94 | return [jsonld.get('@id',default)] 95 | if is_array(jsonld): 96 | return sum(map(lambda x: get_jsonld_id(x,default),jsonld),[]) 97 | if is_rdf_item(jsonld): 98 | return [jsonld['@value']] 99 | return [jsonld] 100 | 101 | def get_string_from_triple(triple): 102 | def format_element(element,predicate): 103 | if is_rdf_item(element): 104 | element = element['@value'] 105 | if not is_array(element): 106 | element = [element] 107 | element = filter(lambda x: not isinstance(x, str) or not x.startswith(CONCEPT_PREFIX), element) 108 | element = map(lambda x: explode_concept_key(' '.join(x[3:].split('.')[:-2]) if x.startswith(WORDNET_PREFIX) else x) if isinstance(x, str) else x, element) 109 | element = filter(lambda x: x, unique_everseen(element)) 110 | element = list(element) 111 | filtered_element = ( 112 | a.strip('.') 113 | for a in element 114 | if next(filter(lambda x: a in x and a != x, element), None) is None 115 | ) 116 | if predicate in [DOC_ID_PREDICATE,HAS_IDX_PREDICATE,HAS_SOURCE_PREDICATE,HAS_LABEL_PREDICATE]: 117 | filtered_element = map(lambda x: f'«{x}»', filtered_element) 118 | filtered_element = sorted(filtered_element, key=lambda x:len(x)) 119 | if len(filtered_element) == 0: 120 | return '' 121 | formatted_triple = filtered_element[0] 122 | if len(filtered_element) > 1: 123 | formatted_triple += f' (or: {", ".join(filtered_element[1:])})' 124 | return formatted_triple 125 | subj,pred,obj = triple 126 | subj = format_element(subj,pred)#.lower().strip() 127 | obj = format_element(obj,pred)#.lower().strip() 128 | if subj == '' or obj == '': 129 | return '' 130 | # Get special predicates templates 131 | if pred == DOC_ID_PREDICATE: 132 | pred = '{subj} has been found in document {obj}' 133 | elif pred == HAS_IDX_PREDICATE: 134 | pred = '{subj} starts at offset {obj} of its document' 135 | elif pred == HAS_SOURCE_PREDICATE: 136 | pred = '{subj} is in the sentence {obj}' 137 | elif pred == HAS_LABEL_PREDICATE: 138 | pred = '{subj} is called {obj}' 139 | elif pred == SUBCLASSOF_PREDICATE: 140 | pred = '{subj} is {obj}' 141 | elif pred == HAS_TYPE_PREDICATE: 142 | pred = '{subj} is {obj}' 143 | elif pred == CAN_BE_PREDICATE: 144 | pred = '{subj} can be {obj}' 145 | elif pred == IN_SYNSET_PREDICATE: 146 | pred = '{subj} is the same of {obj}' 147 | elif pred == HAS_DEFINITION_PREDICATE: 148 | pred = '{subj} is: {obj}' 149 | # Converts a rdf triple into a string, by executing the predicate template on subject and object. 150 | triple_str = pred#.lower().strip() 151 | triple_str = triple_str.replace('{subj}',subj) if '{subj}' in triple_str else ' '.join([subj,triple_str]) 152 | triple_str = triple_str.replace('{obj}',obj) if '{obj}' in triple_str else ' '.join([triple_str,obj]) 153 | triple_str = triple_str.replace('(be)','is') 154 | triple_str = re.sub(r' +([,;.])',r'\1',triple_str) # remove unneeded whitespaces 155 | triple_str = re.sub(r' +/ +',r'/',triple_str) # remove unneeded whitespaces 156 | triple_str = triple_str.replace(' )',')').replace('( ','(') # remove unneeded whitespaces 157 | triple_str = re.sub(r'^: ','',triple_str).replace(' : ',': ').replace('::',':').replace('.,',';') 158 | return triple_str#.replace(',','') 159 | 160 | def jsonld_to_triples(jsonld, base_id=None): 161 | def helper(j, default_subj_id=None, uid=0): 162 | triples = [] 163 | if not default_subj_id: 164 | default_subj_id = f'{ANONYMOUS_PREFIX}{base_id}_{uid}' 165 | if is_array(j): 166 | for x in j: 167 | new_triples, uid = helper(x, default_subj_id, uid) 168 | triples += new_triples 169 | elif is_dict(j): 170 | subj_id = get_jsonld_id(j, default_subj_id)[0] 171 | if not subj_id: 172 | raise ValueError('A subject is required.') 173 | # subj_id = subj_id.lower().strip() 174 | for pred,obj in j.items(): 175 | if pred == '@id': 176 | continue 177 | # pred = pred.casefold().strip() 178 | # if is_rdf_item(obj): 179 | # triples.append((subj_id,pred,hashabledict(obj))) 180 | # continue 181 | for obj_id in get_jsonld_id(obj): 182 | if not obj_id: # new uid, increase the old one 183 | uid += 1 184 | obj_id = f'{ANONYMOUS_PREFIX}{base_id}_{uid}' 185 | # if is_url(obj_id): 186 | # obj_id = obj_id.lower().strip() 187 | triples.append(( 188 | subj_id, 189 | pred, 190 | obj_id, 191 | )) 192 | new_triples, uid = helper(obj, obj_id, uid) 193 | triples += new_triples 194 | return triples, uid 195 | return helper(jsonld)[0] 196 | -------------------------------------------------------------------------------- /web_app/oke/server_interface.py: -------------------------------------------------------------------------------- 1 | import sched, time 2 | import json 3 | from os import mkdir, path as os_path 4 | import sys 5 | base_path = os_path.dirname(os_path.abspath(__file__)) 6 | cache_path = os_path.join(base_path,'cache') 7 | document_path = os_path.join(base_path,'documents') 8 | sys.path.append(os_path.join(base_path,'core')) 9 | 10 | from models.knowledge_extraction.ontology_builder import OntologyBuilder as OB 11 | from models.reasoning.question_answerer import QuestionAnswerer 12 | from misc.doc_reader import load_or_create_cache, DocParser 13 | from misc.graph_builder import get_concept_description_dict 14 | from misc.graph_builder import save_graphml 15 | 16 | ################ Configuration ################ 17 | 18 | QA_DEFAULT_OPTIONS = { 19 | 'log': False, 20 | } 21 | 22 | ONTOLOGY_BUILDER_DEFAULT_OPTIONS = { 23 | 'max_syntagma_length': None, 24 | 'add_source': True, 25 | 'add_label': True, 26 | 'lemmatize_label': False, 27 | 28 | 'default_similarity_threshold': 0.75, 29 | 'tf_model': { 30 | 'url': 'https://tfhub.dev/google/universal-sentence-encoder-large/5', # Transformer 31 | # 'url': 'https://tfhub.dev/google/universal-sentence-encoder/4', # DAN 32 | # 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/', 33 | }, 34 | 'with_centered_similarity': True, 35 | } 36 | 37 | CONCEPT_CLASSIFIER_DEFAULT_OPTIONS = { 38 | 'tf_model': { 39 | 'url': 'https://tfhub.dev/google/universal-sentence-encoder-large/5', # Transformer 40 | # 'url': 'https://tfhub.dev/google/universal-sentence-encoder/4', # DAN 41 | # 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/', 42 | }, 43 | 'with_centered_similarity': True, 44 | 'default_similarity_threshold': 0.75, 45 | # 'default_tfidf_importance': 3/4, 46 | } 47 | 48 | SENTENCE_CLASSIFIER_DEFAULT_OPTIONS = { 49 | 'tf_model': { 50 | # 'url': 'https://tfhub.dev/google/universal-sentence-encoder-qa/3', # English QA 51 | 'url': 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3', # Multilingual QA # 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) 52 | # 'url': 'https://tfhub.dev/google/LAReQA/mBERT_En_En/1', 53 | # 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/', 54 | }, 55 | 'with_centered_similarity': False, 56 | 'with_topic_scaling': False, 57 | 'with_stemmed_tfidf': False, 58 | 'default_tfidf_importance': 1/4, 59 | } 60 | 61 | SUMMARISER_DEFAULT_OPTIONS = { 62 | 'hf_model': { 63 | # 'url': 't5-base', 64 | 'url': 'facebook/bart-large-cnn', # baseline 65 | # 'url': 'sshleifer/distilbart-cnn-12-6', # speedup (over the baseline): 1.24 66 | # 'url': 'sshleifer/distilbart-cnn-12-3', # speedup (over the baseline): 1.78 67 | # 'url': 'sshleifer/distilbart-cnn-6-6', # speedup (over the baseline): 2.09 68 | # 'cache_dir': '/Users/toor/Documents/Software/DLModels/hf_cache_dir/', 69 | 'framework': 'pt', 70 | }, 71 | } 72 | 73 | ################ Initialise data structures ################ 74 | graph_cache = os_path.join(cache_path,f"OB_cache_lemma-{ONTOLOGY_BUILDER_DEFAULT_OPTIONS['lemmatize_label']}.pkl") 75 | concept_classifier_cache = os_path.join(cache_path,'QA_concept_classifier.pkl') 76 | sentence_classifier_cache = os_path.join(cache_path,'QA_sentence_classifier.pkl') 77 | sentence_summariser_cache = os_path.join(cache_path,'QA_sentence_summariser.pkl') 78 | ######################################################################## 79 | print('Building Ontology Edge List..') 80 | graph = load_or_create_cache( 81 | graph_cache, 82 | lambda: OB(ONTOLOGY_BUILDER_DEFAULT_OPTIONS).set_documents_path(document_path).build() 83 | ) 84 | save_graphml(graph, 'ontology') 85 | print('Ontology size:', len(graph)) 86 | ######################################################################## 87 | print('Building Question Answerer..') 88 | qa = QuestionAnswerer( 89 | graph= graph, 90 | model_options= QA_DEFAULT_OPTIONS, 91 | query_concept_classifier_options= CONCEPT_CLASSIFIER_DEFAULT_OPTIONS, 92 | answer_classifier_options= SENTENCE_CLASSIFIER_DEFAULT_OPTIONS, 93 | answer_summariser_options= SUMMARISER_DEFAULT_OPTIONS, 94 | ) 95 | qa.sentence_classifier.load_cache(sentence_classifier_cache) 96 | qa.concept_classifier.load_cache(concept_classifier_cache) 97 | qa.sentence_summariser.load_cache(sentence_summariser_cache) 98 | 99 | 100 | ################ Define methods ################ 101 | def get_question_answer_dict(question_list, options=None): 102 | if not options: 103 | options = {} 104 | question_answer_dict = qa.ask(question_list, **options) 105 | # print('######## Question Answers ########') 106 | # print(json.dumps(question_answer_dict, indent=4)) 107 | return question_answer_dict 108 | 109 | def get_question_answer_dict_quality(question_answer_dict, top=5): 110 | return qa.get_question_answer_dict_quality(question_answer_dict, top=top) 111 | 112 | def get_summarised_question_answer_dict(question_answer_dict, options=None): 113 | if not options: 114 | options = {} 115 | question_summary_tree = qa.summarise_question_answer_dict(question_answer_dict, **options) 116 | # print('######## Summarised Question Answers ########') 117 | # print(json.dumps(question_summarised_answer_dict, indent=4)) 118 | # qa.sentence_summariser.store_cache(sentence_summariser_cache) 119 | return question_summary_tree 120 | 121 | def get_concept_overview(query_template_list, concept_uri, options=None): 122 | if not options: 123 | options = {} 124 | # set consider_incoming_relations to False with concept-centred generic questions (e.g. what is it?), otherwise the answers won't be the sought ones 125 | question_answer_dict = qa.get_concept_overview( 126 | query_template_list = query_template_list, 127 | concept_uri = concept_uri, 128 | **options 129 | ) 130 | # print('######## Concept Overview ########') 131 | # print(concept_uri, json.dumps(question_summarised_answer_dict, indent=4)) 132 | # store_cache() 133 | return question_answer_dict 134 | 135 | def annotate_text(sentence, similarity_threshold=None, max_concepts_per_alignment=1): 136 | return qa.important_concept_classifier.annotate( 137 | DocParser().set_content_list([sentence]), 138 | similarity_threshold=similarity_threshold, 139 | max_concepts_per_alignment=max_concepts_per_alignment 140 | ) 141 | 142 | def annotate_question_summary_tree(question_summary_tree, similarity_threshold=None, max_concepts_per_alignment=1): 143 | return qa.annotate_question_summary_tree(question_summary_tree, similarity_threshold=similarity_threshold, max_concepts_per_alignment=max_concepts_per_alignment) 144 | 145 | def get_taxonomical_view(concept_uri, depth=0): 146 | return qa.get_taxonomical_view(concept_uri, depth=depth) 147 | 148 | def annotate_taxonomical_view(taxonomical_view, similarity_threshold=None, max_concepts_per_alignment=1): 149 | return qa.annotate_taxonomical_view(taxonomical_view, similarity_threshold=similarity_threshold, max_concepts_per_alignment=max_concepts_per_alignment) 150 | 151 | def get_equivalent_concepts(concept_uri): 152 | return qa.adjacency_matrix.get_equivalent_concepts(concept_uri) 153 | 154 | def store_cache(): 155 | qa.sentence_summariser.store_cache(sentence_summariser_cache) 156 | qa.concept_classifier.store_cache(concept_classifier_cache) 157 | qa.sentence_classifier.store_cache(sentence_classifier_cache) 158 | 159 | # ############### Cache scheduler ############### 160 | SCHEDULING_TIMER = 15*60 # 15 minutes 161 | from threading import Timer 162 | def my_task(is_first=False): 163 | if not is_first: 164 | store_cache() 165 | Timer(SCHEDULING_TIMER, my_task).start() 166 | # start your scheduler 167 | my_task(is_first=True) 168 | -------------------------------------------------------------------------------- /web_app/oke/core/models/knowledge_extraction/lattice_builder.py: -------------------------------------------------------------------------------- 1 | from more_itertools import unique_everseen 2 | from concepts import Context 3 | 4 | class LatticeBuilder(): 5 | def __init__(self, templatize=True): 6 | self.formal_concept_context_list = [] 7 | self.templatize = templatize 8 | 9 | @staticmethod 10 | def deanonymize_graph(edge_list, name_fn=lambda c:c, key_fn=lambda c:c): 11 | def build_edge_dict(edge_list, key_fn=lambda x: x): 12 | edge_dict = {} 13 | for edge in edge_list: 14 | subj,_,_ = edge 15 | subj_key = key_fn(subj) 16 | if subj_key not in edge_dict: 17 | edge_dict[subj_key] = [] 18 | edge_dict[subj_key].append(edge) 19 | return edge_dict 20 | def get_named_predicates(root_concept, edge_dict): 21 | named_predicates = [] 22 | concept_checked = set() 23 | concept_to_check = [root_concept] 24 | while len(concept_to_check) > 0: # iterative version 25 | concept = concept_to_check.pop() 26 | concept_key = key_fn(concept) 27 | concept_checked.add(concept_key) 28 | for _,_,obj in edge_dict.get(concept_key,[]): 29 | if name_fn(obj): 30 | named_predicates.append(obj) 31 | elif key_fn(obj) not in concept_checked: 32 | concept_to_check.append(obj) 33 | return list(unique_everseen(named_predicates)) 34 | 35 | edge_dict = build_edge_dict(edge_list, key_fn=key_fn) 36 | new_edge_list = [] 37 | for edge in edge_list: 38 | subj,pred,obj = edge 39 | if not name_fn(subj): 40 | continue 41 | if name_fn(obj): 42 | new_edge_list.append(edge) 43 | else: 44 | new_edge_list.extend( 45 | (subj,pred,o) 46 | for o in get_named_predicates(obj, edge_dict) 47 | ) 48 | return new_edge_list 49 | 50 | def build_concept_relation_dict(self, edge_list): 51 | assert False, 'Not implemented' 52 | 53 | def build_lattice(self, edge_list, stringify=False): 54 | edge_list = list(edge_list) 55 | concept_relation_dict = self.build_concept_relation_dict(edge_list) 56 | properties = set(relation for relation_set in concept_relation_dict.values() for relation in relation_set) 57 | objects = set(concept_relation_dict.keys()) 58 | if len(properties)==0 or len(objects)==0: 59 | return [] 60 | bools = [ 61 | [ 62 | p in concept_relation_dict[object] 63 | for p in properties 64 | ] 65 | for object in objects 66 | ] 67 | 68 | formal_concept_context = Context(objects, properties, bools) 69 | self.formal_concept_context_list.append(formal_concept_context) 70 | concept_lattice = formal_concept_context.lattice 71 | 72 | #print(concept_lattice['person',]) 73 | #print(concept_lattice['employer',]) 74 | #concept_lattice.graphviz(view=True) 75 | #for extent, intent in concept_lattice: 76 | # print('%r %r' % (extent, intent)) 77 | 78 | for concept in concept_lattice._concepts: 79 | concept.objects = sorted(concept.objects) 80 | concept.properties = sorted(concept.properties) 81 | get_concept_obj = lambda x: x.objects if len(x.objects)>0 else x.index 82 | 83 | lattice_edge_list = [] 84 | for concept in concept_lattice._concepts: 85 | concept_obj = get_concept_obj(concept) 86 | lattice_edge_list.extend( 87 | ( 88 | concept_obj, 89 | neighbor.properties, 90 | get_concept_obj(neighbor), 91 | ) 92 | for neighbor in concept.lower_neighbors 93 | ) 94 | ''' 95 | non_root_concept_set = set(get_concept_key(obj) for _,_,obj in lattice_edge_list) 96 | lattice_edge_list.extend( 97 | ( 98 | '', 99 | concept.properties, 100 | get_concept_obj(concept), 101 | ) 102 | for concept in concept_lattice._concepts 103 | if get_concept_key(concept) not in non_root_concept_set 104 | ) 105 | ''' 106 | is_iter = lambda element: isinstance(element, (list,tuple)) 107 | lattice_edge_list = self.deanonymize_graph(lattice_edge_list, name_fn=lambda x: is_iter(x), key_fn=lambda x: ','.join(x) if is_iter(x) else x) 108 | #lattice_edge_list = list(map(lambda x: (x[-1],x[-2],x[-3]), lattice_edge_list)) 109 | if stringify: 110 | list_to_string = lambda x: ', '.join(x) 111 | lattice_edge_list = map(lambda x: (list_to_string(x[0]), list_to_string(x[1]), list_to_string(x[2])), lattice_edge_list) 112 | return list(lattice_edge_list) 113 | 114 | class ActivePredicateTypingLatticeBuilder(LatticeBuilder): 115 | def build_concept_relation_dict(self, edge_list): 116 | concept_relation_dict = {} 117 | for subj,pred,obj in edge_list: 118 | if subj not in concept_relation_dict: 119 | concept_relation_dict[subj] = set() 120 | concept_relation_dict[subj].add(f'that can {pred} is' if self.templatize else pred) # objects and properties cannot overlap 121 | return concept_relation_dict 122 | 123 | class PassivePredicateTypingLatticeBuilder(LatticeBuilder): 124 | def build_concept_relation_dict(self, edge_list): 125 | concept_relation_dict = {} 126 | for subj,pred,obj in edge_list: 127 | if obj not in concept_relation_dict: 128 | concept_relation_dict[obj] = set() 129 | concept_relation_dict[obj].add(f'that can be {pred}-ed is' if self.templatize else pred) # objects and properties cannot overlap 130 | return concept_relation_dict 131 | 132 | class PassiveActivePredicateTypingLatticeBuilder(LatticeBuilder): 133 | def __init__(self, templatize=True): 134 | super().__init__(templatize) 135 | self.active_lb = ActivePredicateTypingLatticeBuilder() 136 | self.passive_lb = PassivePredicateTypingLatticeBuilder() 137 | 138 | def build_concept_relation_dict(self, edge_list): 139 | concept_relation_dict = self.active_lb.build_concept_relation_dict(edge_list) 140 | for key,value in self.passive_lb.build_concept_relation_dict(edge_list).items(): 141 | if key not in concept_relation_dict: 142 | concept_relation_dict[key] = set() 143 | concept_relation_dict[key] |= value 144 | return concept_relation_dict 145 | 146 | class ActiveActionTypingLatticeBuilder(LatticeBuilder): 147 | def build_concept_relation_dict(self, edge_list): 148 | concept_relation_dict = {} 149 | for subj,pred,obj in edge_list: 150 | if subj not in concept_relation_dict: 151 | concept_relation_dict[subj] = set() 152 | concept_relation_dict[subj].add(f'that can {pred} {obj} is' if self.templatize else (pred,obj)) # objects and properties cannot overlap 153 | return concept_relation_dict 154 | 155 | def build_lattice(self, edge_list, stringify=False): 156 | edge_list = list(edge_list) 157 | predicate_dict = {} 158 | for edge in edge_list: 159 | subj,pred,obj = edge 160 | if pred not in predicate_dict: 161 | predicate_dict[pred] = [] 162 | predicate_dict[pred].append(edge) 163 | 164 | global_lattice_edge_list = [] 165 | for pred, new_edge_list in predicate_dict.items(): 166 | global_lattice_edge_list += super().build_lattice(new_edge_list, stringify) 167 | return global_lattice_edge_list 168 | 169 | class PassiveActionTypingLatticeBuilder(ActiveActionTypingLatticeBuilder): 170 | def build_concept_relation_dict(self, edge_list): 171 | concept_relation_dict = {} 172 | for subj,pred,obj in edge_list: 173 | if obj not in concept_relation_dict: 174 | concept_relation_dict[obj] = set() 175 | concept_relation_dict[obj].add(f'that can be {pred}-ed by {subj} is' if self.templatize else (pred,subj)) # objects and properties cannot overlap 176 | return concept_relation_dict 177 | 178 | class PassiveActiveActionTypingLatticeBuilder(ActiveActionTypingLatticeBuilder): 179 | def __init__(self, templatize=True): 180 | super().__init__(templatize) 181 | self.active_lb = ActiveActionTypingLatticeBuilder() 182 | self.passive_lb = PassiveActionTypingLatticeBuilder() 183 | 184 | def build_concept_relation_dict(self, edge_list): 185 | concept_relation_dict = self.active_lb.build_concept_relation_dict(edge_list) 186 | for key,value in self.passive_lb.build_concept_relation_dict(edge_list).items(): 187 | if key not in concept_relation_dict: 188 | concept_relation_dict[key] = set() 189 | concept_relation_dict[key] |= value 190 | return concept_relation_dict 191 | -------------------------------------------------------------------------------- /web_app/yai/static/js/template/jsonld_handler.js: -------------------------------------------------------------------------------- 1 | function get_predicate_templatized_label(ancestor, predicate, data) 2 | { 3 | var predicate_data = {}; 4 | predicate_data[predicate] = data; 5 | var templatized_text_list = get_template_list(predicate_data, ancestor); 6 | if (templatized_text_list.length > 0) 7 | return templatized_text_list[0]; 8 | return get_default_predicate_template(predicate, data, ancestor); 9 | } 10 | 11 | function annotate_nestedlist(nested_tree, annotation_list, annotation_fn) { 12 | if (isArray(nested_tree)) 13 | nested_tree.map(c=>annotate_nestedlist(c, annotation_list, annotation_fn)); 14 | else 15 | { 16 | nested_tree.text = annotate_hmtl(nested_tree.text, annotation_list, annotation_fn); 17 | // console.log(nested_tree.text); 18 | if (nested_tree.children && nested_tree.children.length) 19 | nested_tree.children.map(c=>annotate_nestedlist(c, annotation_list, annotation_fn)); 20 | } 21 | return nested_tree 22 | } 23 | 24 | function nest_jsonld(data, uri_dict, ignore_set=null, max_depth=null, parent_set=null, depth=0) 25 | { 26 | parent_set = new Set(parent_set); 27 | ignore_set = new Set(ignore_set); 28 | // Replace using uri dict 29 | if (isRDFItem(data)) 30 | { 31 | var desc = get_RDFItem_description(data); 32 | if (ignore_set.has(desc)) 33 | return data; 34 | const is_url = isURL(desc); 35 | if (is_url && parent_set.has(desc)) 36 | return data; 37 | if (!(desc in uri_dict)) 38 | return data; 39 | data = uri_dict[desc]; 40 | if (is_url && isDict(data)) 41 | { 42 | const data_uri = get_description(data, false); 43 | const data_label = get_description(data); 44 | data = {'@id': build_RDF_item(data_uri)} 45 | if (data_uri!=data_label) 46 | data[LABEL_URI] = build_RDF_item(data_label); 47 | return data; 48 | } 49 | depth += 1; 50 | if (max_depth && depth > max_depth) 51 | return data; 52 | } 53 | if (isArray(data)) 54 | return data.map(x=>nest_jsonld(x, uri_dict, ignore_set, max_depth, parent_set, depth)); 55 | if (isDict(data)) 56 | { 57 | var new_data = {} 58 | for (var [k,v] of Object.entries(data)) 59 | { 60 | if (k == '@id') 61 | { 62 | parent_set.add(get_RDFItem_description(v)); 63 | new_data[k] = v; 64 | continue; 65 | } 66 | const v_desc = get_description(v, false); 67 | if (ignore_set.has(v_desc)) 68 | new_data[k] = build_RDF_item(v_desc); 69 | else 70 | new_data[k] = nest_jsonld(v, uri_dict, ignore_set, max_depth, parent_set, depth); 71 | } 72 | return new_data; 73 | } 74 | return data; 75 | } 76 | 77 | function jsonld_to_nestedlist(data, depth=0, predicate=null, ancestor=null) // Display JSON-LD data as a HTML tree view 78 | { 79 | var node_id = depth + 1; 80 | var is_first = depth==0; 81 | var current_depth = depth; 82 | // Define routine/goto to get a fragment 83 | function get_fragment(o, p, ancestor_p) { // local function, keep it local 84 | node_id = depth + 1; 85 | var sub_tree_dict = null; 86 | [sub_tree_dict, depth] = jsonld_to_nestedlist(o, node_id, p, ancestor_p); 87 | return sub_tree_dict 88 | } 89 | // Avoid useless nesting 90 | if (isArray(data) && data.length==1) 91 | data = data[0]; 92 | // // Get predicate 93 | // if (predicate === null) 94 | // predicate = 'Document'; 95 | 96 | // Get tree text and predicate link 97 | var tree_dict = get_predicate_templatized_label(ancestor, predicate, data); 98 | // Build html fragment 99 | if (isDict(data)) 100 | { 101 | // add children 102 | var child_list = []; 103 | var already_processed_predicates = []; 104 | // get templatized text list 105 | var template_list = get_template_list(data, ancestor); 106 | for (var template_dict of template_list) 107 | { 108 | already_processed_predicates = already_processed_predicates.concat(template_dict['predicate_list']); 109 | // create new child 110 | var new_child = template_dict; 111 | // add children 112 | if (!template_dict['hide_descendants']) 113 | { 114 | var sub_child_list = []; 115 | for (var p of template_dict['predicate_list']) 116 | { 117 | if (p=='@id') 118 | continue; 119 | var object = data[p]; 120 | 121 | var sub_fragment = get_fragment(object, p, p); 122 | if (template_dict['keys_to_hide_as_child'].includes(p) || template_dict['predicate_list'].length==1) 123 | { // this predicate has been requested to be removed, by the template, save its children 124 | if ('children' in sub_fragment) 125 | sub_child_list = sub_child_list.concat(sub_fragment['children']); 126 | } 127 | else 128 | sub_child_list.push(sub_fragment); 129 | } 130 | if (sub_child_list.length > 0) 131 | new_child['children'] = sub_child_list; 132 | } 133 | // push child into child list 134 | child_list.push(new_child); 135 | } 136 | // process the remaining properties 137 | for (var [p,o] of Object.entries(data)) 138 | { 139 | if (already_processed_predicates.includes(p)) 140 | continue; 141 | if (p=='@id') 142 | continue; 143 | if (isArray(o) && o.length==0) 144 | continue; 145 | // add fragment 146 | child_list.push(get_fragment(o, p, predicate)); 147 | } 148 | if (child_list.length > 0) 149 | tree_dict["children"] = child_list; 150 | } 151 | else if (isArray(data)) 152 | { 153 | var child_list = []; 154 | for (var i in data) // add fragment 155 | child_list.push(get_fragment(data[i], predicate, predicate)); 156 | 157 | if (child_list.length > 0) 158 | tree_dict["children"] = child_list; 159 | } 160 | if (!is_first) 161 | return [tree_dict, depth]; 162 | 163 | if (!('children' in tree_dict)) 164 | return []; 165 | 166 | return flatten_single_childed_trees(clean_tree(tree_dict['children'])); 167 | } 168 | 169 | function flatten_single_childed_trees(tree_dict_list) 170 | { 171 | for (var tree_dict of tree_dict_list) 172 | { 173 | if (!('children' in tree_dict)) 174 | continue 175 | if (tree_dict['children'].length == 1 && !tree_dict['is_in_array']) 176 | { 177 | var child = tree_dict['children'][0]; 178 | if ('children' in child) 179 | { 180 | var last_char = tree_dict['text'].slice(-1); 181 | if (last_char == ':') 182 | tree_dict['text'] = tree_dict['text'].slice(0,-1)+'.'; 183 | else if (last_char != '.') 184 | tree_dict['text'] += '.'; 185 | tree_dict['text'] += ' '+child['text']; 186 | tree_dict['children'] = child['children']; 187 | } 188 | } 189 | tree_dict['children'] = flatten_single_childed_trees(tree_dict['children']); 190 | } 191 | return tree_dict_list 192 | } 193 | 194 | function clean_tree(tree_dict_list) 195 | { // remove trees with empty text 196 | var new_tree_dict_list = []; 197 | for (var tree_dict of tree_dict_list) 198 | { 199 | tree_dict['text'] = tree_dict['text'].trim(); 200 | if (tree_dict['children'] && tree_dict['children'].length>0) 201 | { 202 | // // array template 203 | // if (tree_dict['is_in_array']) 204 | // { 205 | // var alternative_text_list = tree_dict['children'].filter(x=>x['predicate_list'].includes('@id')).sort((a,b)=>b['predicate_list'].length-a['predicate_list'].length).map(x=>x['text']); 206 | // if (alternative_text_list.length > 0) 207 | // tree_dict['text'] = alternative_text_list[0]; 208 | // } 209 | // recursive call 210 | tree_dict['children'] = clean_tree(tree_dict['children']); 211 | // remove blank nodes 212 | if (!tree_dict['text']) 213 | { 214 | // for (var c of tree_dict['children']) 215 | // c['label'] = tree_dict['label'] 216 | new_tree_dict_list = new_tree_dict_list.concat(tree_dict['children']); 217 | } 218 | else 219 | { 220 | // remove children having the same text of the parent 221 | var new_children = []; 222 | const children = tree_dict['children']; 223 | for (var child of children) 224 | { 225 | if (child['text']==tree_dict['text']) 226 | new_children = new_children.concat(child['children']); 227 | else 228 | new_children.push(child); 229 | } 230 | // if (new_children.length == 1 && 'children' in new_children[0]) 231 | // new_children = new_children[0]['children']; 232 | tree_dict['children'] = new_children; 233 | new_tree_dict_list.push(tree_dict); 234 | } 235 | } 236 | else if (tree_dict['text']) 237 | new_tree_dict_list.push(tree_dict); 238 | } 239 | return new_tree_dict_list; 240 | } 241 | -------------------------------------------------------------------------------- /web_app/yai/static/js/stage_builder/item_stage_builder.js: -------------------------------------------------------------------------------- 1 | SCHEMA = 'https://schema.org/' 2 | 3 | function extract_entity_by_type_list(type_list, graph) 4 | { 5 | if (isArray(graph)) 6 | { 7 | var process_list = []; 8 | var new_graph = []; 9 | for (var g of graph) 10 | { 11 | var [extracted_processes, new_g] = extract_entity_by_type_list(type_list, g); 12 | if (extracted_processes.length > 0) 13 | process_list = process_list.concat(extracted_processes); 14 | if (new_g) 15 | new_graph.push(new_g); 16 | } 17 | if (new_graph.length == 0) 18 | new_graph = null; 19 | return [process_list,new_graph]; 20 | } 21 | else if (isDict(graph)) 22 | { 23 | if (TYPE_URI in graph && type_list.includes(graph[TYPE_URI]['@value'])) 24 | return [[graph],null]; 25 | 26 | var process_list = []; 27 | var new_graph = {}; 28 | for (var [k,g] of Object.entries(graph)) 29 | { 30 | var [extracted_processes, new_g] = extract_entity_by_type_list(type_list,g); 31 | process_list = process_list.concat(extracted_processes); 32 | new_graph[k] = new_g?new_g:g['@id']; 33 | } 34 | return [process_list,new_graph]; 35 | } 36 | return [[],Object.assign({},graph)]; 37 | } 38 | 39 | function get_URI_graph(graph, key, collector, recursion=true, is_first=true, class_ground=new Map()) 40 | { 41 | function merge_URI_dict(a,b) { 42 | for (var [class_name, graph_list] of Object.entries(b)) 43 | { 44 | if (!(class_name in a)) 45 | a[class_name] = [] 46 | a[class_name] = a[class_name].concat(b[class_name]) 47 | } 48 | return a 49 | } 50 | function add_unknown_graph_to_dict(n,d) { 51 | if (!(UNKNOWN_TYPE_URI in d)) 52 | d[UNKNOWN_TYPE_URI] = [] 53 | d[UNKNOWN_TYPE_URI].push(n) 54 | } 55 | var class_dict = {} 56 | if (isArray(graph)) 57 | { 58 | for (var i in graph) 59 | { 60 | new_class_dict = get_URI_graph(graph[i], key, collector, recursion, false, class_ground) 61 | class_dict = merge_URI_dict(class_dict, new_class_dict) 62 | if (is_first && Object.keys(new_class_dict).length==0) 63 | add_unknown_graph_to_dict(graph[i],class_dict) 64 | } 65 | } 66 | else if (isDict(graph)) 67 | { 68 | if (key in graph) 69 | { 70 | var graph_key_list = (!isArray(graph[key]))?[graph[key]]:graph[key] 71 | for (var i in graph_key_list) 72 | { 73 | var class_name = graph_key_list[i]['@value'] 74 | if (!isURL(class_name)) 75 | { 76 | var context = ('@context' in graph)?graph['@context']['@value']:SCHEMA 77 | //console.log(graph['@context'],graph_key_list[i]) 78 | if (context!='') 79 | class_name = pathJoin([context,class_name]) 80 | } 81 | if (!(class_name in class_dict)) 82 | class_dict[class_name] = [] 83 | graph = Object.assign({},graph) // deep copy 84 | //delete graph[key] 85 | class_dict[class_name].push(graph) 86 | class_ground.set(class_name, graph_key_list[i]['@ground']) 87 | } 88 | } 89 | else if (is_first) 90 | add_unknown_graph_to_dict(graph,class_dict) 91 | 92 | if (recursion) 93 | { 94 | for (var tuple of Object.entries(graph)) 95 | { 96 | if (tuple[0] == key) 97 | continue 98 | var object_list = tuple[1] 99 | //console.log(object_list, get_URI_graph(object_list, key, collector, recursion, false)) 100 | class_dict = merge_URI_dict(class_dict, get_URI_graph(object_list, key, collector, recursion, false, class_ground)) 101 | } 102 | } 103 | } 104 | if (!is_first) 105 | return class_dict 106 | 107 | var class_list = [] 108 | for (var [class_name, type_graph] of Object.entries(class_dict)) 109 | { 110 | var class_dict = {} 111 | class_dict['@id'] = build_RDF_item(class_name, class_ground.get(class_name)) 112 | class_dict[collector] = type_graph 113 | class_list.push(class_dict) 114 | } 115 | return class_list 116 | } 117 | 118 | function assign_unique_ids_to_graph(graph, anonymous_node_count=0, id_base='') 119 | { 120 | if (isArray(graph)) 121 | { 122 | for (var i in graph) 123 | anonymous_node_count = assign_unique_ids_to_graph(graph[i], anonymous_node_count, id_base); 124 | } 125 | else if (isDict(graph)) 126 | { 127 | if (!('@id' in graph)) 128 | { 129 | graph['@id'] = build_RDF_item(`my:AnonymousEntity_${id_base}_${anonymous_node_count}`); 130 | anonymous_node_count += 1; 131 | } 132 | for (var [key,value] of Object.entries(graph)) 133 | anonymous_node_count = assign_unique_ids_to_graph(value, anonymous_node_count, id_base); 134 | } 135 | return anonymous_node_count; 136 | } 137 | 138 | function build_minimal_entity_graph(graph) 139 | { 140 | // graph = format_jsonld(graph) 141 | var new_graph_list = [] 142 | // Assign unique ids 143 | assign_unique_ids_to_graph(graph); 144 | // Get entity dict 145 | var graph_list = get_URI_graph(graph, '@id', HAS_ENTITY_URI) 146 | // Merge graphs with same entity 147 | for (var ent_idx in graph_list) 148 | { 149 | var entity_id = graph_list[ent_idx]['@id'] 150 | var entity_graph_list = graph_list[ent_idx][HAS_ENTITY_URI] 151 | // keep the biggest graph as central graph and merge the others, this would help keeping grounds as less redundant as possible 152 | entity_graph_list.sort(function(a, b){ 153 | // ASC -> a.length - b.length 154 | // DESC -> b.length - a.length 155 | return b.length - a.length; 156 | }); 157 | 158 | var entity_graph = entity_graph_list[0] // keep the biggest graph as central graph and merge the others 159 | for (var i=1; i [key, graph[key]]); 257 | // Sort the array based on the second element 258 | items = items.sort(function(a, b){ 259 | var a=isURL(a[0])?getPath(a[0]):a[0], b=isURL(b[0])?getPath(b[0]):b[0] 260 | // put special elements first 261 | special_comparison = abstract_comparison(is_special,a,b) 262 | if (special_comparison !== null) 263 | return special_comparison 264 | // put relevant elements second 265 | relevant_comparison = abstract_comparison(is_relevant,a,b) 266 | if (relevant_comparison !== null) 267 | return relevant_comparison 268 | // put remaining elements last 269 | if(a < b) 270 | return -1; 271 | if(a > b) 272 | return 1; 273 | return 0; 274 | }); 275 | //console.log(items) 276 | var ordered_graph = {} 277 | for (var [k,v] of items) 278 | ordered_graph[k]=sort_graph(v) 279 | graph = ordered_graph 280 | } 281 | else if (isArray(graph)) 282 | { 283 | //var items = Object.keys(graph).map(e => getPath(e['@id'])); 284 | for (var i in graph) 285 | graph[i] = sort_graph(graph[i]) 286 | } 287 | return graph 288 | } 289 | -------------------------------------------------------------------------------- /web_app/yai/static/js/template/template_lib.js: -------------------------------------------------------------------------------- 1 | var KNOWN_ENTITY_DICT = {}; 2 | 3 | function get_known_label(id) 4 | { 5 | id = prefixed_string_to_uri(id); 6 | if (id in KNOWN_ENTITY_DICT) 7 | { 8 | const desc = get_dict_description(KNOWN_ENTITY_DICT[id]); 9 | return isURL(desc)?format_link(desc, false):desc; 10 | } 11 | return null; 12 | } 13 | 14 | function get_formatted_tuple_template(is_in_array, value, single_element_fn, multiple_elements_fn, array_item_fn=null, max_length=null) 15 | { 16 | if (isArray(value) && value.length == 1) 17 | value = value[0] 18 | var original_value = value; 19 | var is_dict = isDict(original_value); 20 | var is_rdf = isRDFItem(original_value); 21 | // if (is_in_array) 22 | // console.log(is_dict, value); 23 | 24 | if (is_dict) 25 | value = get_dict_description(value, as_label=false) 26 | else if (is_rdf) 27 | value = get_RDFItem_description(value); 28 | if (!isArray(value)) 29 | { 30 | const value_is_url = isURL(value); 31 | const known_label = titlefy(get_known_label(value)); 32 | if (value_is_url) 33 | value = linkify(value, known_label); 34 | else if (known_label) 35 | value = known_label; 36 | value = clip_text(value,max_length); 37 | var template = ''; 38 | if (is_in_array) 39 | { 40 | if (array_item_fn == null) 41 | { 42 | if (value_is_url) 43 | array_item_fn = x=>x; 44 | else 45 | array_item_fn = x=>'«'+x+'»'; 46 | } 47 | template = value?array_item_fn(value):''; 48 | } 49 | else 50 | template = single_element_fn(value); 51 | if (!template) 52 | return ''; 53 | // if (is_dict) 54 | // template += ':'; 55 | return template; 56 | } 57 | // value is Array 58 | return value.length==0?'':(multiple_elements_fn(value) + ':'); 59 | } 60 | 61 | function clip_text(text, max_length) 62 | { 63 | if (max_length && text.length>max_length) 64 | text = text.slice(0,max_length) + '[...]'; 65 | return text; 66 | } 67 | 68 | function get_default_predicate_template(predicate, object, ancestor_predicate) 69 | { 70 | var is_in_array = is_array_element(predicate, ancestor_predicate); 71 | // console.log(is_in_array, predicate, ancestor_predicate) 72 | if (isURL(predicate)) 73 | predicate = linkify(predicate); 74 | templatized_text = get_formatted_tuple_template(is_in_array, object, 75 | x=>'The '+predicate+' of this resource is «'+x+'»', 76 | x=>'The '+predicate+' of this resource are', 77 | ); 78 | return { 79 | 'is_in_array': is_in_array, 80 | 'predicate_list': [predicate], 81 | 'text': templatized_text, 82 | }; 83 | } 84 | 85 | function get_table(heads, values_list) 86 | { 87 | var head_row = ''; 88 | head_row += '' 89 | for (var h of heads) 90 | head_row += `${h}`; 91 | head_row += '' 92 | 93 | var value_row = ''; 94 | for (var values of values_list) 95 | { 96 | value_row += '' 97 | for (var v of values) 98 | value_row += `${v}` 99 | value_row += '' 100 | } 101 | return `${head_row}${value_row}
`; 102 | } 103 | 104 | function get_known_concepts_from_annotated_sentences(annotated_sentence_list, related_concepts_limit=null) 105 | { 106 | var annotation_list_uri = prefixed_string_to_uri('my:annotationList'); 107 | var word_annotation_list_uri = prefixed_string_to_uri('my:wordLevelAnnotationList'); 108 | var related_to_uri = prefixed_string_to_uri('my:relatedTo'); 109 | var relation_list = []; 110 | for (var annotated_sentence of annotated_sentence_list) 111 | { 112 | var annotation_list = get(annotated_sentence,annotation_list_uri,[]); 113 | if (word_annotation_list_uri in annotated_sentence) 114 | { 115 | var word_annotation_list = annotated_sentence[word_annotation_list_uri]; 116 | for (var word_annotation of word_annotation_list) 117 | annotation_list = annotation_list.concat(word_annotation[annotation_list_uri]); 118 | } 119 | for (var annotation of annotation_list) 120 | { 121 | if (related_to_uri in annotation) 122 | relation_list.push(get_RDFItem_description(annotation[related_to_uri])); 123 | } 124 | } 125 | // display only unique relations 126 | relation_list = [...new Set(relation_list)]; 127 | // keep only the first 3 elements 128 | if (related_concepts_limit) 129 | relation_list = relation_list.slice(0, related_concepts_limit); 130 | var result = relation_list.map(x=>linkify(x)).join(', '); 131 | if (relation_list.length >= related_concepts_limit) 132 | result += ', etc..'; 133 | return result; 134 | } 135 | 136 | function try_to_apply_dict_to_template(ancestor, dict, template) 137 | { 138 | var predicate_list = []; 139 | var value_list = []; 140 | var keys = template['keys'].map(prefixed_string_to_uri); 141 | var optional_keys = new Set(get(template, 'optional_keys', []).map(prefixed_string_to_uri)); 142 | for (var predicate of keys) 143 | { 144 | var predicate_found = predicate in dict; 145 | if (!predicate_found) 146 | { 147 | var predicate_is_optional = optional_keys.has(predicate); 148 | if (!predicate_is_optional) 149 | return null; 150 | value_list.push(null); 151 | } 152 | else 153 | { 154 | var object = dict[predicate]; 155 | if (isRDFItem(object)) 156 | object = get_RDFItem_description(object); 157 | // else if (isArray(object) && object.length==0) 158 | // return null; 159 | value_list.push(object); 160 | predicate_list.push(predicate); 161 | } 162 | } 163 | if (value_list.filter(x=>x != null).length == 0) 164 | return null; 165 | var is_in_array = false; 166 | if (predicate_list.length==1) 167 | { 168 | for (var k of keys) // this should handle grouped lists with different predicates (for which is_in_array is clearly true) 169 | { 170 | if (is_array_element(k, ancestor)) 171 | { 172 | is_in_array = true; 173 | break; 174 | } 175 | } 176 | } 177 | return Object.assign({}, template, { 178 | 'is_in_array': is_in_array, 179 | 'predicate_list': predicate_list, 180 | 'keys_to_hide_as_child': get(template, 'keys_to_hide_as_child', []).map(prefixed_string_to_uri), 181 | 'label': get(template, 'label', null), 182 | 'text': template['template_fn'](is_in_array, ancestor, value_list), 183 | }); 184 | } 185 | 186 | function is_array_element(prefixed_predicate, ancestor) 187 | { 188 | return prefixed_string_to_uri(prefixed_predicate)==prefixed_string_to_uri(ancestor); 189 | } 190 | 191 | function get_template_list(dict, ancestor=null) 192 | { 193 | var graph_templates = []; 194 | for (var t in TEMPLATE_LIST) 195 | { 196 | var template = TEMPLATE_LIST[t]; 197 | var applied_template = try_to_apply_dict_to_template(ancestor, dict, template); 198 | if (applied_template) 199 | { 200 | applied_template['position'] = t; 201 | graph_templates.push(applied_template); 202 | } 203 | } 204 | // some templates may overlap, remove all the redundant templates (that are those for which all the parameters are contained into another template with a greater or equal number of parameters) 205 | var filtered_graph_templates = []; 206 | // sort by predicate list length 207 | graph_templates = graph_templates.sort((a, b) => a.predicate_list.length-b.predicate_list.length); // ascending order 208 | for (var k=0; k < graph_templates.length; k++) 209 | { 210 | var template_dict = graph_templates[k]; 211 | var overlap = false; 212 | for (var i=k+1; !overlap && i a.position-b.position); // ascending order 232 | return filtered_graph_templates; 233 | } 234 | 235 | function to_external_link(link, name=null) 236 | { 237 | link = String(link).replace(/<|>|"/gi,''); 238 | if (!name) 239 | name = isURL(link) ? format_link(link) : link; 240 | 241 | return `${name}`; 242 | } 243 | 244 | function linkify(link, name=null) 245 | { 246 | link = String(link).replace(/<|>|"/gi,''); 247 | if (!name) 248 | name = isURL(link) ? format_link(link) : link; 249 | 250 | return template_expand(name,link); 251 | } 252 | 253 | function template_expand(name,topic=null) 254 | { 255 | if (!topic) 256 | topic = name 257 | return `${name}`; 261 | } 262 | 263 | function counterfactual_input(counterfactual_api_url, feature_order, value) 264 | { 265 | return ``; 272 | } 273 | -------------------------------------------------------------------------------- /web_app/oke/core/models/knowledge_extraction/ontology_builder.py: -------------------------------------------------------------------------------- 1 | from models.knowledge_extraction.knowledge_graph_builder import KnowledgeGraphBuilder 2 | from models.knowledge_extraction.lattice_builder import ActiveActionTypingLatticeBuilder 3 | from misc.graph_builder import get_root_set, get_concept_set, get_predicate_set, get_object_set, get_connected_graph_list, get_ancestors, filter_graph_by_root_set, tuplefy 4 | from misc.graph_builder import save_graph 5 | from misc.jsonld_lib import * 6 | 7 | from more_itertools import unique_everseen 8 | import re 9 | 10 | import nltk 11 | from nltk.corpus import wordnet as wn 12 | 13 | class OntologyBuilder(KnowledgeGraphBuilder): 14 | WORDNET_TO_PATTERN_MAP = { 15 | 'pro:RoleInTime': ['causal_agent\.n\.01'], 16 | # 'foaf:Person': ['person\.n\.01'], 17 | 'foaf:Organization': ['organization\.n\.01'], 18 | 'ti:TimeInterval': ['time_period\.n\.01'], 19 | 'pro:InformationObject': ['information','written_communication'], 20 | 'pro:Place': ['location\.n\.01'], 21 | # 'owl:Thing': ['object'], 22 | # 'pro:Role': ['role\.n\.01'], 23 | 'pwo:Action': ['action\.n\.01'], 24 | # 'pwo:Workflow': ['process\.n\.06'], 25 | 'pro:Obligation': ['obligation\.n'], 26 | } 27 | 28 | KNOWN_ONTO_PATTERN_EDGE_LIST = [ 29 | # Agent-Role pattern 30 | ('pro:RoleInTime', SUBCLASSOF_PREDICATE, 'pro:Role'), 31 | ('lkif:Agent', 'pro:holdsRoleInTime', 'pro:RoleInTime'), 32 | ('pro:RoleInTime', 'pro:withRole', 'pro:Role'), 33 | ('pro:RoleInTime', 'tvc:atTime', 'ti:TimeInterval'), 34 | ('pro:RoleInTime', 'pro:relatesToDocument', 'foaf:Document'), 35 | ('pro:RoleInTime', 'pro:relatesToPerson', 'foaf:Person'), 36 | ('pro:RoleInTime', 'pro:relatesToOrganization', 'foaf:Organization'), 37 | # TVC pattern 38 | ('pro:ValueInTime', 'pro:withValue', 'owl:Thing'), 39 | ('owl:Thing', 'pro:hasValue', 'pro:ValueInTime'), 40 | ('pro:ValueInTime', 'pro:withContext', 'owl:Thing'), 41 | ('pro:ValueInTime', 'tvc:atTime', 'pro:Instant'), 42 | ('pro:ValueInTime', 'pro:atTime', 'ti:TimeInterval'), 43 | ('owl:Thing', CAN_BE_PREDICATE, 'lkif:Jurisdiction'), 44 | ('owl:Thing', CAN_BE_PREDICATE, 'pro:Place'), 45 | # Process pattern 46 | ('pwo:WorkflowExecution', 'pwo:executes', 'pwo:Workflow'), 47 | ('pwo:WorkflowExecution', 'pwo:involvesAction', 'pwo:Action'), 48 | ('pwo:Workflow', 'pwo:hasStep', 'pwo:Step'), 49 | ('pwo:Workflow', 'pwo:hasFirstStep', 'pwo:Step'), 50 | ('pwo:Step', 'pwo:hasNextStep', 'pwo:Step'), 51 | ('pwo:Step', 'pwo:produces', 'owl:Thing'), 52 | ('pwo:Step', 'pwo:needs', 'owl:Thing'), 53 | ('pwo:Action', 'tisit:atTime', 'ti:TimeInterval'), 54 | ('pwo:Step', 'taskex:isExecutedIn', 'owl:Thing'), 55 | ('pwo:Step', 'parameter:hasParameter', 'time:DurationDescription'), 56 | # Deontic Ontology 57 | ('pro:DeonticSpecification', 'pro:hasPointed', 'pro:AuxiliaryParty'), 58 | ('pro:DeonticSpecification', 'pro:isHeld', 'pro:Interval'), 59 | ('pro:Bearer', 'pro:setsUp', 'pro:DeonticSpecification'), 60 | ('pro:DeonticSpecification', 'pro:componentOf', 'pro:LegalRule'), 61 | ('pro:Permission', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 62 | ('pro:Right', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 63 | ('pro:Permission', 'pro:foundedOn', 'pro:Right'), 64 | ('pro:Compliance', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 65 | ('pro:Obligation', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 66 | ('pro:Compliance', 'pro:complies', 'pro:Obligation'), 67 | ('pro:Right', 'pro:generates', 'pro:Obligation'), 68 | ('pro:Violation', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 69 | ('pro:Prohibition', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'), 70 | ('pro:Violation', 'pro:foundedOn', 'pro:Obligation'), 71 | ('pro:Violation', 'pro:foundedOn', 'pro:Prohibition'), 72 | ('pro:PrescriptiveRule', SUBCLASSOF_PREDICATE, 'pro:LegalRule'), 73 | ('pro:ConstitutiveRule', SUBCLASSOF_PREDICATE, 'pro:LegalRule'), 74 | ('pro:Penalty', 'pro:repairs', 'pro:PrescriptiveRule'), 75 | # PrOnto Ontology 76 | ('pwo:Action', 'pro:generateRevision', 'allot:FRBRExpression'), 77 | ('pwo:Action', 'pro:isActedBy', 'lkif:Agent'), 78 | ('pwo:Action', 'taskex:executesTask', 'pwo:Step'), 79 | ('pwo:Workflow', 'pwo:hasStep', 'pwo:Step'), 80 | ('pwo:Step', 'pwo:needs', 'pro:InformationObject'), 81 | ('pwo:Step', 'pro:hasStepType', 'pro:StepType'), 82 | ('pwo:Step', 'pwo:produces', 'pro:InformationObject'), 83 | ('pwo:Step', 'pwo:needs', 'pro:InformationObject'), 84 | ('pwo:Step', 'pro:commits', 'pro:LegalRule'), 85 | ('pro:LegalRule', SUBCLASSOF_PREDICATE, 'pro:PrescriptiveRule'), 86 | ('pro:LegalRule', SUBCLASSOF_PREDICATE, 'pro:ConstitutiveRule'), 87 | ('pro:DeonticSpecification', 'pro:componentOf', 'pro:LegalRule'), 88 | ('pro:DeonticSpecification', 'pro:isHeld', 'ti:TimeInterval'), 89 | ('pro:ValueInTimeAndContext', 'tvc:atTime', 'ti:TimeInterval'), 90 | ('pro:ValueInTimeAndContext', 'tvc:withinContext', 'pro:Place'), 91 | ('pro:ValueInTimeAndContext', 'tvc:withinContext', 'lkif:Jurisdiction'), 92 | ('pro:ValueInTimeAndContext', 'tvc:withValue', 'pro:Value'), 93 | ] 94 | 95 | def __init__(self, model_options): 96 | # nltk.download('wordnet') 97 | self.max_syntagma_length = model_options.get('max_syntagma_length', 5) 98 | self.add_source = model_options.get('add_source', False) 99 | self.add_label = model_options.get('add_label', True) 100 | self.lemmatize_label = model_options.get('lemmatize_label', False) 101 | self.lattice_builder = ActiveActionTypingLatticeBuilder(templatize=False) 102 | super().__init__(model_options) 103 | 104 | @staticmethod 105 | def print_graph(edge_iter, file_name): 106 | edge_iter = filter(lambda x: '{obj}' not in x[1], edge_iter) 107 | edge_list = list(edge_iter) 108 | print(f'Printing {file_name} with {len(edge_list)} triples') 109 | save_graph(edge_list, file_name, max(min(256,len(edge_list)/2),32)) 110 | 111 | def build_edge_list(self): 112 | edge_list = super().build( 113 | max_syntagma_length=self.max_syntagma_length, 114 | add_subclasses=True, 115 | use_wordnet=True, 116 | add_source=self.add_source, 117 | add_label=self.add_label, 118 | lemmatize_label=self.lemmatize_label, 119 | to_rdf=True, 120 | ) 121 | edge_list = tuplefy(edge_list) 122 | return edge_list 123 | 124 | @staticmethod 125 | def get_hypernym_edge_list(concept_set): 126 | hyper = lambda s: s.hypernyms() 127 | # hypo = lambda s: s.hyponyms() 128 | concept_hypernyms_dict = {} 129 | for concept in concept_set: 130 | if not isinstance(concept, str) or not concept.startswith(WORDNET_PREFIX): 131 | continue 132 | synset = wn.synset(concept[3:]) # remove string WORDNET_PREFIX, 3 chars 133 | concept_hypernyms_dict[concept] = set(synset.closure(hyper)).union((synset,)) 134 | 135 | hypernym_edge_list = [ 136 | (concept, SUBCLASSOF_PREDICATE, WORDNET_PREFIX+hypernym.name()) 137 | for concept, hypernym_set in concept_hypernyms_dict.items() 138 | for hypernym in hypernym_set 139 | ] 140 | return hypernym_edge_list 141 | 142 | def extract_minimal_taxonomy(self, main_edge_list): 143 | concept_set = get_concept_set(main_edge_list) 144 | hypernym_edge_list = self.get_hypernym_edge_list(concept_set) 145 | hypernym_edge_list = self.lattice_builder.build_lattice(hypernym_edge_list) 146 | return list(unique_everseen(hypernym_edge_list)) 147 | 148 | @staticmethod 149 | def format_taxonomy(hypernym_edge_list): 150 | # hypernym_edge_list to RDF 151 | taxonomy_edge_list = [] 152 | for subj_list, pred_list, obj_list in hypernym_edge_list: 153 | for subj in subj_list: 154 | for _,pred in pred_list: 155 | for obj in obj_list: 156 | taxonomy_edge_list.append((pred,SUBCLASSOF_PREDICATE,subj)) 157 | if obj != pred: 158 | taxonomy_edge_list.append((obj,SUBCLASSOF_PREDICATE,pred)) 159 | return taxonomy_edge_list 160 | 161 | def connect_taxonomy_to_patterns(self, hypernym_edge_list): 162 | # get sorted concept set by moving parents on top of children 163 | hypernym_concept_set = get_concept_set(hypernym_edge_list) 164 | hypernym_concept_ancestors_list = [ 165 | (c, get_ancestors(c, hypernym_edge_list)) 166 | for c in hypernym_concept_set 167 | ] 168 | hypernym_concept_ancestors_list.sort(key=lambda x: len(x[-1])) 169 | 170 | pattern_edge_list = [] 171 | type_set_dict = {} 172 | for root,_ in hypernym_concept_ancestors_list: 173 | for key,value_list in self.WORDNET_TO_PATTERN_MAP.items(): 174 | for value in value_list: 175 | if re.search(re.compile(value), root) is not None: 176 | pattern_edge_list.append((root,'rdf:type',key)) 177 | if key not in type_set_dict: 178 | type_set_dict[key] = set() 179 | type_set_dict[key].add(root) 180 | 181 | for root,root_ancestors in hypernym_concept_ancestors_list: 182 | root_intension = set( 183 | predicate 184 | for fcc in self.lattice_builder.formal_concept_context_list 185 | for _,predicate in fcc.intension([root.strip()]) 186 | ) 187 | for intension in root_intension: 188 | for key,value_list in self.WORDNET_TO_PATTERN_MAP.items(): 189 | for value in value_list: 190 | if re.search(re.compile(value), intension) is not None: 191 | if ( key not in type_set_dict ) or ( next(filter(lambda a: a in type_set_dict[key], root_ancestors), None) is None ): 192 | pattern_edge_list.append((root,'rdf:type',key)) 193 | if key not in type_set_dict: 194 | type_set_dict[key] = set() 195 | type_set_dict[key].add(root) 196 | return unique_everseen(pattern_edge_list) 197 | 198 | def build(self): 199 | print('Building knowledge graph..') 200 | edge_list = self.build_edge_list() 201 | 202 | print('Extracting minimal taxonomy via FCA..') 203 | hypernym_edge_list = self.extract_minimal_taxonomy(edge_list) 204 | hypernym_concept_set = get_concept_set(hypernym_edge_list) 205 | 206 | print('Connecting known ontology patterns to concept taxonomy..') 207 | pattern_hinge_graph = self.connect_taxonomy_to_patterns(hypernym_edge_list) 208 | # self.print_graph(pattern_hinge_graph, 'kg_hinge') 209 | 210 | print('Creating taxonomy graph..') 211 | taxonomy_graph = self.format_taxonomy(hypernym_edge_list) 212 | # self.print_graph(taxonomy_graph, 'kg_taxonomy') 213 | taxonomy_graph += pattern_hinge_graph 214 | # self.print_graph(taxonomy_graph, 'kg_hinged_taxonomy') 215 | 216 | return edge_list + taxonomy_graph 217 | -------------------------------------------------------------------------------- /web_app/oke/core/models/knowledge_extraction/couple_abstractor.py: -------------------------------------------------------------------------------- 1 | from models.knowledge_extraction.couple_extractor import CoupleExtractor 2 | from misc.jsonld_lib import * 3 | import nltk 4 | from nltk.corpus import framenet as fn 5 | 6 | class CoupleAbstractor(CoupleExtractor): 7 | def abstract_couple_list(self, concept_dict_list): 8 | assert False, 'Not implemented' 9 | 10 | class WordnetAbstractor(CoupleAbstractor): 11 | 12 | # def __init__(self, model_options): 13 | # nltk.download('punkt') 14 | # nltk.download('averaged_perceptron_tagger') 15 | # nltk.download('wordnet') 16 | # super().__init__(model_options) 17 | 18 | ''' 19 | Firstly, the OPs sort of confused between relatedness and similarity, the distinction is fine but it's worth noting. 20 | 21 | Semantic relatedness measures how related two concepts are, using any kind of relation; algorithms: 22 | * Lexical Chains (Hirst and St-Onge, 1998) 23 | * Adapted/Extended Sense Overlaps algorithm (Banerjee and Pedersen, 2002/2003) 24 | * Vectorized Sense Overlaps (Patwardhan, 2003) 25 | 26 | Semantic similarity only considers the IS-A relation (i.e. hypernymy / hyponymy); algorithms: 27 | * Wu-Palmer measure (Wu and Palmer 1994) 28 | * Resnik measure (Resnik 1995) 29 | * Jiang-Conrath measure (Jiang and Conrath 1997) 30 | * Leacock-Chodorow measure (Leacock and Chodorow 1998) 31 | * Lin measure (Lin 1998) 32 | Resnik, Jiang-Conrath and Lin measures are based on information content. The information content of a synset is -log the sum of all probabilities (computed from corpus frequencies) of all words in that synset (Resnik, 1995). 33 | Wu-Palmer and Leacock-Chodorow are based on path length; the similarity between two concepts /synsets is respective of the number of nodes along the shortest path between them. 34 | 35 | The list given above is inexhaustive, but historically, we can see that using similarity measure is sort of outdated since relatedness algorithms considers more relations and should theoretically give more disambiguating power to compare concepts. 36 | ''' 37 | def abstract_couple_list(self, concept_dict_list): 38 | from pywsd import disambiguate 39 | from pywsd.similarity import max_similarity 40 | from pywsd.lesk import simple_lesk, adapted_lesk, cosine_lesk 41 | 42 | concept_dict_list = [ 43 | self.get_couple_from_concept(concept_dict) 44 | if 'predicate' not in concept_dict else 45 | concept_dict 46 | for concept_dict in concept_dict_list 47 | ] 48 | disambiguation_cache = {} 49 | for couple in concept_dict_list: 50 | sentence_text = couple['source']['sentence_text'] 51 | if sentence_text not in disambiguation_cache: 52 | sentence_disambiguation = disambiguate( 53 | sentence_text, 54 | algorithm=cosine_lesk, 55 | #similarity_option='wu-palmer', 56 | ) 57 | disambiguation_cache[sentence_text] = {k.lower():v for k,v in sentence_disambiguation} 58 | synset_dict = disambiguation_cache[sentence_text] 59 | couple['concept']['synset'] = synset_dict.get(couple['concept']['text'], None) 60 | for concept_core_dict in couple['concept_core']: 61 | concept_core_dict['synset'] = synset_dict.get(concept_core_dict['text'], None) 62 | couple['predicate_core']['synset'] = synset_dict.get(couple['predicate_core']['text'], None) 63 | return concept_dict_list 64 | 65 | class FramenetAbstractor(CoupleAbstractor): 66 | FRAME_GF_CACHE = {} 67 | FE_IN_LU_BY_DEP_CACHE = {} 68 | LU_LIST = [lu for lu in fn.lus() if lu.name.split('.')[1] == 'v'] 69 | LU_KEY_LIST = [explode_concept_key(lu.name.split('.')[0]) for lu in LU_LIST] 70 | 71 | def __init__(self, model_options): 72 | # nltk.download('punkt') 73 | # nltk.download('averaged_perceptron_tagger') 74 | # nltk.download('framenet_v17') 75 | fn.propagate_semtypes() 76 | super().__init__(model_options) 77 | self.debug = model_options.get('debug', False) 78 | #self.with_frame_annotation = model_options.get('with_frame_annotation', True) 79 | self.lu_confidence_threshold = model_options.get('lu_confidence_threshold', 2/3) 80 | self.concept_confidence_threshold = model_options.get('concept_confidence_threshold', 1/2) 81 | 82 | @staticmethod 83 | def get_FE_and_GF_by_active_LU_annotation_list(lu_annotation_list): 84 | fe_dict = {} 85 | for annotation_dict in lu_annotation_list: 86 | is_passive_LU_embodiement = annotation_dict['is_passive_LU_embodiement'] 87 | for fe_tuple, gf_tuple in zip(annotation_dict['frame_element'],annotation_dict['grammatical_function']): 88 | gf = gf_tuple[-1] 89 | #print(fe_tuple[-1], gf) 90 | if gf not in ['Ext','Obj']: 91 | continue 92 | if is_passive_LU_embodiement: # get gf in active form 93 | gf = 'Ext' if gf == 'Obj' else 'Obj' 94 | fe = fe_tuple[-1] # every lexical unit has only one frame, so we can use the frame element has unique key 95 | if fe not in fe_dict: 96 | fe_dict[fe] = set() 97 | fe_dict[fe].add(gf) 98 | return fe_dict 99 | 100 | def is_passive_LU_embodiement(self, text, lexical_unit_offset): 101 | for token in self.nlp(text): 102 | #print(token, token.idx) 103 | if token.idx == lexical_unit_offset[0]: 104 | predicate_dict = self.get_predicate_dict(token) 105 | if predicate_dict is not None: 106 | return self.is_passive(predicate_dict['predicate']['span']) 107 | break 108 | return False 109 | 110 | def get_LU_annotation_list(self, lu): 111 | return [ 112 | { # dict_keys(['cDate', 'status', 'ID', '_type', 'layer', '_ascii', 'Target', 'FE', 'GF', 'PT', 'Other', 'Sent', 'Verb', 'sent', 'text', 'LU', 'frame']) 113 | #'text': annotation['text'], 114 | #'id': annotation['ID'], 115 | #'lexical_unit': annotation['LU'].name, 116 | 'frame_element': annotation['FE'][0], 117 | 'grammatical_function': annotation['GF'], 118 | 'phrase_type': annotation['PT'], 119 | #'lexical_unit_offset': annotation['Target'], 120 | 'is_passive_LU_embodiement': self.is_passive_LU_embodiement(annotation['text'], annotation['Target'][0]), 121 | } 122 | for sub_corpus in lu.subCorpus 123 | for sentence in sub_corpus.sentence 124 | for annotation in sentence.annotationSet 125 | if annotation.get('GF',None) is not None 126 | and annotation.get('Target',None) is not None 127 | #and len(annotation['Target']) == 1 # Target 'rule out' has two separated offsets, even if it is a single target 128 | ] 129 | 130 | def get_possible_FE_in_LU_by_dependency(self, lu, dependency): 131 | lu_name = lu.name 132 | cache_key = '{0}.{1}'.format(lu_name, dependency) 133 | if cache_key not in self.FE_IN_LU_BY_DEP_CACHE: 134 | if lu_name not in self.FRAME_GF_CACHE: 135 | lu_annotation_list = self.get_LU_annotation_list(lu) 136 | frame_element_active_grammatical_functions = self.get_FE_and_GF_by_active_LU_annotation_list(lu_annotation_list) 137 | self.FRAME_GF_CACHE[lu_name] = frame_element_active_grammatical_functions 138 | else: 139 | frame_element_active_grammatical_functions = self.FRAME_GF_CACHE[lu_name] 140 | 141 | related_frame = lu.frame 142 | if self.debug: 143 | print('Frame:', str(related_frame.name)) 144 | 145 | abstract_couple_list = [] 146 | for abstract_concept, fe in related_frame.FE.items(): 147 | #if fe.coreType != 'Core': 148 | # continue 149 | 150 | fe_name = fe.name 151 | active_grammatical_functions = frame_element_active_grammatical_functions.get(fe_name,None) 152 | if active_grammatical_functions is None: 153 | continue 154 | valid_gf = 'Obj' if 'obj' in dependency else 'Ext' 155 | if valid_gf not in active_grammatical_functions: 156 | continue 157 | 158 | semantic_type = fe.semType.name if fe.semType is not None else None 159 | abstract_couple_list.append({'frame_element':abstract_concept, 'semantic_type': semantic_type}) 160 | if self.debug: 161 | print('Element:', {'fe': fe_name, 'active_gf': active_grammatical_functions}) 162 | self.FE_IN_LU_BY_DEP_CACHE[cache_key] = abstract_couple_list 163 | else: 164 | abstract_couple_list = self.FE_IN_LU_BY_DEP_CACHE[cache_key] 165 | return abstract_couple_list 166 | 167 | @staticmethod 168 | def stringify_couple(concept, predicate, dependency): 169 | subject = concept if 'subj' in dependency else 'x' 170 | object = concept if 'obj' in dependency else 'x' 171 | str = f'{subject} {predicate} {object}' 172 | #str = str[0].upper() + str[1:] + '.' 173 | return str 174 | 175 | def abstract_couple(self, couple): 176 | if self.debug: 177 | print('###############################') 178 | print('Couple:', couple) 179 | is_passive_couple = couple['is_passive'] 180 | couple_dependency = 'subj' if ('subj' in couple['dependency'] and not is_passive_couple) or ('obj' in couple['dependency'] and is_passive_couple) else 'obj' 181 | couple_predicate = couple['predicate']['lemma'] 182 | fragment = self.stringify_couple(couple['concept']['lemma'], couple_predicate, couple_dependency) 183 | if self.debug: 184 | print('Fragment:', fragment) 185 | print('Is passive:', is_passive_couple) 186 | 187 | lu_argmax, lu_confidence = self.find_most_similar(couple_predicate, self.LU_KEY_LIST, cached=True) 188 | if lu_confidence < self.lu_confidence_threshold: 189 | return 190 | lu = self.LU_LIST[lu_argmax] 191 | lu_name = self.LU_KEY_LIST[lu_argmax] 192 | if self.debug: 193 | print('Most Similar LU:', lu_name) 194 | print('LU Confidence:', lu_confidence) 195 | 196 | # Find all possible frame elements 197 | abstract_couple_list = self.get_possible_FE_in_LU_by_dependency(lu, couple_dependency) 198 | if len(abstract_couple_list) == 0: 199 | return 200 | 201 | target_list = [ 202 | self.stringify_couple( 203 | explode_concept_key(abstract_concept['frame_element']).lower().strip(), 204 | couple_predicate, 205 | couple_dependency 206 | ) 207 | for abstract_concept in abstract_couple_list 208 | ] 209 | 210 | concept_argmax, concept_confidence = self.find_most_similar(fragment, target_list, cached=True) 211 | if concept_confidence < self.concept_confidence_threshold: 212 | return 213 | most_similar_concept = abstract_couple_list[concept_argmax] 214 | if self.debug: 215 | print('Abstract Concept:', most_similar_concept) 216 | print('Confidence:', concept_confidence) 217 | 218 | # Update couples 219 | couple['concept_annotation'] = { 220 | #'embodiment': couple['concept']['text'], 221 | 'confidence': concept_confidence, 222 | } 223 | couple['concept_annotation'].update(most_similar_concept) 224 | couple['predicate_annotation'] = { 225 | 'lexical_unit': lu_name, 226 | 'frame': lu.frame.name, 227 | 'confidence': lu_confidence, 228 | } 229 | 230 | def abstract_couple_list(self, concept_dict_list): 231 | concept_dict_list = [ 232 | self.get_couple_from_concept(concept_dict) 233 | if 'predicate' not in concept_dict else 234 | concept_dict 235 | for concept_dict in concept_dict_list 236 | ] 237 | for couple in concept_dict_list: 238 | self.abstract_couple(couple) 239 | return concept_dict_list 240 | -------------------------------------------------------------------------------- /web_app/oke/core/models/model_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | # os.environ["CUDA_VISIBLE_DEVICES"]="-1" 3 | import multiprocessing 4 | import types 5 | import spacy # for natural language processing 6 | # import neuralcoref # for Coreference Resolution 7 | # python3 -m spacy download en_core_web_md 8 | from sklearn.preprocessing import normalize 9 | import numpy as np 10 | import tensorflow as tf 11 | # import tensorflow.compat.v1 as tf 12 | # tf.disable_v2_behavior() # use 1.X API 13 | tf.get_logger().setLevel('ERROR') # Reduce logging output. 14 | gpu_devices = tf.config.experimental.list_physical_devices('GPU') 15 | for dev in gpu_devices: 16 | tf.config.experimental.set_memory_growth(dev, True) 17 | import tensorflow_hub as hub 18 | import tensorflow_text 19 | from pathlib import Path 20 | from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, pipeline 21 | import torch 22 | 23 | from misc.doc_reader import load_or_create_cache, create_cache, load_cache 24 | 25 | import warnings 26 | warnings.filterwarnings('ignore') 27 | 28 | def get_best_gpu(): 29 | if torch.cuda.device_count() == 0: 30 | return -1 31 | return min( 32 | ( 33 | (i,torch.cuda.memory_allocated(i)) 34 | for i in range(torch.cuda.device_count()) 35 | ), 36 | key = lambda x:x[-1] 37 | )[0] 38 | 39 | is_listable = lambda x: type(x) in (list,tuple) 40 | 41 | class ModelManager(): 42 | # static members 43 | __nlp_models = {} 44 | __tf_embedders = {} 45 | __hf_embedders = {} 46 | 47 | def __init__(self, model_options=None): 48 | if not model_options: 49 | model_options = {} 50 | self.model_options = model_options 51 | self.disable_spacy_component = [] 52 | self.__batch_size = model_options.get('batch_size', 100) 53 | 54 | self.__spacy_cache = {} 55 | self.__tf_cache = {} 56 | self.__hf_cache = {} 57 | 58 | self.__spacy_model = model_options.get('spacy_model', 'en_core_web_md') 59 | self.__tf_model = model_options.get('tf_model', {}) 60 | self.__hf_model = model_options.get('hf_model', {}) 61 | 62 | def store_cache(self, cache_name): 63 | cache_dict = { 64 | 'tf_cache': self.__tf_cache, 65 | 'spacy_cache': self.__spacy_cache, 66 | 'hf_cache': self.__hf_cache, 67 | } 68 | create_cache(cache_name, lambda: cache_dict) 69 | 70 | def load_cache(self, cache_name): 71 | loaded_cache = load_cache(cache_name) 72 | if loaded_cache: 73 | 74 | tf_cache = loaded_cache.get('tf_cache',None) 75 | if tf_cache: 76 | self.__tf_cache = tf_cache 77 | 78 | hf_cache = loaded_cache.get('hf_cache',None) 79 | if hf_cache: 80 | self.__hf_cache = hf_cache 81 | 82 | spacy_cache = loaded_cache.get('spacy_cache',None) 83 | if spacy_cache: 84 | self.__spacy_cache = spacy_cache 85 | 86 | @staticmethod 87 | def get_cached_values(value_list, cache, fetch_fn, key_fn=lambda x:x): 88 | missing_values = [q for q in value_list if key_fn(q) not in cache] 89 | if len(missing_values) > 0: 90 | new_values = fetch_fn(missing_values) 91 | cache.update({key_fn(q):v for q,v in zip(missing_values, new_values)}) 92 | return [cache[key_fn(q)] for q in value_list] 93 | 94 | @staticmethod 95 | def load_nlp_model(spacy_model): 96 | print('## Loading Spacy model <{}>...'.format(spacy_model)) 97 | # go here for more information about Language Processing Pipeline (tokenizer, tagger, parser, etc..) 98 | nlp = spacy.load(spacy_model) 99 | # nlp.add_pipe(nlp.create_pipe("merge_noun_chunks")) 100 | # nlp.add_pipe(nlp.create_pipe("merge_entities")) 101 | # nlp.add_pipe(nlp.create_pipe("merge_subtokens")) 102 | ################################# 103 | # nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab), name='neuralcoref', last=True) # load NeuralCoref and add it to the pipe of SpaCy's model 104 | # def remove_unserializable_results(doc): # Workaround for serialising NeuralCoref's clusters 105 | # def cluster_as_doc(c): 106 | # c.main = c.main.as_doc() 107 | # c.mentions = [ 108 | # m.as_doc() 109 | # for m in c.mentions 110 | # ] 111 | # # doc.user_data = {} 112 | # if not getattr(doc,'_',None): 113 | # return doc 114 | # if not getattr(doc._,'coref_clusters',None): 115 | # return doc 116 | # for cluster in doc._.coref_clusters: 117 | # cluster_as_doc(cluster) 118 | # for token in doc: 119 | # for cluster in token._.coref_clusters: 120 | # cluster_as_doc(cluster) 121 | # return doc 122 | # nlp.add_pipe(remove_unserializable_results, last=True) 123 | print('## Spacy model loaded') 124 | return nlp 125 | 126 | @staticmethod 127 | def load_tf_model(tf_model): 128 | cache_dir = tf_model.get('cache_dir',None) 129 | if cache_dir: 130 | Path(cache_dir).mkdir(parents=True, exist_ok=True) 131 | os.environ["TFHUB_CACHE_DIR"] = cache_dir 132 | 133 | model_url = tf_model['url'] 134 | is_qa_model = 'qa' in model_url.lower() 135 | if is_qa_model: 136 | print(f'## Loading TF model <{model_url}> for QA...') 137 | else: 138 | print(f'## Loading TF model <{model_url}>...') 139 | module = hub.load(model_url) 140 | get_input = lambda y: tf.constant(tuple(map(lambda x: x[0] if is_listable(x) else x, y))) 141 | if is_qa_model: 142 | get_context = lambda y: tf.constant(tuple(map(lambda x: x[1] if is_listable(x) else '', y))) 143 | q_label = "query_encoder" if 'query_encoder' in module.signatures else 'question_encoder' 144 | q_module = lambda doc: module.signatures[q_label](input=get_input(doc))['outputs'].numpy() # The default signature is identical with the question_encoder signature. 145 | a_module = lambda doc: module.signatures['response_encoder'](input=get_input(doc), context=get_context(doc))['outputs'].numpy() 146 | else: 147 | q_module = a_module = lambda doc: module(get_input(doc)).numpy() 148 | print('## TF model loaded') 149 | return { 150 | 'question': q_module, 151 | 'answer': a_module 152 | } 153 | 154 | @staticmethod 155 | def load_hf_model(hf_model): 156 | model_name = hf_model['url'] 157 | model_type = hf_model['type'] 158 | model_framework = hf_model.get('framework', 'pt') 159 | cache_dir = hf_model.get('cache_dir',None) 160 | if cache_dir: 161 | model_path = os.path.join(cache_dir, model_name.replace('/','-')) 162 | if not os.path.isdir(model_path): 163 | os.mkdir(model_path) 164 | else: 165 | model_path = None 166 | print(f'###### Loading {model_type} model <{model_name}> for {model_framework} ######') 167 | config = AutoConfig.from_pretrained(model_name, cache_dir=model_path) # Download configuration from S3 and cache. 168 | model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=model_path) 169 | tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path) 170 | print(f'###### <{model_name}> loaded ######') 171 | return { 172 | 'pipeline': pipeline(model_type, model=model, tokenizer=tokenizer, framework=model_framework, device=get_best_gpu()), 173 | 'tokenizer': tokenizer, 174 | 'model': model, 175 | 'config': config, 176 | } 177 | 178 | def get_nlp_model(self): 179 | if ModelManager.__nlp_models.get(self.__spacy_model, None) is None: 180 | ModelManager.__nlp_models[self.__spacy_model] = ModelManager.load_nlp_model(self.__spacy_model) 181 | return ModelManager.__nlp_models[self.__spacy_model] 182 | 183 | def get_tf_model(self): 184 | model_key = self.__tf_model['url'] 185 | if ModelManager.__tf_embedders.get(model_key, None) is None: 186 | ModelManager.__tf_embedders[model_key] = ModelManager.load_tf_model(self.__tf_model) 187 | return ModelManager.__tf_embedders[model_key] 188 | 189 | def get_hf_model(self): 190 | model_key = (self.__hf_model['url'],self.__hf_model['type']) 191 | if ModelManager.__hf_embedders.get(model_key, None) is None: 192 | ModelManager.__hf_embedders[model_key] = ModelManager.load_hf_model(self.__hf_model) 193 | return ModelManager.__hf_embedders[model_key] 194 | 195 | def nlp(self, text_list, disable=None, n_threads=None, batch_size=None): 196 | if not disable: 197 | disable = self.disable_spacy_component 198 | if not n_threads: # real multi-processing: https://git.readerbench.com/eit/prepdoc/blob/f8e93b6d0a346e9a53dac2e70e5f1712d40d6e1e/examples/parallel_parse.py 199 | n_threads = multiprocessing.cpu_count() 200 | if not batch_size: 201 | batch_size = self.__batch_size 202 | def fetch_fn(missing_text): 203 | return self.get_nlp_model().pipe( 204 | missing_text, 205 | disable=disable, 206 | batch_size=min(batch_size, int(np.ceil(len(missing_text)/n_threads))), 207 | n_process=min(n_threads, len(missing_text)), # The keyword argument n_threads on the .pipe methods is now deprecated, as the v2.x models cannot release the global interpreter lock. (Future versions may introduce a n_process argument for parallel inference via multiprocessing.) - https://spacy.io/usage/v2-1#incompat 208 | ) 209 | return self.get_cached_values(text_list, self.__spacy_cache, fetch_fn) 210 | 211 | def run_tf_embedding(self, doc_list, norm=None, as_question=False): 212 | def fetch_fn(missing_queries): 213 | # print(missing_queries) 214 | tf_model = self.get_tf_model() 215 | # Feed missing_queries into current tf graph 216 | batch_list = ( 217 | missing_queries[i*self.__batch_size:(i+1)*self.__batch_size] 218 | for i in range(np.int(np.ceil(len(missing_queries)/self.__batch_size))) 219 | ) 220 | encoder = tf_model['question' if as_question else 'answer'] 221 | batched_embeddings = tuple(map(encoder, batch_list)) 222 | embeddings = np.concatenate(batched_embeddings, 0) 223 | # Normalize the embeddings, if required 224 | if norm is not None: 225 | embeddings = normalize(embeddings, norm=norm) 226 | return embeddings 227 | return np.array(self.get_cached_values(doc_list, self.__tf_cache, fetch_fn, key_fn=lambda x:(x,as_question))) 228 | 229 | def run_hf_task(self, inputs, **kwargs): 230 | def fetch_fn(missing_inputs): 231 | hf_model = self.get_hf_model() 232 | return [hf_model['pipeline'](i, **kwargs) for i in missing_inputs] 233 | cache_key = '.'.join(map(lambda x: '='.join(map(str,x)), sorted(kwargs.items(), key=lambda x:x[0]))) 234 | return self.get_cached_values(inputs, self.__hf_cache, fetch_fn, key_fn=lambda x: '.'.join((cache_key,x))) 235 | 236 | def get_similarity_vector(self, source_text_list, target_text_list, similarity_fn=np.inner, as_question=False): 237 | source_embedding = self.run_tf_embedding(doc_list=source_text_list, as_question=as_question) 238 | target_embeddings = self.run_tf_embedding(doc_list=target_text_list, as_question=False) 239 | return similarity_fn(source_embedding,target_embeddings) 240 | 241 | # np.inner == lambda x,y: np.matmul(x,np.transpose(y)) 242 | def find_most_similar(self, source_text, target_text_list, similarity_fn=np.inner, as_question=False): 243 | similarity_vec = self.get_similarity_vector( 244 | source_text=source_text, 245 | target_text_list=target_text_list, 246 | similarity_fn=similarity_fn, 247 | as_question=as_question, 248 | ) 249 | argmax = np.argmax(similarity_vec) 250 | return argmax, similarity_vec[argmax] 251 | -------------------------------------------------------------------------------- /web_app/oke/core/misc/graph_builder.py: -------------------------------------------------------------------------------- 1 | from more_itertools import unique_everseen 2 | from matplotlib import pyplot as plt 3 | import re 4 | import networkx as nx 5 | try: 6 | import pygraphviz 7 | from networkx.drawing.nx_agraph import graphviz_layout 8 | except ImportError: 9 | try: 10 | import pydotplus 11 | from networkx.drawing.nx_pydot import graphviz_layout 12 | except ImportError: 13 | raise ImportError("This example needs Graphviz and either PyGraphviz or PyDotPlus") 14 | 15 | import networkx as nx 16 | 17 | 18 | def get_betweenness_centrality(edge_list): 19 | # Betweenness centrality quantifies the number of times a node acts as a bridge along the shortest path between two other nodes. 20 | di_graph = nx.DiGraph() 21 | di_graph.add_edges_from(map(lambda x: (x[0],x[-1]), edge_list)) 22 | return nx.betweenness_centrality(di_graph) 23 | 24 | def get_concept_description_dict(graph, label_predicate, valid_concept_filter_fn=None): 25 | if valid_concept_filter_fn: 26 | concept_set = get_concept_set(filter(valid_concept_filter_fn, graph)) 27 | graph = filter(lambda x: x[0] in concept_set, graph) 28 | # print('Unique concepts:', len(concept_set)) 29 | uri_dict = {} # concept_description_dict 30 | for uri,_,label in filter(lambda x: x[1] == label_predicate, graph): 31 | if uri not in uri_dict: 32 | uri_dict[uri] = [] 33 | uri_dict[uri].append(label) 34 | return uri_dict 35 | 36 | def get_tuple_element_set(tuple_list, element_idx): 37 | tuple_element_set = set() 38 | element_iter = map(lambda x: x[element_idx], tuple_list) 39 | for element in element_iter: 40 | if isinstance(element, (list,tuple)): 41 | for e in element: 42 | tuple_element_set.add(e) 43 | else: 44 | tuple_element_set.add(element) 45 | return tuple_element_set 46 | 47 | def get_subject_set(edge_list): 48 | return get_tuple_element_set(edge_list, 0) 49 | 50 | def get_predicate_set(edge_list): 51 | return get_tuple_element_set(edge_list, 1) 52 | 53 | def get_object_set(edge_list): 54 | return get_tuple_element_set(edge_list, -1) 55 | 56 | def get_concept_set(edge_list): 57 | edge_list = list(edge_list) 58 | return get_subject_set(edge_list).union(get_object_set(edge_list)) 59 | 60 | def get_root_set(edge_list): 61 | edge_list = list(edge_list) 62 | return get_subject_set(edge_list).difference(get_object_set(edge_list)) 63 | 64 | def get_leaf_set(edge_list): 65 | edge_list = list(edge_list) 66 | return get_object_set(edge_list).difference(get_subject_set(edge_list)) 67 | 68 | def reverse_order(edge_list): 69 | return map(lambda edge: (edge[-1],edge[-2],edge[-3]), edge_list) 70 | 71 | def get_ancestors(node, edge_list): 72 | return get_object_set(filter_graph_by_root_set(list(reverse_order(edge_list)), [node])) 73 | 74 | def tuplefy(edge_list): 75 | def to_tuple(x): 76 | if type(x) is dict: 77 | return tuple(x.values()) 78 | if type(x) is list: 79 | return tuple(x) 80 | return x 81 | return [ 82 | tuple(map(to_tuple, edge)) 83 | for edge in edge_list 84 | ] 85 | 86 | def build_edge_dict(edge_list, key_fn=lambda x: x): 87 | edge_dict = {} 88 | for edge in edge_list: 89 | for subj in get_subject_set([edge]): 90 | subj_key = key_fn(subj) 91 | if subj_key not in edge_dict: 92 | edge_dict[subj_key] = [] 93 | edge_dict[subj_key].append(edge) 94 | return edge_dict 95 | 96 | def extract_rooted_edge_list(root, edge_dict): 97 | valid_edge_list = [] 98 | if root not in edge_dict: 99 | return valid_edge_list 100 | valid_edge_list += edge_dict[root] 101 | obj_to_explore = get_object_set(edge_dict[root]) 102 | del edge_dict[root] 103 | while len(obj_to_explore) > 0: 104 | obj = obj_to_explore.pop() 105 | if obj in edge_dict: 106 | valid_edge_list += edge_dict[obj] 107 | obj_to_explore |= get_object_set(edge_dict[obj]) 108 | del edge_dict[obj] 109 | valid_edge_list = list(unique_everseen(valid_edge_list)) 110 | return valid_edge_list 111 | 112 | def filter_graph_by_root_set(edge_list, root_set): 113 | edge_dict = build_edge_dict(edge_list) 114 | rooted_edge_list_iter = (extract_rooted_edge_list(root, edge_dict) for root in root_set) 115 | rooted_edge_list = sum(rooted_edge_list_iter, []) 116 | return rooted_edge_list 117 | 118 | def remove_leaves(edge_list, edge_to_remove_fn=lambda x:x): 119 | edge_list = list(edge_list) 120 | leaf_to_exclude_set = get_leaf_set(edge_list).intersection(get_object_set(filter(edge_to_remove_fn, edge_list))) 121 | edge_to_exclude_iter = filter(lambda x: len(get_object_set([x]).intersection(leaf_to_exclude_set))==0, edge_list) 122 | return list(edge_to_exclude_iter) 123 | 124 | def get_connected_graph_list(edge_list): 125 | edge_list = list(edge_list) 126 | edge_dict = build_edge_dict(edge_list) 127 | graph_list = [ 128 | extract_rooted_edge_list(root, edge_dict) 129 | for root in get_subject_set(edge_list) 130 | ] 131 | graph_list.sort(key=lambda x: len(x), reverse=True) 132 | 133 | for i,graph in enumerate(graph_list): 134 | if len(graph)==0: 135 | continue 136 | graph_concept_set = get_concept_set(graph) 137 | for j,other_graph in enumerate(graph_list): 138 | if i==j: 139 | continue 140 | if len(other_graph)==0: 141 | continue 142 | other_graph_concept_set = get_concept_set(other_graph) 143 | if len(graph_concept_set.intersection(other_graph_concept_set)) > 0: 144 | graph.extend(other_graph) 145 | graph_concept_set |= other_graph_concept_set 146 | other_graph.clear() 147 | graph_list = [ 148 | list(unique_everseen(graph)) 149 | for graph in filter(lambda x: len(x)>0, graph_list) 150 | ] 151 | return graph_list 152 | 153 | def get_biggest_connected_graph(edge_list): 154 | return max(get_connected_graph_list(edge_list), key=lambda x: len(x)) 155 | 156 | def save_graphml(edge_list, file_name): 157 | edge_list = list(edge_list) 158 | 159 | # Build graph 160 | graph=nx.DiGraph() # directed graph 161 | for subject, predicate, object in edge_list: 162 | graph.add_edge(subject, object, r=predicate) 163 | 164 | nx.write_graphml(graph, file_name+".graphml", prettyprint=True) 165 | 166 | graphml = ''' 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | ''' 182 | 183 | concept_set = get_concept_set(edge_list) 184 | for concept in concept_set: 185 | graphml += ''' 186 | 187 | 188 | 189 | 190 | 191 | {0} 192 | 193 | 194 | 195 | '''.format(concept) + '\n' 196 | 197 | for subj,pred,obj in edge_list: 198 | graphml += ''' 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | {2} 207 | 208 | 209 | 210 | 211 | '''.format(subj,obj,pred) + '\n' 212 | 213 | graphml += ''' 214 | 215 | 216 | 217 | ''' 218 | 219 | path = file_name+"_yEd.graphml" 220 | with open(path, 'w') as content_file: 221 | content_file.write(graphml) 222 | 223 | MAX_LABEL_LENGTH = 128 224 | def save_graph(edge_list, file_name, size=None): 225 | def stringify(x): 226 | if isinstance(x, (list,tuple)): 227 | if len(x)==0: 228 | return '' 229 | if len(x)==1: 230 | x = x[0] 231 | return str(x) 232 | edge_list = [tuple(map(stringify,edge)) for edge in edge_list] 233 | # Build graph 234 | save_graphml(edge_list, file_name) 235 | 236 | if size is None: 237 | return 238 | graph=nx.DiGraph() # directed graph 239 | format_str = lambda x: x[:MAX_LABEL_LENGTH].replace(':','.')#.replace('.',' ') 240 | for subject, predicate, object in map(lambda x: map(format_str,x),edge_list): 241 | graph.add_edge(subject, object, r=predicate) 242 | 243 | #initialze Figure 244 | plt.figure(num=None, figsize=(size, size)) 245 | plt.axis('off') 246 | fig = plt.figure(1) 247 | 248 | pos=graphviz_layout(graph,prog='twopi') 249 | nx.draw( 250 | graph, 251 | pos, 252 | font_size=16, 253 | with_labels=False, 254 | arrowstyle='wedge', 255 | ) 256 | nx.draw_networkx_labels( 257 | graph, 258 | pos, 259 | bbox=dict(boxstyle='square', fc="w", ec="k") 260 | ) 261 | #edge_labels={('A','B'):'AB',('B','C'):'BC',('B','D'):'BD'} 262 | nx.draw_networkx_edge_labels( 263 | graph, 264 | pos, 265 | edge_labels=nx.get_edge_attributes(graph,'r'), 266 | font_color='red' 267 | ) 268 | 269 | plt.savefig(file_name+'.png', bbox_inches="tight") 270 | plt.clf() 271 | del fig 272 | -------------------------------------------------------------------------------- /web_app/oke/core/models/classification/sentence_classifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np # for fast array ops 4 | import misc.tfidf_lib as tfidf_lib 5 | from models.model_manager import ModelManager 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from nltk.stem.snowball import SnowballStemmer # 8 | 9 | from misc.doc_reader import load_or_create_cache, create_cache, load_cache 10 | 11 | class SentenceClassifier(ModelManager): 12 | stemmer = SnowballStemmer("english") 13 | 14 | def __init__(self, model_options): 15 | super().__init__(model_options) 16 | self.disable_spacy_component = ["tagger", "ner", "textcat", "neuralcoref"] 17 | # Read options from input 18 | self.log = model_options.get('log', False) 19 | self.use_tf_model = model_options.get('tf_model', None) is not None 20 | self.with_topic_scaling = model_options.get('with_topic_scaling', False) 21 | self.use_combined_wordvec = self.with_topic_scaling or not self.use_tf_model 22 | self.with_document_log_length_scaling = model_options.get('with_document_log_length_scaling', False) 23 | self.with_centered_similarity = model_options.get('with_centered_similarity', False) 24 | # TF-IDF 25 | self.default_tfidf_importance = model_options.get('default_tfidf_importance', 1/2) # number in [0,1] 26 | self.default_tfidf_importance = np.clip(self.default_tfidf_importance, 0,1) 27 | self.with_stemmed_tfidf = model_options.get('with_stemmed_tfidf', False) 28 | self.very_big_corpus = model_options.get('very_big_corpus', False) 29 | self.query_cache = {} 30 | self.default_similarity_threshold = self.model_options.get('default_similarity_threshold', 0) 31 | 32 | if self.log: 33 | print('Initialising SentenceClassifier:') 34 | print(' with_stemmed_tfidf',self.with_stemmed_tfidf) 35 | print(' with_topic_scaling',self.with_topic_scaling) 36 | print(' with_document_log_length_scaling',self.with_document_log_length_scaling) 37 | print(' default_tfidf_importance',self.default_tfidf_importance) 38 | print(' use_combined_wordvec',self.use_combined_wordvec) 39 | 40 | def set_documents(self, id_doc_list, context_list=None): 41 | self.ids, self.documents = zip(*id_doc_list) 42 | self.contexts = context_list if context_list else list(self.documents) 43 | self.contextualised_documents = list(zip(self.documents,self.contexts)) 44 | if self.use_tf_model: 45 | self.contextualised_documents_embeddings = self.run_tf_embedding(doc_list=self.contextualised_documents, as_question=False) 46 | self.target_size = len(id_doc_list) 47 | # TF-IDF 48 | self.tfidf_prepared = False 49 | self.spacy_prepared = False 50 | return self 51 | 52 | def get_stemmed_token_list(self, token_list): 53 | return list(map(self.stemmer.stem, token_list)) 54 | 55 | def prepare_tfidf(self): 56 | # Get lemmatized documents 57 | lemmatized_document_iter = map(self.lemmatize_spacy_document, self.spacy_documents) 58 | if self.with_stemmed_tfidf: 59 | stemmed_documents = [ 60 | self.get_stemmed_token_list(token_list) 61 | for token_list in lemmatized_document_iter 62 | ] 63 | if self.log: 64 | print('stemmed_documents', stemmed_documents) 65 | words_vector = stemmed_documents 66 | else: 67 | words_vector = list(lemmatized_document_iter) 68 | # Build tf-idf model and similarities 69 | dictionary, tfidf_model, tfidf_corpus_similarities = tfidf_lib.build_tfidf(words_vector=words_vector, very_big_corpus=self.very_big_corpus) 70 | if self.log: 71 | print("Number of words in dictionary:",len(dictionary)) 72 | self.dictionary, self.tfidf_model, self.tfidf_corpus_similarities = dictionary, tfidf_model, tfidf_corpus_similarities 73 | self.tfidf_prepared = True 74 | 75 | def prepare_spacy(self): 76 | if not self.spacy_prepared: 77 | self.spacy_documents = self.nlp(self.contexts) 78 | self.spacy_prepared = True 79 | 80 | def lemmatize_spacy_document(self, doc): 81 | return [ 82 | token.lemma_.casefold().strip() 83 | for token in doc 84 | if not (token.is_stop or token.is_punct) and token.lemma_.lower() != '-pron-' 85 | ] 86 | 87 | def get_weighted_similarity(self, similarity_dict, query_length, tfidf_importance): 88 | semantic_similarity = similarity_dict.get('docvec' if self.use_tf_model else 'combined_wordvec', 0) 89 | syntactic_similarity = similarity_dict.get('tfidf', 0) 90 | # Build combined similarity 91 | if self.log: 92 | print('tfidf_importance', tfidf_importance) 93 | weighted_similarity = tfidf_importance*syntactic_similarity+(1-tfidf_importance)*semantic_similarity 94 | 95 | if self.with_topic_scaling: 96 | # Get the topic weight 97 | corpus_similarity = similarity_dict['corpus'] 98 | topic_weight = np.power(corpus_similarity,2) 99 | # Compute the weighted similarity for every sub-corpus 100 | # syntactic_similarity is high for a document when the query words and the document words are similar, but syntactic_similarity may be lower when we use words in the synsets 101 | # in order to address the aforementioned synset-words problem we sum the syntactic_similarity with the corpus_similarity before scaling it by the semantic_weight 102 | # we scale by the semantic_weight in order to give significantly more similarity to the documents semantically more closer to the query 103 | weighted_similarity *= topic_weight 104 | 105 | if self.with_document_log_length_scaling: 106 | # the bigger the sentence, the (smoothly) lower the weighted_similarity 107 | # thus we scale the weighted_similarity by the log of the query length 108 | weighted_similarity *= np.array(query_length)/np.max(query_length) # sum 1 to avoid similarity zeroing 109 | 110 | return weighted_similarity 111 | 112 | def classify(self, query_list, similarity_type, similarity_threshold=None, as_question=False, tfidf_importance=None): 113 | return self.get_index_of_most_similar_documents( 114 | self.get_query_similarity(query_list, as_question=as_question, tfidf_importance=tfidf_importance), 115 | similarity_threshold=similarity_threshold, 116 | similarity_type=similarity_type, 117 | ) 118 | 119 | def get_query_similarity(self, query_list, as_question=False, tfidf_importance=None): 120 | return self.get_formatted_query_similarity( 121 | query_list, # original query 122 | self.nlp(query_list), # Get the filtered query (Document object built using lemmas) 123 | as_question=as_question, 124 | tfidf_importance=tfidf_importance, 125 | ) 126 | 127 | def get_formatted_query_similarity(self, text_list, formatted_query_list, as_question=False, tfidf_importance=None): 128 | if tfidf_importance is None: 129 | tfidf_importance = self.default_tfidf_importance 130 | # Prepare spacy docs if they are not ready yet 131 | with_syntactic_similarity = tfidf_importance > 0 132 | with_semantic_similarity = tfidf_importance < 1 133 | if with_syntactic_similarity: 134 | self.prepare_spacy() 135 | self.prepare_tfidf() 136 | elif self.use_combined_wordvec: 137 | self.prepare_spacy() 138 | ################################################################################# 139 | # Build similarity dict 140 | similarity_dict = {} 141 | if with_syntactic_similarity: 142 | # Get the lemmatized query 143 | formatted_query_list = tuple(map(self.lemmatize_spacy_document, formatted_query_list)) 144 | if self.log: 145 | print('lemmatized_query', formatted_query_list) 146 | # Get the stemmed query for tf-idf 147 | if self.with_stemmed_tfidf: 148 | formatted_query_list = tuple(map(self.get_stemmed_token_list, formatted_query_list)) 149 | if self.log: 150 | print('stemmed_query', formatted_query_list) 151 | # Get tf-idf and docvec similarities 152 | similarity_dict['tfidf'] = np.array([ 153 | tfidf_lib.get_query_tfidf_similarity( 154 | formatted_query, 155 | self.dictionary, 156 | self.tfidf_model, 157 | self.tfidf_corpus_similarities 158 | ) 159 | for formatted_query in formatted_query_list 160 | ]) 161 | if with_semantic_similarity: 162 | if self.use_tf_model: 163 | # Get docvec similarity 164 | similarity_dict['docvec'] = np.inner( 165 | self.run_tf_embedding(doc_list=text_list, as_question=as_question), 166 | self.contextualised_documents_embeddings 167 | ) 168 | if self.use_combined_wordvec: 169 | get_avg_wordvec_similarity = lambda x: np.mean([q.vector for q in x]) 170 | # Get averaged wordvec similarity 171 | similarity_dict['combined_wordvec'] = cosine_similarity( 172 | map(get_avg_wordvec_similarity, formatted_query_list), 173 | map(get_avg_wordvec_similarity, self.spacy_documents) 174 | ) 175 | if self.with_topic_scaling: 176 | # Get the corpus similarity for every sub-corpus, by averaging the docvec similarities of every sub-corpus 177 | similarity_dict['corpus'] = np.mean(similarity_dict['combined_wordvec'],-1) 178 | similarity_dict['corpus'] = np.expand_dims(similarity_dict['corpus'], -1) # expand_dims because we have sub-corpus 179 | # Get the weighted similarity 180 | similarity_dict['weighted'] = self.get_weighted_similarity(similarity_dict=similarity_dict, query_length=np.array(map(len,formatted_query_list)), tfidf_importance=tfidf_importance) 181 | # Sum the weighted similarity across sub-corpus 182 | # similarity_dict['weighted'] = np.sum(similarity_dict['weighted'], 0) 183 | # Center the weighted_similarity vector 184 | if self.with_centered_similarity: 185 | # Center the weighted_similarity vector: Remove the average weighted_similarity 186 | similarity_dict['weighted'] -= np.mean(similarity_dict['weighted']) 187 | # Remove negative components, they are useless for the task 188 | similarity_dict['weighted'] = np.maximum(similarity_dict['weighted'], 0) 189 | return similarity_dict 190 | 191 | def get_index_of_most_similar_documents(self, similarity_dict, similarity_type, similarity_threshold=None): 192 | if similarity_threshold is None: 193 | similarity_threshold = self.default_similarity_threshold 194 | def get_similarity_dict_generator(i, similarity_ranking): 195 | # print('#'*100) 196 | similarity = similarity_dict[similarity_type][i] 197 | syntactic_similarity = similarity_dict['tfidf'][i] if 'tfidf' in similarity_dict else None 198 | semantic_similarity = similarity_dict['docvec'][i] if 'docvec' in similarity_dict else None 199 | for best in similarity_ranking: 200 | if similarity_threshold is not None and similarity[best] < similarity_threshold: 201 | return 202 | sim_dict = { 203 | 'id':self.ids[best], 204 | 'doc':self.documents[best], 205 | 'index':int(best), 206 | 'similarity':float(similarity[best]), 207 | 'syntactic_similarity':float(syntactic_similarity[best]) if syntactic_similarity is not None else 0, 208 | 'semantic_similarity':float(semantic_similarity[best]) if semantic_similarity is not None else 0, 209 | } 210 | # print(best, sim_dict) 211 | if self.contexts: 212 | sim_dict['context'] = self.contexts[best] 213 | yield sim_dict 214 | similarity_list = similarity_dict[similarity_type] 215 | similarity_ranking_list = np.argsort(similarity_list, kind='stable', axis=-1) 216 | return [ 217 | get_similarity_dict_generator(i, similarity_ranking[::-1]) 218 | for i,similarity_ranking in enumerate(similarity_ranking_list) 219 | ] 220 | -------------------------------------------------------------------------------- /web_app/yai/static/js/stage_builder/domain_stage_builder.js: -------------------------------------------------------------------------------- 1 | // create a graph class 2 | class DirectedGraph { 3 | // defining vertex array and 4 | // adjacent list 5 | constructor() 6 | { 7 | this.AdjacencyList = new Map(); 8 | this.InverseAdjacencyList = new Map(); 9 | } 10 | 11 | // add edge to the graph 12 | addEdge(v, w) 13 | { 14 | if (!this.AdjacencyList.has(v)) // initialize the adjacent list with an empty array 15 | this.AdjacencyList.set(v, []) 16 | this.AdjacencyList.get(v).push(w); 17 | 18 | if (!this.InverseAdjacencyList.has(w)) // initialize the adjacent list with an empty array 19 | this.InverseAdjacencyList.set(w, []) 20 | this.InverseAdjacencyList.get(w).push(v); 21 | } 22 | 23 | getRoots() 24 | { 25 | var root_set = new Set(Array.from(this.AdjacencyList.keys())) 26 | for (var [source,target_list] of this.AdjacencyList.entries()) 27 | { 28 | for (var target of target_list) 29 | if (source != target) 30 | root_set.delete(target) 31 | } 32 | return Array.from(root_set) 33 | } 34 | 35 | getLeaves() 36 | { 37 | var root_set = new Set(Array.from(this.InverseAdjacencyList.keys())) 38 | for (var [source,target_list] of this.InverseAdjacencyList.entries()) 39 | { 40 | for (var target of target_list) 41 | { 42 | if (source != target) 43 | root_set.delete(target) 44 | } 45 | } 46 | return Array.from(root_set) 47 | } 48 | 49 | // Prints the vertex and adjacency list 50 | printGraph() 51 | { 52 | // get all the vertices 53 | var get_keys = this.AdjacencyList.keys(); 54 | 55 | // iterate over the vertices 56 | for (var i of get_keys) { 57 | // great the corresponding adjacency list 58 | // for the vertex 59 | var get_values = this.AdjacencyList.get(i); 60 | var conc = ""; 61 | 62 | // iterate over the adjacency list 63 | // concatenate the values into a string 64 | for (var j of get_values) 65 | conc += j + " "; 66 | 67 | // print the vertex and its adjacency list 68 | console.log(i + " -> " + conc); 69 | } 70 | } 71 | } 72 | 73 | function get_taxonomy_information(information_uri) 74 | { 75 | //console.log(information_uri) 76 | var query = [ 77 | "SELECT DISTINCT ?subject ?predicate ?object WHERE {", 78 | "<"+information_uri+"> rdfs:subClassOf* ?subject.", 79 | "?subject ?predicate ?object.", 80 | "}", 81 | ].join("\n"); 82 | //console.log(query) 83 | var query_result = query_sparql_endpoint(DBPEDIA_ENDPOINT, query) 84 | if (!query_result || !query_result.results || query_result.results.bindings.length==0) 85 | return null 86 | var tuple_list = query_result.results.bindings 87 | // Build subject map 88 | var subj_map = new Map() 89 | for (tuple of tuple_list) 90 | { 91 | var subj = tuple.subject.value, pred = tuple.predicate.value, obj = tuple.object.value 92 | if (!subj_map.has(subj)) 93 | subj_map.set(subj, {'@id': subj}) 94 | subj_map.get(subj)[tuple.predicate.value] = tuple.object.value 95 | } 96 | function recursive_graph_building(subj) { 97 | var jsonld_graph = Object.assign({}, subj_map.get(subj)); 98 | for (var [key,value] of Object.entries(jsonld_graph)) 99 | { 100 | if (key=='@id') 101 | continue 102 | if (subj_map.has(value) && value!=subj) 103 | jsonld_graph[key] = recursive_graph_building(value) 104 | } 105 | return jsonld_graph 106 | } 107 | var jsonld_graph = recursive_graph_building(information_uri) 108 | // console.log(jsonld_graph) 109 | var ground = { 110 | '@type': 'JSON', 111 | '@value': JSON.stringify(query_result, null, 2) 112 | } 113 | jsonld_graph = format_jsonld(jsonld_graph, ground, query) 114 | return jsonld_graph 115 | } 116 | //console.log(get_taxonomy_information('http://dbpedia.org/class/yago/WikicatNeuralNetworks')) 117 | 118 | function get_typeset_hierarchy_leaves_from_dbpedia(type_list) 119 | { 120 | if (type_list.length == 1) 121 | return type_list 122 | var type_query = [] 123 | for (var type of type_list) 124 | { 125 | if (type_query.length > 0) 126 | type_query.push('UNION') 127 | type_query.push( 128 | [ 129 | "{", 130 | "SELECT DISTINCT ?class ?superclass WHERE {", 131 | "<"+type+"> rdfs:subClassOf* ?class.", 132 | "?class rdfs:subClassOf? ?superclass.", 133 | "}", 134 | "}" 135 | ].join("\n") 136 | ) 137 | } 138 | // console.log(PREFIX_MAP_STRING) 139 | var query = [ 140 | PREFIX_MAP_STRING, 141 | "SELECT DISTINCT ?class ?superclass WHERE {", 142 | type_query.join("\n"), 143 | "}" 144 | ].join("\n"); 145 | // console.log(query) 146 | 147 | function get_leaves(_data) 148 | { 149 | // console.log(_data) 150 | var results = _data.results.bindings; 151 | var class_hierarchy = new DirectedGraph() 152 | for (var i in results) 153 | { 154 | var row = results[i] 155 | var super_class = row['superclass'].value 156 | var sub_class = row['class'].value; 157 | class_hierarchy.addEdge(super_class, sub_class); 158 | } 159 | // console.log(class_hierarchy.getLeaves()) 160 | return class_hierarchy.getLeaves() 161 | } 162 | return get_leaves(query_sparql_endpoint(DBPEDIA_ENDPOINT, query)); 163 | } 164 | 165 | function build_minimal_type_graph(minimal_entity_graph, predicate=TYPE_URI, collector=HAS_ENTITY_URI) 166 | { 167 | // minimal_entity_graph = format_jsonld(minimal_entity_graph); 168 | var entity_dict = get_entity_dict(minimal_entity_graph); 169 | var tot_statement_count = count_graph_statements(minimal_entity_graph); 170 | minimal_entity_graph = get_URI_graph(minimal_entity_graph, predicate, collector, recursion=false); 171 | // Create the entity_type_map, in order to easily keep track of entities and their types 172 | var type_rdfitem_map = new Map(); 173 | var entity_type_map = new Map(); 174 | for (var sub_graph of minimal_entity_graph) 175 | { 176 | var type = sub_graph['@id']['@value']; 177 | type_rdfitem_map.set(type, sub_graph['@id']); 178 | var sub_graph_list = isArray(sub_graph[collector]) ? sub_graph[collector] : [sub_graph[collector]]; 179 | for (var g of sub_graph_list) 180 | { 181 | if (jQuery.isEmptyObject(g)) 182 | continue; 183 | var entity_id = g['@id']['@value'] 184 | if (!entity_type_map.has(entity_id)) 185 | { 186 | entity_type_map.set(entity_id, {}) 187 | entity_type_map.get(entity_id)[TYPESET_URI] = new Set() 188 | entity_type_map.get(entity_id)[collector] = g 189 | } 190 | entity_type_map.get(entity_id)[TYPESET_URI].add(type) 191 | } 192 | } 193 | // Remove redundant types from the entity_type_map. 194 | // Redundant types are those types that can be inferred from the other types, following superclass relations in dbpedia. 195 | // for (var [entity_id, type_dict] of entity_type_map.entries()) 196 | // { 197 | // var redundant_type_set = new Set(type_dict[TYPESET_URI]) 198 | // var type_hierarchy_leaves = get_typeset_hierarchy_leaves_from_dbpedia(Array.from(redundant_type_set)) 199 | // if (type_hierarchy_leaves.length == 0) // entity not found in dbpedia 200 | // continue 201 | // for (var minimal_type of type_hierarchy_leaves) 202 | // redundant_type_set.delete(minimal_type) 203 | // //console.log(entity_id, redundant_type_set) 204 | // for (var redundant_type of redundant_type_set) 205 | // type_dict[TYPESET_URI].delete(redundant_type) 206 | // } 207 | // Create the type_entity_map, merging similar type groups 208 | var type_entity_map = new Map(); 209 | var current_type_group_id = 0; 210 | for (var [entity_id, type_dict] of entity_type_map.entries()) 211 | { 212 | var minimal_type_list = Array.from(type_dict[TYPESET_URI]) 213 | if (minimal_type_list.length==0) 214 | continue 215 | var type_id = minimal_type_list.sort().join(' ') 216 | minimal_type_list = minimal_type_list.map(x => type_rdfitem_map.get(x)) // recover grounds 217 | if (!type_entity_map.has(type_id)) 218 | { 219 | if (minimal_type_list.length==1) 220 | { 221 | var type_uri_item = minimal_type_list[0]; 222 | var type_graph = {'@id': type_uri_item}; 223 | // type_graph[ENTITY_PERCENTAGE_URI] = build_RDF_item(0); 224 | type_graph[STATEMENT_COUNT_URI] = build_RDF_item(0); 225 | // type_graph[ENTITY_COUNT_URI] = build_RDF_item(0); 226 | // var class_information = get_taxonomy_information(type_uri_item['@value']); 227 | // if (class_information!==null) // build domain stage 228 | // type_graph[PROPERTY_LIST_URI] = class_information; 229 | type_entity_map.set(type_id, type_graph); 230 | } 231 | else 232 | { 233 | var class_set = [] 234 | for (var type_uri_item of minimal_type_list) 235 | { 236 | class_set.push(Object.assign({}, 237 | {'@id': type_uri_item}, 238 | entity_dict[type_uri_item['@value']], 239 | // get_taxonomy_information(type_uri_item['@value']) 240 | )); 241 | } 242 | if (class_set.length > 0) 243 | class_set = class_set.filter(x=>x['@id']['@value']!=PREFIX_MAP['owl']+'Thing'); 244 | var type_entity_dict = {}; 245 | type_entity_dict['@id'] = build_RDF_item('my:CompositeClass'+current_type_group_id); 246 | type_entity_dict[IS_COMPOSITE_CLASS_BOOL_URI] = build_RDF_item(true); 247 | // type_entity_dict[ENTITY_PERCENTAGE_URI] = build_RDF_item(0); 248 | type_entity_dict[STATEMENT_COUNT_URI] = build_RDF_item(0); 249 | // type_entity_dict[ENTITY_COUNT_URI] = build_RDF_item(0); 250 | // type_entity_dict[CLASS_COUNT_URI] = build_RDF_item(class_set.length); 251 | type_entity_dict[COMPOSITE_CLASS_SET_URI] = class_set; 252 | type_entity_map.set(type_id, type_entity_dict); 253 | current_type_group_id += 1; 254 | } 255 | type_entity_map.get(type_id)[collector] = []; 256 | } 257 | type_entity_map.get(type_id)[collector].push(type_dict[collector]); 258 | } 259 | // Return a new minimized type graph 260 | var type_entity_dict_list = Array.from(type_entity_map.values()); 261 | for (var i in type_entity_dict_list) 262 | { 263 | var type_entity_dict = type_entity_dict_list[i]; 264 | var type_uri = type_entity_dict['@id']['@value']; 265 | var type_statement_count = count_graph_statements(type_entity_dict[collector]); 266 | // type_entity_dict[ENTITY_PERCENTAGE_URI] = build_RDF_item(String((100*type_statement_count/tot_statement_count).toFixed(2))+'%'); 267 | type_entity_dict[STATEMENT_COUNT_URI] = build_RDF_item(type_statement_count); 268 | // type_entity_dict[ENTITY_COUNT_URI] = build_RDF_item(type_entity_dict[collector].length); 269 | if (type_uri in entity_dict) 270 | type_entity_dict_list[i] = Object.assign({}, type_entity_dict, entity_dict[type_uri]); 271 | } 272 | // Sort by entity count 273 | var minimal_type_graph = type_entity_dict_list.sort((a,b)=>b[STATEMENT_COUNT_URI]['@value']-a[STATEMENT_COUNT_URI]['@value']); 274 | // Add type info to entities that are also types 275 | for (var type_dict of minimal_type_graph) 276 | { 277 | if (!(LABEL_URI in type_dict)) 278 | type_dict[LABEL_URI] = format_link(get_dict_description(type_dict), false); 279 | // console.log(type_dict, (collector in type_dict)) 280 | if (!(collector in type_dict)) 281 | continue; 282 | for (var ent_dict of type_dict[collector]) 283 | { 284 | var ent_id = ent_dict['@id']['@value']; 285 | if (!type_entity_map.has(ent_id)) 286 | continue; 287 | // console.log(ent_id); 288 | for (var [k,v] of Object.entries(type_entity_map.get(ent_id))) 289 | ent_dict[k] = v; 290 | } 291 | } 292 | return minimal_type_graph; 293 | } -------------------------------------------------------------------------------- /web_app/oke/core/misc/doc_reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import re 4 | import json 5 | from bs4 import BeautifulSoup 6 | from tika import parser 7 | import unicodedata 8 | from more_itertools import unique_everseen 9 | from misc.jsonld_lib import * 10 | import html 11 | 12 | get_bs_text = lambda x: re.sub(r'[ \n\t]+',' ',html.unescape(x.text)).strip() if x else None 13 | 14 | def normalize_string(content): 15 | content = unicodedata.normalize("NFKC", content) # normalize content 16 | content = re.sub(r'\r\n', '\n', content, flags=re.UNICODE) # normalize new lines 17 | content = re.sub(r'[\r\f\v]', '\n', content, flags=re.UNICODE) # normalize new lines 18 | content = re.sub(r'[-\x2D\xAD\x58A\x1806\xFE63\xFF0D\xE002D]\n+', '', content, flags=re.UNICODE) # remove word-breaks (hyphens) 19 | content = re.sub(r'[\x2010\x2011\x2027\x2043]\n+', ' ', content, flags=re.UNICODE) # remove line-breaks (hyphens) 20 | content = re.sub(r'([^\n.])\n+([^\n])', r'\1 \2', content, flags=re.UNICODE) # remove line-breaks 21 | content = re.sub(r'[ \t]+', ' ', content, flags=re.UNICODE) # normalize whitespaces 22 | # workarounds 23 | content = re.sub(r' - ', ' ', content, flags=re.UNICODE) # remove all hyphens 24 | content = re.sub(r'- *', '', content, flags=re.UNICODE) # remove all hyphens 25 | return content.strip() 26 | 27 | def get_document_list(directory): 28 | doc_list = [] 29 | for obj in os.listdir(directory): 30 | obj_path = os.path.join(directory, obj) 31 | if os.path.isfile(obj_path): 32 | doc_list.append(obj_path) 33 | elif os.path.isdir(obj_path): 34 | doc_list.extend(get_document_list(obj_path)) 35 | return doc_list 36 | 37 | def get_all_paths_to_leaf(root, element_set): 38 | if not root: 39 | return [[]] 40 | if root.name in element_set: 41 | return [[root]] 42 | children = list(root.findChildren(recursive=False)) 43 | if not children: 44 | return [[]] 45 | path_list = [ 46 | path 47 | for child in children 48 | for path in get_all_paths_to_leaf(child, element_set) 49 | ] 50 | merged_path_list = [] 51 | i = 0 52 | while i < len(path_list): 53 | child_path = [] 54 | while i < len(path_list) and len(path_list[i]) == 1: 55 | child_path += path_list[i] 56 | i+=1 57 | if i < len(path_list): 58 | child_path += path_list[i] 59 | i+=1 60 | if child_path: 61 | merged_path_list.append(child_path) 62 | return merged_path_list 63 | 64 | def get_next_siblings(e, name_set): 65 | next_siblings = [] 66 | sibling = e.find_next_sibling() 67 | while sibling and sibling.name in name_set: 68 | next_siblings.append(sibling) 69 | sibling = sibling.find_next_sibling() 70 | return next_siblings 71 | 72 | def read_jsonld_file(filename): 73 | file_id = os.path.basename(filename).replace(' ','_') 74 | # read file 75 | with open(f'{filename}.json', 'r') as f: 76 | data=f.read() 77 | # parse file 78 | obj = json.loads(data) 79 | triple_list = jsonld_to_triples(obj, file_id) 80 | # print(json.dumps(triple_list, indent=4)) 81 | annotated_text_list = [ 82 | { 83 | 'text': o if not is_rdf_item(o) else o['@value'], 84 | 'id': file_id, 85 | } 86 | for s,p,o in triple_list 87 | if not is_url(o) and p != HAS_LABEL_PREDICATE 88 | ] + [{ 89 | 'graph': triple_list 90 | }] 91 | return annotated_text_list 92 | 93 | def read_html_file(filename, short_extension=False): 94 | file_id = os.path.basename(filename).replace(' ','_') 95 | with open(filename+('.htm' if short_extension else '.html'), 'r', encoding='utf8', errors='ignore') as file: 96 | file_content = file.read() 97 | doc = BeautifulSoup(file_content, features="lxml") 98 | for script in doc(["script", "style"]): # remove all javascript and stylesheet code 99 | script.extract() 100 | annotated_text_list = [] 101 | p_to_ignore = set() 102 | elements_to_merge = set(['table','ul','ol']) 103 | for i,p in enumerate(doc.findAll("p")): 104 | p_text = get_bs_text(p) 105 | if p_text in p_to_ignore: 106 | continue 107 | p_to_ignore.add(p_text) 108 | # p_set = [p] + get_next_siblings(p,['p']) 109 | # p_to_ignore |= set(p_set) 110 | # p = p_set[-1] 111 | siblings_to_merge = get_next_siblings(p,elements_to_merge) 112 | if not siblings_to_merge: 113 | annotated_text_list.append({ 114 | 'text': p_text, 115 | # 'text': ' '.join(map(get_bs_text,p_set)), 116 | 'id': file_id, 117 | }) 118 | else: 119 | for sibling in siblings_to_merge: 120 | path_list = get_all_paths_to_leaf(sibling, ['p']) 121 | annotated_text_list += [ 122 | { 123 | 'text': ' '.join(map(get_bs_text, [p]+path)), 124 | 'id': file_id, 125 | } 126 | for path in path_list 127 | ] 128 | p_to_ignore |= set(map(get_bs_text, sum(path_list,[]))) 129 | # print(json.dumps(annotated_text_list, indent=4)) 130 | return list(unique_everseen(annotated_text_list, key=lambda x: x['text'])) 131 | 132 | def read_pdf_file(filename): # https://unicodelookup.com 133 | file_id = os.path.basename(filename).replace(' ','_') 134 | raw = parser.from_file(filename+'.pdf') 135 | return [ 136 | { 137 | 'text': paragraph.strip(), 138 | 'id': file_id 139 | } 140 | for paragraph in raw['content'].split('\n\n') 141 | if paragraph 142 | ] 143 | 144 | def read_akn_file(filename): 145 | file_id = os.path.basename(filename).replace(' ','_') 146 | doc_id = urify(os.path.basename(filename)) 147 | def get_num_jsonld(e): 148 | num = get_bs_text(e.num) 149 | if not num: 150 | return None 151 | return { 152 | '@id': doc_id+':'+e['eid'], 153 | HAS_LABEL_PREDICATE: num 154 | } 155 | def get_heading_jsonld(e): 156 | heading = get_bs_text(e.heading) 157 | jsonld = get_num_jsonld(e) 158 | if heading: 159 | if jsonld: 160 | jsonld['my:heading'] = heading 161 | else: 162 | return { 163 | '@id': doc_id+':'+e['eid'], 164 | 'my:heading': heading 165 | } 166 | return jsonld 167 | 168 | with open(filename+'.akn') as f: 169 | file_content = f.read() 170 | 171 | doc = BeautifulSoup(file_content, features="lxml") 172 | 173 | annotated_text_list = [] 174 | for i,p in enumerate(doc.findAll("p")): 175 | text = get_bs_text(p) 176 | # Get annotations 177 | text_annotation = {} 178 | # # Get parent list 179 | # parent_list = [{ 180 | # 'name': p.name, 181 | # 'attrs': p.attrs 182 | # }] 183 | # for parent in p.find_parents(): 184 | # if parent.name == 'akomantoso': # Ignore the remaining parents 185 | # break 186 | # parent_list.append({ 187 | # 'name': parent.name, 188 | # 'attrs': parent.attrs 189 | # }) 190 | # text_annotation['@id'] = doc_id+':'+json.dumps(parent_list) 191 | # Get block 192 | block_list = p.find_parent('blocklist') 193 | if block_list: 194 | list_introduction = block_list.find('listintroduction') 195 | if list_introduction: 196 | text = ' '.join((get_bs_text(list_introduction), text)) 197 | item = p.find_parent('item') 198 | item_num = get_num_jsonld(item) 199 | if item_num: 200 | text_annotation['my:block_id'] = item_num 201 | else: 202 | intro = p.find_parent('intro') 203 | if intro: 204 | continue 205 | list = p.find_parent('list') 206 | if list and list.intro: 207 | text = ' '.join((get_bs_text(list.intro.p), text)) 208 | # Get paragraph 209 | paragraph = p.find_parent('paragraph') 210 | if paragraph: 211 | paragraph_num = get_num_jsonld(paragraph) 212 | if paragraph_num: 213 | text_annotation['my:paragraph_id'] = paragraph_num 214 | # Get article 215 | article = p.find_parent('article') 216 | if article: 217 | article_num = get_num_jsonld(article) 218 | if article_num: 219 | text_annotation['my:article_id'] = article_num 220 | # Get section 221 | section = p.find_parent('section') 222 | if section: 223 | section_heading = get_heading_jsonld(section) 224 | if section_heading: 225 | text_annotation['my:section_id'] = section_heading 226 | # Get chapter 227 | chapter = p.find_parent('chapter') 228 | if chapter: 229 | chapter_heading = get_heading_jsonld(chapter) 230 | if chapter_heading: 231 | text_annotation['my:chapter_id'] = chapter_heading 232 | # Get references 233 | text_annotation['my:reference_id'] = [ 234 | { 235 | '@id': doc_id+':'+ref['href'], 236 | HAS_LABEL_PREDICATE: get_bs_text(ref), 237 | } 238 | for ref in p.findAll('ref', recursive=False) 239 | ] 240 | base_id = f'{file_id}_{i}' 241 | annotated_text_list.append({ 242 | 'text': text, 243 | 'id': file_id, 244 | 'annotation': { 245 | 'root': f'{ANONYMOUS_PREFIX}{base_id}_0', 246 | 'content': jsonld_to_triples(text_annotation, base_id), 247 | }, 248 | }) 249 | return annotated_text_list 250 | 251 | def get_content_list(doc_list): 252 | file_name = lambda x: os.path.splitext(x)[0] 253 | doc_set = set(doc_list) 254 | name_iter = unique_everseen(map(file_name, doc_list)) 255 | content_list = [] 256 | for obj_name in name_iter: 257 | if obj_name+'.akn' in doc_set: 258 | print('Parsing AKN:', obj_name) 259 | content_list += read_akn_file(obj_name) 260 | elif obj_name+'.html' in doc_set: 261 | print('Parsing HTML:', obj_name) 262 | content_list += read_html_file(obj_name) 263 | elif obj_name+'.htm' in doc_set: 264 | print('Parsing HTM:', obj_name) 265 | content_list += read_html_file(obj_name, True) 266 | elif obj_name+'.pdf' in doc_set: 267 | print('Parsing PDF:', obj_name) 268 | content_list += read_pdf_file(obj_name) 269 | elif obj_name+'.json' in doc_set: 270 | print('Parsing JSON-LD:', obj_name) 271 | content_list += read_jsonld_file(obj_name) 272 | return content_list 273 | 274 | def create_cache(file_name, create_fn): 275 | print(f'Creating cache <{file_name}>..') 276 | result = create_fn() 277 | with open(file_name, 'wb') as f: 278 | pickle.dump(result, f) 279 | return result 280 | 281 | def load_cache(file_name): 282 | if os.path.isfile(file_name): 283 | print(f'Loading cache <{file_name}>..') 284 | with open(file_name,'rb') as f: 285 | return pickle.load(f) 286 | return None 287 | 288 | def load_or_create_cache(file_name, create_fn): 289 | result = load_cache(file_name) 290 | if not result: 291 | result = create_cache(file_name, create_fn) 292 | return result 293 | 294 | class DocParser(): 295 | 296 | # def __init__(self, model_options): 297 | # super().__init__(model_options) 298 | 299 | def set_documents_path(self, doc_path): 300 | self.content_list = get_content_list(get_document_list(doc_path)) 301 | self.process_content_list() 302 | return self 303 | 304 | def set_document_list(self, doc_list): 305 | self.content_list = get_content_list(doc_list) 306 | self.process_content_list() 307 | return self 308 | 309 | def set_content_list(self, content_list): 310 | self.content_list = tuple(map(lambda x: x if isinstance(x,dict) else {'text':x,'id':x}, content_list)) 311 | self.process_content_list() 312 | return self 313 | 314 | def process_content_list(self): 315 | self.graph_list = tuple(filter(lambda x: x, map(lambda x: x.get('graph', None), self.content_list))) 316 | self.content_list = tuple(filter(lambda x: 'text' in x, self.content_list)) 317 | for doc_dict in self.content_list: 318 | doc_dict['normalised_text'] = normalize_string(doc_dict['text']) 319 | 320 | def get_doc_iter(self): 321 | for doc_dict in self.content_list: 322 | yield doc_dict['id'] 323 | 324 | def get_annotation_iter(self): 325 | for doc_dict in self.content_list: 326 | yield doc_dict.get('annotation',None) 327 | 328 | def get_graph_iter(self): 329 | return self.graph_list 330 | 331 | def get_content_iter(self, normalised=True): 332 | for doc_dict in self.content_list: 333 | yield doc_dict['normalised_text' if normalised else 'text'] 334 | -------------------------------------------------------------------------------- /web_app/oke/core/models/knowledge_extraction/couple_extractor.py: -------------------------------------------------------------------------------- 1 | from misc.doc_reader import DocParser 2 | from models.knowledge_extraction.concept_extractor import ConceptExtractor as CE 3 | import re 4 | # import json 5 | 6 | class CoupleExtractor(CE): 7 | # PREDICATE_COMPONENT = [ # https://universaldependencies.org/u/dep/all.html 8 | # 'prt', # particle 9 | # 'neg', # negation modifier 10 | # 'auxpass', # auxiliary (passive) 11 | # 'advcl', # adverbial clause modifier 12 | # 'agent', # agent 13 | # 'acomp', # adjectival complement 14 | # 'xcomp', # open clausal complement 15 | # 'pcomp', # complement of preposition 16 | # 'ccomp', # clausal complement 17 | # 'prep', # prepositional modifier 18 | # ] 19 | # HIDDEN_PREDICATE_COMPONENT = [ 20 | # 'aux', # auxiliaries 21 | # 'mark', # marker - https://universaldependencies.org/docs/en/dep/mark.html 22 | # 'advmod', # adverbial modifier 23 | # 'cc', # coordinating conjunction 24 | # ] 25 | # PREDICATE_REGEXP = re.compile('|'.join(PREDICATE_COMPONENT+HIDDEN_PREDICATE_COMPONENT)) 26 | CC_FILTER_FN = lambda x: x.pos_=='PUNCT' or x.dep_=='cc' # punctuation and conjunctions 27 | 28 | @staticmethod 29 | def is_passive(span): # return true if the sentence is passive - at the moment a sentence is assumed to be passive if it has an auxpass verb 30 | for token in span: 31 | if CE.get_token_dependency(token) == "auxpass": 32 | return True 33 | return False 34 | 35 | @staticmethod 36 | def is_verbal(span): # return true if the sentence is passive - at the moment a sentence is assumed to be passive if it has an auxpass verb 37 | for token in span: 38 | if token.pos_ == "VERB": 39 | return True 40 | return False 41 | 42 | @staticmethod 43 | def is_at_core(concept): 44 | concept_span = concept['concept']['span'] 45 | return len(concept_span)==1 and len(concept['concept_core'])==1 and concept['concept_core'][0]['span'][0] == concept_span[0] 46 | 47 | @staticmethod 48 | def get_couple_uid(couple): 49 | return (CE.get_concept_dict_uid(couple['concept']), CE.get_concept_dict_uid(couple['predicate']), couple['dependency']) 50 | 51 | @staticmethod 52 | def is_in_predicate(x,predicate_span): 53 | return x.idx > predicate_span[0].idx and x.idx < predicate_span[-1].idx 54 | 55 | @staticmethod 56 | def trim_noise(token_list): 57 | forbidden_dep = set(['cc', 'prep', 'punct']) 58 | return CE.trim(token_list, lambda x: CE.get_token_dependency(x) in forbidden_dep) 59 | 60 | @staticmethod 61 | def expand_predicate_core(predicate_set, subj_obj_set): # enrich predicate set with details, adding hidden related concepts (n-ary relations) 62 | hidden_related_concept_set = set(( 63 | hidden_related_concept 64 | for predicate_element in predicate_set 65 | for hidden_related_concept in CE.get_token_descendants(predicate_element, lambda x: x not in subj_obj_set and x not in predicate_set) 66 | )) 67 | return predicate_set | hidden_related_concept_set #| hidden_related_concept_detail_set 68 | 69 | @staticmethod 70 | def get_grammatical_connection(core, other_core, core_set): # can be one per core in core_set 71 | subj_obj_set = set((core,other_core)) 72 | # Search for indirect connections with other concepts 73 | core_super_set = set(core.ancestors) # do not use CE.get_token_ancestors here, it messes up with conjunctions 74 | core_super_set.add(core) 75 | other_core_super_set = set(other_core.ancestors) 76 | other_core_super_set.add(other_core) 77 | inter = core_super_set.intersection(other_core_super_set) 78 | if len(inter)==0: # core and other_core are not connected, continue 79 | return None 80 | # get paths connecting cores to each other 81 | core_path_to_inter,core_junction = CE.find_path_to_closest_in_set(core,inter) 82 | if core_junction: 83 | core_path_to_inter.add(core_junction) 84 | core_path_to_inter = core_path_to_inter.difference(subj_obj_set) 85 | if len(core_path_to_inter.intersection(core_set)) > 0: # avoid jumps 86 | return None 87 | other_core_path_to_inter,other_core_junction = CE.find_path_to_closest_in_set(other_core,inter) 88 | if other_core_junction: 89 | other_core_path_to_inter.add(other_core_junction) 90 | other_core_path_to_inter = other_core_path_to_inter.difference(subj_obj_set) 91 | if len(other_core_path_to_inter.intersection(core_set)) > 0: # avoid jumps 92 | return None 93 | # Get predicate set 94 | predicate_core_set = core_path_to_inter.union(other_core_path_to_inter) 95 | if len(predicate_core_set)==0: 96 | return None 97 | # Enrich predicate set with details, adding hidden related concepts (n-ary relations) 98 | predicate_set = CoupleExtractor.expand_predicate_core(predicate_core_set, subj_obj_set=subj_obj_set) 99 | # Add missing conjunctions 100 | if core in other_core.children: 101 | predicate_set |= set(filter(CoupleExtractor.CC_FILTER_FN, other_core.children)) 102 | elif other_core in core.children: 103 | predicate_set |= set(filter(CoupleExtractor.CC_FILTER_FN, core.children)) 104 | # Get predicate spans 105 | predicate_span = sorted(predicate_set, key=lambda x: x.idx) 106 | predicate_core_span = sorted(predicate_core_set, key=lambda x: x.idx) 107 | # # Remove consecutive punctuations 108 | # non_consecutive_puncts = set([',',';']) 109 | # predicate_span = [ 110 | # v 111 | # for i, v in enumerate(predicate_span) 112 | # if i == 0 113 | # or v.pos_ != 'PUNCT' 114 | # or v.pos_ != predicate_span[i-1].pos_ 115 | # or v.text not in non_consecutive_puncts 116 | # or predicate_span[i-1].text not in non_consecutive_puncts 117 | # ] 118 | return { 119 | 'predicate_span':predicate_span, 120 | 'predicate_core_span': predicate_core_span, 121 | 'cores_couple': (core,other_core), 122 | } 123 | 124 | @staticmethod 125 | def grammatical_connections_to_graph(caotic_triple_list): 126 | triple_list = [] 127 | for triple_dict in caotic_triple_list: 128 | predicate_span = triple_dict['predicate_span'] 129 | assert len(predicate_span) > 0, f'predicate_span is empty' 130 | predicate_core_span = triple_dict['predicate_core_span'] 131 | assert len(predicate_core_span) > 0, f'predicate_core_span is empty' 132 | core, other_core = triple_dict['cores_couple'] 133 | core_is_obj = re.search(CE.OBJ_REGEXP, CE.get_token_dependency(core)) is not None 134 | other_core_is_obj = re.search(CE.OBJ_REGEXP, CE.get_token_dependency(other_core)) is not None 135 | core_is_subj = re.search(CE.SUBJ_REGEXP, CE.get_token_dependency(core)) is not None 136 | other_core_is_subj = re.search(CE.SUBJ_REGEXP, CE.get_token_dependency(other_core)) is not None 137 | # Handle ambiguous dependencies 138 | # ambiguous_dep = core_is_obj != other_core_is_subj or core_is_subj != other_core_is_obj or core_is_obj == core_is_subj # or other_core_is_obj == other_core_is_subj 139 | if core_is_obj!=other_core_is_obj: 140 | if core_is_obj: 141 | subj = other_core 142 | obj = core 143 | else: 144 | subj = core 145 | obj = other_core 146 | elif core_is_subj!=other_core_is_subj: 147 | if core_is_subj: 148 | subj = core 149 | obj = other_core 150 | else: 151 | subj = other_core 152 | obj = core 153 | else: # position-based decision 154 | if core.idx < other_core.idx: 155 | subj = core 156 | obj = other_core 157 | else: 158 | subj = other_core 159 | obj = core 160 | triple = { 161 | 'subj': subj, 162 | 'obj': obj, 163 | 'predicate_span': predicate_span, # add predicate components 164 | # 'predicate_set': set(predicate_span), 165 | 'predicate_core_span': predicate_core_span, 166 | # 'predicate_core_set': set(predicate_core_span), 167 | } 168 | # print(triple) 169 | triple_list.append(triple) 170 | return triple_list 171 | 172 | @staticmethod 173 | def get_core_predicate_dict(core_set): 174 | # find the paths that connect the core concepts each other 175 | core_list = list(core_set) 176 | grammatical_connection_list = list(filter(lambda x: x is not None,( 177 | CoupleExtractor.get_grammatical_connection(core, other_core, core_set) 178 | for i,core in enumerate(core_list) 179 | for other_core in core_list[i+1:] 180 | ))) 181 | # print(grammatical_connection_list) 182 | directed_concept_graph = CoupleExtractor.grammatical_connections_to_graph(grammatical_connection_list) 183 | # print(directed_concept_graph) 184 | # create core predicate dict 185 | core_predicate_dict = {} 186 | for edge in directed_concept_graph: 187 | predicate_span = edge['predicate_span'] 188 | subj = edge['subj'] 189 | obj = edge['obj'] 190 | # print(subj,predicate_span,obj) 191 | 192 | get_concept_dict = lambda span: CoupleExtractor.get_concept_dict_from_span(span)#, hidden_dep_list=CoupleExtractor.HIDDEN_PREDICATE_COMPONENT) 193 | # get predicate_dict 194 | predicate_dict = get_concept_dict(predicate_span) 195 | # templatize predicate_dict 196 | triple_span = predicate_span + [subj,obj] 197 | triple_span = CoupleExtractor.trim_noise(sorted(triple_span, key=lambda x:x.idx)) 198 | subj_pos = triple_span.index(subj) 199 | obj_pos = triple_span.index(obj) 200 | if subj_pos < obj_pos: 201 | left_pivot = subj_pos 202 | right_pivot = obj_pos 203 | left_is_subj = True 204 | else: 205 | left_pivot = obj_pos 206 | right_pivot = subj_pos 207 | left_is_subj = False 208 | templatized_lemma = [] 209 | templatized_text = [] 210 | if left_pivot > 0: 211 | left_pdict = get_concept_dict(triple_span[:left_pivot]) 212 | templatized_lemma.append(left_pdict['lemma']) 213 | templatized_text.append(left_pdict['text']) 214 | templatized_lemma.append('{subj}' if left_is_subj else '{obj}') 215 | templatized_text.append('{subj}' if left_is_subj else '{obj}') 216 | if right_pivot > left_pivot+1: 217 | middle_pdict = get_concept_dict(triple_span[left_pivot+1:right_pivot]) 218 | templatized_lemma.append(middle_pdict['lemma']) 219 | templatized_text.append(middle_pdict['text']) 220 | templatized_lemma.append('{obj}' if left_is_subj else '{subj}') 221 | templatized_text.append('{obj}' if left_is_subj else '{subj}') 222 | if right_pivot < len(triple_span)-1: 223 | right_pdict = get_concept_dict(triple_span[right_pivot+1:]) 224 | templatized_lemma.append(right_pdict['lemma']) 225 | templatized_text.append(right_pdict['text']) 226 | predicate_dict['text'] = ' '.join(templatized_text) 227 | predicate_dict['lemma'] = ' '.join(templatized_lemma) 228 | # get predicate_core_dict 229 | predicate_core_dict = get_concept_dict(edge['predicate_core_span']) 230 | 231 | # populate core_predicate_dict 232 | if subj not in core_predicate_dict: 233 | core_predicate_dict[subj] = [] 234 | core_predicate_dict[subj].append({ 235 | 'dependency': 'subj', 236 | 'predicate': predicate_dict, 237 | 'predicate_core': predicate_core_dict, 238 | # 'missing_passivant': subj.i > predicate_span[0].i, 239 | # 'related_concepts_count': related_concepts_count, # if related_concepts_count > 2, then n-ary relation 240 | }) 241 | if obj not in core_predicate_dict: 242 | core_predicate_dict[obj] = [] 243 | core_predicate_dict[obj].append({ 244 | 'dependency': 'obj', 245 | 'predicate': predicate_dict, 246 | 'predicate_core': predicate_core_dict, 247 | # 'missing_passivant': False, 248 | # 'related_concepts_count': related_concepts_count, # if related_concepts_count > 2, then n-ary relation 249 | }) 250 | return core_predicate_dict 251 | 252 | def get_couple_list(self, doc_parser: DocParser): 253 | concept_list = self.get_concept_list(doc_parser) 254 | core_concept_dict = {} 255 | for concept in concept_list: 256 | core = concept['concept_core'][-1]['span'][0] 257 | if core not in core_concept_dict: 258 | core_concept_dict[core] = [] 259 | core_concept_dict[core].append(concept) 260 | # print(core_concept_dict) 261 | 262 | core_predicate_dict = self.get_core_predicate_dict(set(core_concept_dict.keys())) 263 | # print(core_predicate_dict) 264 | couple_list = [] 265 | for core, core_concepts in core_concept_dict.items(): 266 | if core not in core_predicate_dict: 267 | # print(f'"{core}" not in core_predicate_dict') 268 | continue 269 | for concept_dict in core_concepts: 270 | concept_span_set = set(concept_dict['concept']['span']) 271 | is_at_core = self.is_at_core(concept_dict) 272 | for predicate_dict in core_predicate_dict[core]: 273 | if len(concept_span_set.intersection(predicate_dict['predicate']['span'])) > 0: 274 | # print(f'Discarding concept "{concept_dict["concept"]["text"]}", because it intersects its predicate: "{predicate_dict["predicate"]["text"]}".') 275 | continue 276 | couple_dict = { 277 | 'is_at_core': is_at_core, 278 | } 279 | couple_dict.update(concept_dict) 280 | couple_dict.update(predicate_dict) 281 | couple_list.append(couple_dict) 282 | # print([(c['dependency'],c['concept']['text'],c['predicate']['text']) for c in couple_list]) 283 | return couple_list 284 | -------------------------------------------------------------------------------- /web_app/yai/static/js/vue_component/explanation_components.js: -------------------------------------------------------------------------------- 1 | OVERVIEW_CACHE = {}; 2 | TAXONOMICAL_VIEW_CACHE = {}; 3 | ANNOTATION_CACHE = {}; 4 | ANNOTATED_HTML_CACHE = {}; 5 | KNOWN_KNOWLEDGE_GRAPH = []; 6 | 7 | Vue.component("template_tree", { 8 | template: ` 9 |
10 |
11 | 12 | [{{ isOpen ? 'Less..' : 'More..' }}] 13 |
14 |
    15 |
  • 16 | 17 |
  • 18 |
19 |
20 | `, 21 | props: { 22 | item: Object, 23 | annotation_list: Array, 24 | }, 25 | data: function() { 26 | return { 27 | isOpen: this.item.expanded 28 | }; 29 | }, 30 | computed: { 31 | isParent: function() { 32 | return this.item.children && this.item.children.length; 33 | }, 34 | annotatedText: function() { 35 | var txt = this.item.text; 36 | if (txt in ANNOTATED_HTML_CACHE) 37 | return ANNOTATED_HTML_CACHE[txt]; 38 | // console.log('Annotating with:', this.annotation_list); 39 | return ANNOTATED_HTML_CACHE[txt] = annotate_html(txt, this.annotation_list, linkify); 40 | }, 41 | }, 42 | methods: { 43 | toggle: function() { 44 | this.isOpen = !this.isOpen; 45 | }, 46 | } 47 | }); 48 | 49 | Vue.component("overview", { 50 | template: ` 51 | 52 | 55 |
56 | 57 |
58 |
59 |

{{ label }}

60 |
61 |

Loading overview, please wait a while..

62 |

No overview available.

63 | 64 | {{error_message}} 65 | 66 |
67 |
68 |
    69 |
  • 70 | 71 |
  • 72 |
73 |
74 |
75 | {{question?question:'Extra'}} 76 |
    77 |
  • 78 | 79 |
  • 80 |
81 |
82 |
83 |
84 |
85 |
86 | `, 87 | props: { 88 | uri: String, 89 | label: String, 90 | active_fn: { 91 | type: Function, 92 | default: function () {} 93 | }, 94 | close_fn: { 95 | type: Function, 96 | default: function () {} 97 | }, 98 | onload_fn: { 99 | type: Function, 100 | default: function () {} 101 | }, 102 | }, 103 | data: function() { 104 | return { 105 | loading: true, 106 | empty: false, 107 | show_error_alert: false, 108 | error_message: '', 109 | 110 | question_overview_tree: {}, 111 | taxonomical_view: [], 112 | annotation_list: [], 113 | }; 114 | }, 115 | // methods: { 116 | // format_label: function(label) { 117 | // return tokenise(label).filter(x=>x!='').join(' '); 118 | // } 119 | // }, 120 | created: function() { 121 | var self = this; 122 | // self.uri = self.uri.toLowerCase(); 123 | if (self.uri in OVERVIEW_CACHE) 124 | { 125 | self.question_overview_tree = OVERVIEW_CACHE[self.uri] 126 | self.taxonomical_view = TAXONOMICAL_VIEW_CACHE[self.uri]; 127 | self.annotation_list = ANNOTATION_CACHE[self.uri]; 128 | self.loading = false; 129 | if (!self.question_overview_tree) 130 | self.empty = true; 131 | return; 132 | } 133 | console.log('Shifting towards topic:', self.uri, self.label); 134 | self.loading = true; 135 | $.ajax({ 136 | type: "GET", 137 | url: GET_OVERVIEW_API, 138 | responseType:'application/json', 139 | data: { 140 | 'concept_uri': self.uri, 141 | }, 142 | success: function (result) { 143 | console.log('Processing overview', result); 144 | self.show_error_alert = false; 145 | self.loading = false; 146 | self.onload_fn(); 147 | // Check cache 148 | if (!result) 149 | { 150 | self.empty = true; 151 | OVERVIEW_CACHE[self.uri] = null; 152 | ANNOTATION_CACHE[self.uri] = null; 153 | TAXONOMICAL_VIEW_CACHE[self.uri] = null; 154 | return; 155 | } 156 | self.empty = false; 157 | // Setup KNOWN_ENTITY_DICT 158 | var taxonomical_view = tuple_list_to_formatted_jsonld(result.taxonomical_view); 159 | // Update the known entity dict (cache) 160 | KNOWN_KNOWLEDGE_GRAPH = KNOWN_KNOWLEDGE_GRAPH.concat(taxonomical_view); 161 | KNOWN_ENTITY_DICT = get_typed_entity_dict_from_jsonld(KNOWN_KNOWLEDGE_GRAPH); 162 | // Setup and annotate question summary tree 163 | var annotation_list = result.annotation_list; 164 | // IMPORTANT: filter out all the annotations referring to the exact concept in overview. 165 | // annotation_list = annotation_list.filter(x => x.annotation != self.uri); 166 | // Populate the question_overview_tree 167 | var question_summary_tree = result.question_summary_tree; 168 | if (question_summary_tree) 169 | { 170 | for (var [question,summary_tree] of Object.entries(question_summary_tree)) 171 | { 172 | if (!summary_tree.summary) 173 | continue; 174 | summary_tree = summary_tree_to_jsonld(summary_tree); 175 | summary_tree = format_jsonld(summary_tree); 176 | summary_tree = jsonld_to_nestedlist(summary_tree); 177 | self.question_overview_tree[question] = summary_tree; 178 | } 179 | } 180 | // Set taxonomical_view 181 | const prefixed_string = prefixed_string_to_uri(self.uri); 182 | self.taxonomical_view = jsonld_to_nestedlist(nest_jsonld(KNOWN_ENTITY_DICT[prefixed_string], KNOWN_ENTITY_DICT, [prefixed_string], 2)); 183 | self.annotation_list = annotation_list; 184 | // Cache question summary tree 185 | OVERVIEW_CACHE[self.uri] = self.question_overview_tree; 186 | ANNOTATION_CACHE[self.uri] = self.annotation_list; 187 | TAXONOMICAL_VIEW_CACHE[self.uri] = self.taxonomical_view; 188 | }, 189 | error: function(result) { 190 | const prefixed_string = prefixed_string_to_uri(self.uri); 191 | self.loading = false; 192 | if (self.uri in ANNOTATION_CACHE) 193 | { 194 | self.taxonomical_view = TAXONOMICAL_VIEW_CACHE[self.uri]; 195 | self.annotation_list = ANNOTATION_CACHE[self.uri]; 196 | } 197 | else 198 | { 199 | self.error_message = result; 200 | self.show_error_alert = true; 201 | // expand_link( 202 | // prefixed_string_to_uri(self.uri), 203 | // x=>{ 204 | // console.log(x); 205 | // }, 206 | // KNOWN_ENTITY_DICT 207 | // ); 208 | } 209 | }, 210 | }); 211 | }, 212 | }); 213 | 214 | Vue.component("answer", { 215 | template: ` 216 |
217 | 226 |
227 |

Loading answers, please wait a while..

228 |

No answers found.

229 | 230 | {{error_message}} 231 | 232 | 233 | {{warning_message}} 234 | 235 |
236 |

237 | Question: {{ question_text }} 238 |

239 |
240 | Answer: 241 |
    242 |
  • 243 | 244 |
  • 245 |
  • 246 | 247 |
  • 248 |
249 |
250 |
251 |
252 | `, 253 | data: function() { 254 | return { 255 | show_error_alert: false, 256 | error_message: '', 257 | 258 | show_warning_alert: false, 259 | warning_message: '', 260 | 261 | empty_answers: false, 262 | loading_answers: false, 263 | question_text: '', 264 | answer_tree: null, 265 | answer_annotation_list: [], 266 | answer_quality: null, 267 | }; 268 | }, 269 | methods: { 270 | ask: function(event) { 271 | // console.log(event); 272 | var self = this; 273 | self.loading_answers = true; 274 | self.empty_answers = false; 275 | self.show_warning_alert = false; 276 | self.show_error_alert = false; 277 | 278 | var x = titlefy(event.target.value.replace(/(\r\n|\n|\r)/gm, "").trim()); 279 | console.log('Sending question:',x); 280 | $.ajax({ 281 | type: "GET", 282 | url: GET_ANSWER_API, 283 | responseType:'application/json', 284 | data: {'question': x}, 285 | success: function (result) { 286 | console.log('Processing answer'); 287 | // console.log('Getting answer:',JSON.stringify(result)); 288 | self.loading_answers = false; 289 | if (!result) 290 | { 291 | self.empty_answers = true; 292 | return; 293 | } 294 | const annotation_list = result.annotation_list; 295 | var question_summary_tree = result.question_summary_tree; 296 | const question = Object.keys(question_summary_tree)[0]; 297 | var summary_tree = summary_tree_to_jsonld(question_summary_tree[question]); 298 | const answer_quality = result.quality[question]; 299 | 300 | self.show_error_alert = false; 301 | self.empty_answers = false; 302 | self.question_text = question; 303 | self.answer_tree = jsonld_to_nestedlist(format_jsonld(summary_tree)); 304 | self.answer_annotation_list = annotation_list; 305 | self.answer_quality = jsonld_to_nestedlist(format_jsonld({'my:answer_quality': pydict_to_jsonld(answer_quality)})); 306 | 307 | // Show answer quality 308 | console.log('Answer quality:', answer_quality); 309 | if (answer_quality.semantic_similarity < 0.5) 310 | { 311 | self.warning_message = 'The following answers can be very imprecise. We struggled to extract them from data, maybe because this question cannot be properly answered using the available information.'; 312 | self.show_warning_alert = true; 313 | } 314 | }, 315 | error: function(result) { 316 | self.error_message = result; 317 | self.show_error_alert = true; 318 | }, 319 | }); 320 | }, 321 | } 322 | }); 323 | 324 | function summary_tree_to_jsonld(summary_tree) { 325 | var jsonld = {}; 326 | for (var [key,value] of Object.entries(summary_tree)) 327 | { 328 | if (key == 'children') 329 | continue; 330 | if (key == 'annotation') 331 | { 332 | if (value) 333 | { 334 | var source_id = prefixed_string_to_uri(summary_tree['source_id']); 335 | var jsonld_value = tuple_list_to_formatted_jsonld(value); 336 | var entity_dict = get_entity_dict_from_jsonld(jsonld_value); 337 | jsonld['my:hasSource'] = nest_jsonld(entity_dict[source_id], entity_dict, [source_id], 2); 338 | } 339 | } 340 | else 341 | jsonld['my:'+key] = value; 342 | } 343 | if (summary_tree.children && summary_tree.children.length) 344 | jsonld['my:sub_summary_list'] = summary_tree.children.map(summary_tree_to_jsonld); 345 | return jsonld; 346 | } 347 | 348 | function pydict_to_jsonld(pydict) { 349 | if (isDict(pydict)) 350 | { 351 | var jsonld = {}; 352 | for (var [key,value] of Object.entries(pydict)) 353 | jsonld['my:'+key] = pydict_to_jsonld(value); 354 | return jsonld; 355 | } 356 | if (isArray(pydict)) 357 | return pydict.map(pydict_to_jsonld); 358 | return pydict; 359 | } 360 | 361 | $(document).on('click', '.link', function(e) { 362 | var topic = e.target.dataset['topic'] || ""; 363 | topic = uri_to_prefixed_string(topic); 364 | // var is_first = (e.target.dataset['is_first'] == 'true'); 365 | var label = e.target.innerText; 366 | app.cards.push({ 367 | 'uri':topic, 368 | 'label':titlefy(label), 369 | 'deleted':false, 370 | }); 371 | if (!app.show_overview_modal) 372 | app.current_card_index = app.cards.length-1; 373 | app.show_overview_modal = true; 374 | }); 375 | --------------------------------------------------------------------------------