├── web_app
    ├── oke
    │   ├── core
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── summarisation
    │   │   │   │   ├── word_graph_summariser
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── LICENSE.md
    │   │   │   │   │   ├── resources
    │   │   │   │   │   │   ├── stopwords.en.dat
    │   │   │   │   │   │   └── stopwords.fr.dat
    │   │   │   │   │   └── README.md
    │   │   │   │   ├── multi_sentence_compressor.py
    │   │   │   │   └── neural_sentence_summariser.py
    │   │   │   ├── classification
    │   │   │   │   ├── eurovoc_classifier.py
    │   │   │   │   ├── concept_classifier.py
    │   │   │   │   └── sentence_classifier.py
    │   │   │   ├── knowledge_extraction
    │   │   │   │   ├── lattice_builder.py
    │   │   │   │   ├── ontology_builder.py
    │   │   │   │   ├── couple_abstractor.py
    │   │   │   │   └── couple_extractor.py
    │   │   │   └── model_manager.py
    │   │   └── misc
    │   │   │   ├── levenshtein_lib.py
    │   │   │   ├── onto_reader.py
    │   │   │   ├── tfidf_lib.py
    │   │   │   ├── tree_cluster_builder.py
    │   │   │   ├── adjacency_matrix.py
    │   │   │   ├── jsonld_lib.py
    │   │   │   ├── graph_builder.py
    │   │   │   └── doc_reader.py
    │   ├── documents
    │   │   └── yai4law
    │   │   │   ├── Rome II_EN.pdf
    │   │   │   ├── Rome I_EN.pdf
    │   │   │   └── BrusselsReg_EN_1215-20212.pdf
    │   ├── requirements.txt
    │   ├── server.py
    │   └── server_interface.py
    ├── yai
    │   ├── requirements.txt
    │   ├── static
    │   │   ├── css
    │   │   │   ├── style.css
    │   │   │   └── tree.css
    │   │   ├── img
    │   │   │   └── favicon.ico
    │   │   ├── html
    │   │   │   └── index.html
    │   │   └── js
    │   │   │   ├── app.js
    │   │   │   ├── stage_builder
    │   │   │       ├── api_lib.js
    │   │   │       ├── item_stage_builder.js
    │   │   │       └── domain_stage_builder.js
    │   │   │   ├── template
    │   │   │       ├── jsonld_handler.js
    │   │   │       └── template_lib.js
    │   │   │   └── vue_component
    │   │   │       └── explanation_components.js
    │   └── server.py
    ├── server.sh
    └── setup.sh
├── .gitignore
├── kg_hinge
    ├── kg_hinge.png
    └── kg_hinge.graphml
├── kg_taxonomy
    └── kg_taxonomy.png
└── README.md


/web_app/oke/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/web_app/yai/requirements.txt:
--------------------------------------------------------------------------------
1 | bottle
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/word_graph_summariser/__init__.py:
--------------------------------------------------------------------------------
1 | from .takahe import *
2 | 


--------------------------------------------------------------------------------
/kg_hinge/kg_hinge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/kg_hinge/kg_hinge.png


--------------------------------------------------------------------------------
/web_app/yai/static/css/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | 	font-size: 18px;
3 | }
4 | 
5 | .detail_btn {
6 | 	color: blue;
7 | 	text-decoration: underline;
8 | 	cursor: pointer;
9 | }


--------------------------------------------------------------------------------
/kg_taxonomy/kg_taxonomy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/kg_taxonomy/kg_taxonomy.png


--------------------------------------------------------------------------------
/web_app/yai/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/yai/static/img/favicon.ico


--------------------------------------------------------------------------------
/web_app/oke/documents/yai4law/Rome II_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/Rome II_EN.pdf


--------------------------------------------------------------------------------
/web_app/oke/documents/yai4law/Rome I_EN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/Rome I_EN.pdf


--------------------------------------------------------------------------------
/web_app/oke/documents/yai4law/BrusselsReg_EN_1215-20212.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Francesco-Sovrano/Legal-Knowledge-Extraction-for-Knowledge-Graph-Based-Question-Answering/HEAD/web_app/oke/documents/yai4law/BrusselsReg_EN_1215-20212.pdf


--------------------------------------------------------------------------------
/web_app/server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MY_DIR="`python -c "import os; print(os.path.split(os.path.realpath('$0'))[0])"`"
 4 | cd $MY_DIR
 5 | 
 6 | # PyClean
 7 | (find ./ -name __pycache__ -type d | xargs rm -r) && (find ./ -name *.pyc -type f | xargs rm -r)
 8 | 
 9 | # Run OKE Server
10 | cd oke
11 | echo 'Running OKE server..'
12 | source .env/bin/activate
13 | python3 server.py $1 &> server.log &
14 | disown
15 | cd ..
16 | 
17 | # Run YAI Server
18 | cd yai
19 | echo 'Running YAI server..'
20 | source .env/bin/activate
21 | python3 server.py $1 &> server.log &
22 | disown
23 | cd ..
24 | 


--------------------------------------------------------------------------------
/web_app/oke/requirements.txt:
--------------------------------------------------------------------------------
 1 | bottle
 2 | 
 3 | tensorflow==2.1.0 # machine learning
 4 | tensorflow_hub==0.8.0 # pre-trained models
 5 | tensorflow_text==2.1.1 # question answering
 6 | #torch==1.4.0 # machine learning
 7 | torchvision==0.5.0
 8 | keras==2.2.4
 9 | spacy==2.3.2 # NLP
10 | gensim==3.8.3 # TFIDF
11 | nltk==3.5 # stemming, wordnet, etc..
12 | concepts==0.9.1 # formal concept analysis
13 | pywsd==1.2.4 # word sense disambiguation
14 | wikipedia==1.4.0 # definition extraction
15 | transformers==3.0.2 # text summarization and other NLP tasks
16 | 
17 | more_itertools==8.2.0
18 | sortedcontainers==2.1.0
19 | python-Levenshtein # levenshtein string distance
20 | bs4 # xml to txt
21 | scikit-learn
22 | scipy==1.4.1
23 | wordcloud==1.7.0 # graphics
24 | tika # pdf to txt
25 | matplotlib # graphics
26 | pydotplus==2.0.2 # graphs
27 | networkx # graphs
28 | pandas
29 | lxml
30 | 


--------------------------------------------------------------------------------
/web_app/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MY_DIR="`python -c "import os; print(os.path.realpath('$1'))"`"
 4 | cd $MY_DIR
 5 | 
 6 | # Run YAI server
 7 | cd yai
 8 | echo 'Setting up YAI server..'
 9 | python3 -m venv .env
10 | source .env/bin/activate
11 | pip install -U pip setuptools wheel twine
12 | pip install -r requirements.txt
13 | cd ..
14 | 
15 | # Run OKE Server
16 | cd oke
17 | echo 'Setting up OKE server..'
18 | python3 -m venv .env
19 | source .env/bin/activate
20 | pip install -U pip setuptools wheel twine
21 | # cd .env/lib
22 | # git clone https://github.com/huggingface/neuralcoref.git
23 | # cd neuralcoref
24 | # pip install -r requirements.txt
25 | # pip install -e .
26 | # cd ..
27 | # cd ../..
28 | pip install -r requirements.txt
29 | python3 -m spacy download en_core_web_md
30 | # python3 -m spacy download en_core_web_sm
31 | python3 -m nltk.downloader stopwords punkt averaged_perceptron_tagger framenet_v17 wordnet brown
32 | cd ..
33 | 


--------------------------------------------------------------------------------
/web_app/oke/core/misc/levenshtein_lib.py:
--------------------------------------------------------------------------------
 1 | import Levenshtein
 2 | 
 3 | def remove_similar_labels(tuple_list, threshold=0.3):
 4 | 	fetch_value = lambda x: x[0] if isinstance(x, (list,tuple)) else x
 5 | 	new_tuple_list = []
 6 | 	for t in tuple_list:
 7 | 		is_unique = True
 8 | 		for other_t in new_tuple_list:
 9 | 			if labels_are_similar(fetch_value(t),fetch_value(other_t),threshold):
10 | 				is_unique = False
11 | 				break
12 | 		if is_unique:
13 | 			new_tuple_list.append(t)
14 | 	return new_tuple_list
15 | 
16 | def get_normalized_sintactic_distance(a,b):
17 | 	return Levenshtein.distance(a,b)/max(len(a),len(b))
18 | 
19 | def labels_are_similar(a,b, threshold=0.3):
20 | 	return get_normalized_sintactic_distance(a,b) < threshold
21 | 
22 | def get_most_similar_label(label,other_label_list):
23 | 	distance, most_similar_label = min(map(lambda x: (Levenshtein.distance(label,x),x), other_label_list), key=lambda x:x[0])
24 | 	return most_similar_label# if min(1.,distance/len(label)) < 0.2 else label
25 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/word_graph_summariser/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Florian Boudin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/classification/eurovoc_classifier.py:
--------------------------------------------------------------------------------
 1 | from models.classification.concept_classifier import ConceptClassifier
 2 | from more_itertools import unique_everseen
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | class EuroVocClassifier(ConceptClassifier):
 7 | 	EUROVOC_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data/eurovoc.csv')
 8 | 	DEFAULT_OPTIONS = {
 9 | 		'spacy_model': 'en_core_web_md',
10 | 		'tf_model':'USE_Transformer',
11 | 		'with_semantic_shifting':True,
12 | 		'with_centered_similarity':True,
13 | 		'tfidf_importance': 3/4,
14 | 		'default_similarity_threshold': 0.8,
15 | 	}
16 | 
17 | 	def __init__(self, model_options=DEFAULT_OPTIONS):
18 | 		super().__init__(model_options)
19 | 		eurovoc_df = pd.read_csv(self.EUROVOC_PATH, sep=';')
20 | 		unique_term_list = tuple(unique_everseen(eurovoc_df['TERMS (PT-NPT)'].values))
21 | 		concept_description_dict = {t:[t] for t in unique_term_list}
22 | 		self.set_concept_description_dict(concept_description_dict)
23 | 
24 | 	def get_concept_dict(self, concept_counter_dict={}, similarity_threshold=None, with_numbers=True, size=1):
25 | 		return super().get_concept_dict(concept_counter_dict=concept_counter_dict, similarity_threshold=similarity_threshold, with_numbers=with_numbers, size=size)
26 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/word_graph_summariser/resources/stopwords.en.dat:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | as
 13 | at
 14 | be
 15 | because
 16 | been
 17 | before
 18 | being
 19 | below
 20 | between
 21 | both
 22 | but
 23 | by
 24 | cannot
 25 | could
 26 | did
 27 | do
 28 | does
 29 | doing
 30 | down
 31 | during
 32 | each
 33 | few
 34 | for
 35 | from
 36 | further
 37 | had
 38 | he
 39 | her
 40 | here
 41 | hers
 42 | herself
 43 | him
 44 | himself
 45 | his
 46 | how
 47 | how's
 48 | i
 49 | if
 50 | in
 51 | into
 52 | is
 53 | it
 54 | its
 55 | itself
 56 | me
 57 | more
 58 | most
 59 | my
 60 | myself
 61 | no
 62 | nor
 63 | not
 64 | of
 65 | off
 66 | on
 67 | once
 68 | only
 69 | or
 70 | other
 71 | ought
 72 | our
 73 | ours
 74 | ourselves
 75 | out
 76 | over
 77 | own
 78 | same
 79 | she
 80 | so
 81 | some
 82 | such
 83 | than
 84 | that
 85 | the
 86 | their
 87 | theirs
 88 | them
 89 | themselves
 90 | then
 91 | there
 92 | these
 93 | they
 94 | this
 95 | those
 96 | through
 97 | to
 98 | too
 99 | under
100 | until
101 | up
102 | very
103 | was
104 | we
105 | what
106 | when
107 | where
108 | which
109 | while
110 | who
111 | whom
112 | why
113 | with
114 | would
115 | you
116 | your
117 | yours
118 | yourself
119 | yourselves
120 | 


--------------------------------------------------------------------------------
/web_app/oke/core/misc/onto_reader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import pandas as pd
 5 | from misc.jsonld_lib import *
 6 | from misc.doc_reader import get_document_list
 7 | 
 8 | def get_dataframe_dict(ontology_dir):
 9 | 	doc_list = get_document_list(ontology_dir)
10 | 	dataframe_dict = {}
11 | 	for obj_path in doc_list:
12 | 		if obj_path.endswith(('.csv',)):
13 | 			print('Parsing:', obj_path)
14 | 			_, filename = os.path.split(obj_path)
15 | 			class_name = filename.split('.')[0]
16 | 			dataframe_dict[class_name] = pd.read_csv(obj_path, sep=';')
17 | 	return dataframe_dict
18 | 
19 | def get_concept_description_dict(ontology_dir):
20 | 	dataframe_dict = get_dataframe_dict(ontology_dir)
21 | 	
22 | 	concept_dict = {}
23 | 	for concept, df in dataframe_dict.items():
24 | 		concept_dict[concept] = [explode_concept_key(concept).lower().strip()]
25 | 		sub_classes = df['SubClasses'].values.tolist()
26 | 		concept_dict.update({
27 | 			sc: [explode_concept_key(sc).lower().strip()]
28 | 			for sc in sub_classes
29 | 		})
30 | 	return concept_dict
31 | 
32 | def get_concept_description_dict_from_jsonld(ontology_path, key):
33 | 	with open(ontology_path,'r') as f:
34 | 		graph = json.load(f)
35 | 
36 | 	return {
37 | 		sub_graph['@id']: [explode_concept_key(sub_graph[key]).lower().strip()]
38 | 		for sub_graph in graph
39 | 		if key in sub_graph
40 | 	}
41 | 
42 | '''
43 | import sys
44 | _, ontology_path, skos_path = sys.argv
45 | 
46 | print(get_concept_description_dict(ontology_path, skos_path))
47 | '''


--------------------------------------------------------------------------------
/web_app/oke/core/misc/tfidf_lib.py:
--------------------------------------------------------------------------------
 1 | import gensim # for the tf-idf model
 2 | from gensim.test.utils import get_tmpfile
 3 | 
 4 | def build_tfidf(words_vector, very_big_corpus=False):
 5 | 	# The code in the following block comes from: https://www.oreilly.com/learning/how-do-i-compare-document-similarity-using-python
 6 | 	########################## START BLOCK ########################## 
 7 | 	# Build word dictionary
 8 | 	dictionary = gensim.corpora.Dictionary(words_vector)
 9 | 	# Build the Bag-of-Words corpus from lemmatized documents
10 | 	corpus = [dictionary.doc2bow(gen_doc) for gen_doc in words_vector]
11 | 	# Build the tf-idf model from the corpus 
12 | 	tfidf_model = gensim.models.TfidfModel(corpus)
13 | 	# Build similarities cache
14 | 	# Similarity with cache into temporary file is slower than MatrixSimilarity but it can handle bigger corpus
15 | 	if very_big_corpus:
16 | 		tfidf_corpus_similarities = gensim.similarities.Similarity(get_tmpfile("index"), tfidf_model[corpus], num_features=len(dictionary))
17 | 	else:
18 | 		tfidf_corpus_similarities = gensim.similarities.MatrixSimilarity(tfidf_model[corpus], num_features=len(dictionary))
19 | 	########################## END BLOCK ##########################
20 | 	return dictionary, tfidf_model, tfidf_corpus_similarities
21 | 
22 | def get_query_tfidf_similarity(words_vector, dictionary, tfidf_model, tfidf_corpus_similarities):
23 | 	# Get query BoW (Bag of Words)
24 | 	query_bow = dictionary.doc2bow(words_vector)
25 | 	# Get query tf-idf
26 | 	query_tfidf = tfidf_model[query_bow]
27 | 	# Get query similarity vector
28 | 	return tfidf_corpus_similarities[query_tfidf]
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Legal Knowledge Extraction for Knowledge Graph Based Question-Answering
 2 | ==========
 3 | 
 4 | Extra documentation (knowledge graph, images, etc..) of the paper "Legal Knowledge Extraction for Knowledge Graph Based Question-Answering".
 5 | 
 6 | ## Usage and Installation
 7 | This project has been tested on Debian 9 and macOS Mojave 10.14 with Python 3.7.9. 
 8 | The [web_app](web_app) folder contains the code of the answer retriever. The [web_app/setup.sh](web_app/setup.sh) script installs the software. To run the web app, execute the following command ```./web_app/server.sh port_num``` where port_num is the number of the port. After running the "server.sh" script, you can access the web app through your browser at http://localhost:port_num (remember to replace the string port_num with the correct port number you have chosen).
 9 | 
10 | **N.B.** Before being able to run the setup.sh scripts you have to install: virtualenv, python3-dev, python3-pip and make. 
11 | 
12 | ## Files
13 | 
14 | * [The whole Knowledge Graph in graphml format](graph.graphml)
15 | * [The Taxonomy within the Knowledge Graph](kg_taxonomy)
16 | * [The ontological hinge obtained by aligning the Ontology Design Patterns to the Taxonomy](kg_hinge)
17 | 
18 | ## Sources
19 | 
20 | The Knowledge Graph has been extracted from:
21 | * [Rome I Regulation EC 593/2008](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32008R0593&from=EN)
22 | * [Rome II Regulation EC 864/2007](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32007R0864&from=EN)
23 | * [Brussels I bis Regulation EU 1215/2012](https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32012R1215&from=EN)
24 | 
25 | ## KG's Taxonomy
26 | ![KG's Taxonomy](kg_taxonomy/kg_taxonomy.png)
27 | 
28 | ## Taxonomical Hinge
29 | ![Taxonomical Hinge](kg_hinge/kg_hinge.png)
30 | 
31 | ## Contact
32 | 
33 | To report issues, use GitHub Issues. 
34 | For other queries, contact Francesco Sovrano: 
35 | * <francesco.sovrano2@unibo.it>
36 | * <cesco.sovrano@gmail.com>


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/word_graph_summariser/resources/stopwords.fr.dat:
--------------------------------------------------------------------------------
  1 | alors
  2 | alors
  3 | après
  4 | au
  5 | au
  6 | aucun
  7 | aujourd'hui
  8 | aussi
  9 | autre
 10 | autre
 11 | avant
 12 | avant
 13 | avec
 14 | avec
 15 | avoir
 16 | avoir
 17 | bien
 18 | bon
 19 | bon
 20 | c'
 21 | car
 22 | car
 23 | ce
 24 | ce
 25 | cela
 26 | ces
 27 | ces
 28 | cette
 29 | ceux
 30 | chaque
 31 | chez
 32 | ci
 33 | comme
 34 | comme
 35 | comment
 36 | comment
 37 | d'
 38 | dans
 39 | dans
 40 | de
 41 | de
 42 | dedans
 43 | dehors
 44 | demain
 45 | depuis
 46 | des
 47 | des
 48 | deux
 49 | deux
 50 | devrait
 51 | dire
 52 | dit
 53 | doit
 54 | donc
 55 | donner
 56 | dont
 57 | dos
 58 | droite
 59 | du
 60 | du
 61 | début
 62 | elle
 63 | elle
 64 | elles
 65 | elles
 66 | en
 67 | en
 68 | encore
 69 | encore
 70 | enfant
 71 | ensuite
 72 | entre
 73 | essai
 74 | et
 75 | et
 76 | eu
 77 | eux
 78 | faire
 79 | fait
 80 | fait
 81 | faites
 82 | femme
 83 | fois
 84 | font
 85 | force
 86 | grand
 87 | haut
 88 | hier
 89 | homme
 90 | hors
 91 | ici
 92 | ici
 93 | il
 94 | il
 95 | ils
 96 | ils
 97 | j'
 98 | jamais
 99 | je
100 | je
101 | juste
102 | l'
103 | la
104 | le
105 | les
106 | leur
107 | lui
108 | là
109 | m'
110 | ma
111 | maintenant
112 | mais
113 | me
114 | mes
115 | mine
116 | moi
117 | moins
118 | mon
119 | même
120 | ne
121 | ni
122 | n'
123 | non
124 | nos
125 | notre
126 | nous
127 | on
128 | ou
129 | où
130 | par
131 | parce
132 | pas
133 | petit
134 | peu
135 | plupart
136 | plus
137 | pour
138 | pourquoi
139 | près
140 | puis
141 | quand
142 | que
143 | quel
144 | quelle
145 | quelles
146 | quels
147 | qui
148 | quoi
149 | s'
150 | sa
151 | sans
152 | se
153 | ses
154 | seulement
155 | si
156 | sien
157 | soi
158 | son
159 | sous
160 | sujet
161 | sur
162 | ta
163 | tandis
164 | tard
165 | te
166 | tellement
167 | tels
168 | tes
169 | toi
170 | ton
171 | toujours
172 | tous
173 | tout
174 | toute
175 | trop
176 | très
177 | tu
178 | tôt
179 | un
180 | une
181 | valeur
182 | voici
183 | vos
184 | votre
185 | vous
186 | vu
187 | y
188 | à
189 | ça
190 | être
191 | lundi
192 | mardi
193 | mercredi
194 | jeudi
195 | vendredi
196 | samedi
197 | dimanche
198 | soir
199 | matin
200 | midi
201 | demain
202 | aujourd'hui
203 | hier


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/multi_sentence_compressor.py:
--------------------------------------------------------------------------------
 1 | from models.model_manager import ModelManager
 2 | import models.summarisation.word_graph_summariser as wgs
 3 | 
 4 | class MultiSentenceCompressor(ModelManager):
 5 | 
 6 | 	def __init__(self, model_options):
 7 | 		super().__init__(model_options)
 8 | 		self.disable_spacy_component = ["ner", "textcat", "neuralcoref"]
 9 | 
10 | 	def summarise_sentence_list(self, sentence_list, n=1, min_words_n=20, candidates_horizon=1000, ranking_strategy='boudin-morin', cached=True):
11 | 		taggedsentences=[
12 | 			' '.join((
13 | 				token.text+"/"+(token.tag_ if token.pos_ != 'PUNCT' else 'PUNCT')
14 | 				for token in doc
15 | 			)).strip()
16 | 			for doc in self.nlp([sentence])
17 | 		]
18 | 		# print(taggedsentences)
19 | 		
20 | 		# Create a word graph from the set of sentences with parameters :
21 | 		# - minimal number of words in the compression : 6
22 | 		# - language of the input sentences : en (english)
23 | 		# - POS tag for punctuation marks : PUNCT
24 | 		compresser = wgs.word_graph(taggedsentences, nb_words=min_words_n, lang='en', punct_tag="PUNCT")
25 | 
26 | 		# Get the 50 best paths
27 | 		candidates = compresser.get_compression(candidates_horizon)
28 | 
29 | 		if ranking_strategy == 'boudin-morin':
30 | 			# 2. Rerank compressions by keyphrases (Boudin and Morin's method)
31 | 			reranker = wgs.keyphrase_reranker(taggedsentences, candidates, lang = 'en')
32 | 			reranked_candidates = reranker.rerank_nbest_compressions()
33 | 			best_candidates = sorted(( # Normalize path score by path length 
34 | 				{
35 | 					'score': score,
36 | 					'text': ' '.join([u[0] for u in path])
37 | 				}
38 | 				for score, path in reranked_candidates
39 | 			), key=lambda x: x['score'], reverse=True)
40 | 		else: #if ranking_strategy == 'filippova':
41 | 			# 1. Rerank compressions by path length (Filippova's method)
42 | 			best_candidates = sorted(( # Normalize path score by path length 
43 | 				{
44 | 					'score': cummulative_score/len(path),
45 | 					'text': ' '.join([u[0] for u in path])
46 | 				}
47 | 				for cummulative_score, path in candidates
48 | 			), key=lambda x: x['score'], reverse=True)
49 | 
50 | 		if len(best_candidates) == 0:
51 | 			best_candidates = [{'text':sentence}]
52 | 		elif n:
53 | 			best_candidates = best_candidates[:n]
54 | 		return tuple(map(lambda x:x['text'], best_candidates))
55 | 


--------------------------------------------------------------------------------
/web_app/yai/server.py:
--------------------------------------------------------------------------------
 1 | from bottle import run, get, post, route, hook, request, response, static_file
 2 | import sys
 3 | port = int(sys.argv[1])
 4 | 
 5 | ###############################################################
 6 | # CORS
 7 | 
 8 | @route('/<:re:.*>', method='OPTIONS')
 9 | def enable_cors_generic_route():
10 | 	"""
11 | 	This route takes priority over all others. So any request with an OPTIONS
12 | 	method will be handled by this function.
13 | 
14 | 	See: https://github.com/bottlepy/bottle/issues/402
15 | 
16 | 	NOTE: This means we won't 404 any invalid path that is an OPTIONS request.
17 | 	"""
18 | 	add_cors_headers()
19 | 
20 | @hook('after_request')
21 | def enable_cors_after_request_hook():
22 | 	"""
23 | 	This executes after every route. We use it to attach CORS headers when
24 | 	applicable.
25 | 	"""
26 | 	add_cors_headers()
27 | 
28 | def add_cors_headers():
29 | 	try:
30 | 		response.headers['Access-Control-Allow-Origin'] = '*'
31 | 		response.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS'
32 | 		response.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token'
33 | 	except Exception as e:
34 | 		print('Error:',e)
35 | 
36 | ###############################################################
37 | # Static Routes
38 | 
39 | @get("/favicon.ico")
40 | def favicon():
41 | 	return static_file("favicon.ico", root="static/img/")
42 | 
43 | @get("/resources/static/<filepath:re:.*\.css>")
44 | def css(filepath):
45 | 	return static_file(filepath, root="static/css/")
46 | 
47 | @get("/resources/static/<filepath:re:.*\.(eot|otf|svg|ttf|woff|woff2?)>")
48 | def font(filepath):
49 | 	return static_file(filepath, root="static/css/")
50 | 
51 | @get("/resources/static/<filepath:re:.*\.(jpg|png|gif|ico|svg)>")
52 | def img(filepath):
53 | 	return static_file(filepath, root="static/img/")
54 | 
55 | @get("/resources/static/<filepath:re:.*\.js>")
56 | def js(filepath):
57 | 	return static_file(filepath, root="static/js/")
58 | 
59 | @get("/resources/static/<filepath:re:.*\.json>")
60 | def js(filepath):
61 | 	return static_file(filepath, root="static/json/")
62 | 
63 | @get("/documents/<filepath:re:.*\.pdf>")
64 | def docs(filepath):
65 | 	return static_file(filepath, root="../oke/documents/")
66 | 
67 | @get("/<filepath:re:.*\.html>")
68 | def html(filepath):
69 | 	print(filepath)
70 | 	return static_file(filepath, root="static/html/")
71 | 
72 | @get("/")
73 | def home():
74 | 	return static_file('index.html', root="static/html/")
75 | 
76 | if __name__ == "__main__":
77 | 	run(host='0.0.0.0', port=port, debug=True)
78 | 	


--------------------------------------------------------------------------------
/web_app/yai/static/css/tree.css:
--------------------------------------------------------------------------------
  1 | #tree,#expand{ 
  2 |   font-size: medium;
  3 |   overflow-y:auto; 
  4 |   overflow-x: hidden; 
  5 |   /*border:1px solid silver; */
  6 |   /*min-height:100px; */
  7 |   max-height: 400px;
  8 |   /*width: 770px; */
  9 |   margin: 1rem 1rem 0 1rem;
 10 |   background: white;
 11 |   /*padding: 0 1rem;*/
 12 | }
 13 | 
 14 | .card {
 15 |   /*margin: 0.25rem 0 !important;*/
 16 |   margin: none !important;
 17 |   /*enable wrapping*/
 18 |   white-space : normal !important;
 19 |   /*ensure lower nodes move down*/
 20 |   height : auto !important;
 21 |   /*border-style: solid dashed;*/
 22 |   border-bottom: none;
 23 |   border-right: none;
 24 |   border-radius: 0 !important;
 25 | }
 26 | 
 27 | .card-header {
 28 |   border-top: none;
 29 |   border-left: none;
 30 |   border-right: none;
 31 |   padding: 0.5rem !important;
 32 |   background-color: rgba(0,0,0,.03) !important;
 33 | }
 34 | 
 35 | .card-leaf {
 36 |   padding: 0.5rem !important;
 37 | }
 38 | 
 39 | .card-body {
 40 |   padding: 0.5rem 0 0.5rem 1.25rem !important;
 41 | }
 42 | 
 43 | .btn {
 44 |   text-align: left !important;
 45 |   padding: 0 !important;
 46 | }
 47 | 
 48 | .link {
 49 |   /*color: darkblue; */
 50 |   text-decoration: underline;
 51 | }
 52 | 
 53 | /*.link:after {
 54 |     font-family: FontAwesome;
 55 |     content: "\f05a";
 56 |     display:inline-block;
 57 |     position: relative;
 58 |     top:-5px;
 59 |     font-size: 75%;
 60 | }*/
 61 | 
 62 | .link:hover {
 63 |   /*font-weight: bold;*/
 64 |   color: blue !important; 
 65 |   cursor: pointer;
 66 | }
 67 | 
 68 | .link .link {
 69 |   color: blue !important; 
 70 |   text-decoration: underline !important;
 71 | }
 72 | 
 73 | .card input[type=number] {
 74 |   /*for absolutely positioning spinners*/
 75 |   position: relative; 
 76 |   padding: 0.5rem;
 77 |   /*padding-right: 2.5rem;*/
 78 |   width: 4rem;
 79 | }
 80 | 
 81 | .card input[type=number]::-webkit-inner-spin-button,
 82 | .card input[type=number]::-webkit-outer-spin-button {
 83 |   opacity: 1;
 84 | }
 85 | 
 86 | .card input[type=number]::-webkit-outer-spin-button, 
 87 | .card input[type=number]::-webkit-inner-spin-button {
 88 |   -webkit-appearance: inner-spin-button !important;
 89 |   /*width: 25px;*/
 90 |   position: absolute;
 91 |   top: 0;
 92 |   right: 0;
 93 |   height: 100%;
 94 | }
 95 | 
 96 | .initial_explanans {
 97 |   padding: 4%;
 98 |   background: blanchedalmond;
 99 | }
100 | 
101 | .h-divider {
102 |   margin: auto;
103 |   position: relative;
104 | }
105 | 
106 | .h-divider .shadow {
107 |   overflow: hidden;
108 |   height: 20px;
109 |   box-shadow: none !important;
110 | }
111 | 
112 | .h-divider .shadow:after {
113 |   content: '';
114 |   display: block;
115 |   margin: -25px auto 0;
116 |   width: 100%;
117 |   height: 25px;
118 |   border-radius: 125px/12px;
119 |   box-shadow: 0 0 8px black;
120 | }
121 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/word_graph_summariser/README.md:
--------------------------------------------------------------------------------
 1 | # takahe
 2 | 
 3 | takahe is a multi-sentence compression module. Given a set of redundant sentences, a word-graph is constructed by iteratively adding sentences to it. The best compression is obtained by finding the shortest path in the word graph. The original algorithm was published and described in:
 4 | 
 5 | * Katja Filippova, Multi-Sentence Compression: Finding Shortest Paths in Word Graphs, *Proceedings of the 23rd International Conference on Computational Linguistics (Coling 2010)*, pages 322-330, 2010.
 6 | 
 7 | A keyphrase-based reranking method can be applied to generate more informative compressions. The reranking method is described in:
 8 | 
 9 | * Florian Boudin and Emmanuel Morin, Keyphrase Extraction for N-best Reranking in Multi-Sentence Compression, *Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2013)*, 2013.
10 | 
11 | 
12 | ## Dependancies
13 | 
14 | As of today, takahe is built for Python 2.
15 | 
16 | You may need to install the following libraries :
17 | 
18 | - [networkx](http://networkx.github.io/) (installation guide is available [here](http://networkx.github.io/documentation/latest/install.html))
19 | - [graphviz](http://www.graphviz.org/) and graphviz-dev
20 | - [pygraphviz](http://pygraphviz.github.io/documentation/latest/install.html)
21 | 
22 | 
23 | 
24 | ## Example
25 | A typical usage of this module is:
26 |     
27 | 	import takahe
28 |         
29 | 	# Create a word graph from the set of sentences with parameters :
30 | 	# - minimal number of words in the compression : 6
31 | 	# - language of the input sentences : en (english)
32 | 	# - POS tag for punctuation marks : PUNCT
33 | 	compresser = takahe.word_graph( sentences, 
34 | 								    nb_words = 6, 
35 | 		                            lang = 'en', 
36 | 		                            punct_tag = "PUNCT" )
37 | 
38 | 	# Get the 50 best paths
39 | 	candidates = compresser.get_compression(50)
40 | 
41 | 	# 1. Rerank compressions by path length (Filippova's method)
42 | 	for cummulative_score, path in candidates:
43 | 
44 | 		# Normalize path score by path length
45 | 		normalized_score = cummulative_score / len(path)
46 | 
47 | 		# Print normalized score and compression
48 | 		print round(normalized_score, 3), ' '.join([u[0] for u in path])
49 | 
50 | 	# Write the word graph in the dot format
51 | 	compresser.write_dot('test.dot')
52 | 
53 | 	# 2. Rerank compressions by keyphrases (Boudin and Morin's method)
54 | 	reranker = takahe.keyphrase_reranker( sentences,  
55 | 										  candidates, 
56 | 										  lang = 'en' )
57 | 
58 | 	reranked_candidates = reranker.rerank_nbest_compressions()
59 | 
60 | 	# Loop over the best reranked candidates
61 | 	for score, path in reranked_candidates:
62 | 		
63 | 		# Print the best reranked candidates
64 | 		print round(score, 3), ' '.join([u[0] for u in path])
65 | 


--------------------------------------------------------------------------------
/web_app/yai/static/html/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 	<head>
 3 | 		<!-- Add Bootstrap and Bootstrap-Vue CSS to the <head> section -->
 4 | 		<link type="text/css" rel="stylesheet" href="/resources/static/lib/bootstrap-4.5.2.min.css"/>
 5 | 		<link type="text/css" rel="stylesheet" href="/resources/static/lib/bootstrap-vue-2.16.0.min.css"/>
 6 | 		<!--FontAwesome import-->
 7 | 		<link type="text/css" rel="stylesheet" href="/resources/static/lib/font-awesome-4.7.0.min.css">
 8 | 
 9 | 		<!-- App -->
10 | 		<link type="text/css" rel="stylesheet" href="/resources/static/style.css">
11 | 		<link type="text/css" rel="stylesheet" href="/resources/static/tree.css">
12 | 	</head>
13 | 	<body>
14 | 		<div id="app" class="container">
15 | 			<h1><u>QA</u><small>4</small><u>Law</u></h1>
16 | 			<div>
17 | 				<p>
18 | 					Here you can ask any English question concerning the content of the following documents:
19 | 				</p>
20 | 				<ul>
21 | 					<li v-for="(doc,id) in documents">
22 | 						<a v-bind:href="doc.url" target="_blank">{{doc.name}}</a>
23 | 					</li>
24 | 				</ul>
25 | 			</div>
26 | 			<hr>
27 | 			<input 
28 | 				placeholder="Write a question.. e.g. Which law is applicable to a non-contractual obligation?" 
29 | 				type="text" 
30 | 				class="form-control input-lg" 
31 | 				aria-label="Write a question.. e.g. Which law is applicable to a non-contractual obligation?" 
32 | 				aria-describedby="inputGroup-sizing-sm" 
33 | 				v-on:keydown.enter="question"
34 | 			>
35 | 			<hr>
36 | 			<p v-if="loading_answers">Loading answers, please wait a while..</p>
37 | 			<div v-if="!loading_answers && !empty_answers">
38 | 				<p>
39 | 					<strong>Question</strong>: {{ question_text }}
40 | 				</p>
41 | 				<!-- <h3><u>Details</u></h3> -->
42 | 				<table class="table table-bordered">
43 | 					<tr>
44 | 						<th>Pertinence</th>
45 | 						<th>Source</th>
46 | 						<!-- <th>Abstract</th> -->
47 | 						<th>Document</th>
48 | 					</tr>
49 | 					<tr v-for="(answer,i) in important_answer_list" v-bind:key="i">
50 | 						<td>{{ answer.confidence }}</td>
51 | 						<td>«{{ answer.sentence }}»</td>
52 | 						<!-- <td><b>{{ answer.abstract }}</b></td> -->
53 | 						<td><a v-bind:href="answer.document.url" target="_blank">{{answer.document.name}}</a></td>
54 | 					</tr>
55 | 				</table>
56 | 			</div>
57 | 			<div v-if="!loading_answers && empty_answers">
58 | 				<p>
59 | 					No answer found.
60 | 				</p>
61 | 			</div>
62 | 		</div>
63 | 		<footer>
64 | 			<!-- JQUERY -->
65 | 			<script type="text/javascript" src="/resources/static/lib/jquery-3.5.1.min.js"></script>
66 | 			<!-- Add Vue and Bootstrap-Vue JS just before the closing </body> tag -->
67 | 			<script type="text/javascript" src="/resources/static/lib/vue-2.6.11.min.js"></script>
68 | 			<script type="text/javascript" src="/resources/static/lib/bootstrap-vue-2.16.0.min.js"></script>
69 | 			
70 | 			<!-- Internal libs, keep this exact order of import -->
71 | 			<script type="text/javascript" src="/resources/static/lib/common_fn.js"></script>
72 | 			<!-- Shared popup stuff, keep this exact order of import -->
73 | 			<script type="text/javascript" src="/resources/static/template/template_lib.js"></script>
74 | 			<script type="text/javascript" src="/resources/static/template/template.js"></script>
75 | 			<script type="text/javascript" src="/resources/static/template/jsonld_handler.js"></script>
76 | 			<!-- Content Script, keep this exact order of import -->
77 | 			<script type="text/javascript" src="/resources/static/stage_builder/item_stage_builder.js"></script>
78 | 			<script type="text/javascript" src="/resources/static/stage_builder/domain_stage_builder.js"></script>
79 | 			<script type="text/javascript" src="/resources/static/stage_builder/api_lib.js"></script>
80 | 			<!-- App, keep this exact order of import -->
81 | 			<script type="text/javascript" src="/resources/static/vue_component/explanation_components.js"></script>
82 | 			<script type="text/javascript" src="/resources/static/app.js"></script>
83 | 		</footer>
84 | 	</body>
85 | </html>


--------------------------------------------------------------------------------
/web_app/oke/core/models/summarisation/neural_sentence_summariser.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import json
  3 | from models.model_manager import ModelManager
  4 | 
  5 | class NeuralSentenceSummariser(ModelManager):
  6 | 
  7 | 	def __init__(self, model_options):
  8 | 		model_options['hf_model']['type'] = 'summarization'
  9 | 		super().__init__(model_options)
 10 | 		self.debug = model_options.get('debug',False)
 11 | 		self.max_input_token_count = self.get_hf_model()['config'].max_position_embeddings
 12 | 
 13 | 	@staticmethod
 14 | 	def sentify(s):
 15 | 		return ' '.join((
 16 | 			p[0].upper() + p[1:] + ('.' if p[-1] != '.' else '')
 17 | 			for p in s.split(' . ')
 18 | 			if p
 19 | 		))
 20 | 
 21 | 	def summarise_sentence(self, sentence, sentence_id=None, n=1, options=None, min_size=None):
 22 | 		# if len(sentence) < 100:
 23 | 		# 	return (sentence,)
 24 | 		if not options:
 25 | 			options = {}
 26 | 		# Format sentence
 27 | 		tokenizer = self.get_hf_model()['tokenizer']
 28 | 		tokenized_sentence = tokenizer.convert_ids_to_tokens(tokenizer.encode(sentence))
 29 | 		if min_size and len(tokenized_sentence) < min_size:
 30 | 			return (sentence,)
 31 | 		tokenized_sentence = tokenized_sentence[:self.max_input_token_count-3] # the 1st and last token are a BoS (Begin of String) and a EoS (End of String), furthermore a task token is added to the beginning of the sentence
 32 | 		formatted_sentence = tokenizer.convert_tokens_to_string(tokenized_sentence)
 33 | 		# print(formatted_sentence)
 34 | 		# sentence = ' '.join(sentence.split(' ')[:self.max_input_token_count])
 35 | 		summary_ids = self.run_hf_task(
 36 | 			[formatted_sentence], 
 37 | 			min_length=3, 
 38 | 			max_length=self.max_input_token_count, 
 39 | 			num_return_sequences=n, # default 1
 40 | 			**options
 41 | 			# do_sample=True, # default False
 42 | 		)[0]
 43 | 		return tuple(map(lambda x: x['summary_text'], summary_ids))
 44 | 
 45 | 	@staticmethod
 46 | 	def integrate_summary_tree_list(integration_map, summary_tree_list):
 47 | 		for summary_tree in summary_tree_list:
 48 | 			sentence = summary_tree.get('sentence',None)
 49 | 			if sentence:
 50 | 				integration = integration_map.get(sentence, None)
 51 | 				if integration:
 52 | 					summary_tree.update(integration)
 53 | 			if 'children' in summary_tree:
 54 | 				NeuralSentenceSummariser.integrate_summary_tree_list(integration_map, summary_tree['children'])
 55 | 
 56 | 	def summarise_sentence_list(self, sentence_list, tree_arity=2, cut_factor=1, depth=None, options=None, min_size=None):
 57 | 		def get_elements_to_merge(slist, merge_size):
 58 | 			# slist = tuple(filter(lambda x: x[-1]=='.', slist))
 59 | 			return tuple(
 60 | 				slist[i:i+merge_size]
 61 | 				for i in range(0,len(slist),merge_size)
 62 | 			)
 63 | 
 64 | 		root_set = tuple(map(
 65 | 			lambda s: self.summarise_sentence(s, n=1, options=options, min_size=min_size)[0], 
 66 | 			sentence_list
 67 | 		))
 68 | 		root_set = tuple(map(
 69 | 			self.sentify, 
 70 | 			root_set
 71 | 		))
 72 | 		summary_tree = [
 73 | 			{
 74 | 				'summary': k,
 75 | 				'children': [{'sentence':v}]
 76 | 			}
 77 | 			for k,v in zip(root_set, sentence_list)
 78 | 		]
 79 | 		limit = 1 if not depth else math.ceil(len(root_set)/(tree_arity**depth))
 80 | 		while len(root_set) > limit:
 81 | 			root_set = tuple(
 82 | 				self.summarise_sentence(
 83 | 					' '.join(map(self.sentify, etm)), 
 84 | 					n=1, 
 85 | 					options=options,
 86 | 					min_size=min_size
 87 | 				)[0] if len(etm) > 1 else etm[0]
 88 | 				for etm in get_elements_to_merge(root_set, tree_arity)	
 89 | 			)
 90 | 			root_set = tuple(map(self.sentify, root_set))
 91 | 			summary_tree = [
 92 | 				{
 93 | 					'summary': summary,
 94 | 					'children': summary_tree[i*tree_arity:(i+1)*tree_arity]
 95 | 				}
 96 | 				for i,summary in enumerate(root_set)
 97 | 			]
 98 | 			if cut_factor > 1 and len(root_set) > 2:
 99 | 				root_set = root_set[:math.ceil(len(root_set)/cut_factor)]
100 | 		# print(json.dumps(summary_tree, indent=4))
101 | 		return summary_tree
102 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/app.js:
--------------------------------------------------------------------------------
  1 | const OKE_SERVER_URL = location.protocol+'//'+location.hostname+(location.port ? ':'+(parseInt(location.port,10)+2): '')+'/';
  2 | console.log('OKE_SERVER_URL:', OKE_SERVER_URL);
  3 | const GET_OVERVIEW_API = OKE_SERVER_URL+"overview";
  4 | const GET_ANSWER_API = OKE_SERVER_URL+"answer";
  5 | const GET_ANNOTATION_API = OKE_SERVER_URL+"annotation";
  6 | 
  7 | var app = new Vue({
  8 | 	el: '#app',
  9 | 	data: {
 10 | 		answer_list: [],
 11 | 		empty_answers: true,
 12 | 		loading_answers: false,
 13 | 		question_text: '',
 14 | 		important_answer_list: [],
 15 | 		summary_answer: '',
 16 | 		show_details: false,
 17 | 		single_answer_details: [],
 18 | 		documents: {
 19 | 			'myfile:BrusselsReg_EN_1215-20212': {
 20 | 				url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32012R1215&from=EN', // 'documents/BrusselsReg_EN_1215-20212.pdf',
 21 | 				name: 'Brussels I bis Regulation EU 1215/2012',
 22 | 			},
 23 | 			'myfile:Rome_I_EN': {
 24 | 				url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32008R0593&from=EN', // 'documents/Rome I_EN.pdf',
 25 | 				name: 'Rome I Regulation EC 593/2008',
 26 | 			},
 27 | 			'myfile:Rome_II_EN': {
 28 | 				url: 'https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32007R0864&from=EN', // 'documents/Rome II_EN.pdf',
 29 | 				name: 'Rome II Regulation EC 864/2007',
 30 | 			}
 31 | 		}
 32 | 	},
 33 | 	methods: {
 34 | 		question: function(event) {
 35 | 			// console.log(event);
 36 | 			var self = this;
 37 | 			self.loading_answers = true;
 38 | 			self.answer_list = [];
 39 | 			self.show_details = false;
 40 | 
 41 | 			var x = event.target.value.replace(/(\r\n|\n|\r)/gm, "").trim();
 42 | 			x = x.charAt(0).toUpperCase() + x.slice(1);
 43 | 			console.log('Sending question:',x);
 44 | 			$.ajax({
 45 | 				type: "GET",
 46 | 				url: GET_ANSWER_API,
 47 | 				responseType:'application/json',
 48 | 				data: {
 49 | 					'question': x, 
 50 | 					// 'summarised': true
 51 | 				},
 52 | 				success: function (result) {
 53 | 					// console.log('Getting answer:',JSON.stringify(result));
 54 | 					self.loading_answers = false;
 55 | 					if (!result)
 56 | 					{
 57 | 						self.empty_answers = true;
 58 | 						return;
 59 | 					}
 60 | 					var question = Object.keys(result)[0];
 61 | 					var important_answer_list = result[question];
 62 | 					self.empty_answers = false;
 63 | 					self.question_text = question;
 64 | 					self.important_answer_list = [];
 65 | 					self.single_answer_details = [];
 66 | 					// console.log('Getting answer:',JSON.stringify(important_answer_list));
 67 | 					console.log('Getting answer..');
 68 | 					for (var answer of important_answer_list) {
 69 | 						answer.confidence = (answer.confidence*100).toFixed(2).toString()+'%';
 70 | 						if (answer.annotation)
 71 | 						{
 72 | 							var jsonld = build_minimal_entity_graph(tuple_list_to_formatted_jsonld(answer.annotation));
 73 | 							KNOWN_ENTITY_DICT = get_entity_dict(jsonld);
 74 | 							var source_dict = jsonld[0]; // the biggest
 75 | 							var doc_id = get_description(source_dict[prefixed_string_to_uri('my:docID')]);
 76 | 							if (!doc_id)
 77 | 							{
 78 | 								answer.document = '';
 79 | 								continue;
 80 | 							}
 81 | 							answer.document = self.documents[uri_to_prefixed_string(doc_id)];
 82 | 
 83 | 							var article_id = get_description(source_dict[prefixed_string_to_uri('my:article_id')]);
 84 | 							if (article_id)
 85 | 							{
 86 | 								var article = [article_id];
 87 | 								var paragraph_id = get_description(source_dict[prefixed_string_to_uri('my:paragraph_id')]);
 88 | 								if (paragraph_id)
 89 | 									article.push(paragraph_id);
 90 | 								answer.article = article.map(get_known_label).join('.');
 91 | 							}
 92 | 						}
 93 | 						else 
 94 | 						{
 95 | 							answer.document = '';
 96 | 							continue;
 97 | 						}
 98 | 						self.important_answer_list.push(answer);
 99 | 					}
100 | 					if (self.important_answer_list.length == 0)
101 | 						self.empty_answers = true;
102 | 				},
103 | 			});
104 | 		}
105 | 	}
106 | })
107 | 


--------------------------------------------------------------------------------
/web_app/oke/core/misc/tree_cluster_builder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.cluster.hierarchy as hierarchy
 3 | from scipy.spatial import distance
 4 | from sklearn.metrics.pairwise import cosine_similarity
 5 | 
 6 | def build_hierarchical_cluster(elements, labels, method='centroid', metric='euclidean', optimal_ordering=False):
 7 | 	cluster = hierarchy.linkage(
 8 | 		elements, 
 9 | 		method=method, 
10 | 		metric=metric, 
11 | 		optimal_ordering=optimal_ordering
12 | 	)
13 | 
14 | 	cophentic_correlation_distance, cophenetic_distance_matrix = hierarchy.cophenet(cluster, distance.pdist(elements))
15 | 	#print('Cophenetic Distance Matrix', cophenetic_distance_matrix)
16 | 	#print('Cophentic Correlation Distance:', cophentic_correlation_distance)
17 | 
18 | 	clusters_dict = {}
19 | 	for i, merge in enumerate(cluster):
20 | 		# if it is an original point read it from the centers array # other wise read the cluster that has been created
21 | 		a = int(merge[0]) if merge[0] <= len(cluster) else clusters_dict[int(merge[0])]
22 | 		b = int(merge[1]) if merge[1] <= len(cluster) else clusters_dict[int(merge[1])]
23 | 		# the clusters_dict are 1-indexed by scipy
24 | 		clusters_dict[1 + i + len(cluster)] = [a,b]
25 | 		
26 | 	cluster_nested_list = clusters_dict[1 + i + len(cluster)]
27 | 
28 | 	def flatten(container): # iterative flatten, this way we avoid <RecursionError: maximum recursion depth exceeded>
29 | 		list_to_flat = [container]
30 | 		while len(list_to_flat) > 0:
31 | 			current_list_to_flat = list_to_flat.pop()
32 | 			for element in current_list_to_flat:
33 | 				if isinstance(element, (list,tuple)):
34 | 					list_to_flat.append(element)
35 | 				else:
36 | 					yield element
37 | 				
38 | 	def build_centroid_tree(nested_list):
39 | 		# lazy building, this way we avoid <RecursionError: maximum recursion depth exceeded>
40 | 		def let_centroid_tree(nested_list):
41 | 			if isinstance(nested_list, (list,tuple)):
42 | 				centroid = np.average([elements[e] for e in flatten(nested_list)], 0)
43 | 				return {'centroid': centroid, 'sub_tree': (let_centroid_tree(l) for l in nested_list)}
44 | 			return {'label': labels[nested_list], 'value': elements[nested_list], 'idx': nested_list}
45 | 		centroid_tree = let_centroid_tree(nested_list)
46 | 		# eager building
47 | 		tree_to_build = [centroid_tree]
48 | 		while len(tree_to_build) > 0:
49 | 			current_tree_to_build = tree_to_build.pop()
50 | 			if 'sub_tree' in current_tree_to_build:
51 | 				current_tree_to_build['sub_tree'] = tuple(current_tree_to_build['sub_tree'])
52 | 				tree_to_build.extend(current_tree_to_build['sub_tree'])
53 | 		return centroid_tree
54 | 
55 | 	return build_centroid_tree(cluster_nested_list), cophentic_correlation_distance
56 | 
57 | def get_most_similar_leaf(dendrogram, entity_embedding):
58 | 	# iterative version to avoid <RecursionError: maximum recursion depth exceeded>
59 | 	leaf_list = []
60 | 	tree_to_look = [dendrogram]
61 | 	while len(tree_to_look) > 0:
62 | 		current_tree_to_look = tree_to_look.pop()
63 | 		if 'sub_tree' in current_tree_to_look:
64 | 			tree_to_look.extend(current_tree_to_look['sub_tree'])
65 | 		elif 'value' in current_tree_to_look:
66 | 			leaf_list.append(current_tree_to_look)
67 | 	if len(leaf_list) == 0:
68 | 		return None
69 | 	value_list = [
70 | 		leaf['value'] 
71 | 		for leaf in leaf_list 
72 | 		#if leaf['label'] not in centroid_set
73 | 	]
74 | 	similarity_vec = cosine_similarity([entity_embedding], value_list)
75 | 	best_idx = np.argmax(similarity_vec)
76 | 	return leaf_list[best_idx]['label']
77 | 
78 | def build_edge_list(root_dendrogram): # iterative version to avoid <RecursionError: maximum recursion depth exceeded>
79 | 	centroid_set = set()
80 | 	edge_list = []
81 | 	tree_to_build = [(None,root_dendrogram)]
82 | 	while len(tree_to_build) > 0:
83 | 		parent_label, dendrogram = tree_to_build.pop()
84 | 		if 'centroid' in dendrogram:
85 | 			centroid_embedding = dendrogram['centroid']
86 | 			centroid_label = get_most_similar_leaf(dendrogram, centroid_embedding)
87 | 			if centroid_label is not None:
88 | 				centroid_set.add(centroid_label)
89 | 				if parent_label is not None:
90 | 					edge_list.append((parent_label, 'related_to', centroid_label))
91 | 				tree_to_build.extend((centroid_label,sub_d) for sub_d in dendrogram['sub_tree'])
92 | 		elif 'value' in dendrogram:
93 | 			if dendrogram['label'] not in centroid_set: # centroids cannot be leaves
94 | 				edge_list.append((parent_label, 'related_to', dendrogram['label']))
95 | 	return edge_list
96 | 


--------------------------------------------------------------------------------
/web_app/oke/server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # os.environ["CUDA_VISIBLE_DEVICES"]="-1"
  3 | 
  4 | from bottle import run, get, post, route, hook, request, response, static_file
  5 | import json
  6 | 
  7 | from more_itertools import unique_everseen
  8 | import sys
  9 | port = int(sys.argv[1])
 10 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 11 | 
 12 | from server_interface import *
 13 | 
 14 | ###############################################################
 15 | # CORS
 16 | 
 17 | @route('/<:re:.*>', method='OPTIONS')
 18 | def enable_cors_generic_route():
 19 | 	"""
 20 | 	This route takes priority over all others. So any request with an OPTIONS
 21 | 	method will be handled by this function.
 22 | 
 23 | 	See: https://github.com/bottlepy/bottle/issues/402
 24 | 
 25 | 	NOTE: This means we won't 404 any invalid path that is an OPTIONS request.
 26 | 	"""
 27 | 	add_cors_headers()
 28 | 
 29 | @hook('after_request')
 30 | def enable_cors_after_request_hook():
 31 | 	"""
 32 | 	This executes after every route. We use it to attach CORS headers when
 33 | 	applicable.
 34 | 	"""
 35 | 	add_cors_headers()
 36 | 
 37 | def add_cors_headers():
 38 | 	try:
 39 | 		response.headers['Access-Control-Allow-Origin'] = '*'
 40 | 		response.headers['Access-Control-Allow-Methods'] = 'GET, POST, PUT, OPTIONS'
 41 | 		response.headers['Access-Control-Allow-Headers'] = 'Origin, Accept, Content-Type, X-Requested-With, X-CSRF-Token'
 42 | 	except Exception as e:
 43 | 		print('Error:',e)
 44 | 
 45 | def get_from_cache(cache, key, build_fn):
 46 | 	if key not in cache:
 47 | 		cache[key] = json.dumps(build_fn())
 48 | 	return cache[key]
 49 | 
 50 | ###############################################################
 51 | # API - Question Answerer
 52 | 
 53 | ANSWERS_CACHE = {}
 54 | @get('/answer')
 55 | def get_answer():
 56 | 	response.content_type = 'application/json'
 57 | 	# question = request.forms.get('question') # post
 58 | 	question = request.query.get('question')
 59 | 	def build_fn():
 60 | 		print('Answering..')
 61 | 		# print(question)
 62 | 		return get_question_answer_dict(
 63 | 			[question],
 64 | 			options={
 65 | 				'answer_pertinence_threshold': 0.35, 
 66 | 				'keep_the_n_most_similar_concepts': 1, 
 67 | 				'query_concept_similarity_threshold': 0.55, 
 68 | 				'add_external_definitions': True, 
 69 | 				'add_clustered_triples': False,
 70 | 				'include_super_concepts_graph': False, 
 71 | 				'include_sub_concepts_graph': True, 
 72 | 				'consider_incoming_relations': True,
 73 | 				'tfidf_importance': 1/2,
 74 | 			}
 75 | 		)
 76 | 	return get_from_cache(ANSWERS_CACHE, question, build_fn)
 77 | 
 78 | OVERVIEW_CACHE = {}
 79 | @get('/overview')
 80 | def get_overview():
 81 | 	response.content_type = 'application/json'
 82 | 	concept_uri = request.query.get('concept_uri')
 83 | 	# concept_uri = concept_uri.lower().strip()
 84 | 	# query_template_list = json.loads(request.query.get('query_template_list'))
 85 | 	def build_fn():
 86 | 		print('Answering..')
 87 | 		query_template_list = [
 88 | 			##### Causal + Justificatory
 89 | 			'Why?',
 90 | 			##### Theleological
 91 | 			'What for?',
 92 | 			##### Spatial
 93 | 			'Where?',
 94 | 			##### Temporal
 95 | 			'When?',
 96 | 			##### Descriptive
 97 | 			'What?',
 98 | 			##### Expository
 99 | 			# 'How?',
100 | 			##### Extra
101 | 			# 'Who?',
102 | 			# 'Who by?',
103 | 			# 'Why not?',
104 | 		]
105 | 		question_answer_dict = get_concept_overview(
106 | 			query_template_list, 
107 | 			concept_uri, 
108 | 			options={
109 | 				'answer_pertinence_threshold': 0.02, 
110 | 				'add_external_definitions': True, 
111 | 				'add_clustered_triples': False, 
112 | 				'include_super_concepts_graph': False, 
113 | 				'include_sub_concepts_graph': True, 
114 | 				'consider_incoming_relations': True,
115 | 				'tfidf_importance': 0, # questions are not compatible with TF-IDF
116 | 			}
117 | 		)
118 | 		print('Summarising..')
119 | 		if question_answer_dict:
120 | 			question_summary_tree = get_summarised_question_answer_dict(
121 | 				question_answer_dict,
122 | 				options={
123 | 					'ignore_non_grounded_answers': False, 
124 | 					'use_abstracts': False, 
125 | 					'summary_horizon': 3,
126 | 					'tree_arity': 3, 
127 | 					# 'cut_factor': 2, 
128 | 					# 'depth': 1,
129 | 					'remove_duplicates': True,
130 | 					'min_size_for_summarising': 50,
131 | 				}
132 | 			)
133 | 		else:
134 | 			question_summary_tree = None
135 | 		print('Getting taxonomical view..')
136 | 		taxonomical_view = get_taxonomical_view(concept_uri, depth=0)
137 | 		print('Annotating..')
138 | 		annotation_iter = unique_everseen(annotate_question_summary_tree(question_summary_tree) + annotate_taxonomical_view(taxonomical_view))
139 | 		equivalent_concept_uri_set = get_equivalent_concepts(concept_uri)
140 | 		equivalent_concept_uri_set.add(concept_uri)
141 | 		annotation_iter = filter(lambda x: x['annotation'] not in equivalent_concept_uri_set, annotation_iter)
142 | 		return {
143 | 			'question_summary_tree': question_summary_tree,
144 | 			'taxonomical_view': taxonomical_view,
145 | 			'annotation_list': list(annotation_iter),
146 | 		}
147 | 	return get_from_cache(OVERVIEW_CACHE, concept_uri, build_fn)
148 | 
149 | if __name__ == "__main__":
150 | 	run(host='0.0.0.0', port=port+2, debug=True)
151 | 	
152 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/classification/concept_classifier.py:
--------------------------------------------------------------------------------
  1 | from misc.doc_reader import DocParser
  2 | from models.classification.sentence_classifier import SentenceClassifier
  3 | from models.knowledge_extraction.concept_extractor import ConceptExtractor
  4 | from more_itertools import unique_everseen
  5 | from collections import Counter
  6 | import nltk
  7 | from nltk.corpus import stopwords
  8 | import re
  9 | import json
 10 | import itertools
 11 | 
 12 | class ConceptClassifier(SentenceClassifier):
 13 | 	def __init__(self, model_options):
 14 | 		# nltk.download('stopwords')
 15 | 		super().__init__(model_options)
 16 | 		self.concept_extractor = ConceptExtractor(model_options)
 17 | 
 18 | 	def set_concept_description_dict(self, concept_description_dict):
 19 | 		id_doc_list = tuple(unique_everseen((
 20 | 			(key,description) 
 21 | 			for key, value in concept_description_dict.items() 
 22 | 			for description in value
 23 | 		)))
 24 | 		self.set_documents(id_doc_list)
 25 | 		return self
 26 | 	
 27 | 	def lemmatize_spacy_document(self, doc):
 28 | 		return [
 29 | 			token.lemma_.casefold().strip()
 30 | 			for token in doc
 31 | 			if not token.is_punct and token.lemma_.lower() != '-pron-'
 32 | 			# Remove stop tokens: <https://stackoverflow.com/questions/40288323/what-do-spacys-part-of-speech-and-dependency-tags-mean>
 33 | 			#if not (token.is_punct or token.pos_ in ['PART','DET','ADP','CONJ','SCONJ'])
 34 | 		]
 35 | 	
 36 | 	def get_concept_dict(self, doc_parser: DocParser, concept_counter_dict=None, similarity_threshold=None, with_numbers=True, size=None, remove_stopwords=True, lemmatized=True, tfidf_importance=None):
 37 | 		if concept_counter_dict is None:
 38 | 			concept_counter_dict = {}
 39 | 		# Extract concept dict list
 40 | 		concept_dict_list = self.concept_extractor.get_concept_list(doc_parser)
 41 | 		get_concept_label = lambda x: x['concept']['lemma' if lemmatized else 'text']
 42 | 		# Remove unwanted concepts
 43 | 		filter_empty_fn = lambda x: x
 44 | 		filter_stopwords_fn = lambda x: x not in stopwords.words('english')
 45 | 		filter_numbers_fn = lambda x: re.search(r'\d', x) is None
 46 | 		if remove_stopwords and with_numbers:
 47 | 			filter_fn = lambda x: filter_empty_fn(x) and filter_stopwords_fn(x) and filter_numbers_fn(x)
 48 | 		elif remove_stopwords:
 49 | 			filter_fn = lambda x: filter_empty_fn(x) and filter_stopwords_fn(x)
 50 | 		elif with_numbers:
 51 | 			filter_fn = lambda x: filter_empty_fn(x) and filter_numbers_fn(x)
 52 | 		else:
 53 | 			filter_fn = lambda x: filter_empty_fn(x)
 54 | 		concept_dict_list = list(filter(lambda x: filter_fn(get_concept_label(x)), concept_dict_list))
 55 | 		# Extract concept_counter
 56 | 		concept_iter = map(get_concept_label, concept_dict_list)
 57 | 		concept_list = tuple(concept_iter)
 58 | 		concept_counter = Counter(concept_list)
 59 | 		# Merge input concept_counter_dict with concept_counter
 60 | 		for concept,count in concept_counter.items():
 61 | 			if concept not in concept_counter_dict:
 62 | 				concept_counter_dict[concept] = {
 63 | 					'count': count, 
 64 | 					'source_list': [],
 65 | 					'similar_to': []
 66 | 				}
 67 | 			else:
 68 | 				concept_counter_dict[concept]['count'] += count
 69 | 		# Add sources
 70 | 		for concept, cdict in zip(concept_list, concept_dict_list):
 71 | 			concept_counter_dict[concept]['span'] = cdict['concept']['span']
 72 | 			concept_counter_dict[concept]['source_list'].append(cdict['source'])
 73 | 		# Add similarities
 74 | 		if not concept_counter_dict:
 75 | 			return {}
 76 | 		text_list, cdict_list = zip(*concept_counter_dict.items())
 77 | 		formatted_text_list = tuple(map(lambda x: x['span'], cdict_list))
 78 | 		index_of_most_similar_documents_list = self.get_index_of_most_similar_documents(
 79 | 			self.get_formatted_query_similarity(text_list, formatted_text_list, tfidf_importance=tfidf_importance), 
 80 | 			similarity_threshold= similarity_threshold,
 81 | 			similarity_type= 'weighted',
 82 | 		)
 83 | 		for concept, index_of_most_similar_documents in zip(text_list, index_of_most_similar_documents_list):
 84 | 			concept_counter_dict[concept]['similar_to'] = tuple(itertools.islice(index_of_most_similar_documents, size))
 85 | 			concept_counter_dict[concept]['source_list'] = tuple(unique_everseen(concept_counter_dict[concept]['source_list'], key=lambda x:x['sentence_text']))
 86 | 		return concept_counter_dict
 87 | 
 88 | 	@staticmethod
 89 | 	def get_missing_concepts_counter(concept_dict):
 90 | 		return {
 91 | 			concept: value['count']
 92 | 			for concept, value in concept_dict.items()
 93 | 			if len(value['similar_to'])==0
 94 | 		}
 95 | 
 96 | 	def annotate(self, doc_parser: DocParser, similarity_threshold=None, max_concepts_per_alignment=1, tfidf_importance=None):
 97 | 		concept_dict = self.get_concept_dict(
 98 | 			doc_parser, 
 99 | 			similarity_threshold= similarity_threshold, 
100 | 			with_numbers= True, 
101 | 			lemmatized= False,
102 | 			remove_stopwords= True,
103 | 			size= max_concepts_per_alignment,
104 | 			tfidf_importance= tfidf_importance,
105 | 		)
106 | 		annotation_iter = (
107 | 			{
108 | 				'text': concept_label,
109 | 				'annotation': concept_uri_dict['id'],
110 | 				'similarity': concept_uri_dict['similarity'],
111 | 				'syntactic_similarity': concept_uri_dict['syntactic_similarity'],
112 | 				'semantic_similarity': concept_uri_dict['semantic_similarity'],
113 | 			}
114 | 			for concept_label,similarity_dict in concept_dict.items()
115 | 			for concept_uri_dict in similarity_dict['similar_to']
116 | 		)
117 | 		annotation_iter = unique_everseen(annotation_iter, key=lambda x: x['text'])
118 | 		annotation_list = list(annotation_iter)
119 | 		# print(json.dumps(annotation_list, indent=4))
120 | 		return annotation_list
121 | 


--------------------------------------------------------------------------------
/web_app/oke/core/misc/adjacency_matrix.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | class AdjacencyMatrix():
  4 | 	
  5 | 	def __init__(self, graph, equivalence_relation_set, is_sorted=False): # Build the adjacency matrix, for both incoming and outcoming edges
  6 | 		self.graph = graph
  7 | 		self.equivalence_matrix = {}
  8 | 		self.adjacency_matrix = {}
  9 | 		for s,p,o in graph:
 10 | 			if o not in self.adjacency_matrix:
 11 | 				self.adjacency_matrix[o] = {'in': [], 'out': []}
 12 | 			if s not in self.adjacency_matrix:
 13 | 				self.adjacency_matrix[s] = {'in': [], 'out': []}
 14 | 			if p not in equivalence_relation_set:
 15 | 				continue
 16 | 			if s not in self.equivalence_matrix:
 17 | 				self.equivalence_matrix[s] = set()
 18 | 			if o not in self.equivalence_matrix:
 19 | 				self.equivalence_matrix[o] = set()
 20 | 			self.equivalence_matrix[s].add(o)
 21 | 			for e in self.equivalence_matrix[s]:
 22 | 				if e == o:
 23 | 					continue
 24 | 				self.equivalence_matrix[e].add(o)
 25 | 			self.equivalence_matrix[o].add(s)
 26 | 			for e in self.equivalence_matrix[o]:
 27 | 				if e == s:
 28 | 					continue
 29 | 				self.equivalence_matrix[e].add(s)
 30 | 		# print(json.dumps(dict(map(lambda x:(x[0],list(x[1])), self.equivalence_matrix.items())), indent=4))
 31 | 		for s,p,o in graph:
 32 | 			self.adjacency_matrix[s]['out'].append((p,o))
 33 | 			for e in self.equivalence_matrix.get(s,[]):
 34 | 				self.adjacency_matrix[e]['out'].append((p,o))
 35 | 			self.adjacency_matrix[o]['in'].append((p,s))
 36 | 			for e in self.equivalence_matrix.get(o,[]):
 37 | 				self.adjacency_matrix[e]['in'].append((p,s))
 38 | 		# print(json.dumps(self.adjacency_matrix['my:cem'], indent=4))
 39 | 		if is_sorted:
 40 | 			for concept in self.adjacency_matrix.values():
 41 | 				concept['in'].sort()
 42 | 				concept['out'].sort()
 43 | 
 44 | 	def get_incoming_edges_matrix(self, concept):
 45 | 		adjacency_list = self.adjacency_matrix.get(concept,None)
 46 | 		return list(adjacency_list['in']) if adjacency_list else []
 47 | 
 48 | 	def get_outcoming_edges_matrix(self, concept):
 49 | 		adjacency_list = self.adjacency_matrix.get(concept,None)
 50 | 		return list(adjacency_list['out']) if adjacency_list else []
 51 | 
 52 | 	def get_equivalent_concepts(self, concept):
 53 | 		return set(self.equivalence_matrix.get(concept,[]))
 54 | 
 55 | 	def get_nodes(self):
 56 | 		return self.adjacency_matrix.keys()
 57 | 
 58 | 	def get_predicate_chain(self, concept_set, direction_set, predicate_filter_fn=None, depth=None, already_explored_concepts_set=None): # This function returns the related concepts of a given concept set for a given type of relations (e.g. if the relation is rdfs:subclassof, then it returns the super- and/or sub-classes), exploting an adjacency matrix
 59 | 		if depth:
 60 | 			depth -= 1
 61 | 		if not already_explored_concepts_set:
 62 | 			already_explored_concepts_set = set()
 63 | 		joint_set = set()
 64 | 		already_explored_concepts_set |= concept_set
 65 | 		for c in concept_set:
 66 | 			for direction in direction_set:
 67 | 				adjacency_list = self.adjacency_matrix.get(c,None)
 68 | 				if adjacency_list:
 69 | 					adjacency_iter = filter(lambda x: x[-1] not in already_explored_concepts_set, adjacency_list[direction])
 70 | 					if predicate_filter_fn:
 71 | 						adjacency_iter = filter(lambda x: predicate_filter_fn(x[0]), adjacency_iter)
 72 | 					joint_set |= set(map(lambda y: y[-1], adjacency_iter))
 73 | 		if len(joint_set) == 0:
 74 | 			return set(concept_set)
 75 | 		elif depth and depth <= 0:
 76 | 			return joint_set.union(concept_set)
 77 | 		return concept_set.union(self.get_predicate_chain(
 78 | 			joint_set, 
 79 | 			direction_set,
 80 | 			predicate_filter_fn=predicate_filter_fn, 
 81 | 			depth=depth, 
 82 | 			already_explored_concepts_set=already_explored_concepts_set,
 83 | 		))
 84 | 
 85 | 	# Tarjan's algorithm (single DFS) for finding strongly connected components in a given directed graph
 86 | 	def SCC(self): # Complexity : O(V+E) 
 87 | 		'''A recursive function that finds and prints strongly connected 
 88 | 		components using DFS traversal 
 89 | 		u --> The vertex to be visited next 
 90 | 		disc[] --> Stores discovery times of visited vertices 
 91 | 		low[] -- >> earliest visited vertex (the vertex with minimum 
 92 | 					discovery time) that can be reached from subtree 
 93 | 					rooted with current vertex 
 94 | 		 st -- >> To store all the connected ancestors (could be part 
 95 | 			   of SCC) 
 96 | 		 stackMember[] --> bit/index array for faster check whether 
 97 | 					  a node is in stack 
 98 | 		'''
 99 | 		def helper(clique_list, u, low, disc, stackMember, st, Time=0): 
100 | 			# Initialize discovery time and low value 
101 | 			disc[u] = Time 
102 | 			low[u] = Time 
103 | 			Time += 1
104 | 			stackMember[u] = True
105 | 			st.append(u) 
106 | 
107 | 			# Go through all vertices adjacent to this 
108 | 			for _,v in self.adjacency_matrix[u]['in']: 
109 | 				  
110 | 				# If v is not visited yet, then recur for it 
111 | 				if disc[v] == -1: 
112 | 					Time = helper(clique_list, v, low, disc, stackMember, st, Time) 
113 | 
114 | 					# Check if the subtree rooted with v has a connection to 
115 | 					# one of the ancestors of u 
116 | 					# Case 1 (per above discussion on Disc and Low value) 
117 | 					low[u] = min(low[u], low[v]) 
118 | 							  
119 | 				elif stackMember[v] == True:  
120 | 
121 | 					'''Update low value of 'u' only if 'v' is still in stack 
122 | 					(i.e. it's a back edge, not cross edge). 
123 | 					Case 2 (per above discussion on Disc and Low value) '''
124 | 					low[u] = min(low[u], disc[v]) 
125 | 
126 | 			# head node found, pop the stack and print an SCC 
127 | 			w = -1 #To store stack extracted vertices 
128 | 			if low[u] == disc[u]:
129 | 				clique = [] 
130 | 				while w != u: 
131 | 					w = st.pop() 
132 | 					clique.append(w)
133 | 					stackMember[w] = False
134 | 				clique_list.append(clique)
135 | 			return Time  
136 | 		# Mark all the vertices as not visited  
137 | 		# and Initialize parent and visited,  
138 | 		# and ap(articulation point) arrays 
139 | 		disc = {k:-1 for k in self.adjacency_matrix.keys()}
140 | 		low = {k:-1 for k in self.adjacency_matrix.keys()}
141 | 		stackMember = {k:False for k in self.adjacency_matrix.keys()}
142 | 		st =[] 
143 | 		  
144 | 
145 | 		# Call the recursive helper function  
146 | 		# to find articulation points 
147 | 		# in DFS tree rooted with vertex 'i' 
148 | 		clique_list = []
149 | 		Time = 0
150 | 		for i in self.adjacency_matrix.keys(): 
151 | 			if disc[i] == -1: 
152 | 				Time = helper(clique_list, i, low, disc, stackMember, st, Time)
153 | 		return clique_list
154 | 


--------------------------------------------------------------------------------
/kg_hinge/kg_hinge.graphml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='utf-8'?>
  2 | <graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"><key id="d0" for="edge" attr.name="r" attr.type="string"/>
  3 | <graph edgedefault="directed"><node id="wn:organization.n.01"/>
  4 | <node id="foaf:Organization"/>
  5 | <node id="wn:information.n.01"/>
  6 | <node id="pro:InformationObject"/>
  7 | <node id="wn:change_of_location.n.01"/>
  8 | <node id="pro:Place"/>
  9 | <node id="wn:region.n.01"/>
 10 | <node id="wn:person.n.01"/>
 11 | <node id="pro:RoleInTime"/>
 12 | <node id="wn:matter.n.06"/>
 13 | <node id="wn:section.n.03"/>
 14 | <node id="wn:legal_document.n.01"/>
 15 | <node id="wn:right.n.01"/>
 16 | <node id="pwo:Action"/>
 17 | <node id="wn:fact.n.01"/>
 18 | <node id="wn:software.n.01"/>
 19 | <node id="wn:derogation.n.02"/>
 20 | <node id="wn:way.n.05"/>
 21 | <node id="wn:recompense.n.02"/>
 22 | <node id="wn:deference.n.01"/>
 23 | <node id="wn:choice.n.02"/>
 24 | <node id="wn:joining.n.01"/>
 25 | <node id="wn:appearance.n.03"/>
 26 | <node id="wn:performance.n.03"/>
 27 | <node id="wn:amendment.n.01"/>
 28 | <node id="wn:transformation.n.05"/>
 29 | <node id="wn:damage.n.03"/>
 30 | <node id="wn:substitution.n.02"/>
 31 | <node id="wn:restitution.n.02"/>
 32 | <node id="wn:transfer.n.03"/>
 33 | <node id="wn:judgment.n.03"/>
 34 | <node id="wn:service.n.13"/>
 35 | <node id="wn:obedience.n.03"/>
 36 | <node id="wn:presentment.n.01"/>
 37 | <node id="wn:copyright.n.01"/>
 38 | <node id="wn:sale.n.01"/>
 39 | <node id="wn:enforcement.n.01"/>
 40 | <node id="wn:litigation.n.01"/>
 41 | <node id="wn:rise.n.02"/>
 42 | <node id="wn:indebtedness.n.01"/>
 43 | <node id="pro:Obligation"/>
 44 | <node id="wn:performance.n.04"/>
 45 | <node id="wn:lockout.n.01"/>
 46 | <node id="wn:appeal.n.03"/>
 47 | <node id="wn:commercial_enterprise.n.02"/>
 48 | <node id="wn:review.n.08"/>
 49 | <node id="wn:remuneration.n.02"/>
 50 | <node id="wn:termination.n.05"/>
 51 | <node id="wn:requital.n.02"/>
 52 | <node id="wn:sale.n.02"/>
 53 | <edge source="wn:organization.n.01" target="foaf:Organization">
 54 |   <data key="d0">rdf:type</data>
 55 | </edge>
 56 | <edge source="wn:information.n.01" target="pro:InformationObject">
 57 |   <data key="d0">rdf:type</data>
 58 | </edge>
 59 | <edge source="wn:change_of_location.n.01" target="pro:Place">
 60 |   <data key="d0">rdf:type</data>
 61 | </edge>
 62 | <edge source="wn:region.n.01" target="pro:Place">
 63 |   <data key="d0">rdf:type</data>
 64 | </edge>
 65 | <edge source="wn:person.n.01" target="pro:RoleInTime">
 66 |   <data key="d0">rdf:type</data>
 67 | </edge>
 68 | <edge source="wn:matter.n.06" target="pro:InformationObject">
 69 |   <data key="d0">rdf:type</data>
 70 | </edge>
 71 | <edge source="wn:section.n.03" target="pro:Place">
 72 |   <data key="d0">rdf:type</data>
 73 | </edge>
 74 | <edge source="wn:legal_document.n.01" target="pro:InformationObject">
 75 |   <data key="d0">rdf:type</data>
 76 | </edge>
 77 | <edge source="wn:right.n.01" target="pwo:Action">
 78 |   <data key="d0">rdf:type</data>
 79 | </edge>
 80 | <edge source="wn:fact.n.01" target="pro:InformationObject">
 81 |   <data key="d0">rdf:type</data>
 82 | </edge>
 83 | <edge source="wn:software.n.01" target="pro:InformationObject">
 84 |   <data key="d0">rdf:type</data>
 85 | </edge>
 86 | <edge source="wn:derogation.n.02" target="pwo:Action">
 87 |   <data key="d0">rdf:type</data>
 88 | </edge>
 89 | <edge source="wn:way.n.05" target="pwo:Action">
 90 |   <data key="d0">rdf:type</data>
 91 | </edge>
 92 | <edge source="wn:recompense.n.02" target="pwo:Action">
 93 |   <data key="d0">rdf:type</data>
 94 | </edge>
 95 | <edge source="wn:deference.n.01" target="pwo:Action">
 96 |   <data key="d0">rdf:type</data>
 97 | </edge>
 98 | <edge source="wn:choice.n.02" target="pwo:Action">
 99 |   <data key="d0">rdf:type</data>
100 | </edge>
101 | <edge source="wn:joining.n.01" target="pwo:Action">
102 |   <data key="d0">rdf:type</data>
103 | </edge>
104 | <edge source="wn:appearance.n.03" target="pwo:Action">
105 |   <data key="d0">rdf:type</data>
106 | </edge>
107 | <edge source="wn:performance.n.03" target="pwo:Action">
108 |   <data key="d0">rdf:type</data>
109 | </edge>
110 | <edge source="wn:amendment.n.01" target="pwo:Action">
111 |   <data key="d0">rdf:type</data>
112 | </edge>
113 | <edge source="wn:transformation.n.05" target="pwo:Action">
114 |   <data key="d0">rdf:type</data>
115 | </edge>
116 | <edge source="wn:damage.n.03" target="pwo:Action">
117 |   <data key="d0">rdf:type</data>
118 | </edge>
119 | <edge source="wn:substitution.n.02" target="pwo:Action">
120 |   <data key="d0">rdf:type</data>
121 | </edge>
122 | <edge source="wn:restitution.n.02" target="pwo:Action">
123 |   <data key="d0">rdf:type</data>
124 | </edge>
125 | <edge source="wn:transfer.n.03" target="pwo:Action">
126 |   <data key="d0">rdf:type</data>
127 | </edge>
128 | <edge source="wn:judgment.n.03" target="pwo:Action">
129 |   <data key="d0">rdf:type</data>
130 | </edge>
131 | <edge source="wn:service.n.13" target="pwo:Action">
132 |   <data key="d0">rdf:type</data>
133 | </edge>
134 | <edge source="wn:obedience.n.03" target="pwo:Action">
135 |   <data key="d0">rdf:type</data>
136 | </edge>
137 | <edge source="wn:presentment.n.01" target="pwo:Action">
138 |   <data key="d0">rdf:type</data>
139 | </edge>
140 | <edge source="wn:copyright.n.01" target="pro:InformationObject">
141 |   <data key="d0">rdf:type</data>
142 | </edge>
143 | <edge source="wn:sale.n.01" target="pwo:Action">
144 |   <data key="d0">rdf:type</data>
145 | </edge>
146 | <edge source="wn:enforcement.n.01" target="pwo:Action">
147 |   <data key="d0">rdf:type</data>
148 | </edge>
149 | <edge source="wn:litigation.n.01" target="pwo:Action">
150 |   <data key="d0">rdf:type</data>
151 | </edge>
152 | <edge source="wn:rise.n.02" target="pwo:Action">
153 |   <data key="d0">rdf:type</data>
154 | </edge>
155 | <edge source="wn:indebtedness.n.01" target="pro:Obligation">
156 |   <data key="d0">rdf:type</data>
157 | </edge>
158 | <edge source="wn:performance.n.04" target="pwo:Action">
159 |   <data key="d0">rdf:type</data>
160 | </edge>
161 | <edge source="wn:lockout.n.01" target="pwo:Action">
162 |   <data key="d0">rdf:type</data>
163 | </edge>
164 | <edge source="wn:appeal.n.03" target="pwo:Action">
165 |   <data key="d0">rdf:type</data>
166 | </edge>
167 | <edge source="wn:commercial_enterprise.n.02" target="pwo:Action">
168 |   <data key="d0">rdf:type</data>
169 | </edge>
170 | <edge source="wn:review.n.08" target="pwo:Action">
171 |   <data key="d0">rdf:type</data>
172 | </edge>
173 | <edge source="wn:remuneration.n.02" target="pwo:Action">
174 |   <data key="d0">rdf:type</data>
175 | </edge>
176 | <edge source="wn:termination.n.05" target="pwo:Action">
177 |   <data key="d0">rdf:type</data>
178 | </edge>
179 | <edge source="wn:requital.n.02" target="pwo:Action">
180 |   <data key="d0">rdf:type</data>
181 | </edge>
182 | <edge source="wn:sale.n.02" target="pwo:Action">
183 |   <data key="d0">rdf:type</data>
184 | </edge>
185 | </graph></graphml>


--------------------------------------------------------------------------------
/web_app/yai/static/js/stage_builder/api_lib.js:
--------------------------------------------------------------------------------
  1 | const API_SERVER_URL = location.protocol+'//'+location.hostname+(location.port ? ':'+location.port: '')+'/';
  2 | 
  3 | function expand_information(information_uri, known_entity_dict, language='en', use_linksum=true, retry=true)
  4 | {
  5 | 	if (information_uri in known_entity_dict)
  6 | 		return Object.assign({},known_entity_dict[information_uri]);
  7 | 
  8 | 	var query = ''
  9 | 	if (use_linksum) // LinkSUM: Using Link Analysis to Summarize Entity Data
 10 | 	{
 11 | 		query = [
 12 | 			PREFIX_MAP_STRING,
 13 | 			"SELECT ?v ?predicate ?object",
 14 | 			"FROM <http://dbpedia.org>",
 15 | 			"FROM <http://people.aifb.kit.edu/ath/#DBpedia_PageRank>",
 16 | 			"WHERE {",
 17 | 				"{", // show abstract, label and type first
 18 | 					"SELECT ?predicate ?object WHERE {",
 19 | 						"<"+information_uri+"> ?predicate ?object.",
 20 | 						"FILTER(?predicate = dbo:abstract || ?predicate = rdfs:label || ?predicate = rdf:type).",
 21 | 						"FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).",
 22 | 					"}",
 23 | 					"ORDER BY DESC(?predicate)",
 24 | 				"} UNION {", // show subclasses
 25 | 					"SELECT (my:superClassOf AS ?predicate) ?object WHERE {",
 26 | 						"?object rdfs:subClassOf <"+information_uri+">.",
 27 | 						"FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).",
 28 | 					"}",
 29 | 				"} UNION {", // show other properties second, ranked by importance
 30 | 					"SELECT ?predicate ?object ?v WHERE {",
 31 | 						"<"+information_uri+"> ?predicate ?object.",
 32 | 						"FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).",
 33 | 						"FILTER(?predicate != rdfs:comment && ?predicate != vrank:hasRank && ?predicate != dbo:abstract && ?predicate != rdfs:label && ?predicate != rdf:type).",
 34 | 						"OPTIONAL {?object vrank:hasRank ?r. ?r vrank:rankValue ?v}.",
 35 | 					"}",
 36 | 					"ORDER BY DESC(?v), DESC(?predicate)",
 37 | 				"}",
 38 | 			"}",
 39 | 		].join("\n")
 40 | 	}
 41 | 	else
 42 | 	{
 43 | 		query = [
 44 | 			"SELECT DISTINCT ?predicate ?object WHERE {",
 45 | 				"<"+information_uri+"> ?predicate ?object.",
 46 | 				"FILTER(!isLiteral(?object) || lang(?object) = '' || langMatches(lang(?object), '"+language+"')).",
 47 | 			"}",
 48 | 		].join("\n");
 49 | 	}
 50 | 	var query_result = query_sparql_endpoint(DBPEDIA_ENDPOINT, query)
 51 | 	if (!query_result || !query_result.results || query_result.results.bindings.length==0)
 52 | 	{
 53 | 		if (!retry)
 54 | 			return null;
 55 | 		return expand_information(PREFIX_MAP['dbr']+getPath(information_uri), known_entity_dict, language, use_linksum, false);
 56 | 	}
 57 | 	// console.log(query_result)
 58 | 	// Build subject map
 59 | 	var jsonld_graph = {'@id':information_uri}
 60 | 	for (tuple of query_result.results.bindings)
 61 | 	{
 62 | 		var pred = tuple.predicate?String(tuple.predicate.value):'';
 63 | 		if (pred == '')
 64 | 			continue
 65 | 		var obj = tuple.object?String(tuple.object.value):'';
 66 | 		if (pred in jsonld_graph)
 67 | 		{
 68 | 			if (!isArray(jsonld_graph[pred]))
 69 | 				jsonld_graph[pred] = [jsonld_graph[pred]]
 70 | 			jsonld_graph[pred].push(obj)
 71 | 		}
 72 | 		else
 73 | 			jsonld_graph[pred] = obj
 74 | 	}
 75 | 	var ground = {
 76 | 		'@type': 'JSON',
 77 | 		'@value': JSON.stringify(query_result, null, 2)
 78 | 	}
 79 | 	var formatted_jsonld_graph = format_jsonld(jsonld_graph, ground, query);
 80 | 	return formatted_jsonld_graph;
 81 | }
 82 | 
 83 | function expand_link(link, show_expansion_fn, known_entity_dict)
 84 | {
 85 | 	try {
 86 | 		if (!isURL(link)) // query wikipedia
 87 | 			query_wikipedia_by_title(link, show_expansion_fn)
 88 | 		else
 89 | 		{ // query dbpedia
 90 | 			console.log('Expanding '+format_link(link)+' on DBPedia.');
 91 | 			var response = expand_information(link, known_entity_dict);
 92 | 			if (response)
 93 | 				response['@id'] = link;
 94 | 			show_expansion_fn(response);
 95 | 		}
 96 | 	} catch (ex) {
 97 | 		if (DEBUG)
 98 | 			console.error(ex);
 99 | 	}
100 | }
101 | 
102 | function generate_counterfactual(information_dict)
103 | {
104 | 	var api = information_dict['api'];
105 | 	var input = information_dict['input'].map(x=>parseInt(x,10));
106 | 	var output = null;
107 | 	try {
108 | 		$.ajax({
109 | 			url:api, 
110 | 			// async: false,
111 | 			method:'POST',
112 | 			async: false,
113 | 			data: JSON.stringify({'sample_value': input}),
114 | 			contentType: "application/json; charset=utf-8",
115 | 			success: x => output=x,
116 | 		});
117 | 	} catch(e) {
118 | 		if (DEBUG)
119 | 			console.error(e);
120 | 	}
121 | 	return output;
122 | }
123 | 
124 | function get_counterfactual(current, api, process_graph)
125 | {
126 | 	var process_input_dict = get_process_input_dict_from_formatted_jsonld({'my:processList':process_graph});
127 | 	var input_list = [].concat(...Object.values(process_input_dict).filter(x=>x[0][COUNTERFACTUAL_API_URI] == api));	
128 | 	var input_values = $(".counterfactual").toArray().sort((a, b) => get_DOM_element_distance(a,current) - get_DOM_element_distance(b,current));
129 | 	input_values = input_values.concat(input_list.map(x=>{
130 | 		return {'id':x[FEATURE_ORDER_URI],'value':x[VALUE_URI]}
131 | 	}));
132 | 	input_values = get_unique_elements(input_values, x=>x.id);
133 | 	return input_values.sort((a, b) => parseInt(a.id) - parseFloat(b.id)).map(x=>x.value);
134 | }
135 | 
136 | function get_typed_entity_dict_from_jsonld(jsonld)
137 | {
138 | 	var minimal_entity_graph = build_minimal_entity_graph(jsonld);
139 | 	var minimal_subclass_graph = build_minimal_type_graph(minimal_entity_graph, SUBCLASSOF_URI, HAS_SUBCLASS_URI);
140 | 	var minimal_type_graph = build_minimal_type_graph(minimal_entity_graph, TYPE_URI, HAS_ENTITY_URI);
141 | 	return get_entity_dict(minimal_entity_graph.concat(minimal_type_graph).concat(minimal_subclass_graph));
142 | }
143 | 
144 | function get_entity_dict_from_jsonld(jsonld)
145 | {
146 | 	return get_entity_dict(build_minimal_entity_graph(jsonld));
147 | }
148 | 
149 | function format_dataset(data, id=null)
150 | {
151 | 	var dataset = {};
152 | 	var fragments_count = isArray(data)?data.length:1;
153 | 	console.log('RDF Fragments count:', fragments_count);
154 | 	if (fragments_count==0)
155 | 		return dataset;
156 | 
157 | 	if (isDict(data) && '@id' in data)
158 | 		dataset['@id'] = data['@id'];
159 | 	if (id)
160 | 	{
161 | 		dataset[LABEL_URI] = build_RDF_item(id);
162 | 		console.log('Formatting dataset:', id);
163 | 	}
164 | 	else
165 | 		console.log('Formatting dataset:', dataset['@id']);
166 | 
167 | 	// Get entity-centered graph
168 | 	var minimal_entity_graph = build_minimal_entity_graph(data);
169 | 	console.log('Entity count:', data.length)
170 | 	// Get class-centerd graph
171 | 	var minimal_type_graph = build_minimal_type_graph(minimal_entity_graph)
172 | 	console.log('Class count:', minimal_type_graph.length)
173 | 	dataset[STATEMENT_COUNT_URI] = build_RDF_item(count_graph_statements(minimal_entity_graph))
174 | 	dataset[ENTITY_COUNT_URI] = build_RDF_item(minimal_entity_graph.length)
175 | 	dataset[CLASS_COUNT_URI] = build_RDF_item(minimal_type_graph.length)
176 | 	dataset[CLASS_LIST_URI] = minimal_type_graph
177 | 	return dataset
178 | }
179 | 


--------------------------------------------------------------------------------
/web_app/oke/core/misc/jsonld_lib.py:
--------------------------------------------------------------------------------
  1 | from more_itertools import unique_everseen
  2 | import json
  3 | import re
  4 | 
  5 | class hashabledict(dict):
  6 |     def __hash__(self):
  7 |         return hash(tuple(sorted(self.items())))
  8 | 
  9 | CONCEPT_PREFIX = 'my:'
 10 | DOC_PREFIX = 'myfile:'
 11 | ANONYMOUS_PREFIX = '_:'
 12 | WORDNET_PREFIX = 'wn:'
 13 | 
 14 | DOC_ID_PREDICATE = 'my:docID'
 15 | HAS_IDX_PREDICATE = 'my:hasIDX'
 16 | HAS_SOURCE_PREDICATE = 'my:hasSource'
 17 | HAS_LABEL_PREDICATE = 'rdfs:label'
 18 | SUBCLASSOF_PREDICATE = 'rdfs:subClassOf'
 19 | HAS_TYPE_PREDICATE = 'rdf:type'
 20 | CAN_BE_PREDICATE = 'my:canBe'
 21 | IN_SYNSET_PREDICATE = 'my:inSynset'
 22 | HAS_DEFINITION_PREDICATE = 'dbo:abstract'
 23 | CONTENT_PREDICATE = 'my:content'
 24 | SPECIAL_PREDICATE_LIST = [
 25 | 	DOC_ID_PREDICATE,
 26 | 	HAS_IDX_PREDICATE,
 27 | 	HAS_SOURCE_PREDICATE,
 28 | 	HAS_LABEL_PREDICATE,
 29 | 	SUBCLASSOF_PREDICATE,
 30 | 	HAS_TYPE_PREDICATE,
 31 | 	CAN_BE_PREDICATE,
 32 | 	IN_SYNSET_PREDICATE,
 33 | 	HAS_DEFINITION_PREDICATE,
 34 | 	CONTENT_PREDICATE
 35 | ]
 36 | 
 37 | def explode_concept_key(key):
 38 | 	if not key:
 39 | 		return ''
 40 | 	key = re.sub(r"[_-]", " ", key)
 41 | 	key = key.split(':')[-1]
 42 | 	if not key:
 43 | 		return ''
 44 | 	key = key[0].upper() + key[1:]
 45 | 	splitted_key = re.findall('[A-Z][^A-Z]*', key)
 46 | 
 47 | 	# join upper case letters
 48 | 	i = 0
 49 | 	j = 1
 50 | 	while j < len(splitted_key):
 51 | 		if len(splitted_key[j]) == 1:
 52 | 			splitted_key[i] += splitted_key[j]
 53 | 			splitted_key[j] = ''
 54 | 			j += 1
 55 | 		else:
 56 | 			i = j
 57 | 			j = i+1
 58 | 	
 59 | 	exploded_key = ' '.join(splitted_key)
 60 | 	exploded_key = re.sub(r" +", r" ", exploded_key).strip()
 61 | 	return exploded_key
 62 | 
 63 | def urify(str):
 64 | 	return str.casefold().strip().replace(' ','_')
 65 | 
 66 | def is_html(str):
 67 | 	html_pattern = r"<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
 68 | 	return re.match(html_pattern, str, re.IGNORECASE) is not None
 69 | 
 70 | def is_url(str):
 71 | 	if is_rdf_item(str):
 72 | 		str = str['@value']
 73 | 	str = str.casefold().strip()
 74 | 	if not str:
 75 | 		return False
 76 | 	if str.startswith('../') or str.startswith('./'):
 77 | 		return True
 78 | 	if re.match(r'\w+:', str, re.IGNORECASE) is not None:
 79 | 		return True
 80 | 	url_pattern = r'(http[s]?:)?//(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
 81 | 	return re.match(url_pattern, str, re.IGNORECASE) is not None
 82 | 
 83 | def is_rdf_item(v):
 84 | 	return isinstance(v, dict) and '@value' in v
 85 | 
 86 | def is_dict(v):
 87 | 	return isinstance(v, dict) and '@value' not in v
 88 | 
 89 | def is_array(v):
 90 | 	return isinstance(v, (list,tuple))
 91 | 
 92 | def get_jsonld_id(jsonld, default=None):
 93 | 	if is_dict(jsonld):
 94 | 		return [jsonld.get('@id',default)]
 95 | 	if is_array(jsonld):
 96 | 		return sum(map(lambda x: get_jsonld_id(x,default),jsonld),[])
 97 | 	if is_rdf_item(jsonld):
 98 | 		return [jsonld['@value']]
 99 | 	return [jsonld]
100 | 
101 | def get_string_from_triple(triple): 
102 | 	def format_element(element,predicate):
103 | 		if is_rdf_item(element):
104 | 			element = element['@value']
105 | 		if not is_array(element):
106 | 			element = [element]
107 | 		element = filter(lambda x: not isinstance(x, str) or not x.startswith(CONCEPT_PREFIX), element)
108 | 		element = map(lambda x: explode_concept_key(' '.join(x[3:].split('.')[:-2]) if x.startswith(WORDNET_PREFIX) else x) if isinstance(x, str) else x, element)
109 | 		element = filter(lambda x: x, unique_everseen(element))
110 | 		element = list(element)
111 | 		filtered_element = (
112 | 			a.strip('.')
113 | 			for a in element	
114 | 			if next(filter(lambda x: a in x and a != x, element), None) is None
115 | 		)
116 | 		if predicate in [DOC_ID_PREDICATE,HAS_IDX_PREDICATE,HAS_SOURCE_PREDICATE,HAS_LABEL_PREDICATE]:
117 | 			filtered_element = map(lambda x: f'«{x}»', filtered_element)
118 | 		filtered_element = sorted(filtered_element, key=lambda x:len(x))
119 | 		if len(filtered_element) == 0:
120 | 			return ''
121 | 		formatted_triple = filtered_element[0]
122 | 		if len(filtered_element) > 1:
123 | 			formatted_triple += f' (or: {", ".join(filtered_element[1:])})'
124 | 		return formatted_triple
125 | 	subj,pred,obj = triple
126 | 	subj = format_element(subj,pred)#.lower().strip()
127 | 	obj = format_element(obj,pred)#.lower().strip()
128 | 	if subj == '' or obj == '':
129 | 		return ''
130 | 	# Get special predicates templates
131 | 	if pred == DOC_ID_PREDICATE:
132 | 		pred = '{subj} has been found in document {obj}'
133 | 	elif pred == HAS_IDX_PREDICATE:
134 | 		pred = '{subj} starts at offset {obj} of its document'
135 | 	elif pred == HAS_SOURCE_PREDICATE:
136 | 		pred = '{subj} is in the sentence {obj}'
137 | 	elif pred == HAS_LABEL_PREDICATE:
138 | 		pred = '{subj} is called {obj}'
139 | 	elif pred == SUBCLASSOF_PREDICATE:
140 | 		pred = '{subj} is {obj}'
141 | 	elif pred == HAS_TYPE_PREDICATE:
142 | 		pred = '{subj} is {obj}'
143 | 	elif pred == CAN_BE_PREDICATE:
144 | 		pred = '{subj} can be {obj}'
145 | 	elif pred == IN_SYNSET_PREDICATE:
146 | 		pred = '{subj} is the same of {obj}'
147 | 	elif pred == HAS_DEFINITION_PREDICATE:
148 | 		pred = '{subj} is: {obj}'
149 | 	# Converts a rdf triple into a string, by executing the predicate template on subject and object.
150 | 	triple_str = pred#.lower().strip()
151 | 	triple_str = triple_str.replace('{subj}',subj) if '{subj}' in triple_str else ' '.join([subj,triple_str])
152 | 	triple_str = triple_str.replace('{obj}',obj) if '{obj}' in triple_str else ' '.join([triple_str,obj])
153 | 	triple_str = triple_str.replace('(be)','is')
154 | 	triple_str = re.sub(r' +([,;.])',r'\1',triple_str) # remove unneeded whitespaces
155 | 	triple_str = re.sub(r' +/ +',r'/',triple_str) # remove unneeded whitespaces
156 | 	triple_str = triple_str.replace(' )',')').replace('( ','(') # remove unneeded whitespaces
157 | 	triple_str = re.sub(r'^: ','',triple_str).replace(' : ',': ').replace('::',':').replace('.,',';')
158 | 	return triple_str#.replace(',','')
159 | 
160 | def jsonld_to_triples(jsonld, base_id=None):
161 | 	def helper(j, default_subj_id=None, uid=0):
162 | 		triples = []
163 | 		if not default_subj_id:
164 | 			default_subj_id = f'{ANONYMOUS_PREFIX}{base_id}_{uid}'
165 | 		if is_array(j):
166 | 			for x in j:
167 | 				new_triples, uid = helper(x, default_subj_id, uid)
168 | 				triples += new_triples
169 | 		elif is_dict(j):
170 | 			subj_id = get_jsonld_id(j, default_subj_id)[0]
171 | 			if not subj_id:
172 | 				raise ValueError('A subject is required.')
173 | 			# subj_id = subj_id.lower().strip()
174 | 			for pred,obj in j.items():
175 | 				if pred == '@id':
176 | 					continue
177 | 				# pred = pred.casefold().strip()
178 | 				# if is_rdf_item(obj):
179 | 				# 	triples.append((subj_id,pred,hashabledict(obj)))
180 | 				# 	continue
181 | 				for obj_id in get_jsonld_id(obj):
182 | 					if not obj_id: # new uid, increase the old one
183 | 						uid += 1
184 | 						obj_id = f'{ANONYMOUS_PREFIX}{base_id}_{uid}'
185 | 					# if is_url(obj_id):
186 | 					# 	obj_id = obj_id.lower().strip()
187 | 					triples.append((
188 | 						subj_id, 
189 | 						pred, 
190 | 						obj_id,
191 | 					))
192 | 					new_triples, uid = helper(obj, obj_id, uid)
193 | 					triples += new_triples
194 | 		return triples, uid
195 | 	return helper(jsonld)[0]
196 | 


--------------------------------------------------------------------------------
/web_app/oke/server_interface.py:
--------------------------------------------------------------------------------
  1 | import sched, time
  2 | import json
  3 | from os import mkdir, path as os_path
  4 | import sys
  5 | base_path = os_path.dirname(os_path.abspath(__file__))
  6 | cache_path = os_path.join(base_path,'cache')
  7 | document_path = os_path.join(base_path,'documents')
  8 | sys.path.append(os_path.join(base_path,'core'))
  9 | 
 10 | from models.knowledge_extraction.ontology_builder import OntologyBuilder as OB
 11 | from models.reasoning.question_answerer import QuestionAnswerer
 12 | from misc.doc_reader import load_or_create_cache, DocParser
 13 | from misc.graph_builder import get_concept_description_dict
 14 | from misc.graph_builder import save_graphml
 15 | 
 16 | ################ Configuration ################
 17 | 
 18 | QA_DEFAULT_OPTIONS = {
 19 | 	'log': False,
 20 | }
 21 | 
 22 | ONTOLOGY_BUILDER_DEFAULT_OPTIONS = {
 23 | 	'max_syntagma_length': None,
 24 | 	'add_source': True,
 25 | 	'add_label': True,
 26 | 	'lemmatize_label': False,
 27 | 
 28 | 	'default_similarity_threshold': 0.75,
 29 | 	'tf_model': {
 30 | 		'url': 'https://tfhub.dev/google/universal-sentence-encoder-large/5', # Transformer
 31 | 		# 'url': 'https://tfhub.dev/google/universal-sentence-encoder/4', # DAN
 32 | 		# 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/',
 33 | 	},
 34 | 	'with_centered_similarity': True,
 35 | }
 36 | 
 37 | CONCEPT_CLASSIFIER_DEFAULT_OPTIONS = {
 38 | 	'tf_model': {
 39 | 		'url': 'https://tfhub.dev/google/universal-sentence-encoder-large/5', # Transformer
 40 | 		# 'url': 'https://tfhub.dev/google/universal-sentence-encoder/4', # DAN
 41 | 		# 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/',
 42 | 	},
 43 | 	'with_centered_similarity': True,
 44 | 	'default_similarity_threshold': 0.75,
 45 | 	# 'default_tfidf_importance': 3/4,
 46 | }
 47 | 
 48 | SENTENCE_CLASSIFIER_DEFAULT_OPTIONS = {
 49 | 	'tf_model': {
 50 | 		# 'url': 'https://tfhub.dev/google/universal-sentence-encoder-qa/3', # English QA
 51 | 		'url': 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3', # Multilingual QA # 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian)
 52 | 		# 'url': 'https://tfhub.dev/google/LAReQA/mBERT_En_En/1',
 53 | 		# 'cache_dir': '/Users/toor/Documents/Software/DLModels/tf_cache_dir/',
 54 | 	}, 
 55 | 	'with_centered_similarity': False,
 56 | 	'with_topic_scaling': False,
 57 | 	'with_stemmed_tfidf': False,
 58 | 	'default_tfidf_importance': 1/4,
 59 | }
 60 | 
 61 | SUMMARISER_DEFAULT_OPTIONS = {
 62 | 	'hf_model': {
 63 | 		# 'url': 't5-base',
 64 | 		'url': 'facebook/bart-large-cnn', # baseline
 65 | 		# 'url': 'sshleifer/distilbart-cnn-12-6', # speedup (over the baseline): 1.24
 66 | 		# 'url': 'sshleifer/distilbart-cnn-12-3', # speedup (over the baseline): 1.78
 67 | 		# 'url': 'sshleifer/distilbart-cnn-6-6', # speedup (over the baseline): 2.09
 68 | 		# 'cache_dir': '/Users/toor/Documents/Software/DLModels/hf_cache_dir/',
 69 | 		'framework': 'pt',
 70 | 	},
 71 | }
 72 | 
 73 | ################ Initialise data structures ################
 74 | graph_cache = os_path.join(cache_path,f"OB_cache_lemma-{ONTOLOGY_BUILDER_DEFAULT_OPTIONS['lemmatize_label']}.pkl")
 75 | concept_classifier_cache = os_path.join(cache_path,'QA_concept_classifier.pkl')
 76 | sentence_classifier_cache = os_path.join(cache_path,'QA_sentence_classifier.pkl')
 77 | sentence_summariser_cache = os_path.join(cache_path,'QA_sentence_summariser.pkl')
 78 | ########################################################################
 79 | print('Building Ontology Edge List..')
 80 | graph = load_or_create_cache(
 81 | 	graph_cache, 
 82 | 	lambda: OB(ONTOLOGY_BUILDER_DEFAULT_OPTIONS).set_documents_path(document_path).build()
 83 | )
 84 | save_graphml(graph, 'ontology')
 85 | print('Ontology size:', len(graph))
 86 | ########################################################################
 87 | print('Building Question Answerer..')
 88 | qa = QuestionAnswerer(
 89 | 	graph= graph, 
 90 | 	model_options= QA_DEFAULT_OPTIONS,
 91 | 	query_concept_classifier_options= CONCEPT_CLASSIFIER_DEFAULT_OPTIONS, 
 92 | 	answer_classifier_options= SENTENCE_CLASSIFIER_DEFAULT_OPTIONS, 
 93 | 	answer_summariser_options= SUMMARISER_DEFAULT_OPTIONS,
 94 | )
 95 | qa.sentence_classifier.load_cache(sentence_classifier_cache)
 96 | qa.concept_classifier.load_cache(concept_classifier_cache)
 97 | qa.sentence_summariser.load_cache(sentence_summariser_cache)
 98 | 
 99 | 
100 | ################ Define methods ################
101 | def get_question_answer_dict(question_list, options=None):
102 | 	if not options:
103 | 		options = {}
104 | 	question_answer_dict = qa.ask(question_list, **options)
105 | 	# print('######## Question Answers ########')
106 | 	# print(json.dumps(question_answer_dict, indent=4))
107 | 	return question_answer_dict
108 | 
109 | def get_question_answer_dict_quality(question_answer_dict, top=5):
110 | 	return qa.get_question_answer_dict_quality(question_answer_dict, top=top)
111 | 
112 | def get_summarised_question_answer_dict(question_answer_dict, options=None):
113 | 	if not options:
114 | 		options = {}
115 | 	question_summary_tree = qa.summarise_question_answer_dict(question_answer_dict, **options)
116 | 	# print('######## Summarised Question Answers ########')
117 | 	# print(json.dumps(question_summarised_answer_dict, indent=4))
118 | 	# qa.sentence_summariser.store_cache(sentence_summariser_cache)
119 | 	return question_summary_tree
120 | 
121 | def get_concept_overview(query_template_list, concept_uri, options=None):
122 | 	if not options:
123 | 		options = {}
124 | 	# set consider_incoming_relations to False with concept-centred generic questions (e.g. what is it?), otherwise the answers won't be the sought ones
125 | 	question_answer_dict = qa.get_concept_overview(
126 | 		query_template_list = query_template_list, 
127 | 		concept_uri = concept_uri,
128 | 		**options
129 | 	)
130 | 	# print('######## Concept Overview ########')
131 | 	# print(concept_uri, json.dumps(question_summarised_answer_dict, indent=4))
132 | 	# store_cache()
133 | 	return question_answer_dict
134 | 
135 | def annotate_text(sentence, similarity_threshold=None, max_concepts_per_alignment=1):
136 | 	return qa.important_concept_classifier.annotate(
137 | 		DocParser().set_content_list([sentence]), 
138 | 		similarity_threshold=similarity_threshold, 
139 | 		max_concepts_per_alignment=max_concepts_per_alignment
140 | 	)
141 | 
142 | def annotate_question_summary_tree(question_summary_tree, similarity_threshold=None, max_concepts_per_alignment=1):
143 | 	return qa.annotate_question_summary_tree(question_summary_tree, similarity_threshold=similarity_threshold, max_concepts_per_alignment=max_concepts_per_alignment)
144 | 
145 | def get_taxonomical_view(concept_uri, depth=0):
146 | 	return qa.get_taxonomical_view(concept_uri, depth=depth)
147 | 
148 | def annotate_taxonomical_view(taxonomical_view, similarity_threshold=None, max_concepts_per_alignment=1):
149 | 	return qa.annotate_taxonomical_view(taxonomical_view, similarity_threshold=similarity_threshold, max_concepts_per_alignment=max_concepts_per_alignment)
150 | 
151 | def get_equivalent_concepts(concept_uri):
152 | 	return qa.adjacency_matrix.get_equivalent_concepts(concept_uri)
153 | 
154 | def store_cache():
155 | 	qa.sentence_summariser.store_cache(sentence_summariser_cache)
156 | 	qa.concept_classifier.store_cache(concept_classifier_cache)
157 | 	qa.sentence_classifier.store_cache(sentence_classifier_cache)
158 | 
159 | # ############### Cache scheduler ###############
160 | SCHEDULING_TIMER = 15*60 # 15 minutes
161 | from threading import Timer
162 | def my_task(is_first=False):
163 | 	if not is_first:
164 | 		store_cache()
165 | 	Timer(SCHEDULING_TIMER, my_task).start()
166 | # start your scheduler
167 | my_task(is_first=True)
168 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/knowledge_extraction/lattice_builder.py:
--------------------------------------------------------------------------------
  1 | from more_itertools import unique_everseen
  2 | from concepts import Context
  3 | 
  4 | class LatticeBuilder():
  5 | 	def __init__(self, templatize=True):
  6 | 		self.formal_concept_context_list = []
  7 | 		self.templatize = templatize
  8 | 
  9 | 	@staticmethod
 10 | 	def deanonymize_graph(edge_list, name_fn=lambda c:c, key_fn=lambda c:c):
 11 | 		def build_edge_dict(edge_list, key_fn=lambda x: x):
 12 | 			edge_dict = {}
 13 | 			for edge in edge_list:
 14 | 				subj,_,_ = edge
 15 | 				subj_key = key_fn(subj)
 16 | 				if subj_key not in edge_dict:
 17 | 					edge_dict[subj_key] = []
 18 | 				edge_dict[subj_key].append(edge)
 19 | 			return edge_dict
 20 | 		def get_named_predicates(root_concept, edge_dict):
 21 | 			named_predicates = []
 22 | 			concept_checked = set()
 23 | 			concept_to_check = [root_concept]
 24 | 			while len(concept_to_check) > 0: # iterative version
 25 | 				concept = concept_to_check.pop()
 26 | 				concept_key = key_fn(concept)
 27 | 				concept_checked.add(concept_key)
 28 | 				for _,_,obj in edge_dict.get(concept_key,[]):
 29 | 					if name_fn(obj):
 30 | 						named_predicates.append(obj)
 31 | 					elif key_fn(obj) not in concept_checked:
 32 | 						concept_to_check.append(obj)
 33 | 			return list(unique_everseen(named_predicates))
 34 | 
 35 | 		edge_dict = build_edge_dict(edge_list, key_fn=key_fn)
 36 | 		new_edge_list = []
 37 | 		for edge in edge_list:
 38 | 			subj,pred,obj = edge
 39 | 			if not name_fn(subj):
 40 | 				continue
 41 | 			if name_fn(obj):
 42 | 				new_edge_list.append(edge)
 43 | 			else:
 44 | 				new_edge_list.extend(
 45 | 					(subj,pred,o)
 46 | 					for o in get_named_predicates(obj, edge_dict)
 47 | 				)
 48 | 		return new_edge_list
 49 | 
 50 | 	def build_concept_relation_dict(self, edge_list):
 51 | 		assert False, 'Not implemented'
 52 | 
 53 | 	def build_lattice(self, edge_list, stringify=False):
 54 | 		edge_list = list(edge_list)
 55 | 		concept_relation_dict = self.build_concept_relation_dict(edge_list)
 56 | 		properties = set(relation for relation_set in concept_relation_dict.values() for relation in relation_set)
 57 | 		objects = set(concept_relation_dict.keys())
 58 | 		if len(properties)==0 or len(objects)==0:
 59 | 			return []
 60 | 		bools = [
 61 | 			[
 62 | 				p in concept_relation_dict[object]
 63 | 				for p in properties
 64 | 			]
 65 | 			for object in objects
 66 | 		]
 67 | 
 68 | 		formal_concept_context = Context(objects, properties, bools)
 69 | 		self.formal_concept_context_list.append(formal_concept_context)
 70 | 		concept_lattice = formal_concept_context.lattice
 71 | 		
 72 | 		#print(concept_lattice['person',])
 73 | 		#print(concept_lattice['employer',])
 74 | 		#concept_lattice.graphviz(view=True)
 75 | 		#for extent, intent in concept_lattice:
 76 | 		#	print('%r %r' % (extent, intent))
 77 | 
 78 | 		for concept in concept_lattice._concepts:
 79 | 			concept.objects = sorted(concept.objects)
 80 | 			concept.properties = sorted(concept.properties)
 81 | 		get_concept_obj = lambda x: x.objects if len(x.objects)>0 else x.index
 82 | 
 83 | 		lattice_edge_list = []
 84 | 		for concept in concept_lattice._concepts:
 85 | 			concept_obj = get_concept_obj(concept)
 86 | 			lattice_edge_list.extend(
 87 | 				(
 88 | 					concept_obj, 
 89 | 					neighbor.properties, 
 90 | 					get_concept_obj(neighbor),
 91 | 				)
 92 | 				for neighbor in concept.lower_neighbors
 93 | 			)
 94 | 		'''
 95 | 		non_root_concept_set = set(get_concept_key(obj) for _,_,obj in lattice_edge_list)
 96 | 		lattice_edge_list.extend(
 97 | 			(
 98 | 				'', 
 99 | 				concept.properties, 
100 | 				get_concept_obj(concept),
101 | 			)
102 | 			for concept in concept_lattice._concepts
103 | 			if get_concept_key(concept) not in non_root_concept_set
104 | 		)
105 | 		'''
106 | 		is_iter = lambda element: isinstance(element, (list,tuple))
107 | 		lattice_edge_list = self.deanonymize_graph(lattice_edge_list, name_fn=lambda x: is_iter(x), key_fn=lambda x: ','.join(x) if is_iter(x) else x)
108 | 		#lattice_edge_list = list(map(lambda x: (x[-1],x[-2],x[-3]), lattice_edge_list))
109 | 		if stringify:
110 | 			list_to_string = lambda x: ', '.join(x)
111 | 			lattice_edge_list = map(lambda x: (list_to_string(x[0]), list_to_string(x[1]), list_to_string(x[2])), lattice_edge_list)
112 | 		return list(lattice_edge_list)
113 | 
114 | class ActivePredicateTypingLatticeBuilder(LatticeBuilder):
115 | 	def build_concept_relation_dict(self, edge_list):
116 | 		concept_relation_dict = {}
117 | 		for subj,pred,obj in edge_list:
118 | 			if subj not in concept_relation_dict:
119 | 				concept_relation_dict[subj] = set()
120 | 			concept_relation_dict[subj].add(f'that can {pred} is' if self.templatize else pred) # objects and properties cannot overlap
121 | 		return concept_relation_dict
122 | 
123 | class PassivePredicateTypingLatticeBuilder(LatticeBuilder):
124 | 	def build_concept_relation_dict(self, edge_list):
125 | 		concept_relation_dict = {}
126 | 		for subj,pred,obj in edge_list:
127 | 			if obj not in concept_relation_dict:
128 | 				concept_relation_dict[obj] = set()
129 | 			concept_relation_dict[obj].add(f'that can be {pred}-ed is' if self.templatize else pred) # objects and properties cannot overlap
130 | 		return concept_relation_dict
131 | 
132 | class PassiveActivePredicateTypingLatticeBuilder(LatticeBuilder):
133 | 	def __init__(self, templatize=True):
134 | 		super().__init__(templatize)
135 | 		self.active_lb = ActivePredicateTypingLatticeBuilder()
136 | 		self.passive_lb = PassivePredicateTypingLatticeBuilder()
137 | 
138 | 	def build_concept_relation_dict(self, edge_list):
139 | 		concept_relation_dict = self.active_lb.build_concept_relation_dict(edge_list)
140 | 		for key,value in self.passive_lb.build_concept_relation_dict(edge_list).items():
141 | 			if key not in concept_relation_dict:
142 | 				concept_relation_dict[key] = set()
143 | 			concept_relation_dict[key] |= value
144 | 		return concept_relation_dict
145 | 
146 | class ActiveActionTypingLatticeBuilder(LatticeBuilder):
147 | 	def build_concept_relation_dict(self, edge_list):
148 | 		concept_relation_dict = {}
149 | 		for subj,pred,obj in edge_list:
150 | 			if subj not in concept_relation_dict:
151 | 				concept_relation_dict[subj] = set()
152 | 			concept_relation_dict[subj].add(f'that can {pred} {obj} is' if self.templatize else (pred,obj)) # objects and properties cannot overlap
153 | 		return concept_relation_dict
154 | 
155 | 	def build_lattice(self, edge_list, stringify=False):
156 | 		edge_list = list(edge_list)
157 | 		predicate_dict = {}
158 | 		for edge in edge_list:
159 | 			subj,pred,obj = edge
160 | 			if pred not in predicate_dict:
161 | 				predicate_dict[pred] = []
162 | 			predicate_dict[pred].append(edge)
163 | 
164 | 		global_lattice_edge_list = []
165 | 		for pred, new_edge_list in predicate_dict.items():
166 | 			global_lattice_edge_list += super().build_lattice(new_edge_list, stringify)
167 | 		return global_lattice_edge_list
168 | 
169 | class PassiveActionTypingLatticeBuilder(ActiveActionTypingLatticeBuilder):
170 | 	def build_concept_relation_dict(self, edge_list):
171 | 		concept_relation_dict = {}
172 | 		for subj,pred,obj in edge_list:
173 | 			if obj not in concept_relation_dict:
174 | 				concept_relation_dict[obj] = set()
175 | 			concept_relation_dict[obj].add(f'that can be {pred}-ed by {subj} is' if self.templatize else (pred,subj)) # objects and properties cannot overlap
176 | 		return concept_relation_dict
177 | 
178 | class PassiveActiveActionTypingLatticeBuilder(ActiveActionTypingLatticeBuilder):
179 | 	def __init__(self, templatize=True):
180 | 		super().__init__(templatize)
181 | 		self.active_lb = ActiveActionTypingLatticeBuilder()
182 | 		self.passive_lb = PassiveActionTypingLatticeBuilder()
183 | 
184 | 	def build_concept_relation_dict(self, edge_list):
185 | 		concept_relation_dict = self.active_lb.build_concept_relation_dict(edge_list)
186 | 		for key,value in self.passive_lb.build_concept_relation_dict(edge_list).items():
187 | 			if key not in concept_relation_dict:
188 | 				concept_relation_dict[key] = set()
189 | 			concept_relation_dict[key] |= value
190 | 		return concept_relation_dict
191 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/template/jsonld_handler.js:
--------------------------------------------------------------------------------
  1 | function get_predicate_templatized_label(ancestor, predicate, data)
  2 | {
  3 | 	var predicate_data = {};
  4 | 	predicate_data[predicate] = data;
  5 | 	var templatized_text_list = get_template_list(predicate_data, ancestor);
  6 | 	if (templatized_text_list.length > 0)
  7 | 		return templatized_text_list[0];
  8 | 	return get_default_predicate_template(predicate, data, ancestor);
  9 | }
 10 | 
 11 | function annotate_nestedlist(nested_tree, annotation_list, annotation_fn) {
 12 | 	if (isArray(nested_tree))
 13 | 		nested_tree.map(c=>annotate_nestedlist(c, annotation_list, annotation_fn));
 14 | 	else
 15 | 	{
 16 | 		nested_tree.text = annotate_hmtl(nested_tree.text, annotation_list, annotation_fn);
 17 | 		// console.log(nested_tree.text);
 18 | 		if (nested_tree.children && nested_tree.children.length)
 19 | 			nested_tree.children.map(c=>annotate_nestedlist(c, annotation_list, annotation_fn));
 20 | 	}
 21 | 	return nested_tree
 22 | }
 23 | 
 24 | function nest_jsonld(data, uri_dict, ignore_set=null, max_depth=null, parent_set=null, depth=0)
 25 | {
 26 | 	parent_set = new Set(parent_set);
 27 | 	ignore_set = new Set(ignore_set);
 28 | 	// Replace using uri dict
 29 | 	if (isRDFItem(data))
 30 | 	{
 31 | 		var desc = get_RDFItem_description(data);
 32 | 		if (ignore_set.has(desc))
 33 | 			return data;
 34 | 		const is_url = isURL(desc);
 35 | 		if (is_url && parent_set.has(desc))
 36 | 			return data;
 37 | 		if (!(desc in uri_dict))
 38 | 			return data;
 39 | 		data = uri_dict[desc];
 40 | 		if (is_url && isDict(data))
 41 | 		{
 42 | 			const data_uri = get_description(data, false);
 43 | 			const data_label = get_description(data);
 44 | 			data = {'@id': build_RDF_item(data_uri)}
 45 | 			if (data_uri!=data_label)
 46 | 				data[LABEL_URI] = build_RDF_item(data_label);
 47 | 			return data;
 48 | 		}
 49 | 		depth += 1;
 50 | 		if (max_depth && depth > max_depth)
 51 | 			return data;
 52 | 	}
 53 | 	if (isArray(data))
 54 | 		return data.map(x=>nest_jsonld(x, uri_dict, ignore_set, max_depth, parent_set, depth));
 55 | 	if (isDict(data))
 56 | 	{
 57 | 		var new_data = {}
 58 | 		for (var [k,v] of Object.entries(data))
 59 | 		{
 60 | 			if (k == '@id')
 61 | 			{
 62 | 				parent_set.add(get_RDFItem_description(v));
 63 | 				new_data[k] = v;
 64 | 				continue;
 65 | 			}
 66 | 			const v_desc = get_description(v, false);
 67 | 			if (ignore_set.has(v_desc))
 68 | 				new_data[k] = build_RDF_item(v_desc);
 69 | 			else
 70 | 				new_data[k] = nest_jsonld(v, uri_dict, ignore_set, max_depth, parent_set, depth);
 71 | 		}
 72 | 		return new_data;
 73 | 	}
 74 | 	return data;
 75 | }
 76 | 
 77 | function jsonld_to_nestedlist(data, depth=0, predicate=null, ancestor=null) // Display JSON-LD data as a HTML tree view
 78 | {
 79 | 	var node_id = depth + 1;
 80 | 	var is_first = depth==0;
 81 | 	var current_depth = depth;
 82 | 	// Define routine/goto to get a fragment
 83 | 	function get_fragment(o, p, ancestor_p) { // local function, keep it local
 84 | 		node_id = depth + 1;
 85 | 		var sub_tree_dict = null;
 86 | 		[sub_tree_dict, depth] = jsonld_to_nestedlist(o, node_id, p, ancestor_p);
 87 | 		return sub_tree_dict
 88 | 	}
 89 | 	// Avoid useless nesting
 90 | 	if (isArray(data) && data.length==1)
 91 | 		data = data[0];
 92 | 	// // Get predicate
 93 | 	// if (predicate === null)
 94 | 	// 	predicate = 'Document';
 95 | 
 96 | 	// Get tree text and predicate link
 97 | 	var tree_dict = get_predicate_templatized_label(ancestor, predicate, data);
 98 | 	// Build html fragment
 99 | 	if (isDict(data))
100 | 	{
101 | 		// add children
102 | 		var child_list = [];
103 | 		var already_processed_predicates = [];
104 | 		// get templatized text list
105 | 		var template_list = get_template_list(data, ancestor);
106 | 		for (var template_dict of template_list)
107 | 		{
108 | 			already_processed_predicates = already_processed_predicates.concat(template_dict['predicate_list']);
109 | 			// create new child
110 | 			var new_child = template_dict;
111 | 			// add children
112 | 			if (!template_dict['hide_descendants'])
113 | 			{
114 | 				var sub_child_list = [];
115 | 				for (var p of template_dict['predicate_list'])
116 | 				{
117 | 					if (p=='@id')
118 | 						continue;
119 | 					var object = data[p];
120 | 					
121 | 					var sub_fragment = get_fragment(object, p, p);
122 | 					if (template_dict['keys_to_hide_as_child'].includes(p) || template_dict['predicate_list'].length==1)
123 | 					{ // this predicate has been requested to be removed, by the template, save its children
124 | 						if ('children' in sub_fragment)
125 | 							sub_child_list = sub_child_list.concat(sub_fragment['children']);
126 | 					}
127 | 					else
128 | 						sub_child_list.push(sub_fragment);
129 | 				}
130 | 				if (sub_child_list.length > 0)
131 | 					new_child['children'] = sub_child_list;
132 | 			}
133 | 			// push child into child list
134 | 			child_list.push(new_child);
135 | 		}
136 | 		// process the remaining properties
137 | 		for (var [p,o] of Object.entries(data)) 
138 | 		{
139 | 			if (already_processed_predicates.includes(p))
140 | 				continue;
141 | 			if (p=='@id')
142 | 				continue;
143 | 			if (isArray(o) && o.length==0)
144 | 				continue;
145 | 			// add fragment
146 | 			child_list.push(get_fragment(o, p, predicate));
147 | 		}
148 | 		if (child_list.length > 0)
149 | 			tree_dict["children"] = child_list;
150 | 	}
151 | 	else if (isArray(data))
152 | 	{		
153 | 		var child_list = [];
154 | 		for (var i in data) // add fragment
155 | 			child_list.push(get_fragment(data[i], predicate, predicate));
156 | 
157 | 		if (child_list.length > 0)
158 | 			tree_dict["children"] = child_list;
159 | 	}
160 | 	if (!is_first)
161 | 		return [tree_dict, depth];
162 | 
163 | 	if (!('children' in tree_dict))
164 | 		return [];
165 | 
166 | 	return flatten_single_childed_trees(clean_tree(tree_dict['children']));
167 | }
168 | 
169 | function flatten_single_childed_trees(tree_dict_list)
170 | {
171 | 	for (var tree_dict of tree_dict_list)
172 | 	{
173 | 		if (!('children' in tree_dict))
174 | 			continue
175 | 		if (tree_dict['children'].length == 1 && !tree_dict['is_in_array'])
176 | 		{
177 | 			var child = tree_dict['children'][0];
178 | 			if ('children' in child)
179 | 			{
180 | 				var last_char = tree_dict['text'].slice(-1);
181 | 				if (last_char == ':')
182 | 					tree_dict['text'] = tree_dict['text'].slice(0,-1)+'.';
183 | 				else if (last_char != '.')
184 | 					tree_dict['text'] += '.';
185 | 				tree_dict['text'] += ' '+child['text'];
186 | 				tree_dict['children'] = child['children'];
187 | 			}
188 | 		}
189 | 		tree_dict['children'] = flatten_single_childed_trees(tree_dict['children']);
190 | 	}
191 | 	return tree_dict_list
192 | }
193 | 
194 | function clean_tree(tree_dict_list) 
195 | { // remove trees with empty text
196 | 	var new_tree_dict_list = [];
197 | 	for (var tree_dict of tree_dict_list)
198 | 	{
199 | 		tree_dict['text'] = tree_dict['text'].trim();
200 | 		if (tree_dict['children'] && tree_dict['children'].length>0)
201 | 		{
202 | 			// // array template
203 | 			// if (tree_dict['is_in_array'])
204 | 			// {
205 | 			// 	var alternative_text_list = tree_dict['children'].filter(x=>x['predicate_list'].includes('@id')).sort((a,b)=>b['predicate_list'].length-a['predicate_list'].length).map(x=>x['text']);
206 | 			// 	if (alternative_text_list.length > 0)
207 | 			// 		tree_dict['text'] = alternative_text_list[0];
208 | 			// }
209 | 			// recursive call
210 | 			tree_dict['children'] = clean_tree(tree_dict['children']);
211 | 			// remove blank nodes
212 | 			if (!tree_dict['text'])
213 | 			{
214 | 				// for (var c of tree_dict['children'])
215 | 				// 	c['label'] = tree_dict['label']
216 | 				new_tree_dict_list = new_tree_dict_list.concat(tree_dict['children']);
217 | 			}
218 | 			else
219 | 			{
220 | 				// remove children having the same text of the parent
221 | 				var new_children = [];
222 | 				const children = tree_dict['children'];
223 | 				for (var child of children)
224 | 				{
225 | 					if (child['text']==tree_dict['text'])
226 | 						new_children = new_children.concat(child['children']);
227 | 					else
228 | 						new_children.push(child);
229 | 				}
230 | 				// if (new_children.length == 1 && 'children' in new_children[0])
231 | 				// 	new_children = new_children[0]['children'];	
232 | 				tree_dict['children'] = new_children;
233 | 				new_tree_dict_list.push(tree_dict);
234 | 			}
235 | 		}
236 | 		else if (tree_dict['text'])
237 | 			new_tree_dict_list.push(tree_dict);
238 | 	}
239 | 	return new_tree_dict_list;
240 | }
241 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/stage_builder/item_stage_builder.js:
--------------------------------------------------------------------------------
  1 | SCHEMA = 'https://schema.org/'
  2 | 
  3 | function extract_entity_by_type_list(type_list, graph)
  4 | {
  5 | 	if (isArray(graph))
  6 | 	{
  7 | 		var process_list = [];
  8 | 		var new_graph = [];
  9 | 		for (var g of graph)
 10 | 		{
 11 | 			var [extracted_processes, new_g] = extract_entity_by_type_list(type_list, g);
 12 | 			if (extracted_processes.length > 0)
 13 | 				process_list = process_list.concat(extracted_processes);
 14 | 			if (new_g)
 15 | 				new_graph.push(new_g);
 16 | 		}
 17 | 		if (new_graph.length == 0)
 18 | 			new_graph = null;
 19 | 		return [process_list,new_graph];
 20 | 	}
 21 | 	else if (isDict(graph))
 22 | 	{
 23 | 		if (TYPE_URI in graph && type_list.includes(graph[TYPE_URI]['@value']))
 24 | 			return [[graph],null];
 25 | 		
 26 | 		var process_list = [];
 27 | 		var new_graph = {};
 28 | 		for (var [k,g] of Object.entries(graph))
 29 | 		{
 30 | 			var [extracted_processes, new_g] = extract_entity_by_type_list(type_list,g);
 31 | 			process_list = process_list.concat(extracted_processes);
 32 | 			new_graph[k] = new_g?new_g:g['@id'];
 33 | 		}
 34 | 		return [process_list,new_graph];
 35 | 	}
 36 | 	return [[],Object.assign({},graph)];
 37 | }
 38 | 
 39 | function get_URI_graph(graph, key, collector, recursion=true, is_first=true, class_ground=new Map()) 
 40 | {
 41 | 	function merge_URI_dict(a,b) {
 42 | 		for (var [class_name, graph_list] of Object.entries(b))
 43 | 		{
 44 | 			if (!(class_name in a))
 45 | 				a[class_name] = []
 46 | 			a[class_name] = a[class_name].concat(b[class_name])
 47 | 		}
 48 | 		return a
 49 | 	}
 50 | 	function add_unknown_graph_to_dict(n,d) {
 51 | 		if (!(UNKNOWN_TYPE_URI in d))
 52 | 			d[UNKNOWN_TYPE_URI] = []
 53 | 		d[UNKNOWN_TYPE_URI].push(n)
 54 | 	}
 55 | 	var class_dict = {}
 56 | 	if (isArray(graph))
 57 | 	{
 58 | 		for (var i in graph)
 59 | 		{
 60 | 			new_class_dict = get_URI_graph(graph[i], key, collector, recursion, false, class_ground)
 61 | 			class_dict = merge_URI_dict(class_dict, new_class_dict)
 62 | 			if (is_first && Object.keys(new_class_dict).length==0)
 63 | 				add_unknown_graph_to_dict(graph[i],class_dict)
 64 | 		}
 65 | 	}
 66 | 	else if (isDict(graph))
 67 | 	{
 68 | 		if (key in graph)
 69 | 		{
 70 | 			var graph_key_list = (!isArray(graph[key]))?[graph[key]]:graph[key]
 71 | 			for (var i in graph_key_list) 
 72 | 			{
 73 | 				var class_name = graph_key_list[i]['@value']
 74 | 				if (!isURL(class_name))
 75 | 				{
 76 | 					var context = ('@context' in graph)?graph['@context']['@value']:SCHEMA
 77 | 					//console.log(graph['@context'],graph_key_list[i])
 78 | 					if (context!='')
 79 | 						class_name = pathJoin([context,class_name])
 80 | 				}
 81 | 				if (!(class_name in class_dict))
 82 | 					class_dict[class_name] = []
 83 | 				graph = Object.assign({},graph) // deep copy
 84 | 				//delete graph[key]
 85 | 				class_dict[class_name].push(graph)
 86 | 				class_ground.set(class_name, graph_key_list[i]['@ground'])
 87 | 			}
 88 | 		}
 89 | 		else if (is_first)
 90 | 			add_unknown_graph_to_dict(graph,class_dict)
 91 | 
 92 | 		if (recursion)
 93 | 		{
 94 | 			for (var tuple of Object.entries(graph)) 
 95 | 			{
 96 | 				if (tuple[0] == key)
 97 | 					continue
 98 | 				var object_list = tuple[1]
 99 | 				//console.log(object_list, get_URI_graph(object_list, key, collector, recursion, false))
100 | 				class_dict = merge_URI_dict(class_dict, get_URI_graph(object_list, key, collector, recursion, false, class_ground))
101 | 			}
102 | 		}
103 | 	}
104 | 	if (!is_first)
105 | 		return class_dict
106 | 
107 | 	var class_list = []
108 | 	for (var [class_name, type_graph] of Object.entries(class_dict)) 
109 | 	{
110 | 		var class_dict = {}
111 | 		class_dict['@id'] = build_RDF_item(class_name, class_ground.get(class_name))
112 | 		class_dict[collector] = type_graph
113 | 		class_list.push(class_dict)
114 | 	}
115 | 	return class_list
116 | }
117 | 
118 | function assign_unique_ids_to_graph(graph, anonymous_node_count=0, id_base='') 
119 | {
120 | 	if (isArray(graph))
121 | 	{
122 | 		for (var i in graph)
123 | 			anonymous_node_count = assign_unique_ids_to_graph(graph[i], anonymous_node_count, id_base);
124 | 	}
125 | 	else if (isDict(graph))
126 | 	{
127 | 		if (!('@id' in graph))
128 | 		{
129 | 			graph['@id'] = build_RDF_item(`my:AnonymousEntity_${id_base}_${anonymous_node_count}`);
130 | 			anonymous_node_count += 1;
131 | 		}
132 | 		for (var [key,value] of Object.entries(graph))
133 | 			anonymous_node_count = assign_unique_ids_to_graph(value, anonymous_node_count, id_base);
134 | 	}
135 | 	return anonymous_node_count;
136 | }
137 | 
138 | function build_minimal_entity_graph(graph) 
139 | {
140 | 	// graph = format_jsonld(graph)
141 | 	var new_graph_list = []
142 | 	// Assign unique ids
143 | 	assign_unique_ids_to_graph(graph);
144 | 	// Get entity dict
145 | 	var graph_list = get_URI_graph(graph, '@id', HAS_ENTITY_URI)
146 | 	// Merge graphs with same entity
147 | 	for (var ent_idx in graph_list)
148 | 	{
149 | 		var entity_id = graph_list[ent_idx]['@id']
150 | 		var entity_graph_list = graph_list[ent_idx][HAS_ENTITY_URI]
151 | 		// keep the biggest graph as central graph and merge the others, this would help keeping grounds as less redundant as possible
152 | 		entity_graph_list.sort(function(a, b){
153 | 		  // ASC  -> a.length - b.length
154 | 		  // DESC -> b.length - a.length
155 | 		  return b.length - a.length;
156 | 		});
157 | 
158 | 		var entity_graph = entity_graph_list[0] // keep the biggest graph as central graph and merge the others
159 | 		for (var i=1; i<entity_graph_list.length; ++i)
160 | 		{
161 | 			var current_entity_graph = entity_graph_list[i]
162 | 			for (var [pred, obj] of Object.entries(current_entity_graph))
163 | 			{
164 | 				if (pred == '@id')
165 | 					continue
166 | 				if (pred in entity_graph)
167 | 				{ // create list
168 | 					var original_obj = entity_graph[pred]
169 | 					if (isDict(original_obj) || !isArray(original_obj))
170 | 						entity_graph[pred] = [original_obj]
171 | 					entity_graph[pred].push(obj)
172 | 				}
173 | 				else
174 | 					entity_graph[pred] = obj
175 | 			}
176 | 		}
177 | 		// Remove duplicates
178 | 		for (var [pred, obj] of Object.entries(entity_graph))
179 | 		{
180 | 			if (isArray(obj))
181 | 			{
182 | 				//console.log('pre',obj)
183 | 
184 | 				obj = get_unique_elements(obj, function(x) {
185 | 					if (isArray(x))
186 | 						return get_array_description(x);
187 | 					if (isDict(x))
188 | 						return get_dict_description(x);
189 | 					return get_RDFItem_description(x);
190 | 				});
191 | 				if (obj.length == 1)
192 | 					obj = obj[0];
193 | 				entity_graph[pred] = obj;
194 | 			}
195 | 		}
196 | 		new_graph_list.push(entity_graph);
197 | 	}
198 | 	return new_graph_list;
199 | }
200 | 
201 | function get_entity_dict(graph)
202 | {
203 | 	var entity_dict = {};
204 | 	for (var known_entity of graph)
205 | 	{
206 | 		if ('@id' in known_entity)
207 | 		{
208 | 			var key = known_entity['@id']['@value'];
209 | 			if (!(key in entity_dict))
210 | 				entity_dict[key] = {};
211 | 			Object.assign(entity_dict[key], known_entity);
212 | 		}
213 | 	}
214 | 	return entity_dict;
215 | }
216 | 
217 | function count_graph_statements(graph)
218 | {
219 | 	var count = 0
220 | 	if (isArray(graph))
221 | 	{
222 | 		for (var g of graph)
223 | 			count += count_graph_statements(g);
224 | 	}
225 | 	else if (isDict(graph))
226 | 	{
227 | 		for (var [pred, obj] of Object.entries(graph))
228 | 		{
229 | 			if (pred == '@id')
230 | 				continue
231 | 			if (isArray(obj))
232 | 				count += obj.length
233 | 			count += 1 + count_graph_statements(obj)
234 | 		}
235 | 	}
236 | 	return count
237 | }
238 | 
239 | function sort_graph(graph)
240 | {
241 | 	function is_special(k){return k.startsWith('@') || k=='type'}
242 | 	function is_relevant(k){return ['abstract','label'].includes(k)}
243 | 	function abstract_comparison(fun,a,b) {
244 | 		if (fun(a) && !fun(b))
245 | 			return -1;
246 | 		if (!fun(a) && fun(b))
247 | 			return 1;
248 | 		if (fun(a) && fun(b))
249 | 			return 0;
250 | 		return null
251 | 	}
252 | 
253 | 	if (isDict(graph))
254 | 	{
255 | 		// Create items array
256 | 		var items = Object.keys(graph).map(key => [key, graph[key]]);
257 | 		// Sort the array based on the second element
258 | 		items = items.sort(function(a, b){
259 | 			var a=isURL(a[0])?getPath(a[0]):a[0], b=isURL(b[0])?getPath(b[0]):b[0]
260 | 			// put special elements first
261 | 			special_comparison = abstract_comparison(is_special,a,b)
262 | 			if (special_comparison !== null)
263 | 				return special_comparison
264 | 			// put relevant elements second
265 | 			relevant_comparison = abstract_comparison(is_relevant,a,b)
266 | 			if (relevant_comparison !== null)
267 | 				return relevant_comparison
268 | 			// put remaining elements last
269 | 			if(a < b) 
270 | 		    	return -1;
271 | 		    if(a > b) 
272 | 		    	return 1;
273 | 		    return 0;
274 | 		});
275 | 		//console.log(items)
276 | 		var ordered_graph = {}
277 | 		for (var [k,v] of items)
278 | 			ordered_graph[k]=sort_graph(v)
279 | 		graph = ordered_graph
280 | 	}
281 | 	else if (isArray(graph))
282 | 	{
283 | 		//var items = Object.keys(graph).map(e => getPath(e['@id']));
284 | 		for (var i in graph)
285 | 			graph[i] = sort_graph(graph[i])
286 | 	}
287 | 	return graph
288 | }
289 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/template/template_lib.js:
--------------------------------------------------------------------------------
  1 | var KNOWN_ENTITY_DICT = {};
  2 | 
  3 | function get_known_label(id) 
  4 | {
  5 | 	id = prefixed_string_to_uri(id);
  6 | 	if (id in KNOWN_ENTITY_DICT)
  7 | 	{
  8 | 		const desc = get_dict_description(KNOWN_ENTITY_DICT[id]);
  9 | 		return isURL(desc)?format_link(desc, false):desc;
 10 | 	}
 11 | 	return null;
 12 | }
 13 | 
 14 | function get_formatted_tuple_template(is_in_array, value, single_element_fn, multiple_elements_fn, array_item_fn=null, max_length=null)
 15 | {
 16 | 	if (isArray(value) && value.length == 1)
 17 | 		value = value[0]
 18 | 	var original_value = value;
 19 | 	var is_dict = isDict(original_value);
 20 | 	var is_rdf = isRDFItem(original_value);
 21 | 	// if (is_in_array)
 22 | 	// 	console.log(is_dict, value);
 23 | 
 24 | 	if (is_dict)
 25 | 		value = get_dict_description(value, as_label=false)
 26 | 	else if (is_rdf)
 27 | 		value = get_RDFItem_description(value);
 28 | 	if (!isArray(value))
 29 | 	{
 30 | 		const value_is_url = isURL(value);
 31 | 		const known_label = titlefy(get_known_label(value));
 32 | 		if (value_is_url)
 33 | 			value = linkify(value, known_label);
 34 | 		else if (known_label)
 35 | 			value = known_label;
 36 | 		value = clip_text(value,max_length);
 37 | 		var template = '';
 38 | 		if (is_in_array)
 39 | 		{
 40 | 			if (array_item_fn == null)
 41 | 			{
 42 | 				if (value_is_url)
 43 | 					array_item_fn = x=>x;
 44 | 				else
 45 | 					array_item_fn = x=>'«'+x+'»';
 46 | 			}
 47 | 			template = value?array_item_fn(value):'';
 48 | 		}
 49 | 		else
 50 | 			template = single_element_fn(value);
 51 | 		if (!template)
 52 | 			return '';
 53 | 		// if (is_dict)
 54 | 		// 	template += ':';
 55 | 		return template;
 56 | 	}
 57 | 	// value is Array
 58 | 	return value.length==0?'':(multiple_elements_fn(value) + ':');
 59 | }
 60 | 
 61 | function clip_text(text, max_length)
 62 | {
 63 | 	if (max_length && text.length>max_length)
 64 | 		text = text.slice(0,max_length) + '[...]';
 65 | 	return text;
 66 | }
 67 | 
 68 | function get_default_predicate_template(predicate, object, ancestor_predicate)
 69 | {
 70 | 	var is_in_array = is_array_element(predicate, ancestor_predicate);
 71 | 	// console.log(is_in_array, predicate, ancestor_predicate)
 72 | 	if (isURL(predicate))
 73 | 		predicate = linkify(predicate);
 74 | 	templatized_text = get_formatted_tuple_template(is_in_array, object, 
 75 | 		x=>'The <b>'+predicate+'</b> of this resource is «'+x+'»',
 76 | 		x=>'The <b>'+predicate+'</b> of this resource are',
 77 | 	);
 78 | 	return {
 79 | 		'is_in_array': is_in_array,
 80 | 		'predicate_list': [predicate], 
 81 | 		'text': templatized_text,
 82 | 	};
 83 | }
 84 | 
 85 | function get_table(heads, values_list)
 86 | {
 87 | 	var head_row = '';
 88 | 	head_row += '<tr>'
 89 | 	for (var h of heads)
 90 | 		head_row += `<th>${h}</th>`;
 91 | 	head_row += '</tr>'
 92 | 
 93 | 	var value_row = '';
 94 | 	for (var values of values_list)
 95 | 	{
 96 | 		value_row += '<tr>'
 97 | 		for (var v of values)
 98 | 			value_row += `<td>${v}</td>`
 99 | 		value_row += '</tr>'
100 | 	}
101 | 	return `<table class="table table-bordered">${head_row}${value_row}</table>`;
102 | }
103 | 
104 | function get_known_concepts_from_annotated_sentences(annotated_sentence_list, related_concepts_limit=null)
105 | {
106 | 	var annotation_list_uri = prefixed_string_to_uri('my:annotationList');
107 | 	var word_annotation_list_uri = prefixed_string_to_uri('my:wordLevelAnnotationList');
108 | 	var related_to_uri = prefixed_string_to_uri('my:relatedTo');
109 | 	var relation_list = [];
110 | 	for (var annotated_sentence of annotated_sentence_list)
111 | 	{
112 | 		var annotation_list = get(annotated_sentence,annotation_list_uri,[]);
113 | 		if (word_annotation_list_uri in annotated_sentence)
114 | 		{
115 | 			var word_annotation_list = annotated_sentence[word_annotation_list_uri];
116 | 			for (var word_annotation of word_annotation_list)
117 | 				annotation_list = annotation_list.concat(word_annotation[annotation_list_uri]);
118 | 		}
119 | 		for (var annotation of annotation_list)
120 | 		{
121 | 			if (related_to_uri in annotation)
122 | 				relation_list.push(get_RDFItem_description(annotation[related_to_uri]));
123 | 		}
124 | 	}
125 | 	// display only unique relations
126 | 	relation_list = [...new Set(relation_list)];
127 | 	// keep only the first 3 elements
128 | 	if (related_concepts_limit)
129 | 		relation_list = relation_list.slice(0, related_concepts_limit);
130 | 	var result = relation_list.map(x=>linkify(x)).join(', ');
131 | 	if (relation_list.length >= related_concepts_limit)
132 | 		result += ', etc..';
133 | 	return result;
134 | }
135 | 
136 | function try_to_apply_dict_to_template(ancestor, dict, template)
137 | {
138 | 	var predicate_list = [];
139 | 	var value_list = [];
140 | 	var keys = template['keys'].map(prefixed_string_to_uri);
141 | 	var optional_keys = new Set(get(template, 'optional_keys', []).map(prefixed_string_to_uri));
142 | 	for (var predicate of keys) 
143 | 	{
144 | 		var predicate_found = predicate in dict;
145 | 		if (!predicate_found)
146 | 		{
147 | 			var predicate_is_optional = optional_keys.has(predicate);
148 | 			if (!predicate_is_optional)
149 | 				return null;
150 | 			value_list.push(null);
151 | 		}
152 | 		else
153 | 		{
154 | 			var object = dict[predicate];
155 | 			if (isRDFItem(object))
156 | 				object = get_RDFItem_description(object);
157 | 			// else if (isArray(object) && object.length==0)
158 | 			// 	return null;
159 | 			value_list.push(object);
160 | 			predicate_list.push(predicate);
161 | 		}
162 | 	}
163 | 	if (value_list.filter(x=>x != null).length == 0)
164 | 		return null;
165 | 	var is_in_array = false;
166 | 	if (predicate_list.length==1)
167 | 	{
168 | 		for (var k of keys) // this should handle grouped lists with different predicates (for which is_in_array is clearly true)
169 | 		{
170 | 			if (is_array_element(k, ancestor))
171 | 			{
172 | 				is_in_array = true;
173 | 				break;
174 | 			}
175 | 		}
176 | 	}
177 | 	return Object.assign({}, template, {
178 | 		'is_in_array': is_in_array,
179 | 		'predicate_list': predicate_list, 
180 | 		'keys_to_hide_as_child': get(template, 'keys_to_hide_as_child', []).map(prefixed_string_to_uri),
181 | 		'label': get(template, 'label', null),
182 | 		'text': template['template_fn'](is_in_array, ancestor, value_list),
183 | 	});
184 | }
185 | 
186 | function is_array_element(prefixed_predicate, ancestor)
187 | {
188 | 	return prefixed_string_to_uri(prefixed_predicate)==prefixed_string_to_uri(ancestor);
189 | }
190 | 
191 | function get_template_list(dict, ancestor=null)
192 | {
193 | 	var graph_templates = [];
194 | 	for (var t in TEMPLATE_LIST)
195 | 	{
196 | 		var template = TEMPLATE_LIST[t];
197 | 		var applied_template = try_to_apply_dict_to_template(ancestor, dict, template);
198 | 		if (applied_template)
199 | 		{
200 | 			applied_template['position'] = t;
201 | 			graph_templates.push(applied_template);
202 | 		}
203 | 	}
204 | 	// some templates may overlap, remove all the redundant templates (that are those for which all the parameters are contained into another template with a greater or equal number of parameters)
205 | 	var filtered_graph_templates = [];
206 | 	// sort by predicate list length
207 | 	graph_templates = graph_templates.sort((a, b) => a.predicate_list.length-b.predicate_list.length); // ascending order
208 | 	for (var k=0; k < graph_templates.length; k++)
209 | 	{
210 | 		var template_dict = graph_templates[k];
211 | 		var overlap = false;
212 | 		for (var i=k+1; !overlap && i<graph_templates.length; i++)
213 | 		{
214 | 			var other_template_dict = graph_templates[i];
215 | 			// if (other_template_dict.predicate_list.length < template_dict.predicate_list.length)
216 | 			// 	continue;
217 | 			var may_overlap = true;
218 | 			var other_predicate_set = new Set(other_template_dict['predicate_list']);
219 | 			var predicate_list = template_dict['predicate_list'];
220 | 			for (var j=0; may_overlap && j<predicate_list.length; j++)
221 | 			{
222 | 				if (!other_predicate_set.has(predicate_list[j]))
223 | 					may_overlap = false;
224 | 			}
225 | 			if (may_overlap)
226 | 				overlap = true;
227 | 		}
228 | 		if (!overlap)
229 | 			filtered_graph_templates.push(template_dict);
230 | 	}
231 | 	filtered_graph_templates = filtered_graph_templates.sort((a, b) => a.position-b.position); // ascending order
232 | 	return filtered_graph_templates;
233 | }
234 | 
235 | function to_external_link(link, name=null)
236 | {
237 | 	link = String(link).replace(/<|>|"/gi,'');
238 | 	if (!name)
239 | 		name = isURL(link) ? format_link(link) : link;
240 | 
241 | 	return `<a href="${link}" target="_blank">${name}</a>`;
242 | }
243 | 
244 | function linkify(link, name=null)
245 | {
246 | 	link = String(link).replace(/<|>|"/gi,'');
247 | 	if (!name)
248 | 		name = isURL(link) ? format_link(link) : link;
249 | 
250 | 	return template_expand(name,link);
251 | }
252 | 
253 | function template_expand(name,topic=null) 
254 | {
255 | 	if (!topic)
256 | 		topic = name
257 | 	return `<annotation><span 
258 | 				class="link"
259 | 				data-topic="${topic}"
260 | 			>${name}</span></annotation>`;
261 | }
262 | 
263 | function counterfactual_input(counterfactual_api_url, feature_order, value)
264 | {
265 | 	return `<input 
266 | 				class="counterfactual" 
267 | 				id="${feature_order}" 
268 | 				data-api="${counterfactual_api_url}" 
269 | 				value="${value}"
270 | 				type="number" 
271 | 			>`;
272 | }
273 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/knowledge_extraction/ontology_builder.py:
--------------------------------------------------------------------------------
  1 | from models.knowledge_extraction.knowledge_graph_builder import KnowledgeGraphBuilder
  2 | from models.knowledge_extraction.lattice_builder import ActiveActionTypingLatticeBuilder
  3 | from misc.graph_builder import get_root_set, get_concept_set, get_predicate_set, get_object_set, get_connected_graph_list, get_ancestors, filter_graph_by_root_set, tuplefy
  4 | from misc.graph_builder import save_graph
  5 | from misc.jsonld_lib import *
  6 | 
  7 | from more_itertools import unique_everseen
  8 | import re
  9 | 
 10 | import nltk
 11 | from nltk.corpus import wordnet as wn
 12 | 
 13 | class OntologyBuilder(KnowledgeGraphBuilder):
 14 | 	WORDNET_TO_PATTERN_MAP = {
 15 | 		'pro:RoleInTime': ['causal_agent\.n\.01'],
 16 | 		# 'foaf:Person': ['person\.n\.01'],
 17 | 		'foaf:Organization': ['organization\.n\.01'],
 18 | 		'ti:TimeInterval': ['time_period\.n\.01'],
 19 | 		'pro:InformationObject': ['information','written_communication'],
 20 | 		'pro:Place': ['location\.n\.01'],
 21 | 		# 'owl:Thing': ['object'],
 22 | 		# 'pro:Role': ['role\.n\.01'],
 23 | 		'pwo:Action': ['action\.n\.01'],
 24 | 		# 'pwo:Workflow': ['process\.n\.06'],
 25 | 		'pro:Obligation': ['obligation\.n'],
 26 | 	}
 27 | 
 28 | 	KNOWN_ONTO_PATTERN_EDGE_LIST = [
 29 | 		# Agent-Role pattern
 30 | 		('pro:RoleInTime', SUBCLASSOF_PREDICATE, 'pro:Role'),
 31 | 		('lkif:Agent', 'pro:holdsRoleInTime', 'pro:RoleInTime'),
 32 | 		('pro:RoleInTime', 'pro:withRole', 'pro:Role'),
 33 | 		('pro:RoleInTime', 'tvc:atTime', 'ti:TimeInterval'),
 34 | 		('pro:RoleInTime', 'pro:relatesToDocument', 'foaf:Document'),
 35 | 		('pro:RoleInTime', 'pro:relatesToPerson', 'foaf:Person'),
 36 | 		('pro:RoleInTime', 'pro:relatesToOrganization', 'foaf:Organization'),
 37 | 		# TVC pattern
 38 | 		('pro:ValueInTime', 'pro:withValue', 'owl:Thing'),
 39 | 		('owl:Thing', 'pro:hasValue', 'pro:ValueInTime'),
 40 | 		('pro:ValueInTime', 'pro:withContext', 'owl:Thing'),
 41 | 		('pro:ValueInTime', 'tvc:atTime', 'pro:Instant'),
 42 | 		('pro:ValueInTime', 'pro:atTime', 'ti:TimeInterval'),
 43 | 		('owl:Thing', CAN_BE_PREDICATE, 'lkif:Jurisdiction'),
 44 | 		('owl:Thing', CAN_BE_PREDICATE, 'pro:Place'),
 45 | 		# Process pattern
 46 | 		('pwo:WorkflowExecution', 'pwo:executes', 'pwo:Workflow'),
 47 | 		('pwo:WorkflowExecution', 'pwo:involvesAction', 'pwo:Action'),
 48 | 		('pwo:Workflow', 'pwo:hasStep', 'pwo:Step'),
 49 | 		('pwo:Workflow', 'pwo:hasFirstStep', 'pwo:Step'),
 50 | 		('pwo:Step', 'pwo:hasNextStep', 'pwo:Step'),
 51 | 		('pwo:Step', 'pwo:produces', 'owl:Thing'),
 52 | 		('pwo:Step', 'pwo:needs', 'owl:Thing'),
 53 | 		('pwo:Action', 'tisit:atTime', 'ti:TimeInterval'),
 54 | 		('pwo:Step', 'taskex:isExecutedIn', 'owl:Thing'),
 55 | 		('pwo:Step', 'parameter:hasParameter', 'time:DurationDescription'),
 56 | 		# Deontic Ontology
 57 | 		('pro:DeonticSpecification', 'pro:hasPointed', 'pro:AuxiliaryParty'),
 58 | 		('pro:DeonticSpecification', 'pro:isHeld', 'pro:Interval'),
 59 | 		('pro:Bearer', 'pro:setsUp', 'pro:DeonticSpecification'),
 60 | 		('pro:DeonticSpecification', 'pro:componentOf', 'pro:LegalRule'),
 61 | 		('pro:Permission', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 62 | 		('pro:Right', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 63 | 		('pro:Permission', 'pro:foundedOn', 'pro:Right'),
 64 | 		('pro:Compliance', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 65 | 		('pro:Obligation', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 66 | 		('pro:Compliance', 'pro:complies', 'pro:Obligation'),
 67 | 		('pro:Right', 'pro:generates', 'pro:Obligation'),
 68 | 		('pro:Violation', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 69 | 		('pro:Prohibition', SUBCLASSOF_PREDICATE, 'pro:DeonticSpecification'),
 70 | 		('pro:Violation', 'pro:foundedOn', 'pro:Obligation'),
 71 | 		('pro:Violation', 'pro:foundedOn', 'pro:Prohibition'),
 72 | 		('pro:PrescriptiveRule', SUBCLASSOF_PREDICATE, 'pro:LegalRule'),
 73 | 		('pro:ConstitutiveRule', SUBCLASSOF_PREDICATE, 'pro:LegalRule'),
 74 | 		('pro:Penalty', 'pro:repairs', 'pro:PrescriptiveRule'),
 75 | 		# PrOnto Ontology
 76 | 		('pwo:Action', 'pro:generateRevision', 'allot:FRBRExpression'),
 77 | 		('pwo:Action', 'pro:isActedBy', 'lkif:Agent'),
 78 | 		('pwo:Action', 'taskex:executesTask', 'pwo:Step'),
 79 | 		('pwo:Workflow', 'pwo:hasStep', 'pwo:Step'),
 80 | 		('pwo:Step', 'pwo:needs', 'pro:InformationObject'),
 81 | 		('pwo:Step', 'pro:hasStepType', 'pro:StepType'),
 82 | 		('pwo:Step', 'pwo:produces', 'pro:InformationObject'),
 83 | 		('pwo:Step', 'pwo:needs', 'pro:InformationObject'),
 84 | 		('pwo:Step', 'pro:commits', 'pro:LegalRule'),
 85 | 		('pro:LegalRule', SUBCLASSOF_PREDICATE, 'pro:PrescriptiveRule'),
 86 | 		('pro:LegalRule', SUBCLASSOF_PREDICATE, 'pro:ConstitutiveRule'),
 87 | 		('pro:DeonticSpecification', 'pro:componentOf', 'pro:LegalRule'),
 88 | 		('pro:DeonticSpecification', 'pro:isHeld', 'ti:TimeInterval'),
 89 | 		('pro:ValueInTimeAndContext', 'tvc:atTime', 'ti:TimeInterval'),
 90 | 		('pro:ValueInTimeAndContext', 'tvc:withinContext', 'pro:Place'),
 91 | 		('pro:ValueInTimeAndContext', 'tvc:withinContext', 'lkif:Jurisdiction'),
 92 | 		('pro:ValueInTimeAndContext', 'tvc:withValue', 'pro:Value'),
 93 | 	]
 94 | 	
 95 | 	def __init__(self, model_options):
 96 | 		# nltk.download('wordnet')
 97 | 		self.max_syntagma_length = model_options.get('max_syntagma_length', 5)
 98 | 		self.add_source = model_options.get('add_source', False)
 99 | 		self.add_label = model_options.get('add_label', True)
100 | 		self.lemmatize_label = model_options.get('lemmatize_label', False)
101 | 		self.lattice_builder = ActiveActionTypingLatticeBuilder(templatize=False)
102 | 		super().__init__(model_options)
103 | 
104 | 	@staticmethod
105 | 	def print_graph(edge_iter, file_name):
106 | 		edge_iter = filter(lambda x: '{obj}' not in x[1], edge_iter)
107 | 		edge_list = list(edge_iter)
108 | 		print(f'Printing {file_name} with {len(edge_list)} triples')
109 | 		save_graph(edge_list, file_name, max(min(256,len(edge_list)/2),32))
110 | 
111 | 	def build_edge_list(self):
112 | 		edge_list = super().build(
113 | 			max_syntagma_length=self.max_syntagma_length, 
114 | 			add_subclasses=True, 
115 | 			use_wordnet=True,
116 | 			add_source=self.add_source, 
117 | 			add_label=self.add_label,
118 | 			lemmatize_label=self.lemmatize_label,
119 | 			to_rdf=True,
120 | 		)
121 | 		edge_list = tuplefy(edge_list)
122 | 		return edge_list
123 | 
124 | 	@staticmethod
125 | 	def get_hypernym_edge_list(concept_set):
126 | 		hyper = lambda s: s.hypernyms()
127 | 		# hypo = lambda s: s.hyponyms()
128 | 		concept_hypernyms_dict = {}
129 | 		for concept in concept_set:
130 | 			if not isinstance(concept, str) or not concept.startswith(WORDNET_PREFIX):
131 | 				continue
132 | 			synset = wn.synset(concept[3:]) # remove string WORDNET_PREFIX, 3 chars
133 | 			concept_hypernyms_dict[concept] = set(synset.closure(hyper)).union((synset,))
134 | 
135 | 		hypernym_edge_list = [
136 | 			(concept, SUBCLASSOF_PREDICATE, WORDNET_PREFIX+hypernym.name())
137 | 			for concept, hypernym_set in concept_hypernyms_dict.items()
138 | 			for hypernym in hypernym_set
139 | 		]
140 | 		return hypernym_edge_list
141 | 
142 | 	def extract_minimal_taxonomy(self, main_edge_list):
143 | 		concept_set = get_concept_set(main_edge_list)
144 | 		hypernym_edge_list = self.get_hypernym_edge_list(concept_set)
145 | 		hypernym_edge_list = self.lattice_builder.build_lattice(hypernym_edge_list)
146 | 		return list(unique_everseen(hypernym_edge_list))
147 | 
148 | 	@staticmethod
149 | 	def format_taxonomy(hypernym_edge_list):
150 | 		# hypernym_edge_list to RDF
151 | 		taxonomy_edge_list = []
152 | 		for subj_list, pred_list, obj_list in hypernym_edge_list:
153 | 			for subj in subj_list:
154 | 				for _,pred in pred_list:
155 | 					for obj in obj_list:
156 | 						taxonomy_edge_list.append((pred,SUBCLASSOF_PREDICATE,subj))
157 | 						if obj != pred:
158 | 							taxonomy_edge_list.append((obj,SUBCLASSOF_PREDICATE,pred))
159 | 		return taxonomy_edge_list
160 | 
161 | 	def connect_taxonomy_to_patterns(self, hypernym_edge_list):
162 | 		# get sorted concept set by moving parents on top of children
163 | 		hypernym_concept_set = get_concept_set(hypernym_edge_list)
164 | 		hypernym_concept_ancestors_list = [
165 | 			(c, get_ancestors(c, hypernym_edge_list))
166 | 			for c in hypernym_concept_set
167 | 		]
168 | 		hypernym_concept_ancestors_list.sort(key=lambda x: len(x[-1]))
169 | 
170 | 		pattern_edge_list = []
171 | 		type_set_dict = {}
172 | 		for root,_ in hypernym_concept_ancestors_list:
173 | 			for key,value_list in self.WORDNET_TO_PATTERN_MAP.items():
174 | 				for value in value_list:
175 | 					if re.search(re.compile(value), root) is not None:
176 | 						pattern_edge_list.append((root,'rdf:type',key))
177 | 						if key not in type_set_dict:
178 | 							type_set_dict[key] = set()
179 | 						type_set_dict[key].add(root)
180 | 
181 | 		for root,root_ancestors in hypernym_concept_ancestors_list:
182 | 			root_intension = set(
183 | 				predicate 
184 | 				for fcc in self.lattice_builder.formal_concept_context_list 
185 | 				for _,predicate in fcc.intension([root.strip()])
186 | 			)
187 | 			for intension in root_intension:
188 | 				for key,value_list in self.WORDNET_TO_PATTERN_MAP.items():
189 | 					for value in value_list:
190 | 						if re.search(re.compile(value), intension) is not None:
191 | 							if ( key not in type_set_dict ) or ( next(filter(lambda a: a in type_set_dict[key], root_ancestors), None) is None ):
192 | 								pattern_edge_list.append((root,'rdf:type',key))
193 | 								if key not in type_set_dict:
194 | 									type_set_dict[key] = set()
195 | 								type_set_dict[key].add(root)
196 | 		return unique_everseen(pattern_edge_list)
197 | 
198 | 	def build(self):
199 | 		print('Building knowledge graph..')
200 | 		edge_list = self.build_edge_list()
201 | 
202 | 		print('Extracting minimal taxonomy via FCA..')
203 | 		hypernym_edge_list = self.extract_minimal_taxonomy(edge_list)
204 | 		hypernym_concept_set = get_concept_set(hypernym_edge_list)
205 | 
206 | 		print('Connecting known ontology patterns to concept taxonomy..')
207 | 		pattern_hinge_graph = self.connect_taxonomy_to_patterns(hypernym_edge_list)
208 | 		# self.print_graph(pattern_hinge_graph, 'kg_hinge')
209 | 
210 | 		print('Creating taxonomy graph..')
211 | 		taxonomy_graph = self.format_taxonomy(hypernym_edge_list)
212 | 		# self.print_graph(taxonomy_graph, 'kg_taxonomy')
213 | 		taxonomy_graph += pattern_hinge_graph
214 | 		# self.print_graph(taxonomy_graph, 'kg_hinged_taxonomy')
215 | 
216 | 		return edge_list + taxonomy_graph
217 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/knowledge_extraction/couple_abstractor.py:
--------------------------------------------------------------------------------
  1 | from models.knowledge_extraction.couple_extractor import CoupleExtractor
  2 | from misc.jsonld_lib import *
  3 | import nltk
  4 | from nltk.corpus import framenet as fn
  5 | 
  6 | class CoupleAbstractor(CoupleExtractor):
  7 | 	def abstract_couple_list(self, concept_dict_list):
  8 | 		assert False, 'Not implemented'
  9 | 
 10 | class WordnetAbstractor(CoupleAbstractor):
 11 | 
 12 | 	# def __init__(self, model_options):
 13 | 	# 	nltk.download('punkt')
 14 | 	# 	nltk.download('averaged_perceptron_tagger')
 15 | 	# 	nltk.download('wordnet')
 16 | 	# 	super().__init__(model_options)
 17 | 		
 18 | 	'''
 19 | 	Firstly, the OPs sort of confused between relatedness and similarity, the distinction is fine but it's worth noting.
 20 | 
 21 | 	Semantic relatedness measures how related two concepts are, using any kind of relation; algorithms:
 22 | 	* Lexical Chains (Hirst and St-Onge, 1998)
 23 | 	* Adapted/Extended Sense Overlaps algorithm (Banerjee and Pedersen, 2002/2003)
 24 | 	* Vectorized Sense Overlaps (Patwardhan, 2003)
 25 | 	
 26 | 	Semantic similarity only considers the IS-A relation (i.e. hypernymy / hyponymy); algorithms:
 27 | 	* Wu-Palmer measure (Wu and Palmer 1994)
 28 | 	* Resnik measure (Resnik 1995)
 29 | 	* Jiang-Conrath measure (Jiang and Conrath 1997)
 30 | 	* Leacock-Chodorow measure (Leacock and Chodorow 1998)
 31 | 	* Lin measure (Lin 1998)	
 32 | 	Resnik, Jiang-Conrath and Lin measures are based on information content. The information content of a synset is -log the sum of all probabilities (computed from corpus frequencies) of all words in that synset (Resnik, 1995).
 33 | 	Wu-Palmer and Leacock-Chodorow are based on path length; the similarity between two concepts /synsets is respective of the number of nodes along the shortest path between them.
 34 | 
 35 | 	The list given above is inexhaustive, but historically, we can see that using similarity measure is sort of outdated since relatedness algorithms considers more relations and should theoretically give more disambiguating power to compare concepts.
 36 | 	'''
 37 | 	def abstract_couple_list(self, concept_dict_list):
 38 | 		from pywsd import disambiguate
 39 | 		from pywsd.similarity import max_similarity
 40 | 		from pywsd.lesk import simple_lesk, adapted_lesk, cosine_lesk
 41 | 	
 42 | 		concept_dict_list = [
 43 | 			self.get_couple_from_concept(concept_dict)
 44 | 			if 'predicate' not in concept_dict else
 45 | 			concept_dict
 46 | 			for concept_dict in concept_dict_list
 47 | 		]
 48 | 		disambiguation_cache = {}
 49 | 		for couple in concept_dict_list:
 50 | 			sentence_text = couple['source']['sentence_text']
 51 | 			if sentence_text not in disambiguation_cache:
 52 | 				sentence_disambiguation = disambiguate(
 53 | 					sentence_text,
 54 | 					algorithm=cosine_lesk, 
 55 | 					#similarity_option='wu-palmer',
 56 | 				)
 57 | 				disambiguation_cache[sentence_text] = {k.lower():v for k,v in sentence_disambiguation}
 58 | 			synset_dict = disambiguation_cache[sentence_text]
 59 | 			couple['concept']['synset'] = synset_dict.get(couple['concept']['text'], None)
 60 | 			for concept_core_dict in couple['concept_core']:
 61 | 				concept_core_dict['synset'] = synset_dict.get(concept_core_dict['text'], None)
 62 | 			couple['predicate_core']['synset'] = synset_dict.get(couple['predicate_core']['text'], None)
 63 | 		return concept_dict_list
 64 | 
 65 | class FramenetAbstractor(CoupleAbstractor):
 66 | 	FRAME_GF_CACHE = {}
 67 | 	FE_IN_LU_BY_DEP_CACHE = {}
 68 | 	LU_LIST = [lu for lu in fn.lus() if lu.name.split('.')[1] == 'v']
 69 | 	LU_KEY_LIST = [explode_concept_key(lu.name.split('.')[0]) for lu in LU_LIST]
 70 | 
 71 | 	def __init__(self, model_options):
 72 | 		# nltk.download('punkt')
 73 | 		# nltk.download('averaged_perceptron_tagger')
 74 | 		# nltk.download('framenet_v17')
 75 | 		fn.propagate_semtypes()
 76 | 		super().__init__(model_options)
 77 | 		self.debug = model_options.get('debug', False)
 78 | 		#self.with_frame_annotation = model_options.get('with_frame_annotation', True)
 79 | 		self.lu_confidence_threshold = model_options.get('lu_confidence_threshold', 2/3)
 80 | 		self.concept_confidence_threshold = model_options.get('concept_confidence_threshold', 1/2)
 81 | 
 82 | 	@staticmethod
 83 | 	def get_FE_and_GF_by_active_LU_annotation_list(lu_annotation_list):
 84 | 		fe_dict = {}
 85 | 		for annotation_dict in lu_annotation_list:
 86 | 			is_passive_LU_embodiement = annotation_dict['is_passive_LU_embodiement']
 87 | 			for fe_tuple, gf_tuple in zip(annotation_dict['frame_element'],annotation_dict['grammatical_function']):
 88 | 				gf = gf_tuple[-1]
 89 | 				#print(fe_tuple[-1], gf)
 90 | 				if gf not in ['Ext','Obj']:
 91 | 					continue
 92 | 				if is_passive_LU_embodiement: # get gf in active form
 93 | 					gf = 'Ext' if gf == 'Obj' else 'Obj'
 94 | 				fe = fe_tuple[-1] # every lexical unit has only one frame, so we can use the frame element has unique key
 95 | 				if fe not in fe_dict:
 96 | 					fe_dict[fe] = set()
 97 | 				fe_dict[fe].add(gf)
 98 | 		return fe_dict
 99 | 
100 | 	def is_passive_LU_embodiement(self, text, lexical_unit_offset):
101 | 		for token in self.nlp(text):
102 | 			#print(token, token.idx)
103 | 			if token.idx == lexical_unit_offset[0]:
104 | 				predicate_dict = self.get_predicate_dict(token)
105 | 				if predicate_dict is not None:
106 | 					return self.is_passive(predicate_dict['predicate']['span'])
107 | 				break
108 | 		return False
109 | 
110 | 	def get_LU_annotation_list(self, lu):
111 | 		return [
112 | 			{ # dict_keys(['cDate', 'status', 'ID', '_type', 'layer', '_ascii', 'Target', 'FE', 'GF', 'PT', 'Other', 'Sent', 'Verb', 'sent', 'text', 'LU', 'frame'])
113 | 				#'text': annotation['text'],
114 | 				#'id': annotation['ID'],
115 | 				#'lexical_unit': annotation['LU'].name,
116 | 				'frame_element': annotation['FE'][0],
117 | 				'grammatical_function': annotation['GF'],
118 | 				'phrase_type': annotation['PT'],
119 | 				#'lexical_unit_offset': annotation['Target'],
120 | 				'is_passive_LU_embodiement': self.is_passive_LU_embodiement(annotation['text'], annotation['Target'][0]),
121 | 			}
122 | 			for sub_corpus in lu.subCorpus
123 | 			for sentence in sub_corpus.sentence
124 | 			for annotation in sentence.annotationSet
125 | 			if annotation.get('GF',None) is not None
126 | 			and annotation.get('Target',None) is not None
127 | 			#and len(annotation['Target']) == 1 # Target 'rule out' has two separated offsets, even if it is a single target
128 | 		]
129 | 
130 | 	def get_possible_FE_in_LU_by_dependency(self, lu, dependency):
131 | 		lu_name = lu.name
132 | 		cache_key = '{0}.{1}'.format(lu_name, dependency)
133 | 		if cache_key not in self.FE_IN_LU_BY_DEP_CACHE:
134 | 			if lu_name not in self.FRAME_GF_CACHE:
135 | 				lu_annotation_list = self.get_LU_annotation_list(lu)
136 | 				frame_element_active_grammatical_functions = self.get_FE_and_GF_by_active_LU_annotation_list(lu_annotation_list)
137 | 				self.FRAME_GF_CACHE[lu_name] = frame_element_active_grammatical_functions
138 | 			else:
139 | 				frame_element_active_grammatical_functions = self.FRAME_GF_CACHE[lu_name]
140 | 		
141 | 			related_frame = lu.frame
142 | 			if self.debug:
143 | 				print('Frame:', str(related_frame.name))
144 | 
145 | 			abstract_couple_list = []
146 | 			for abstract_concept, fe in related_frame.FE.items():
147 | 				#if fe.coreType != 'Core':
148 | 				#	continue
149 | 
150 | 				fe_name = fe.name
151 | 				active_grammatical_functions = frame_element_active_grammatical_functions.get(fe_name,None)
152 | 				if active_grammatical_functions is None:
153 | 					continue
154 | 				valid_gf = 'Obj' if 'obj' in dependency else 'Ext'
155 | 				if valid_gf not in active_grammatical_functions:
156 | 					continue
157 | 
158 | 				semantic_type = fe.semType.name if fe.semType is not None else None
159 | 				abstract_couple_list.append({'frame_element':abstract_concept, 'semantic_type': semantic_type})
160 | 				if self.debug:
161 | 					print('Element:', {'fe': fe_name, 'active_gf': active_grammatical_functions})
162 | 				self.FE_IN_LU_BY_DEP_CACHE[cache_key] = abstract_couple_list
163 | 		else:
164 | 			abstract_couple_list = self.FE_IN_LU_BY_DEP_CACHE[cache_key]
165 | 		return abstract_couple_list
166 | 
167 | 	@staticmethod
168 | 	def stringify_couple(concept, predicate, dependency):
169 | 		subject = concept if 'subj' in dependency else 'x'
170 | 		object = concept if 'obj' in dependency else 'x'
171 | 		str = f'{subject} {predicate} {object}'
172 | 		#str = str[0].upper() + str[1:] + '.'
173 | 		return str
174 | 
175 | 	def abstract_couple(self, couple):
176 | 		if self.debug:
177 | 			print('###############################')
178 | 			print('Couple:', couple)
179 | 		is_passive_couple = couple['is_passive']
180 | 		couple_dependency = 'subj' if ('subj' in couple['dependency'] and not is_passive_couple) or ('obj' in couple['dependency'] and is_passive_couple) else 'obj'
181 | 		couple_predicate = couple['predicate']['lemma']
182 | 		fragment = self.stringify_couple(couple['concept']['lemma'], couple_predicate, couple_dependency)
183 | 		if self.debug:
184 | 			print('Fragment:', fragment)
185 | 			print('Is passive:', is_passive_couple)
186 | 		
187 | 		lu_argmax, lu_confidence = self.find_most_similar(couple_predicate, self.LU_KEY_LIST, cached=True)
188 | 		if lu_confidence < self.lu_confidence_threshold:
189 | 			return
190 | 		lu = self.LU_LIST[lu_argmax]
191 | 		lu_name = self.LU_KEY_LIST[lu_argmax]
192 | 		if self.debug:
193 | 			print('Most Similar LU:', lu_name)
194 | 			print('LU Confidence:', lu_confidence)
195 | 
196 | 		# Find all possible frame elements
197 | 		abstract_couple_list = self.get_possible_FE_in_LU_by_dependency(lu, couple_dependency)
198 | 		if len(abstract_couple_list) == 0:
199 | 			return
200 | 
201 | 		target_list = [
202 | 			self.stringify_couple(
203 | 				explode_concept_key(abstract_concept['frame_element']).lower().strip(), 
204 | 				couple_predicate, 
205 | 				couple_dependency
206 | 			)
207 | 			for abstract_concept in abstract_couple_list
208 | 		]
209 | 		
210 | 		concept_argmax, concept_confidence = self.find_most_similar(fragment, target_list, cached=True)
211 | 		if concept_confidence < self.concept_confidence_threshold:
212 | 			return
213 | 		most_similar_concept = abstract_couple_list[concept_argmax]
214 | 		if self.debug:
215 | 			print('Abstract Concept:', most_similar_concept)
216 | 			print('Confidence:', concept_confidence)
217 | 
218 | 		# Update couples
219 | 		couple['concept_annotation'] = {
220 | 			#'embodiment': couple['concept']['text'],
221 | 			'confidence': concept_confidence,
222 | 		}
223 | 		couple['concept_annotation'].update(most_similar_concept)
224 | 		couple['predicate_annotation'] = {
225 | 			'lexical_unit': lu_name,
226 | 			'frame': lu.frame.name,
227 | 			'confidence': lu_confidence,
228 | 		}
229 | 
230 | 	def abstract_couple_list(self, concept_dict_list):
231 | 		concept_dict_list = [
232 | 			self.get_couple_from_concept(concept_dict)
233 | 			if 'predicate' not in concept_dict else
234 | 			concept_dict
235 | 			for concept_dict in concept_dict_list
236 | 		]
237 | 		for couple in concept_dict_list:
238 | 			self.abstract_couple(couple)
239 | 		return concept_dict_list
240 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/model_manager.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # os.environ["CUDA_VISIBLE_DEVICES"]="-1"
  3 | import multiprocessing
  4 | import types
  5 | import spacy # for natural language processing
  6 | # import neuralcoref # for Coreference Resolution
  7 | # python3 -m spacy download en_core_web_md
  8 | from sklearn.preprocessing import normalize
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | # import tensorflow.compat.v1 as tf
 12 | # tf.disable_v2_behavior() # use 1.X API
 13 | tf.get_logger().setLevel('ERROR') # Reduce logging output.
 14 | gpu_devices = tf.config.experimental.list_physical_devices('GPU')
 15 | for dev in gpu_devices:
 16 | 	tf.config.experimental.set_memory_growth(dev, True)
 17 | import tensorflow_hub as hub
 18 | import tensorflow_text
 19 | from pathlib import Path
 20 | from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, pipeline
 21 | import torch
 22 | 
 23 | from misc.doc_reader import load_or_create_cache, create_cache, load_cache
 24 | 
 25 | import warnings
 26 | warnings.filterwarnings('ignore')
 27 | 
 28 | def get_best_gpu():
 29 | 	if torch.cuda.device_count() == 0:
 30 | 		return -1
 31 | 	return min(
 32 | 		(
 33 | 			(i,torch.cuda.memory_allocated(i))
 34 | 			for i in range(torch.cuda.device_count())
 35 | 		),
 36 | 		key = lambda x:x[-1]
 37 | 	)[0]
 38 | 
 39 | is_listable = lambda x: type(x) in (list,tuple)
 40 | 
 41 | class ModelManager():
 42 | 	# static members
 43 | 	__nlp_models = {}
 44 | 	__tf_embedders = {}
 45 | 	__hf_embedders = {}
 46 | 
 47 | 	def __init__(self, model_options=None):
 48 | 		if not model_options:
 49 | 			model_options = {}
 50 | 		self.model_options = model_options
 51 | 		self.disable_spacy_component = []
 52 | 		self.__batch_size = model_options.get('batch_size', 100)
 53 | 
 54 | 		self.__spacy_cache = {}
 55 | 		self.__tf_cache = {}
 56 | 		self.__hf_cache = {}
 57 | 
 58 | 		self.__spacy_model = model_options.get('spacy_model', 'en_core_web_md')
 59 | 		self.__tf_model = model_options.get('tf_model', {})
 60 | 		self.__hf_model = model_options.get('hf_model', {})
 61 | 
 62 | 	def store_cache(self, cache_name):
 63 | 		cache_dict = {
 64 | 			'tf_cache': self.__tf_cache,
 65 | 			'spacy_cache': self.__spacy_cache,
 66 | 			'hf_cache': self.__hf_cache,
 67 | 		}
 68 | 		create_cache(cache_name, lambda: cache_dict)
 69 | 
 70 | 	def load_cache(self, cache_name):
 71 | 		loaded_cache = load_cache(cache_name)
 72 | 		if loaded_cache:
 73 | 
 74 | 			tf_cache = loaded_cache.get('tf_cache',None)
 75 | 			if tf_cache:
 76 | 				self.__tf_cache = tf_cache
 77 | 
 78 | 			hf_cache = loaded_cache.get('hf_cache',None)
 79 | 			if hf_cache:
 80 | 				self.__hf_cache = hf_cache
 81 | 
 82 | 			spacy_cache = loaded_cache.get('spacy_cache',None)
 83 | 			if spacy_cache:
 84 | 				self.__spacy_cache = spacy_cache
 85 | 
 86 | 	@staticmethod
 87 | 	def get_cached_values(value_list, cache, fetch_fn, key_fn=lambda x:x):
 88 | 		missing_values = [q for q in value_list if key_fn(q) not in cache]
 89 | 		if len(missing_values) > 0:
 90 | 			new_values = fetch_fn(missing_values)
 91 | 			cache.update({key_fn(q):v for q,v in zip(missing_values, new_values)})
 92 | 		return [cache[key_fn(q)] for q in value_list]
 93 | 
 94 | 	@staticmethod
 95 | 	def load_nlp_model(spacy_model):
 96 | 		print('## Loading Spacy model <{}>...'.format(spacy_model))
 97 | 		# go here <https://spacy.io/usage/processing-pipelines> for more information about Language Processing Pipeline (tokenizer, tagger, parser, etc..)
 98 | 		nlp = spacy.load(spacy_model)
 99 | 		# nlp.add_pipe(nlp.create_pipe("merge_noun_chunks"))
100 | 		# nlp.add_pipe(nlp.create_pipe("merge_entities"))
101 | 		# nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
102 | 		#################################
103 | 		# nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab), name='neuralcoref', last=True) # load NeuralCoref and add it to the pipe of SpaCy's model
104 | 		# def remove_unserializable_results(doc): # Workaround for serialising NeuralCoref's clusters
105 | 		# 	def cluster_as_doc(c):
106 | 		# 		c.main = c.main.as_doc()
107 | 		# 		c.mentions = [
108 | 		# 			m.as_doc()
109 | 		# 			for m in c.mentions
110 | 		# 		]
111 | 		# 	# doc.user_data = {}
112 | 		# 	if not getattr(doc,'_',None):
113 | 		# 		return doc
114 | 		# 	if not getattr(doc._,'coref_clusters',None):
115 | 		# 		return doc
116 | 		# 	for cluster in doc._.coref_clusters:
117 | 		# 		cluster_as_doc(cluster)
118 | 		# 	for token in doc:
119 | 		# 		for cluster in token._.coref_clusters:
120 | 		# 			cluster_as_doc(cluster)
121 | 		# 	return doc
122 | 		# nlp.add_pipe(remove_unserializable_results, last=True)
123 | 		print('## Spacy model loaded')
124 | 		return nlp
125 | 	
126 | 	@staticmethod
127 | 	def load_tf_model(tf_model):
128 | 		cache_dir = tf_model.get('cache_dir',None)
129 | 		if cache_dir:
130 | 			Path(cache_dir).mkdir(parents=True, exist_ok=True)
131 | 			os.environ["TFHUB_CACHE_DIR"] = cache_dir
132 | 
133 | 		model_url = tf_model['url']
134 | 		is_qa_model = 'qa' in model_url.lower()
135 | 		if is_qa_model:
136 | 			print(f'## Loading TF model <{model_url}> for QA...')
137 | 		else:
138 | 			print(f'## Loading TF model <{model_url}>...')
139 | 		module = hub.load(model_url)
140 | 		get_input = lambda y: tf.constant(tuple(map(lambda x: x[0] if is_listable(x) else x, y)))
141 | 		if is_qa_model:
142 | 			get_context = lambda y: tf.constant(tuple(map(lambda x: x[1] if is_listable(x) else '', y)))
143 | 			q_label = "query_encoder" if 'query_encoder' in module.signatures else 'question_encoder'
144 | 			q_module = lambda doc: module.signatures[q_label](input=get_input(doc))['outputs'].numpy() # The default signature is identical with the question_encoder signature.
145 | 			a_module = lambda doc: module.signatures['response_encoder'](input=get_input(doc), context=get_context(doc))['outputs'].numpy()
146 | 		else:
147 | 			q_module = a_module = lambda doc: module(get_input(doc)).numpy()
148 | 		print('## TF model loaded')
149 | 		return {
150 | 			'question': q_module,
151 | 			'answer': a_module
152 | 		}
153 | 
154 | 	@staticmethod
155 | 	def load_hf_model(hf_model):
156 | 		model_name = hf_model['url']
157 | 		model_type = hf_model['type']
158 | 		model_framework = hf_model.get('framework', 'pt')
159 | 		cache_dir = hf_model.get('cache_dir',None)
160 | 		if cache_dir:
161 | 			model_path = os.path.join(cache_dir, model_name.replace('/','-'))
162 | 			if not os.path.isdir(model_path):
163 | 				os.mkdir(model_path)
164 | 		else:
165 | 			model_path = None
166 | 		print(f'###### Loading {model_type} model <{model_name}> for {model_framework} ######')
167 | 		config = AutoConfig.from_pretrained(model_name, cache_dir=model_path) # Download configuration from S3 and cache.
168 | 		model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=model_path)
169 | 		tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
170 | 		print(f'###### <{model_name}> loaded ######')
171 | 		return {
172 | 			'pipeline': pipeline(model_type, model=model, tokenizer=tokenizer, framework=model_framework, device=get_best_gpu()),
173 | 			'tokenizer': tokenizer,
174 | 			'model': model,
175 | 			'config': config,
176 | 		}
177 | 	
178 | 	def get_nlp_model(self):
179 | 		if ModelManager.__nlp_models.get(self.__spacy_model, None) is None:
180 | 			ModelManager.__nlp_models[self.__spacy_model] = ModelManager.load_nlp_model(self.__spacy_model)
181 | 		return ModelManager.__nlp_models[self.__spacy_model]
182 | 
183 | 	def get_tf_model(self):
184 | 		model_key = self.__tf_model['url']
185 | 		if ModelManager.__tf_embedders.get(model_key, None) is None:
186 | 			ModelManager.__tf_embedders[model_key] = ModelManager.load_tf_model(self.__tf_model)
187 | 		return ModelManager.__tf_embedders[model_key]
188 | 
189 | 	def get_hf_model(self):
190 | 		model_key = (self.__hf_model['url'],self.__hf_model['type'])
191 | 		if ModelManager.__hf_embedders.get(model_key, None) is None:
192 | 			ModelManager.__hf_embedders[model_key] = ModelManager.load_hf_model(self.__hf_model)
193 | 		return ModelManager.__hf_embedders[model_key]
194 | 
195 | 	def nlp(self, text_list, disable=None, n_threads=None, batch_size=None):
196 | 		if not disable:
197 | 			disable = self.disable_spacy_component
198 | 		if not n_threads: # real multi-processing: https://git.readerbench.com/eit/prepdoc/blob/f8e93b6d0a346e9a53dac2e70e5f1712d40d6e1e/examples/parallel_parse.py
199 | 			n_threads = multiprocessing.cpu_count()
200 | 		if not batch_size:
201 | 			batch_size = self.__batch_size
202 | 		def fetch_fn(missing_text):
203 | 			return self.get_nlp_model().pipe(
204 | 				missing_text, 
205 | 				disable=disable, 
206 | 				batch_size=min(batch_size, int(np.ceil(len(missing_text)/n_threads))),
207 | 				n_process=min(n_threads, len(missing_text)), # The keyword argument n_threads on the .pipe methods is now deprecated, as the v2.x models cannot release the global interpreter lock. (Future versions may introduce a n_process argument for parallel inference via multiprocessing.) - https://spacy.io/usage/v2-1#incompat
208 | 			)
209 | 		return self.get_cached_values(text_list, self.__spacy_cache, fetch_fn)
210 | 
211 | 	def run_tf_embedding(self, doc_list, norm=None, as_question=False):
212 | 		def fetch_fn(missing_queries):
213 | 			# print(missing_queries)
214 | 			tf_model = self.get_tf_model()
215 | 			# Feed missing_queries into current tf graph
216 | 			batch_list = (
217 | 				missing_queries[i*self.__batch_size:(i+1)*self.__batch_size] 
218 | 				for i in range(np.int(np.ceil(len(missing_queries)/self.__batch_size)))
219 | 			)
220 | 			encoder = tf_model['question' if as_question else 'answer']
221 | 			batched_embeddings = tuple(map(encoder, batch_list))
222 | 			embeddings = np.concatenate(batched_embeddings, 0)
223 | 			# Normalize the embeddings, if required
224 | 			if norm is not None:
225 | 				embeddings = normalize(embeddings, norm=norm)
226 | 			return embeddings
227 | 		return np.array(self.get_cached_values(doc_list, self.__tf_cache, fetch_fn, key_fn=lambda x:(x,as_question)))
228 | 
229 | 	def run_hf_task(self, inputs, **kwargs):
230 | 		def fetch_fn(missing_inputs):
231 | 			hf_model = self.get_hf_model()
232 | 			return [hf_model['pipeline'](i, **kwargs) for i in missing_inputs]
233 | 		cache_key = '.'.join(map(lambda x: '='.join(map(str,x)), sorted(kwargs.items(), key=lambda x:x[0])))
234 | 		return self.get_cached_values(inputs, self.__hf_cache, fetch_fn, key_fn=lambda x: '.'.join((cache_key,x)))
235 | 
236 | 	def get_similarity_vector(self, source_text_list, target_text_list, similarity_fn=np.inner, as_question=False):
237 | 		source_embedding = self.run_tf_embedding(doc_list=source_text_list, as_question=as_question)
238 | 		target_embeddings = self.run_tf_embedding(doc_list=target_text_list, as_question=False)
239 | 		return similarity_fn(source_embedding,target_embeddings)
240 | 
241 | 	# np.inner == lambda x,y: np.matmul(x,np.transpose(y))
242 | 	def find_most_similar(self, source_text, target_text_list, similarity_fn=np.inner, as_question=False):
243 | 		similarity_vec = self.get_similarity_vector(
244 | 			source_text=source_text, 
245 | 			target_text_list=target_text_list, 
246 | 			similarity_fn=similarity_fn, 
247 | 			as_question=as_question,
248 | 		)
249 | 		argmax = np.argmax(similarity_vec)
250 | 		return argmax, similarity_vec[argmax]
251 | 	


--------------------------------------------------------------------------------
/web_app/oke/core/misc/graph_builder.py:
--------------------------------------------------------------------------------
  1 | from more_itertools import unique_everseen
  2 | from matplotlib import pyplot as plt
  3 | import re
  4 | import networkx as nx
  5 | try:
  6 | 	import pygraphviz
  7 | 	from networkx.drawing.nx_agraph import graphviz_layout
  8 | except ImportError:
  9 | 	try:
 10 | 		import pydotplus
 11 | 		from networkx.drawing.nx_pydot import graphviz_layout
 12 | 	except ImportError:
 13 | 		raise ImportError("This example needs Graphviz and either PyGraphviz or PyDotPlus")
 14 | 
 15 | import networkx as nx
 16 | 
 17 | 
 18 | def get_betweenness_centrality(edge_list):
 19 | 	# Betweenness centrality quantifies the number of times a node acts as a bridge along the shortest path between two other nodes.
 20 | 	di_graph = nx.DiGraph()
 21 | 	di_graph.add_edges_from(map(lambda x: (x[0],x[-1]), edge_list))
 22 | 	return nx.betweenness_centrality(di_graph)
 23 | 
 24 | def get_concept_description_dict(graph, label_predicate, valid_concept_filter_fn=None):
 25 | 	if valid_concept_filter_fn:
 26 | 		concept_set = get_concept_set(filter(valid_concept_filter_fn, graph))
 27 | 		graph = filter(lambda x: x[0] in concept_set, graph)
 28 | 	# print('Unique concepts:', len(concept_set))
 29 | 	uri_dict = {} # concept_description_dict
 30 | 	for uri,_,label in filter(lambda x: x[1] == label_predicate, graph):
 31 | 		if uri not in uri_dict:
 32 | 			uri_dict[uri] = []
 33 | 		uri_dict[uri].append(label)
 34 | 	return uri_dict
 35 | 
 36 | def get_tuple_element_set(tuple_list, element_idx):
 37 | 	tuple_element_set = set()
 38 | 	element_iter = map(lambda x: x[element_idx], tuple_list)
 39 | 	for element in element_iter:
 40 | 		if isinstance(element, (list,tuple)):
 41 | 			for e in element:
 42 | 				tuple_element_set.add(e)
 43 | 		else:
 44 | 			tuple_element_set.add(element)
 45 | 	return tuple_element_set
 46 | 
 47 | def get_subject_set(edge_list):
 48 | 	return get_tuple_element_set(edge_list, 0)
 49 | 
 50 | def get_predicate_set(edge_list):
 51 | 	return get_tuple_element_set(edge_list, 1)
 52 | 
 53 | def get_object_set(edge_list):
 54 | 	return get_tuple_element_set(edge_list, -1)
 55 | 
 56 | def get_concept_set(edge_list):
 57 | 	edge_list = list(edge_list)
 58 | 	return get_subject_set(edge_list).union(get_object_set(edge_list))
 59 | 
 60 | def get_root_set(edge_list):
 61 | 	edge_list = list(edge_list)
 62 | 	return get_subject_set(edge_list).difference(get_object_set(edge_list))
 63 | 
 64 | def get_leaf_set(edge_list):
 65 | 	edge_list = list(edge_list)
 66 | 	return get_object_set(edge_list).difference(get_subject_set(edge_list))
 67 | 
 68 | def reverse_order(edge_list):
 69 | 	return map(lambda edge: (edge[-1],edge[-2],edge[-3]), edge_list)
 70 | 
 71 | def get_ancestors(node, edge_list):
 72 | 	return get_object_set(filter_graph_by_root_set(list(reverse_order(edge_list)), [node]))
 73 | 
 74 | def tuplefy(edge_list):
 75 | 	def to_tuple(x):
 76 | 		if type(x) is dict:
 77 | 			return tuple(x.values())
 78 | 		if type(x) is list:
 79 | 			return tuple(x)
 80 | 		return x
 81 | 	return [
 82 | 		tuple(map(to_tuple, edge))
 83 | 		for edge in edge_list
 84 | 	]
 85 | 
 86 | def build_edge_dict(edge_list, key_fn=lambda x: x):
 87 | 	edge_dict = {}
 88 | 	for edge in edge_list:
 89 | 		for subj in get_subject_set([edge]):
 90 | 			subj_key = key_fn(subj)
 91 | 			if subj_key not in edge_dict:
 92 | 				edge_dict[subj_key] = []
 93 | 			edge_dict[subj_key].append(edge)
 94 | 	return edge_dict
 95 | 
 96 | def extract_rooted_edge_list(root, edge_dict):
 97 | 	valid_edge_list = []
 98 | 	if root not in edge_dict:
 99 | 		return valid_edge_list
100 | 	valid_edge_list += edge_dict[root]
101 | 	obj_to_explore = get_object_set(edge_dict[root])
102 | 	del edge_dict[root]
103 | 	while len(obj_to_explore) > 0:
104 | 		obj = obj_to_explore.pop()
105 | 		if obj in edge_dict:
106 | 			valid_edge_list += edge_dict[obj]
107 | 			obj_to_explore |= get_object_set(edge_dict[obj])
108 | 			del edge_dict[obj]
109 | 	valid_edge_list = list(unique_everseen(valid_edge_list))
110 | 	return valid_edge_list
111 | 
112 | def filter_graph_by_root_set(edge_list, root_set):
113 | 	edge_dict = build_edge_dict(edge_list)
114 | 	rooted_edge_list_iter = (extract_rooted_edge_list(root, edge_dict) for root in root_set)
115 | 	rooted_edge_list = sum(rooted_edge_list_iter, [])
116 | 	return rooted_edge_list
117 | 
118 | def remove_leaves(edge_list, edge_to_remove_fn=lambda x:x):
119 | 	edge_list = list(edge_list)
120 | 	leaf_to_exclude_set = get_leaf_set(edge_list).intersection(get_object_set(filter(edge_to_remove_fn, edge_list)))
121 | 	edge_to_exclude_iter = filter(lambda x: len(get_object_set([x]).intersection(leaf_to_exclude_set))==0, edge_list)
122 | 	return list(edge_to_exclude_iter)
123 | 
124 | def get_connected_graph_list(edge_list):
125 | 	edge_list = list(edge_list)
126 | 	edge_dict = build_edge_dict(edge_list)
127 | 	graph_list = [
128 | 		extract_rooted_edge_list(root, edge_dict)
129 | 		for root in get_subject_set(edge_list)
130 | 	]
131 | 	graph_list.sort(key=lambda x: len(x), reverse=True)
132 | 
133 | 	for i,graph in enumerate(graph_list):
134 | 		if len(graph)==0:
135 | 			continue
136 | 		graph_concept_set = get_concept_set(graph)
137 | 		for j,other_graph in enumerate(graph_list):
138 | 			if i==j:
139 | 				continue
140 | 			if len(other_graph)==0:
141 | 				continue
142 | 			other_graph_concept_set = get_concept_set(other_graph)
143 | 			if len(graph_concept_set.intersection(other_graph_concept_set)) > 0:
144 | 				graph.extend(other_graph)
145 | 				graph_concept_set |= other_graph_concept_set
146 | 				other_graph.clear()
147 | 	graph_list = [
148 | 		list(unique_everseen(graph))
149 | 		for graph in filter(lambda x: len(x)>0, graph_list)
150 | 	]
151 | 	return graph_list
152 | 
153 | def get_biggest_connected_graph(edge_list):
154 | 	return max(get_connected_graph_list(edge_list), key=lambda x: len(x))
155 | 
156 | def save_graphml(edge_list, file_name):
157 | 	edge_list = list(edge_list)
158 | 
159 | 	# Build graph
160 | 	graph=nx.DiGraph() # directed graph
161 | 	for subject, predicate, object in edge_list:
162 | 		graph.add_edge(subject, object, r=predicate)
163 | 	
164 | 	nx.write_graphml(graph, file_name+".graphml", prettyprint=True)
165 | 	
166 | 	graphml = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
167 | <graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:java="http://www.yworks.com/xml/yfiles-common/1.0/java" xmlns:sys="http://www.yworks.com/xml/yfiles-common/markup/primitives/2.0" xmlns:x="http://www.yworks.com/xml/yfiles-common/markup/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:y="http://www.yworks.com/xml/graphml" xmlns:yed="http://www.yworks.com/xml/yed/3" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://www.yworks.com/xml/schema/graphml/1.1/ygraphml.xsd">
168 |   <!--Created by yEd 3.14.2-->
169 |   <key attr.name="Description" attr.type="string" for="graph" id="d0"/>
170 |   <key for="port" id="d1" yfiles.type="portgraphics"/>
171 |   <key for="port" id="d2" yfiles.type="portgeometry"/>
172 |   <key for="port" id="d3" yfiles.type="portuserdata"/>
173 |   <key attr.name="url" attr.type="string" for="node" id="d4"/>
174 |   <key attr.name="description" attr.type="string" for="node" id="d5"/>
175 |   <key for="node" id="d6" yfiles.type="nodegraphics"/>
176 |   <key for="graphml" id="d7" yfiles.type="resources"/>
177 |   <key attr.name="url" attr.type="string" for="edge" id="d8"/>
178 |   <key attr.name="description" attr.type="string" for="edge" id="d9"/>
179 |   <key for="edge" id="d10" yfiles.type="edgegraphics"/>
180 |   <graph edgedefault="directed" id="G">
181 | 	<data key="d0"/>'''
182 | 
183 | 	concept_set = get_concept_set(edge_list)
184 | 	for concept in concept_set:
185 | 		graphml += '''<node id="{0}">
186 | 		<data key="d6">
187 | 			<y:ShapeNode>
188 | 				<y:Geometry height="44.0" width="162.96822102864604" x="-5597.332280393664" y="-1819.21394393037"/>
189 | 				<y:Fill color="#FFFF00" transparent="false"/>
190 | 				<y:BorderStyle color="#000000" type="line" width="1.0"/>
191 | 				<y:NodeLabel alignment="center" autoSizePolicy="content" fontFamily="Dialog" fontSize="16" fontStyle="plain" hasBackgroundColor="false" hasLineColor="false" height="23.6015625" modelName="internal" modelPosition="c" textColor="#000000" visible="true" width="131.171875" x="15.89817301432322" y="10.19921875">{0}</y:NodeLabel>
192 | 				<y:Shape type="roundrectangle"/>
193 | 			</y:ShapeNode>
194 | 		</data>
195 | 	</node>'''.format(concept) + '\n'
196 | 	
197 | 	for subj,pred,obj in edge_list:
198 | 		graphml += '''<edge source="{0}" target="{1}">
199 | 	  <data key="d10">
200 | 		<y:PolyLineEdge>
201 | 		  <y:Path sx="0.0" sy="0.0" tx="0.0" ty="0.0">
202 | 			<y:Point x="-3676.1339400566617" y="-1345.8181620062282"/>
203 | 		  </y:Path>
204 | 		  <y:LineStyle color="#000000" type="line" width="1.0"/>
205 | 		  <y:Arrows source="none" target="standard"/>
206 | 		  <y:EdgeLabel alignment="center" backgroundColor="#FFFFFF" distance="2.0" fontFamily="Dialog" fontSize="16" fontStyle="plain" hasLineColor="false" height="23.6015625" modelName="centered" modelPosition="center" preferredPlacement="anywhere" ratio="0.5" textColor="#000000" visible="true" width="100.0390625" x="-101.48474858880354" y="18.27839561095925">{2}<y:PreferredPlacementDescriptor angle="0.0" angleOffsetOnRightSide="0" angleReference="absolute" angleRotationOnRightSide="co" distance="-1.0" frozen="true" placement="anywhere" side="anywhere" sideReference="relative_to_edge_flow"/>
207 | 		  </y:EdgeLabel>
208 | 		  <y:BendStyle smoothed="false"/>
209 | 		</y:PolyLineEdge>
210 | 	  </data>
211 | 	</edge>'''.format(subj,obj,pred) + '\n'
212 | 	
213 | 	graphml += '''</graph>
214 |   <data key="d7">
215 | 	<y:Resources/>
216 |   </data>
217 | </graphml>'''
218 | 
219 | 	path = file_name+"_yEd.graphml"
220 | 	with open(path, 'w') as content_file:
221 | 		content_file.write(graphml)
222 | 
223 | MAX_LABEL_LENGTH = 128
224 | def save_graph(edge_list, file_name, size=None):
225 | 	def stringify(x): 
226 | 		if isinstance(x, (list,tuple)):
227 | 			if len(x)==0:
228 | 				return ''
229 | 			if len(x)==1:
230 | 				x = x[0]
231 | 		return str(x)
232 | 	edge_list = [tuple(map(stringify,edge)) for edge in edge_list]
233 | 	# Build graph
234 | 	save_graphml(edge_list, file_name)
235 | 
236 | 	if size is None:
237 | 		return
238 | 	graph=nx.DiGraph() # directed graph
239 | 	format_str = lambda x: x[:MAX_LABEL_LENGTH].replace(':','.')#.replace('.',' ')
240 | 	for subject, predicate, object in map(lambda x: map(format_str,x),edge_list):
241 | 		graph.add_edge(subject, object, r=predicate)
242 | 
243 | 	#initialze Figure
244 | 	plt.figure(num=None, figsize=(size, size))
245 | 	plt.axis('off')
246 | 	fig = plt.figure(1)
247 | 
248 | 	pos=graphviz_layout(graph,prog='twopi')
249 | 	nx.draw(
250 | 		graph, 
251 | 		pos, 
252 | 		font_size=16, 
253 | 		with_labels=False,
254 | 		arrowstyle='wedge',
255 | 	)
256 | 	nx.draw_networkx_labels(
257 | 		graph, 
258 | 		pos, 
259 | 		bbox=dict(boxstyle='square', fc="w", ec="k")
260 | 	)
261 | 	#edge_labels={('A','B'):'AB',('B','C'):'BC',('B','D'):'BD'}
262 | 	nx.draw_networkx_edge_labels(
263 | 		graph,
264 | 		pos,
265 | 		edge_labels=nx.get_edge_attributes(graph,'r'),
266 | 		font_color='red'
267 | 	)
268 | 
269 | 	plt.savefig(file_name+'.png', bbox_inches="tight")
270 | 	plt.clf()
271 | 	del fig
272 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/classification/sentence_classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np # for fast array ops
  4 | import misc.tfidf_lib as tfidf_lib
  5 | from models.model_manager import ModelManager
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | from nltk.stem.snowball import SnowballStemmer # <http://www.nltk.org/howto/stem.html>
  8 | 
  9 | from misc.doc_reader import load_or_create_cache, create_cache, load_cache
 10 | 
 11 | class SentenceClassifier(ModelManager):
 12 | 	stemmer = SnowballStemmer("english")
 13 | 
 14 | 	def __init__(self, model_options):
 15 | 		super().__init__(model_options)
 16 | 		self.disable_spacy_component = ["tagger", "ner", "textcat", "neuralcoref"]
 17 | 		# Read options from input
 18 | 		self.log = model_options.get('log', False)
 19 | 		self.use_tf_model = model_options.get('tf_model', None) is not None
 20 | 		self.with_topic_scaling = model_options.get('with_topic_scaling', False)
 21 | 		self.use_combined_wordvec = self.with_topic_scaling or not self.use_tf_model
 22 | 		self.with_document_log_length_scaling = model_options.get('with_document_log_length_scaling', False)
 23 | 		self.with_centered_similarity = model_options.get('with_centered_similarity', False)
 24 | 		# TF-IDF
 25 | 		self.default_tfidf_importance = model_options.get('default_tfidf_importance', 1/2) # number in [0,1]
 26 | 		self.default_tfidf_importance = np.clip(self.default_tfidf_importance, 0,1)
 27 | 		self.with_stemmed_tfidf = model_options.get('with_stemmed_tfidf', False)
 28 | 		self.very_big_corpus = model_options.get('very_big_corpus', False)
 29 | 		self.query_cache = {}
 30 | 		self.default_similarity_threshold = self.model_options.get('default_similarity_threshold', 0)
 31 | 		
 32 | 		if self.log:
 33 | 			print('Initialising SentenceClassifier:')
 34 | 			print('  with_stemmed_tfidf',self.with_stemmed_tfidf)
 35 | 			print('  with_topic_scaling',self.with_topic_scaling)
 36 | 			print('  with_document_log_length_scaling',self.with_document_log_length_scaling)
 37 | 			print('  default_tfidf_importance',self.default_tfidf_importance)
 38 | 			print('  use_combined_wordvec',self.use_combined_wordvec)
 39 | 
 40 | 	def set_documents(self, id_doc_list, context_list=None):
 41 | 		self.ids, self.documents = zip(*id_doc_list)
 42 | 		self.contexts = context_list if context_list else list(self.documents)
 43 | 		self.contextualised_documents = list(zip(self.documents,self.contexts))
 44 | 		if self.use_tf_model:
 45 | 			self.contextualised_documents_embeddings = self.run_tf_embedding(doc_list=self.contextualised_documents, as_question=False)
 46 | 		self.target_size = len(id_doc_list)
 47 | 		# TF-IDF
 48 | 		self.tfidf_prepared = False
 49 | 		self.spacy_prepared = False
 50 | 		return self
 51 | 
 52 | 	def get_stemmed_token_list(self, token_list):
 53 | 		return list(map(self.stemmer.stem, token_list))
 54 | 
 55 | 	def prepare_tfidf(self):
 56 | 		# Get lemmatized documents
 57 | 		lemmatized_document_iter = map(self.lemmatize_spacy_document, self.spacy_documents)
 58 | 		if self.with_stemmed_tfidf:
 59 | 			stemmed_documents = [
 60 | 				self.get_stemmed_token_list(token_list)
 61 | 				for token_list in lemmatized_document_iter
 62 | 			]
 63 | 			if self.log:
 64 | 				print('stemmed_documents', stemmed_documents)
 65 | 			words_vector = stemmed_documents 
 66 | 		else:
 67 | 			words_vector = list(lemmatized_document_iter)
 68 | 		# Build tf-idf model and similarities
 69 | 		dictionary, tfidf_model, tfidf_corpus_similarities = tfidf_lib.build_tfidf(words_vector=words_vector, very_big_corpus=self.very_big_corpus)
 70 | 		if self.log:
 71 | 			print("Number of words in dictionary:",len(dictionary))
 72 | 		self.dictionary, self.tfidf_model, self.tfidf_corpus_similarities = dictionary, tfidf_model, tfidf_corpus_similarities
 73 | 		self.tfidf_prepared = True
 74 | 
 75 | 	def prepare_spacy(self):
 76 | 		if not self.spacy_prepared:
 77 | 			self.spacy_documents = self.nlp(self.contexts)
 78 | 			self.spacy_prepared = True
 79 | 	
 80 | 	def lemmatize_spacy_document(self, doc):
 81 | 		return [
 82 | 			token.lemma_.casefold().strip()
 83 | 			for token in doc
 84 | 			if not (token.is_stop or token.is_punct) and token.lemma_.lower() != '-pron-'
 85 | 		]
 86 | 
 87 | 	def get_weighted_similarity(self, similarity_dict, query_length, tfidf_importance):
 88 | 		semantic_similarity = similarity_dict.get('docvec' if self.use_tf_model else 'combined_wordvec', 0)
 89 | 		syntactic_similarity = similarity_dict.get('tfidf', 0)
 90 | 		# Build combined similarity
 91 | 		if self.log:
 92 | 			print('tfidf_importance', tfidf_importance)
 93 | 		weighted_similarity = tfidf_importance*syntactic_similarity+(1-tfidf_importance)*semantic_similarity
 94 | 				
 95 | 		if self.with_topic_scaling:
 96 | 			# Get the topic weight
 97 | 			corpus_similarity = similarity_dict['corpus']
 98 | 			topic_weight = np.power(corpus_similarity,2)
 99 | 			# Compute the weighted similarity for every sub-corpus
100 | 			# syntactic_similarity is high for a document when the query words and the document words are similar, but syntactic_similarity may be lower when we use words in the synsets
101 | 			# in order to address the aforementioned synset-words problem we sum the syntactic_similarity with the corpus_similarity before scaling it by the semantic_weight
102 | 			# we scale by the semantic_weight in order to give significantly more similarity to the documents semantically more closer to the query 
103 | 			weighted_similarity *= topic_weight
104 | 			
105 | 		if self.with_document_log_length_scaling:
106 | 			# the bigger the sentence, the (smoothly) lower the weighted_similarity
107 | 			# thus we scale the weighted_similarity by the log of the query length
108 | 			weighted_similarity *= np.array(query_length)/np.max(query_length) # sum 1 to avoid similarity zeroing
109 | 			
110 | 		return weighted_similarity
111 | 
112 | 	def classify(self, query_list, similarity_type, similarity_threshold=None, as_question=False, tfidf_importance=None):
113 | 		return self.get_index_of_most_similar_documents(
114 | 			self.get_query_similarity(query_list, as_question=as_question, tfidf_importance=tfidf_importance), 
115 | 			similarity_threshold=similarity_threshold,
116 | 			similarity_type=similarity_type,
117 | 		)
118 | 
119 | 	def get_query_similarity(self, query_list, as_question=False, tfidf_importance=None):
120 | 		return self.get_formatted_query_similarity(
121 | 			query_list, # original query
122 | 			self.nlp(query_list), # Get the filtered query (Document object built using lemmas)
123 | 			as_question=as_question,
124 | 			tfidf_importance=tfidf_importance,
125 | 		)
126 | 
127 | 	def get_formatted_query_similarity(self, text_list, formatted_query_list, as_question=False, tfidf_importance=None):
128 | 		if tfidf_importance is None:
129 | 			tfidf_importance = self.default_tfidf_importance
130 | 		# Prepare spacy docs if they are not ready yet
131 | 		with_syntactic_similarity = tfidf_importance > 0
132 | 		with_semantic_similarity = tfidf_importance < 1
133 | 		if with_syntactic_similarity:
134 | 			self.prepare_spacy()
135 | 			self.prepare_tfidf()
136 | 		elif self.use_combined_wordvec:
137 | 			self.prepare_spacy()
138 | 		#################################################################################
139 | 		# Build similarity dict
140 | 		similarity_dict = {}
141 | 		if with_syntactic_similarity:
142 | 			# Get the lemmatized query
143 | 			formatted_query_list = tuple(map(self.lemmatize_spacy_document, formatted_query_list))
144 | 			if self.log:
145 | 				print('lemmatized_query', formatted_query_list)
146 | 			# Get the stemmed query for tf-idf
147 | 			if self.with_stemmed_tfidf:
148 | 				formatted_query_list = tuple(map(self.get_stemmed_token_list, formatted_query_list))
149 | 				if self.log:
150 | 					print('stemmed_query', formatted_query_list)
151 | 			# Get tf-idf and docvec similarities
152 | 			similarity_dict['tfidf'] = np.array([
153 | 				tfidf_lib.get_query_tfidf_similarity(
154 | 					formatted_query, 
155 | 					self.dictionary, 
156 | 					self.tfidf_model, 
157 | 					self.tfidf_corpus_similarities
158 | 				)
159 | 				for formatted_query in formatted_query_list
160 | 			])
161 | 		if with_semantic_similarity:
162 | 			if self.use_tf_model:
163 | 				# Get docvec similarity
164 | 				similarity_dict['docvec'] = np.inner(
165 | 					self.run_tf_embedding(doc_list=text_list, as_question=as_question),
166 | 					self.contextualised_documents_embeddings
167 | 				)
168 | 			if self.use_combined_wordvec:
169 | 				get_avg_wordvec_similarity = lambda x: np.mean([q.vector for q in x])
170 | 				# Get averaged wordvec similarity
171 | 				similarity_dict['combined_wordvec'] = cosine_similarity(
172 | 					map(get_avg_wordvec_similarity, formatted_query_list),
173 | 					map(get_avg_wordvec_similarity, self.spacy_documents)
174 | 				)
175 | 			if self.with_topic_scaling:
176 | 				# Get the corpus similarity for every sub-corpus, by averaging the docvec similarities of every sub-corpus
177 | 				similarity_dict['corpus'] = np.mean(similarity_dict['combined_wordvec'],-1)
178 | 				similarity_dict['corpus'] = np.expand_dims(similarity_dict['corpus'], -1) # expand_dims because we have sub-corpus
179 | 		# Get the weighted similarity
180 | 		similarity_dict['weighted'] = self.get_weighted_similarity(similarity_dict=similarity_dict, query_length=np.array(map(len,formatted_query_list)), tfidf_importance=tfidf_importance)
181 | 		# Sum the weighted similarity across sub-corpus
182 | 		# similarity_dict['weighted'] = np.sum(similarity_dict['weighted'], 0)
183 | 		# Center the weighted_similarity vector
184 | 		if self.with_centered_similarity:
185 | 			# Center the weighted_similarity vector: Remove the average weighted_similarity
186 | 			similarity_dict['weighted'] -= np.mean(similarity_dict['weighted'])
187 | 			# Remove negative components, they are useless for the task
188 | 			similarity_dict['weighted'] = np.maximum(similarity_dict['weighted'], 0)
189 | 		return similarity_dict
190 | 	
191 | 	def get_index_of_most_similar_documents(self, similarity_dict, similarity_type, similarity_threshold=None):
192 | 		if similarity_threshold is None:
193 | 			similarity_threshold = self.default_similarity_threshold
194 | 		def get_similarity_dict_generator(i, similarity_ranking):
195 | 			# print('#'*100)
196 | 			similarity = similarity_dict[similarity_type][i]
197 | 			syntactic_similarity = similarity_dict['tfidf'][i] if 'tfidf' in similarity_dict else None
198 | 			semantic_similarity = similarity_dict['docvec'][i] if 'docvec' in similarity_dict else None
199 | 			for best in similarity_ranking:
200 | 				if similarity_threshold is not None and similarity[best] < similarity_threshold:
201 | 					return
202 | 				sim_dict = {
203 | 					'id':self.ids[best], 
204 | 					'doc':self.documents[best], 
205 | 					'index':int(best), 
206 | 					'similarity':float(similarity[best]),
207 | 					'syntactic_similarity':float(syntactic_similarity[best]) if syntactic_similarity is not None else 0,
208 | 					'semantic_similarity':float(semantic_similarity[best]) if semantic_similarity is not None else 0,
209 | 				}
210 | 				# print(best, sim_dict)
211 | 				if self.contexts:
212 | 					sim_dict['context'] = self.contexts[best]
213 | 				yield sim_dict
214 | 		similarity_list = similarity_dict[similarity_type]
215 | 		similarity_ranking_list = np.argsort(similarity_list, kind='stable', axis=-1)
216 | 		return [
217 | 			get_similarity_dict_generator(i, similarity_ranking[::-1])
218 | 			for i,similarity_ranking in enumerate(similarity_ranking_list)
219 | 		]
220 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/stage_builder/domain_stage_builder.js:
--------------------------------------------------------------------------------
  1 | // create a graph class 
  2 | class DirectedGraph { 
  3 | 	// defining vertex array and 
  4 | 	// adjacent list 
  5 | 	constructor() 
  6 | 	{ 
  7 | 		this.AdjacencyList = new Map();
  8 | 		this.InverseAdjacencyList = new Map();
  9 | 	}
 10 | 	
 11 | 	// add edge to the graph 
 12 | 	addEdge(v, w) 
 13 | 	{
 14 | 		if (!this.AdjacencyList.has(v)) // initialize the adjacent list with an empty array
 15 | 			this.AdjacencyList.set(v, [])
 16 | 		this.AdjacencyList.get(v).push(w);
 17 | 
 18 | 		if (!this.InverseAdjacencyList.has(w)) // initialize the adjacent list with an empty array
 19 | 			this.InverseAdjacencyList.set(w, [])
 20 | 		this.InverseAdjacencyList.get(w).push(v);
 21 | 	}
 22 | 
 23 | 	getRoots()
 24 | 	{
 25 | 		var root_set = new Set(Array.from(this.AdjacencyList.keys()))
 26 | 		for (var [source,target_list] of this.AdjacencyList.entries())
 27 | 		{
 28 | 			for (var target of target_list) 
 29 | 				if (source != target)
 30 | 					root_set.delete(target)
 31 | 		}
 32 | 		return Array.from(root_set)
 33 | 	}
 34 | 
 35 | 	getLeaves()
 36 | 	{
 37 | 		var root_set = new Set(Array.from(this.InverseAdjacencyList.keys()))
 38 | 		for (var [source,target_list] of this.InverseAdjacencyList.entries())
 39 | 		{
 40 | 			for (var target of target_list)
 41 | 			{
 42 | 				if (source != target)
 43 | 					root_set.delete(target)
 44 | 			}
 45 | 		}
 46 | 		return Array.from(root_set)
 47 | 	}
 48 | 	
 49 | 	// Prints the vertex and adjacency list 
 50 | 	printGraph() 
 51 | 	{ 
 52 | 		// get all the vertices 
 53 | 		var get_keys = this.AdjacencyList.keys(); 
 54 | 	  
 55 | 		// iterate over the vertices 
 56 | 		for (var i of get_keys) { 
 57 | 			// great the corresponding adjacency list 
 58 | 			// for the vertex 
 59 | 			var get_values = this.AdjacencyList.get(i); 
 60 | 			var conc = ""; 
 61 | 	  
 62 | 			// iterate over the adjacency list 
 63 | 			// concatenate the values into a string 
 64 | 			for (var j of get_values) 
 65 | 				conc += j + " "; 
 66 | 	  
 67 | 			// print the vertex and its adjacency list 
 68 | 			console.log(i + " -> " + conc); 
 69 | 		} 
 70 | 	} 
 71 | }
 72 | 
 73 | function get_taxonomy_information(information_uri)
 74 | {
 75 | 	//console.log(information_uri)
 76 | 	var query = [
 77 | 		"SELECT DISTINCT ?subject ?predicate ?object WHERE {",
 78 | 			"<"+information_uri+"> rdfs:subClassOf* ?subject.",
 79 | 			"?subject ?predicate ?object.",
 80 | 		"}",
 81 | 	].join("\n");
 82 | 	//console.log(query)
 83 | 	var query_result = query_sparql_endpoint(DBPEDIA_ENDPOINT, query)
 84 | 	if (!query_result || !query_result.results || query_result.results.bindings.length==0)
 85 | 		return null
 86 | 	var tuple_list = query_result.results.bindings
 87 | 	// Build subject map
 88 | 	var subj_map = new Map()
 89 | 	for (tuple of tuple_list)
 90 | 	{
 91 | 		var subj = tuple.subject.value, pred = tuple.predicate.value, obj = tuple.object.value
 92 | 		if (!subj_map.has(subj))
 93 | 			subj_map.set(subj, {'@id': subj})
 94 | 		subj_map.get(subj)[tuple.predicate.value] = tuple.object.value
 95 | 	}
 96 | 	function recursive_graph_building(subj) {
 97 | 		var jsonld_graph = Object.assign({}, subj_map.get(subj));
 98 | 		for (var [key,value] of Object.entries(jsonld_graph))
 99 | 		{
100 | 			if (key=='@id')
101 | 				continue
102 | 			if (subj_map.has(value) && value!=subj)
103 | 				jsonld_graph[key] = recursive_graph_building(value)
104 | 		}
105 | 		return jsonld_graph
106 | 	}
107 | 	var jsonld_graph = recursive_graph_building(information_uri)
108 | 	// console.log(jsonld_graph)
109 | 	var ground = {
110 | 		'@type': 'JSON',
111 | 		'@value': JSON.stringify(query_result, null, 2)
112 | 	}
113 | 	jsonld_graph = format_jsonld(jsonld_graph, ground, query)
114 | 	return jsonld_graph
115 | }
116 | //console.log(get_taxonomy_information('http://dbpedia.org/class/yago/WikicatNeuralNetworks'))
117 | 
118 | function get_typeset_hierarchy_leaves_from_dbpedia(type_list) 
119 | {
120 | 	if (type_list.length == 1)
121 | 		return type_list
122 | 	var type_query = []
123 | 	for (var type of type_list)
124 | 	{
125 | 		if (type_query.length > 0)
126 | 			type_query.push('UNION')
127 | 		type_query.push(
128 | 			[
129 | 				"{",
130 | 					"SELECT DISTINCT ?class ?superclass WHERE {",
131 | 						"<"+type+"> rdfs:subClassOf* ?class.",
132 | 						"?class rdfs:subClassOf? ?superclass.",
133 | 					"}",
134 | 				"}"
135 | 			].join("\n")
136 | 		)
137 | 	}
138 | 	// console.log(PREFIX_MAP_STRING)
139 | 	var query = [
140 | 		PREFIX_MAP_STRING,
141 | 		"SELECT DISTINCT ?class ?superclass WHERE {",
142 | 			type_query.join("\n"),
143 | 		"}"
144 | 	].join("\n");
145 | 	// console.log(query)
146 | 
147 | 	function get_leaves(_data) 
148 | 	{
149 | 		// console.log(_data)
150 | 		var results = _data.results.bindings;
151 | 		var class_hierarchy = new DirectedGraph()
152 | 		for (var i in results) 
153 | 		{
154 | 			var row = results[i]
155 | 			var super_class = row['superclass'].value
156 | 			var sub_class = row['class'].value;
157 | 			class_hierarchy.addEdge(super_class, sub_class);
158 | 		}
159 | 		// console.log(class_hierarchy.getLeaves())
160 | 		return class_hierarchy.getLeaves()
161 | 	}
162 | 	return get_leaves(query_sparql_endpoint(DBPEDIA_ENDPOINT, query));
163 | }
164 | 
165 | function build_minimal_type_graph(minimal_entity_graph, predicate=TYPE_URI, collector=HAS_ENTITY_URI)
166 | {
167 | 	// minimal_entity_graph = format_jsonld(minimal_entity_graph);
168 | 	var entity_dict = get_entity_dict(minimal_entity_graph);
169 | 	var tot_statement_count = count_graph_statements(minimal_entity_graph);
170 | 	minimal_entity_graph = get_URI_graph(minimal_entity_graph, predicate, collector, recursion=false);
171 | 	// Create the entity_type_map, in order to easily keep track of entities and their types
172 | 	var type_rdfitem_map = new Map();
173 | 	var entity_type_map = new Map();
174 | 	for (var sub_graph of minimal_entity_graph)
175 | 	{ 
176 | 		var type = sub_graph['@id']['@value'];
177 | 		type_rdfitem_map.set(type, sub_graph['@id']);
178 | 		var sub_graph_list = isArray(sub_graph[collector]) ? sub_graph[collector] : [sub_graph[collector]];
179 | 		for (var g of sub_graph_list)
180 | 		{
181 | 			if (jQuery.isEmptyObject(g))
182 | 				continue;
183 | 			var entity_id = g['@id']['@value']
184 | 			if (!entity_type_map.has(entity_id))
185 | 			{
186 | 				entity_type_map.set(entity_id, {})
187 | 				entity_type_map.get(entity_id)[TYPESET_URI] = new Set()
188 | 				entity_type_map.get(entity_id)[collector] = g
189 | 			}
190 | 			entity_type_map.get(entity_id)[TYPESET_URI].add(type)
191 | 		}
192 | 	}
193 | 	// Remove redundant types from the entity_type_map.
194 | 	// Redundant types are those types that can be inferred from the other types, following superclass relations in dbpedia.
195 | 	// for (var [entity_id, type_dict] of entity_type_map.entries())
196 | 	// {
197 | 	// 	var redundant_type_set = new Set(type_dict[TYPESET_URI])
198 | 	// 	var type_hierarchy_leaves = get_typeset_hierarchy_leaves_from_dbpedia(Array.from(redundant_type_set))
199 | 	// 	if (type_hierarchy_leaves.length == 0) // entity not found in dbpedia
200 | 	// 		continue
201 | 	// 	for (var minimal_type of type_hierarchy_leaves)
202 | 	// 		redundant_type_set.delete(minimal_type)
203 | 	// 	//console.log(entity_id, redundant_type_set)
204 | 	// 	for (var redundant_type of redundant_type_set)
205 | 	// 		type_dict[TYPESET_URI].delete(redundant_type)
206 | 	// }
207 | 	// Create the type_entity_map, merging similar type groups
208 | 	var type_entity_map = new Map();
209 | 	var current_type_group_id = 0;
210 | 	for (var [entity_id, type_dict] of entity_type_map.entries())
211 | 	{
212 | 		var minimal_type_list = Array.from(type_dict[TYPESET_URI])
213 | 		if (minimal_type_list.length==0)
214 | 			continue
215 | 		var type_id = minimal_type_list.sort().join(' ')
216 | 		minimal_type_list = minimal_type_list.map(x => type_rdfitem_map.get(x)) // recover grounds
217 | 		if (!type_entity_map.has(type_id))
218 | 		{
219 | 			if (minimal_type_list.length==1) 
220 | 			{
221 | 				var type_uri_item = minimal_type_list[0];
222 | 				var type_graph = {'@id': type_uri_item};
223 | 				// type_graph[ENTITY_PERCENTAGE_URI] = build_RDF_item(0);
224 | 				type_graph[STATEMENT_COUNT_URI] = build_RDF_item(0);
225 | 				// type_graph[ENTITY_COUNT_URI] = build_RDF_item(0);
226 | 				// var class_information = get_taxonomy_information(type_uri_item['@value']);
227 | 				// if (class_information!==null) // build domain stage
228 | 				// 	type_graph[PROPERTY_LIST_URI] = class_information;
229 | 				type_entity_map.set(type_id, type_graph);
230 | 			}
231 | 			else 
232 | 			{
233 | 				var class_set = []
234 | 				for (var type_uri_item of minimal_type_list)
235 | 				{
236 | 					class_set.push(Object.assign({}, 
237 | 						{'@id': type_uri_item},
238 | 						entity_dict[type_uri_item['@value']], 
239 | 						// get_taxonomy_information(type_uri_item['@value'])
240 | 					));
241 | 				}
242 | 				if (class_set.length > 0)
243 | 					class_set = class_set.filter(x=>x['@id']['@value']!=PREFIX_MAP['owl']+'Thing');
244 | 				var type_entity_dict = {};
245 | 				type_entity_dict['@id'] = build_RDF_item('my:CompositeClass'+current_type_group_id);
246 | 				type_entity_dict[IS_COMPOSITE_CLASS_BOOL_URI] = build_RDF_item(true);
247 | 				// type_entity_dict[ENTITY_PERCENTAGE_URI] = build_RDF_item(0);
248 | 				type_entity_dict[STATEMENT_COUNT_URI] = build_RDF_item(0);
249 | 				// type_entity_dict[ENTITY_COUNT_URI] = build_RDF_item(0);
250 | 				// type_entity_dict[CLASS_COUNT_URI] = build_RDF_item(class_set.length);
251 | 				type_entity_dict[COMPOSITE_CLASS_SET_URI] = class_set;
252 | 				type_entity_map.set(type_id, type_entity_dict);
253 | 				current_type_group_id += 1;
254 | 			}
255 | 			type_entity_map.get(type_id)[collector] = [];
256 | 		}
257 | 		type_entity_map.get(type_id)[collector].push(type_dict[collector]);
258 | 	}
259 | 	// Return a new minimized type graph
260 | 	var type_entity_dict_list = Array.from(type_entity_map.values());
261 | 	for (var i in type_entity_dict_list)
262 | 	{
263 | 		var type_entity_dict = type_entity_dict_list[i];
264 | 		var type_uri = type_entity_dict['@id']['@value'];
265 | 		var type_statement_count = count_graph_statements(type_entity_dict[collector]);
266 | 		// type_entity_dict[ENTITY_PERCENTAGE_URI] = build_RDF_item(String((100*type_statement_count/tot_statement_count).toFixed(2))+'%');
267 | 		type_entity_dict[STATEMENT_COUNT_URI] = build_RDF_item(type_statement_count);
268 | 		// type_entity_dict[ENTITY_COUNT_URI] = build_RDF_item(type_entity_dict[collector].length);
269 | 		if (type_uri in entity_dict)
270 | 			type_entity_dict_list[i] = Object.assign({}, type_entity_dict, entity_dict[type_uri]);
271 | 	}
272 | 	// Sort by entity count
273 | 	var minimal_type_graph = type_entity_dict_list.sort((a,b)=>b[STATEMENT_COUNT_URI]['@value']-a[STATEMENT_COUNT_URI]['@value']);
274 | 	// Add type info to entities that are also types
275 | 	for (var type_dict of minimal_type_graph)
276 | 	{
277 | 		if (!(LABEL_URI in type_dict))
278 | 			type_dict[LABEL_URI] = format_link(get_dict_description(type_dict), false);
279 | 		// console.log(type_dict, (collector in type_dict))
280 | 		if (!(collector in type_dict))
281 | 			continue;
282 | 		for (var ent_dict of type_dict[collector])
283 | 		{
284 | 			var ent_id = ent_dict['@id']['@value'];
285 | 			if (!type_entity_map.has(ent_id))
286 | 				continue;
287 | 			// console.log(ent_id);
288 | 			for (var [k,v] of Object.entries(type_entity_map.get(ent_id)))
289 | 				ent_dict[k] = v;
290 | 		}
291 | 	}
292 | 	return minimal_type_graph;
293 | }


--------------------------------------------------------------------------------
/web_app/oke/core/misc/doc_reader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import re
  4 | import json
  5 | from bs4 import BeautifulSoup
  6 | from tika import parser
  7 | import unicodedata
  8 | from more_itertools import unique_everseen
  9 | from misc.jsonld_lib import *
 10 | import html
 11 | 
 12 | get_bs_text = lambda x: re.sub(r'[ \n\t]+',' ',html.unescape(x.text)).strip() if x else None
 13 | 
 14 | def normalize_string(content):
 15 | 	content = unicodedata.normalize("NFKC", content) # normalize content
 16 | 	content = re.sub(r'\r\n', '\n', content, flags=re.UNICODE) # normalize new lines
 17 | 	content = re.sub(r'[\r\f\v]', '\n', content, flags=re.UNICODE) # normalize new lines
 18 | 	content = re.sub(r'[-\x2D\xAD\x58A\x1806\xFE63\xFF0D\xE002D]\n+', '', content, flags=re.UNICODE) # remove word-breaks (hyphens)
 19 | 	content = re.sub(r'[\x2010\x2011\x2027\x2043]\n+', ' ', content, flags=re.UNICODE) # remove line-breaks (hyphens)
 20 | 	content = re.sub(r'([^\n.])\n+([^\n])', r'\1 \2', content, flags=re.UNICODE) # remove line-breaks
 21 | 	content = re.sub(r'[ \t]+', ' ', content, flags=re.UNICODE) # normalize whitespaces
 22 | 	# workarounds
 23 | 	content = re.sub(r' - ', ' ', content, flags=re.UNICODE) # remove all hyphens
 24 | 	content = re.sub(r'- *', '', content, flags=re.UNICODE) # remove all hyphens
 25 | 	return content.strip()
 26 | 
 27 | def get_document_list(directory):
 28 | 	doc_list = []
 29 | 	for obj in os.listdir(directory):
 30 | 		obj_path = os.path.join(directory, obj)
 31 | 		if os.path.isfile(obj_path):
 32 | 			doc_list.append(obj_path)
 33 | 		elif os.path.isdir(obj_path):
 34 | 			doc_list.extend(get_document_list(obj_path))
 35 | 	return doc_list
 36 | 
 37 | def get_all_paths_to_leaf(root, element_set):
 38 | 	if not root:
 39 | 		return [[]]
 40 | 	if root.name in element_set:
 41 | 		return [[root]]
 42 | 	children = list(root.findChildren(recursive=False))
 43 | 	if not children:
 44 | 		return [[]]
 45 | 	path_list = [
 46 | 		path
 47 | 		for child in children
 48 | 		for path in get_all_paths_to_leaf(child, element_set)
 49 | 	]
 50 | 	merged_path_list = []
 51 | 	i = 0
 52 | 	while i < len(path_list):
 53 | 		child_path = []
 54 | 		while i < len(path_list) and len(path_list[i]) == 1:
 55 | 			child_path += path_list[i]
 56 | 			i+=1
 57 | 		if i < len(path_list):
 58 | 			child_path += path_list[i]
 59 | 			i+=1
 60 | 		if child_path:
 61 | 			merged_path_list.append(child_path)
 62 | 	return merged_path_list
 63 | 
 64 | def get_next_siblings(e, name_set):
 65 | 	next_siblings = []
 66 | 	sibling = e.find_next_sibling()
 67 | 	while sibling and sibling.name in name_set:
 68 | 		next_siblings.append(sibling)
 69 | 		sibling = sibling.find_next_sibling()
 70 | 	return next_siblings
 71 | 
 72 | def read_jsonld_file(filename):
 73 | 	file_id = os.path.basename(filename).replace(' ','_')
 74 | 	# read file
 75 | 	with open(f'{filename}.json', 'r') as f:
 76 | 		data=f.read()
 77 | 	# parse file
 78 | 	obj = json.loads(data)
 79 | 	triple_list = jsonld_to_triples(obj, file_id)
 80 | 	# print(json.dumps(triple_list, indent=4))
 81 | 	annotated_text_list = [
 82 | 		{
 83 | 			'text': o if not is_rdf_item(o) else o['@value'],
 84 | 			'id': file_id,
 85 | 		}
 86 | 		for s,p,o in triple_list
 87 | 		if not is_url(o) and p != HAS_LABEL_PREDICATE
 88 | 	] + [{
 89 | 		'graph': triple_list
 90 | 	}]
 91 | 	return annotated_text_list
 92 | 
 93 | def read_html_file(filename, short_extension=False):
 94 | 	file_id = os.path.basename(filename).replace(' ','_')
 95 | 	with open(filename+('.htm' if short_extension else '.html'), 'r', encoding='utf8', errors='ignore') as file:
 96 | 		file_content = file.read()
 97 | 	doc = BeautifulSoup(file_content, features="lxml")
 98 | 	for script in doc(["script", "style"]): # remove all javascript and stylesheet code
 99 | 		script.extract()
100 | 	annotated_text_list = []
101 | 	p_to_ignore = set()
102 | 	elements_to_merge = set(['table','ul','ol'])
103 | 	for i,p in enumerate(doc.findAll("p")):
104 | 		p_text = get_bs_text(p)
105 | 		if p_text in p_to_ignore:
106 | 			continue
107 | 		p_to_ignore.add(p_text)
108 | 		# p_set = [p] + get_next_siblings(p,['p'])
109 | 		# p_to_ignore |= set(p_set)
110 | 		# p = p_set[-1]
111 | 		siblings_to_merge = get_next_siblings(p,elements_to_merge)
112 | 		if not siblings_to_merge:
113 | 			annotated_text_list.append({
114 | 				'text': p_text,
115 | 				# 'text': ' '.join(map(get_bs_text,p_set)),
116 | 				'id': file_id,
117 | 			})
118 | 		else:
119 | 			for sibling in siblings_to_merge:
120 | 				path_list = get_all_paths_to_leaf(sibling, ['p'])
121 | 				annotated_text_list += [
122 | 					{
123 | 						'text': ' '.join(map(get_bs_text, [p]+path)),
124 | 						'id': file_id,
125 | 					}
126 | 					for path in path_list
127 | 				]
128 | 				p_to_ignore |= set(map(get_bs_text, sum(path_list,[])))
129 | 	# print(json.dumps(annotated_text_list, indent=4))
130 | 	return list(unique_everseen(annotated_text_list, key=lambda x: x['text']))
131 | 
132 | def read_pdf_file(filename): # https://unicodelookup.com
133 | 	file_id = os.path.basename(filename).replace(' ','_')
134 | 	raw = parser.from_file(filename+'.pdf')
135 | 	return [
136 | 		{
137 | 			'text': paragraph.strip(),
138 | 			'id': file_id
139 | 		}
140 | 		for paragraph in raw['content'].split('\n\n')
141 | 		if paragraph
142 | 	]
143 | 
144 | def read_akn_file(filename):
145 | 	file_id = os.path.basename(filename).replace(' ','_')
146 | 	doc_id = urify(os.path.basename(filename))
147 | 	def get_num_jsonld(e):
148 | 		num = get_bs_text(e.num)
149 | 		if not num:
150 | 			return None
151 | 		return {
152 | 			'@id': doc_id+':'+e['eid'],
153 | 			HAS_LABEL_PREDICATE: num
154 | 		}
155 | 	def get_heading_jsonld(e):
156 | 		heading = get_bs_text(e.heading)
157 | 		jsonld = get_num_jsonld(e)
158 | 		if heading:
159 | 			if jsonld:
160 | 				jsonld['my:heading'] = heading
161 | 			else:
162 | 				return {
163 | 					'@id': doc_id+':'+e['eid'],
164 | 					'my:heading': heading
165 | 				}
166 | 		return jsonld
167 | 	
168 | 	with open(filename+'.akn') as f: 
169 | 		file_content = f.read()
170 | 
171 | 	doc = BeautifulSoup(file_content, features="lxml")
172 | 
173 | 	annotated_text_list = []
174 | 	for i,p in enumerate(doc.findAll("p")):
175 | 		text = get_bs_text(p)
176 | 		# Get annotations
177 | 		text_annotation = {}
178 | 		# # Get parent list
179 | 		# parent_list = [{
180 | 		# 	'name': p.name,
181 | 		# 	'attrs': p.attrs
182 | 		# }]
183 | 		# for parent in p.find_parents():
184 | 		# 	if parent.name == 'akomantoso': # Ignore the remaining parents
185 | 		# 		break
186 | 		# 	parent_list.append({
187 | 		# 		'name': parent.name,
188 | 		# 		'attrs': parent.attrs
189 | 		# 	})
190 | 		# text_annotation['@id'] = doc_id+':'+json.dumps(parent_list)
191 | 		# Get block
192 | 		block_list = p.find_parent('blocklist')
193 | 		if block_list:
194 | 			list_introduction = block_list.find('listintroduction')
195 | 			if list_introduction:
196 | 				text = ' '.join((get_bs_text(list_introduction), text))
197 | 			item = p.find_parent('item')
198 | 			item_num = get_num_jsonld(item)
199 | 			if item_num:
200 | 				text_annotation['my:block_id'] = item_num
201 | 		else:
202 | 			intro = p.find_parent('intro') 
203 | 			if intro:
204 | 				continue
205 | 			list = p.find_parent('list')
206 | 			if list and list.intro:
207 | 				text = ' '.join((get_bs_text(list.intro.p), text))
208 | 		# Get paragraph
209 | 		paragraph = p.find_parent('paragraph')
210 | 		if paragraph:
211 | 			paragraph_num = get_num_jsonld(paragraph)
212 | 			if paragraph_num:
213 | 				text_annotation['my:paragraph_id'] = paragraph_num
214 | 		# Get article
215 | 		article = p.find_parent('article')
216 | 		if article:
217 | 			article_num = get_num_jsonld(article)
218 | 			if article_num:
219 | 				text_annotation['my:article_id'] = article_num
220 | 		# Get section
221 | 		section = p.find_parent('section')
222 | 		if section:
223 | 			section_heading = get_heading_jsonld(section)
224 | 			if section_heading:
225 | 				text_annotation['my:section_id'] = section_heading
226 | 		# Get chapter
227 | 		chapter = p.find_parent('chapter')
228 | 		if chapter:
229 | 			chapter_heading = get_heading_jsonld(chapter)
230 | 			if chapter_heading:
231 | 				text_annotation['my:chapter_id'] = chapter_heading
232 | 		# Get references
233 | 		text_annotation['my:reference_id'] = [
234 | 			{
235 | 				'@id': doc_id+':'+ref['href'],
236 | 				HAS_LABEL_PREDICATE: get_bs_text(ref), 
237 | 			}
238 | 			for ref in p.findAll('ref', recursive=False)
239 | 		]
240 | 		base_id = f'{file_id}_{i}'
241 | 		annotated_text_list.append({
242 | 			'text': text,
243 | 			'id': file_id,
244 | 			'annotation': {
245 | 				'root': f'{ANONYMOUS_PREFIX}{base_id}_0',
246 | 				'content': jsonld_to_triples(text_annotation, base_id),
247 | 			},
248 | 		})
249 | 	return annotated_text_list
250 | 
251 | def get_content_list(doc_list):
252 | 	file_name = lambda x: os.path.splitext(x)[0]
253 | 	doc_set = set(doc_list)
254 | 	name_iter = unique_everseen(map(file_name, doc_list))
255 | 	content_list = []
256 | 	for obj_name in name_iter:
257 | 		if obj_name+'.akn' in doc_set:
258 | 			print('Parsing AKN:', obj_name)
259 | 			content_list += read_akn_file(obj_name)
260 | 		elif obj_name+'.html' in doc_set:
261 | 			print('Parsing HTML:', obj_name)
262 | 			content_list += read_html_file(obj_name)
263 | 		elif obj_name+'.htm' in doc_set:
264 | 			print('Parsing HTM:', obj_name)
265 | 			content_list += read_html_file(obj_name, True)
266 | 		elif obj_name+'.pdf' in doc_set:
267 | 			print('Parsing PDF:', obj_name)
268 | 			content_list += read_pdf_file(obj_name)
269 | 		elif obj_name+'.json' in doc_set:
270 | 			print('Parsing JSON-LD:', obj_name)
271 | 			content_list += read_jsonld_file(obj_name)
272 | 	return content_list
273 | 
274 | def create_cache(file_name, create_fn):
275 | 	print(f'Creating cache <{file_name}>..')
276 | 	result = create_fn()
277 | 	with open(file_name, 'wb') as f:
278 | 		pickle.dump(result, f)
279 | 	return result
280 | 
281 | def load_cache(file_name):
282 | 	if os.path.isfile(file_name):
283 | 		print(f'Loading cache <{file_name}>..')
284 | 		with open(file_name,'rb') as f:
285 | 			return pickle.load(f)
286 | 	return None
287 | 
288 | def load_or_create_cache(file_name, create_fn):
289 | 	result = load_cache(file_name)
290 | 	if not result:
291 | 		result = create_cache(file_name, create_fn)
292 | 	return result
293 | 
294 | class DocParser():
295 | 
296 | 	# def __init__(self, model_options):
297 | 	# 	super().__init__(model_options)
298 | 
299 | 	def set_documents_path(self, doc_path):
300 | 		self.content_list = get_content_list(get_document_list(doc_path))
301 | 		self.process_content_list()
302 | 		return self
303 | 
304 | 	def set_document_list(self, doc_list):
305 | 		self.content_list = get_content_list(doc_list)
306 | 		self.process_content_list()
307 | 		return self
308 | 
309 | 	def set_content_list(self, content_list):
310 | 		self.content_list = tuple(map(lambda x: x if isinstance(x,dict) else {'text':x,'id':x}, content_list))
311 | 		self.process_content_list()
312 | 		return self
313 | 
314 | 	def process_content_list(self):
315 | 		self.graph_list = tuple(filter(lambda x: x, map(lambda x: x.get('graph', None), self.content_list)))
316 | 		self.content_list = tuple(filter(lambda x: 'text' in x, self.content_list))
317 | 		for doc_dict in self.content_list:
318 | 			doc_dict['normalised_text'] = normalize_string(doc_dict['text'])
319 | 
320 | 	def get_doc_iter(self):
321 | 		for doc_dict in self.content_list:
322 | 			yield doc_dict['id']
323 | 
324 | 	def get_annotation_iter(self):
325 | 		for doc_dict in self.content_list:
326 | 			yield doc_dict.get('annotation',None)
327 | 
328 | 	def get_graph_iter(self):
329 | 		return self.graph_list
330 | 
331 | 	def get_content_iter(self, normalised=True):
332 | 		for doc_dict in self.content_list:
333 | 			yield doc_dict['normalised_text' if normalised else 'text']
334 | 


--------------------------------------------------------------------------------
/web_app/oke/core/models/knowledge_extraction/couple_extractor.py:
--------------------------------------------------------------------------------
  1 | from misc.doc_reader import DocParser
  2 | from models.knowledge_extraction.concept_extractor import ConceptExtractor as CE
  3 | import re
  4 | # import json
  5 | 
  6 | class CoupleExtractor(CE):
  7 | 	# PREDICATE_COMPONENT = [ # https://universaldependencies.org/u/dep/all.html
  8 | 	# 	'prt',		# particle
  9 | 	# 	'neg',		# negation modifier
 10 | 	# 	'auxpass',	# auxiliary (passive)
 11 | 	# 	'advcl',	# adverbial clause modifier
 12 | 	# 	'agent',	# agent
 13 | 	# 	'acomp',	# adjectival complement
 14 | 	# 	'xcomp',	# open clausal complement
 15 | 	# 	'pcomp',	# complement of preposition
 16 | 	# 	'ccomp',	# clausal complement
 17 | 	# 	'prep',		# prepositional modifier
 18 | 	# ]
 19 | 	# HIDDEN_PREDICATE_COMPONENT = [
 20 | 	# 	'aux',		# auxiliaries
 21 | 	# 	'mark', 	# marker - https://universaldependencies.org/docs/en/dep/mark.html
 22 | 	# 	'advmod',	# adverbial modifier
 23 | 	# 	'cc',		# coordinating conjunction
 24 | 	# ]
 25 | 	# PREDICATE_REGEXP = re.compile('|'.join(PREDICATE_COMPONENT+HIDDEN_PREDICATE_COMPONENT))
 26 | 	CC_FILTER_FN = lambda x: x.pos_=='PUNCT' or x.dep_=='cc' # punctuation and conjunctions
 27 | 	
 28 | 	@staticmethod
 29 | 	def is_passive(span): # return true if the sentence is passive - at the moment a sentence is assumed to be passive if it has an auxpass verb
 30 | 		for token in span:
 31 | 			if CE.get_token_dependency(token) == "auxpass":
 32 | 				return True
 33 | 		return False
 34 | 
 35 | 	@staticmethod
 36 | 	def is_verbal(span): # return true if the sentence is passive - at the moment a sentence is assumed to be passive if it has an auxpass verb
 37 | 		for token in span:
 38 | 			if token.pos_ == "VERB":
 39 | 				return True
 40 | 		return False
 41 | 
 42 | 	@staticmethod
 43 | 	def is_at_core(concept):
 44 | 		concept_span = concept['concept']['span']
 45 | 		return len(concept_span)==1 and len(concept['concept_core'])==1 and concept['concept_core'][0]['span'][0] == concept_span[0]
 46 | 
 47 | 	@staticmethod
 48 | 	def get_couple_uid(couple):
 49 | 		return (CE.get_concept_dict_uid(couple['concept']), CE.get_concept_dict_uid(couple['predicate']), couple['dependency'])
 50 | 
 51 | 	@staticmethod
 52 | 	def is_in_predicate(x,predicate_span):
 53 | 		return x.idx > predicate_span[0].idx and x.idx < predicate_span[-1].idx
 54 | 
 55 | 	@staticmethod
 56 | 	def trim_noise(token_list):
 57 | 		forbidden_dep = set(['cc', 'prep', 'punct'])
 58 | 		return CE.trim(token_list, lambda x: CE.get_token_dependency(x) in forbidden_dep)
 59 | 
 60 | 	@staticmethod
 61 | 	def expand_predicate_core(predicate_set, subj_obj_set): # enrich predicate set with details, adding hidden related concepts (n-ary relations)
 62 | 		hidden_related_concept_set = set((
 63 | 			hidden_related_concept
 64 | 			for predicate_element in predicate_set
 65 | 			for hidden_related_concept in CE.get_token_descendants(predicate_element, lambda x: x not in subj_obj_set and x not in predicate_set)
 66 | 		))
 67 | 		return predicate_set | hidden_related_concept_set #| hidden_related_concept_detail_set
 68 | 
 69 | 	@staticmethod
 70 | 	def get_grammatical_connection(core, other_core, core_set): # can be one per core in core_set
 71 | 		subj_obj_set = set((core,other_core))
 72 | 		# Search for indirect connections with other concepts
 73 | 		core_super_set = set(core.ancestors) # do not use CE.get_token_ancestors here, it messes up with conjunctions
 74 | 		core_super_set.add(core)
 75 | 		other_core_super_set = set(other_core.ancestors)
 76 | 		other_core_super_set.add(other_core)
 77 | 		inter = core_super_set.intersection(other_core_super_set)
 78 | 		if len(inter)==0: # core and other_core are not connected, continue
 79 | 			return None
 80 | 		# get paths connecting cores to each other
 81 | 		core_path_to_inter,core_junction = CE.find_path_to_closest_in_set(core,inter)
 82 | 		if core_junction:
 83 | 			core_path_to_inter.add(core_junction)
 84 | 		core_path_to_inter = core_path_to_inter.difference(subj_obj_set)
 85 | 		if len(core_path_to_inter.intersection(core_set)) > 0: # avoid jumps
 86 | 			return None
 87 | 		other_core_path_to_inter,other_core_junction = CE.find_path_to_closest_in_set(other_core,inter)
 88 | 		if other_core_junction:
 89 | 			other_core_path_to_inter.add(other_core_junction)
 90 | 		other_core_path_to_inter = other_core_path_to_inter.difference(subj_obj_set)
 91 | 		if len(other_core_path_to_inter.intersection(core_set)) > 0: # avoid jumps
 92 | 			return None
 93 | 		# Get predicate set
 94 | 		predicate_core_set = core_path_to_inter.union(other_core_path_to_inter)
 95 | 		if len(predicate_core_set)==0:
 96 | 			return None
 97 | 		# Enrich predicate set with details, adding hidden related concepts (n-ary relations)
 98 | 		predicate_set = CoupleExtractor.expand_predicate_core(predicate_core_set, subj_obj_set=subj_obj_set)
 99 | 		# Add missing conjunctions
100 | 		if core in other_core.children:
101 | 			predicate_set |= set(filter(CoupleExtractor.CC_FILTER_FN, other_core.children))
102 | 		elif other_core in core.children:
103 | 			predicate_set |= set(filter(CoupleExtractor.CC_FILTER_FN, core.children))
104 | 		# Get predicate spans
105 | 		predicate_span = sorted(predicate_set, key=lambda x: x.idx)
106 | 		predicate_core_span = sorted(predicate_core_set, key=lambda x: x.idx)
107 | 		# # Remove consecutive punctuations
108 | 		# non_consecutive_puncts = set([',',';'])
109 | 		# predicate_span = [
110 | 		# 	v 
111 | 		# 	for i, v in enumerate(predicate_span) 
112 | 		# 	if i == 0 
113 | 		# 	or v.pos_ != 'PUNCT' 
114 | 		# 	or v.pos_ != predicate_span[i-1].pos_ 
115 | 		# 	or v.text not in non_consecutive_puncts 
116 | 		# 	or predicate_span[i-1].text not in non_consecutive_puncts
117 | 		# ]
118 | 		return {
119 | 			'predicate_span':predicate_span,
120 | 			'predicate_core_span': predicate_core_span,
121 | 			'cores_couple': (core,other_core),
122 | 		}
123 | 
124 | 	@staticmethod
125 | 	def grammatical_connections_to_graph(caotic_triple_list):
126 | 		triple_list = []
127 | 		for triple_dict in caotic_triple_list:
128 | 			predicate_span = triple_dict['predicate_span']
129 | 			assert len(predicate_span) > 0, f'predicate_span is empty'
130 | 			predicate_core_span = triple_dict['predicate_core_span']
131 | 			assert len(predicate_core_span) > 0, f'predicate_core_span is empty'
132 | 			core, other_core = triple_dict['cores_couple']
133 | 			core_is_obj = re.search(CE.OBJ_REGEXP, CE.get_token_dependency(core)) is not None
134 | 			other_core_is_obj = re.search(CE.OBJ_REGEXP, CE.get_token_dependency(other_core)) is not None
135 | 			core_is_subj = re.search(CE.SUBJ_REGEXP, CE.get_token_dependency(core)) is not None
136 | 			other_core_is_subj = re.search(CE.SUBJ_REGEXP, CE.get_token_dependency(other_core)) is not None
137 | 			# Handle ambiguous dependencies
138 | 			# ambiguous_dep = core_is_obj != other_core_is_subj or core_is_subj != other_core_is_obj or core_is_obj == core_is_subj # or other_core_is_obj == other_core_is_subj
139 | 			if core_is_obj!=other_core_is_obj:
140 | 				if core_is_obj:
141 | 					subj = other_core
142 | 					obj = core
143 | 				else:
144 | 					subj = core
145 | 					obj = other_core
146 | 			elif core_is_subj!=other_core_is_subj:
147 | 				if core_is_subj:
148 | 					subj = core
149 | 					obj = other_core
150 | 				else:
151 | 					subj = other_core
152 | 					obj = core
153 | 			else: # position-based decision
154 | 				if core.idx < other_core.idx:
155 | 					subj = core
156 | 					obj = other_core
157 | 				else:
158 | 					subj = other_core
159 | 					obj = core
160 | 			triple = {
161 | 				'subj': subj,
162 | 				'obj': obj,
163 | 				'predicate_span': predicate_span, # add predicate components
164 | 				# 'predicate_set': set(predicate_span),
165 | 				'predicate_core_span': predicate_core_span,
166 | 				# 'predicate_core_set': set(predicate_core_span),
167 | 			}
168 | 			# print(triple)
169 | 			triple_list.append(triple)
170 | 		return triple_list
171 | 
172 | 	@staticmethod
173 | 	def get_core_predicate_dict(core_set):
174 | 		# find the paths that connect the core concepts each other
175 | 		core_list = list(core_set)
176 | 		grammatical_connection_list = list(filter(lambda x: x is not None,(
177 | 			CoupleExtractor.get_grammatical_connection(core, other_core, core_set)
178 | 			for i,core in enumerate(core_list)
179 | 			for other_core in core_list[i+1:]
180 | 		)))
181 | 		# print(grammatical_connection_list)
182 | 		directed_concept_graph = CoupleExtractor.grammatical_connections_to_graph(grammatical_connection_list)
183 | 		# print(directed_concept_graph)
184 | 		# create core predicate dict
185 | 		core_predicate_dict = {}		
186 | 		for edge in directed_concept_graph:
187 | 			predicate_span = edge['predicate_span']
188 | 			subj = edge['subj']
189 | 			obj = edge['obj']
190 | 			# print(subj,predicate_span,obj)
191 | 
192 | 			get_concept_dict = lambda span: CoupleExtractor.get_concept_dict_from_span(span)#, hidden_dep_list=CoupleExtractor.HIDDEN_PREDICATE_COMPONENT)
193 | 			# get predicate_dict
194 | 			predicate_dict = get_concept_dict(predicate_span)
195 | 			# templatize predicate_dict
196 | 			triple_span = predicate_span + [subj,obj]
197 | 			triple_span = CoupleExtractor.trim_noise(sorted(triple_span, key=lambda x:x.idx))
198 | 			subj_pos = triple_span.index(subj)
199 | 			obj_pos = triple_span.index(obj)
200 | 			if subj_pos < obj_pos:
201 | 				left_pivot = subj_pos
202 | 				right_pivot = obj_pos
203 | 				left_is_subj = True
204 | 			else:
205 | 				left_pivot = obj_pos
206 | 				right_pivot = subj_pos
207 | 				left_is_subj = False
208 | 			templatized_lemma = []
209 | 			templatized_text = []
210 | 			if left_pivot > 0:
211 | 				left_pdict = get_concept_dict(triple_span[:left_pivot])
212 | 				templatized_lemma.append(left_pdict['lemma'])
213 | 				templatized_text.append(left_pdict['text'])
214 | 			templatized_lemma.append('{subj}' if left_is_subj else '{obj}')
215 | 			templatized_text.append('{subj}' if left_is_subj else '{obj}')
216 | 			if right_pivot > left_pivot+1:
217 | 				middle_pdict = get_concept_dict(triple_span[left_pivot+1:right_pivot])
218 | 				templatized_lemma.append(middle_pdict['lemma'])
219 | 				templatized_text.append(middle_pdict['text'])
220 | 			templatized_lemma.append('{obj}' if left_is_subj else '{subj}')
221 | 			templatized_text.append('{obj}' if left_is_subj else '{subj}')
222 | 			if right_pivot < len(triple_span)-1:
223 | 				right_pdict = get_concept_dict(triple_span[right_pivot+1:])
224 | 				templatized_lemma.append(right_pdict['lemma'])
225 | 				templatized_text.append(right_pdict['text'])
226 | 			predicate_dict['text'] = ' '.join(templatized_text)
227 | 			predicate_dict['lemma'] = ' '.join(templatized_lemma)
228 | 			# get predicate_core_dict
229 | 			predicate_core_dict = get_concept_dict(edge['predicate_core_span'])
230 | 
231 | 			# populate core_predicate_dict
232 | 			if subj not in core_predicate_dict:
233 | 				core_predicate_dict[subj] = []
234 | 			core_predicate_dict[subj].append({
235 | 				'dependency': 'subj', 
236 | 				'predicate': predicate_dict,
237 | 				'predicate_core': predicate_core_dict,
238 | 				# 'missing_passivant': subj.i > predicate_span[0].i,
239 | 				# 'related_concepts_count': related_concepts_count, # if related_concepts_count > 2, then n-ary relation
240 | 			})
241 | 			if obj not in core_predicate_dict:
242 | 				core_predicate_dict[obj] = []
243 | 			core_predicate_dict[obj].append({
244 | 				'dependency': 'obj', 
245 | 				'predicate': predicate_dict,
246 | 				'predicate_core': predicate_core_dict,
247 | 				# 'missing_passivant': False,
248 | 				# 'related_concepts_count': related_concepts_count, # if related_concepts_count > 2, then n-ary relation
249 | 			})
250 | 		return core_predicate_dict
251 | 
252 | 	def get_couple_list(self, doc_parser: DocParser):
253 | 		concept_list = self.get_concept_list(doc_parser)
254 | 		core_concept_dict = {}
255 | 		for concept in concept_list:
256 | 			core = concept['concept_core'][-1]['span'][0]
257 | 			if core not in core_concept_dict:
258 | 				core_concept_dict[core] = []
259 | 			core_concept_dict[core].append(concept)
260 | 		# print(core_concept_dict)
261 | 
262 | 		core_predicate_dict = self.get_core_predicate_dict(set(core_concept_dict.keys()))
263 | 		# print(core_predicate_dict)
264 | 		couple_list = []
265 | 		for core, core_concepts in core_concept_dict.items():
266 | 			if core not in core_predicate_dict:
267 | 				# print(f'"{core}" not in core_predicate_dict')
268 | 				continue
269 | 			for concept_dict in core_concepts:
270 | 				concept_span_set = set(concept_dict['concept']['span'])
271 | 				is_at_core = self.is_at_core(concept_dict)
272 | 				for predicate_dict in core_predicate_dict[core]:
273 | 					if len(concept_span_set.intersection(predicate_dict['predicate']['span'])) > 0:
274 | 						# print(f'Discarding concept "{concept_dict["concept"]["text"]}", because it intersects its predicate: "{predicate_dict["predicate"]["text"]}".')
275 | 						continue
276 | 					couple_dict = {
277 | 						'is_at_core': is_at_core,
278 | 					}
279 | 					couple_dict.update(concept_dict)
280 | 					couple_dict.update(predicate_dict)
281 | 					couple_list.append(couple_dict)
282 | 		# print([(c['dependency'],c['concept']['text'],c['predicate']['text']) for c in couple_list])
283 | 		return couple_list
284 | 


--------------------------------------------------------------------------------
/web_app/yai/static/js/vue_component/explanation_components.js:
--------------------------------------------------------------------------------
  1 | OVERVIEW_CACHE = {};
  2 | TAXONOMICAL_VIEW_CACHE = {};
  3 | ANNOTATION_CACHE = {};
  4 | ANNOTATED_HTML_CACHE = {};
  5 | KNOWN_KNOWLEDGE_GRAPH = [];
  6 | 
  7 | Vue.component("template_tree", {
  8 | 	template: `
  9 | 		<div>
 10 | 			<div>
 11 | 				<span v-html="annotatedText"></span>
 12 | 				<span v-if="isParent" class="detail_btn" @click="toggle">[{{ isOpen ? 'Less..' : 'More..' }}]</span>
 13 | 			</div>
 14 | 			<ul v-if="isOpen && isParent">
 15 | 				<li v-for="(child, index) in item.children">
 16 | 					<template_tree :key="index" :item="child" :annotation_list="annotation_list"></template_tree>
 17 | 				</li>
 18 | 			</ul>
 19 | 		</div>
 20 | 	`,
 21 | 	props: {
 22 | 		item: Object,
 23 | 		annotation_list: Array,
 24 | 	},
 25 | 	data: function() {
 26 | 		return {
 27 | 			isOpen: this.item.expanded
 28 | 		};
 29 | 	},
 30 | 	computed: {
 31 | 		isParent: function() {
 32 | 			return this.item.children && this.item.children.length;
 33 | 		},
 34 | 		annotatedText: function() {
 35 | 			var txt = this.item.text;
 36 | 			if (txt in ANNOTATED_HTML_CACHE)
 37 | 				return ANNOTATED_HTML_CACHE[txt];
 38 | 			// console.log('Annotating with:', this.annotation_list);
 39 | 			return ANNOTATED_HTML_CACHE[txt] = annotate_html(txt, this.annotation_list, linkify);
 40 | 		},
 41 | 	},
 42 | 	methods: {
 43 | 		toggle: function() {
 44 | 			this.isOpen = !this.isOpen;
 45 | 		},
 46 | 	}
 47 | });
 48 | 
 49 | Vue.component("overview", {
 50 | 	template: `
 51 | 		<b-tab :active="active_fn()">
 52 | 			<template v-slot:title>
 53 | 				<b-spinner type="border" small v-if="loading"></b-spinner> {{label}}
 54 | 			</template>
 55 | 			<div>
 56 | 				<b-button-close @click="close_fn()">
 57 | 			</div>
 58 | 			<div>
 59 | 				<h1>{{ label }}</h1>
 60 | 				<div class="d-block">
 61 | 					<p v-if="loading">Loading overview, please wait a while..</p>
 62 | 					<p v-if="empty">No overview available.</p>
 63 | 					<b-alert variant="danger" dismissible fade :show="show_error_alert" @dismissed="show_error_alert=false">
 64 | 						{{error_message}}
 65 | 					</b-alert>
 66 | 					<div v-if="!loading && !empty">
 67 | 						<div v-if="taxonomical_view.length>0">
 68 | 							<ul>
 69 | 								<li v-for="view in taxonomical_view">
 70 | 									<template_tree :item="view" :annotation_list="annotation_list"></template_tree>
 71 | 								</li>
 72 | 							</ul>						
 73 | 						</div>
 74 | 						<div v-for="(overview_tree,question) in question_overview_tree" v-if="overview_tree!=null">
 75 | 							<b>{{question?question:'Extra'}}</b>
 76 | 							<ul>
 77 | 								<li v-for="overview in overview_tree">
 78 | 									<template_tree :item="overview" :annotation_list="annotation_list"></template_tree>
 79 | 								</li>
 80 | 							</ul>						
 81 | 						</div>
 82 | 					</div>
 83 | 				</div>
 84 | 			</div>
 85 | 		</b-tab>
 86 | 	`,
 87 | 	props: {
 88 | 		uri: String,
 89 | 		label: String,
 90 | 		active_fn: {
 91 | 			type: Function,
 92 | 			default: function () {}
 93 | 		},
 94 | 		close_fn: {
 95 | 			type: Function,
 96 | 			default: function () {}
 97 | 		},
 98 | 		onload_fn: {
 99 | 			type: Function,
100 | 			default: function () {}
101 | 		},
102 | 	},
103 | 	data: function() {
104 | 		return {
105 | 			loading: true,
106 | 			empty: false,
107 | 			show_error_alert: false,
108 | 			error_message: '',
109 | 
110 | 			question_overview_tree: {},
111 | 			taxonomical_view: [],
112 | 			annotation_list: [],
113 | 		};
114 | 	},
115 | 	// methods: {
116 | 	// 	format_label: function(label) {
117 | 	// 		return tokenise(label).filter(x=>x!='').join(' ');
118 | 	// 	}
119 | 	// },
120 | 	created: function() {
121 | 		var self = this;
122 | 		// self.uri = self.uri.toLowerCase();
123 | 		if (self.uri in OVERVIEW_CACHE)
124 | 		{
125 | 			self.question_overview_tree = OVERVIEW_CACHE[self.uri]
126 | 			self.taxonomical_view = TAXONOMICAL_VIEW_CACHE[self.uri];
127 | 			self.annotation_list = ANNOTATION_CACHE[self.uri];
128 | 			self.loading = false;
129 | 			if (!self.question_overview_tree)
130 | 				self.empty = true;
131 | 			return;
132 | 		}
133 | 		console.log('Shifting towards topic:', self.uri, self.label);
134 | 		self.loading = true;
135 | 		$.ajax({
136 | 			type: "GET",
137 | 			url: GET_OVERVIEW_API,
138 | 			responseType:'application/json',
139 | 			data: {
140 | 				'concept_uri': self.uri, 
141 | 			},
142 | 			success: function (result) {
143 | 				console.log('Processing overview', result);
144 | 				self.show_error_alert = false;
145 | 				self.loading = false;
146 | 				self.onload_fn();
147 | 				// Check cache
148 | 				if (!result)
149 | 				{
150 | 					self.empty = true;
151 | 					OVERVIEW_CACHE[self.uri] = null;
152 | 					ANNOTATION_CACHE[self.uri] = null;
153 | 					TAXONOMICAL_VIEW_CACHE[self.uri] = null;
154 | 					return;
155 | 				}
156 | 				self.empty = false;
157 | 				// Setup KNOWN_ENTITY_DICT
158 | 				var taxonomical_view = tuple_list_to_formatted_jsonld(result.taxonomical_view);
159 | 				// Update the known entity dict (cache)
160 | 				KNOWN_KNOWLEDGE_GRAPH = KNOWN_KNOWLEDGE_GRAPH.concat(taxonomical_view);
161 | 				KNOWN_ENTITY_DICT = get_typed_entity_dict_from_jsonld(KNOWN_KNOWLEDGE_GRAPH);
162 | 				// Setup and annotate question summary tree
163 | 				var annotation_list = result.annotation_list;
164 | 				// IMPORTANT: filter out all the annotations referring to the exact concept in overview.
165 | 				// annotation_list = annotation_list.filter(x => x.annotation != self.uri);
166 | 				// Populate the question_overview_tree
167 | 				var question_summary_tree = result.question_summary_tree;
168 | 				if (question_summary_tree)
169 | 				{
170 | 					for (var [question,summary_tree] of Object.entries(question_summary_tree))
171 | 					{
172 | 						if (!summary_tree.summary)
173 | 							continue;
174 | 						summary_tree = summary_tree_to_jsonld(summary_tree);
175 | 						summary_tree = format_jsonld(summary_tree);
176 | 						summary_tree = jsonld_to_nestedlist(summary_tree);
177 | 						self.question_overview_tree[question] = summary_tree;
178 | 					}
179 | 				}
180 | 				// Set taxonomical_view
181 | 				const prefixed_string = prefixed_string_to_uri(self.uri);
182 | 				self.taxonomical_view = jsonld_to_nestedlist(nest_jsonld(KNOWN_ENTITY_DICT[prefixed_string], KNOWN_ENTITY_DICT, [prefixed_string], 2));
183 | 				self.annotation_list = annotation_list;
184 | 				// Cache question summary tree
185 | 				OVERVIEW_CACHE[self.uri] = self.question_overview_tree;
186 | 				ANNOTATION_CACHE[self.uri] = self.annotation_list;
187 | 				TAXONOMICAL_VIEW_CACHE[self.uri] = self.taxonomical_view;
188 | 			},
189 | 			error: function(result) {
190 | 				const prefixed_string = prefixed_string_to_uri(self.uri);
191 | 				self.loading = false;
192 | 				if (self.uri in ANNOTATION_CACHE)
193 | 				{
194 | 					self.taxonomical_view = TAXONOMICAL_VIEW_CACHE[self.uri];
195 | 					self.annotation_list = ANNOTATION_CACHE[self.uri];
196 | 				}
197 | 				else 
198 | 				{
199 | 					self.error_message = result;
200 | 					self.show_error_alert = true;
201 | 					// expand_link(
202 | 					// 	prefixed_string_to_uri(self.uri), 
203 | 					// 	x=>{
204 | 					// 		console.log(x);
205 | 					// 	}, 
206 | 					// 	KNOWN_ENTITY_DICT
207 | 					// );
208 | 				}
209 | 			},
210 | 		});
211 | 	},
212 | });
213 | 
214 | Vue.component("answer", {
215 | 	template: `
216 | 		<div>
217 | 			<input 
218 | 				placeholder="Write a question.. e.g. Which law is applicable to a non-contractual obligation?" 
219 | 				value="Which law is applicable to a non-contractual obligation?" 
220 | 				type="text" 
221 | 				class="form-control input-lg" 
222 | 				aria-label="Write a question.. e.g. Which law is applicable to a non-contractual obligation?" 
223 | 				aria-describedby="inputGroup-sizing-sm" 
224 | 				v-on:keydown.enter="ask"
225 | 			>
226 | 			<hr>
227 | 			<p v-if="loading_answers">Loading answers, please wait a while..</p>
228 | 			<p v-if="empty_answers">No answers found.</p>
229 | 			<b-alert variant="danger" dismissible fade :show="show_error_alert" @dismissed="show_error_alert=false">
230 | 				{{error_message}}
231 | 			</b-alert>
232 | 			<b-alert variant="warning" dismissible fade :show="show_warning_alert" @dismissed="show_warning_alert=false">
233 | 				{{warning_message}}
234 | 			</b-alert>
235 | 			<div v-if="!loading_answers && !empty_answers && answer_tree">
236 | 				<p>
237 | 					<strong>Question</strong>: {{ question_text }}
238 | 				</p>
239 | 				<div>
240 | 					<strong>Answer</strong>:
241 | 					<ul>
242 | 						<li v-for="answer in answer_tree">
243 | 							<template_tree :item="answer" :annotation_list="answer_annotation_list"></template_tree>
244 | 						</li>
245 | 						<li v-for="quality in answer_quality">
246 | 							<template_tree :item="quality" :annotation_list="[]"></template_tree>							
247 | 						</li>
248 | 					</ul>
249 | 				</div>
250 | 			</div>
251 | 		</div>
252 | 	`,
253 | 	data: function() {
254 | 		return {
255 | 			show_error_alert: false,
256 | 			error_message: '',
257 | 
258 | 			show_warning_alert: false,
259 | 			warning_message: '',
260 | 
261 | 			empty_answers: false,
262 | 			loading_answers: false,
263 | 			question_text: '',
264 | 			answer_tree: null,
265 | 			answer_annotation_list: [],
266 | 			answer_quality: null,
267 | 		};
268 | 	},
269 | 	methods: {
270 | 		ask: function(event) {
271 | 			// console.log(event);
272 | 			var self = this;
273 | 			self.loading_answers = true;
274 | 			self.empty_answers = false;
275 | 			self.show_warning_alert = false;
276 | 			self.show_error_alert = false;
277 | 
278 | 			var x = titlefy(event.target.value.replace(/(\r\n|\n|\r)/gm, "").trim());
279 | 			console.log('Sending question:',x);
280 | 			$.ajax({
281 | 				type: "GET",
282 | 				url: GET_ANSWER_API,
283 | 				responseType:'application/json',
284 | 				data: {'question': x},
285 | 				success: function (result) {
286 | 					console.log('Processing answer');
287 | 					// console.log('Getting answer:',JSON.stringify(result));
288 | 					self.loading_answers = false;
289 | 					if (!result)
290 | 					{
291 | 						self.empty_answers = true;
292 | 						return;
293 | 					}
294 | 					const annotation_list = result.annotation_list;
295 | 					var question_summary_tree = result.question_summary_tree;
296 | 					const question = Object.keys(question_summary_tree)[0];
297 | 					var summary_tree = summary_tree_to_jsonld(question_summary_tree[question]);
298 | 					const answer_quality = result.quality[question];
299 | 
300 | 					self.show_error_alert = false;
301 | 					self.empty_answers = false;
302 | 					self.question_text = question;
303 | 					self.answer_tree = jsonld_to_nestedlist(format_jsonld(summary_tree));
304 | 					self.answer_annotation_list = annotation_list;
305 | 					self.answer_quality = jsonld_to_nestedlist(format_jsonld({'my:answer_quality': pydict_to_jsonld(answer_quality)}));
306 | 					
307 | 					// Show answer quality
308 | 					console.log('Answer quality:', answer_quality);
309 | 					if (answer_quality.semantic_similarity < 0.5)
310 | 					{
311 | 						self.warning_message = 'The following answers can be very imprecise. We struggled to extract them from data, maybe because this question cannot be properly answered using the available information.';
312 | 						self.show_warning_alert = true;
313 | 					}
314 | 				},
315 | 				error: function(result) {
316 | 					self.error_message = result;
317 | 					self.show_error_alert = true;
318 | 				},
319 | 			});
320 | 		},
321 | 	}
322 | });
323 | 
324 | function summary_tree_to_jsonld(summary_tree) {
325 | 	var jsonld = {};
326 | 	for (var [key,value] of Object.entries(summary_tree))
327 | 	{
328 | 		if (key == 'children')
329 | 			continue;
330 | 		if (key == 'annotation')
331 | 		{
332 | 			if (value)
333 | 			{
334 | 				var source_id = prefixed_string_to_uri(summary_tree['source_id']);
335 | 				var jsonld_value = tuple_list_to_formatted_jsonld(value);
336 | 				var entity_dict = get_entity_dict_from_jsonld(jsonld_value);
337 | 				jsonld['my:hasSource'] = nest_jsonld(entity_dict[source_id], entity_dict, [source_id], 2);
338 | 			}
339 | 		}
340 | 		else
341 | 			jsonld['my:'+key] = value;
342 | 	}
343 | 	if (summary_tree.children && summary_tree.children.length)
344 | 		jsonld['my:sub_summary_list'] = summary_tree.children.map(summary_tree_to_jsonld);
345 | 	return jsonld;
346 | }
347 | 
348 | function pydict_to_jsonld(pydict) {
349 | 	if (isDict(pydict))
350 | 	{
351 | 		var jsonld = {};
352 | 		for (var [key,value] of Object.entries(pydict))
353 | 			jsonld['my:'+key] = pydict_to_jsonld(value);
354 | 		return jsonld;
355 | 	}
356 | 	if (isArray(pydict))
357 | 		return pydict.map(pydict_to_jsonld);
358 | 	return pydict;
359 | }
360 | 
361 | $(document).on('click', '.link', function(e) {
362 | 	var topic = e.target.dataset['topic'] || "";
363 | 	topic = uri_to_prefixed_string(topic);
364 | 	// var is_first = (e.target.dataset['is_first'] == 'true');
365 | 	var label = e.target.innerText;
366 | 	app.cards.push({
367 | 		'uri':topic,
368 | 		'label':titlefy(label),
369 | 		'deleted':false,
370 | 	});
371 | 	if (!app.show_overview_modal)
372 | 		app.current_card_index = app.cards.length-1;
373 | 	app.show_overview_modal = true;
374 | });
375 | 


--------------------------------------------------------------------------------