├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── properties
    │   ├── dbpedia-201610-manual-annotation.csv
    │   ├── dbpedia-201610-place-properties.csv
    │   ├── dbpedia-201610-properties.tsv
    │   └── dbpedia-ontology-properties.tsv
    └── templates
    │   ├── Annotations_F30_art.csv
    │   ├── LC-QuAD_v6_art.csv
    │   ├── LC-QuAD_v6_other.csv
    │   ├── LC-QuAD_v6_personal.csv
    │   ├── LC-QuAD_v6_sport.csv
    │   ├── LC-QuAD_v6_sport_art.csv
    │   ├── QALD-7.csv
    │   └── dbpedia-201610-place.csv
├── gsoc
    ├── aman
    │   ├── .DS_Store
    │   ├── GS_with_mve.csv
    │   ├── PIPELINE
    │   ├── README.md
    │   ├── composite_template.py
    │   ├── decision_tree.py
    │   ├── delete_lines.py
    │   ├── entity_errors.py
    │   ├── error_analysis.py
    │   ├── final_formatting.py
    │   ├── get_metadata.py
    │   ├── get_properties.py
    │   ├── integrate.py
    │   ├── log_place
    │   ├── metadata_place.txt
    │   ├── place_labels
    │   ├── placetemp
    │   │   ├── data_300.en
    │   │   ├── data_300.sparql
    │   │   └── resource_dump.json
    │   ├── range_place.py
    │   ├── remove_en.py
    │   ├── script.py
    │   ├── script2.py
    │   ├── sparql_generator.csv
    │   ├── sparql_generator.py
    │   ├── temp.py
    │   ├── tempout
    │   ├── test.txt
    │   ├── test_comp.csv
    │   ├── test_pipeline
    │   │   ├── 1
    │   │   ├── 2
    │   │   └── get_properties.py
    │   └── training_log
    ├── anand
    │   ├── .images
    │   │   ├── test-accuracy.png
    │   │   └── test-bleu.png
    │   ├── .pipeline_2
    │   │   ├── .vscode
    │   │   │   └── settings.json
    │   │   ├── decision_tree.py
    │   │   ├── final_formatting.py
    │   │   ├── get_properties.py
    │   │   ├── integrate.py
    │   │   ├── person
    │   │   │   ├── decision_tree.csv
    │   │   │   ├── get_properties.csv
    │   │   │   ├── integrate.csv
    │   │   │   ├── mvedecision_tree.csv
    │   │   │   ├── sparql_generator.csv
    │   │   │   └── test_res.csv
    │   │   ├── sparql_generator.py
    │   │   └── test
    │   │   │   ├── check_test_res.csv
    │   │   │   ├── decision_tree.csv
    │   │   │   ├── get_properties.csv
    │   │   │   ├── integrate.csv
    │   │   │   ├── mvedecision_tree.csv
    │   │   │   └── sparql_generator.csv
    │   ├── .vscode
    │   │   └── settings.json
    │   ├── PIPELINE.md
    │   ├── pipeline_1
    │   │   ├── pipeline_1_composite
    │   │   │   ├── composite_template.py
    │   │   │   ├── decision_tree.py
    │   │   │   ├── final_formatting.py
    │   │   │   ├── get_properties.py
    │   │   │   ├── integrate.py
    │   │   │   ├── person
    │   │   │   │   ├── decision_tree.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── integrate.csv
    │   │   │   │   ├── mvedecision_tree.csv
    │   │   │   │   ├── sparql_generator.csv
    │   │   │   │   └── test_res.csv
    │   │   │   ├── range_place.py
    │   │   │   ├── sparql_generator.py
    │   │   │   └── test3
    │   │   │   │   ├── decision_tree.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── integrate.csv
    │   │   │   │   ├── mvedecision_tree.csv
    │   │   │   │   ├── sparql_generator.csv
    │   │   │   │   └── test_res.csv
    │   │   ├── pipeline_1_simple
    │   │   │   ├── decision_tree.py
    │   │   │   ├── final_formatting.py
    │   │   │   ├── get_properties.py
    │   │   │   ├── integrate.py
    │   │   │   ├── person
    │   │   │   │   ├── decision_tree.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── integrate.csv
    │   │   │   │   ├── mvedecision_tree.csv
    │   │   │   │   ├── sparql_generator.csv
    │   │   │   │   └── test_res.csv
    │   │   │   ├── sparql_generator.py
    │   │   │   ├── test
    │   │   │   │   ├── decision_tree.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── integrate.csv
    │   │   │   │   ├── mvedecision_tree.csv
    │   │   │   │   ├── sparql_generator.csv
    │   │   │   │   └── test_res.csv
    │   │   │   └── test_person
    │   │   │   │   ├── decision_tree.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── integrate.csv
    │   │   │   │   ├── mvedecision_tree.csv
    │   │   │   │   ├── sparql_generator.csv
    │   │   │   │   └── test_res.csv
    │   │   └── readme.md
    │   ├── pipeline_3
    │   │   ├── .pipeline_3
    │   │   │   ├── .vscode
    │   │   │   │   └── settings.json
    │   │   │   ├── Film
    │   │   │   │   ├── author.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   └── sentence_and_template_generator.ods
    │   │   │   ├── eliminator.py
    │   │   │   ├── eukaryotes
    │   │   │   │   ├── family.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── results.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   ├── sentence_and_template_generator.ods
    │   │   │   │   └── species.csv
    │   │   │   ├── fetch_ranks_sub.py
    │   │   │   ├── generate_templates.py
    │   │   │   ├── generate_url.py
    │   │   │   ├── get_properties.py
    │   │   │   ├── nspm_ready.csv
    │   │   │   ├── rank_test
    │   │   │   │   ├── affiliation.csv
    │   │   │   │   ├── birth place.csv
    │   │   │   │   ├── body discovered.csv
    │   │   │   │   ├── child.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   ├── sentence_and_template_generator.csv
    │   │   │   │   └── sentence_and_template_generator.ods
    │   │   │   ├── readme.md
    │   │   │   └── sentence_and_template_generator.py
    │   │   ├── pipeline_3_with_controlled_test_set
    │   │   │   ├── .images
    │   │   │   │   └── person_properties.png
    │   │   │   ├── .vscode
    │   │   │   │   └── settings.json
    │   │   │   ├── Eukaryotes
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   └── test.csv
    │   │   │   ├── Monument
    │   │   │   │   ├── architect.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   ├── tenant.csv
    │   │   │   │   └── test.csv
    │   │   │   ├── Organisation
    │   │   │   │   ├── administrator.csv
    │   │   │   │   ├── chairperson.csv
    │   │   │   │   ├── chaplain.csv
    │   │   │   │   ├── chief executive officer.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   └── test.csv
    │   │   │   ├── Person
    │   │   │   │   ├── alma mater.csv
    │   │   │   │   ├── birth place.csv
    │   │   │   │   ├── body discovered.csv
    │   │   │   │   ├── career station.csv
    │   │   │   │   ├── child.csv
    │   │   │   │   ├── college.csv
    │   │   │   │   ├── death place.csv
    │   │   │   │   ├── employer.csv
    │   │   │   │   ├── ethnicity.csv
    │   │   │   │   ├── get_properties.csv
    │   │   │   │   ├── home town.csv
    │   │   │   │   ├── ideology.csv
    │   │   │   │   ├── nationality.csv
    │   │   │   │   ├── networth.csv
    │   │   │   │   ├── opponent.csv
    │   │   │   │   ├── parent.csv
    │   │   │   │   ├── partner.csv
    │   │   │   │   ├── person function.csv
    │   │   │   │   ├── place of burial.csv
    │   │   │   │   ├── relation.csv
    │   │   │   │   ├── relative.csv
    │   │   │   │   ├── residence.csv
    │   │   │   │   ├── resting place.csv
    │   │   │   │   ├── sentence_and_template_generator
    │   │   │   │   └── test.csv
    │   │   │   ├── eliminator.py
    │   │   │   ├── fetch_ranks.py
    │   │   │   ├── fetch_ranks_sub.py
    │   │   │   ├── generate_templates.py
    │   │   │   ├── generate_url.py
    │   │   │   ├── get_properties.py
    │   │   │   ├── new_train.csv
    │   │   │   ├── nspm_ready.csv
    │   │   │   ├── question_generator.py
    │   │   │   ├── readme.md
    │   │   │   └── sentence_and_template_generator.py
    │   │   └── utility
    │   │   │   ├── .~lock.question_form.csv#
    │   │   │   ├── Test_Fixer
    │   │   │       ├── readme.md
    │   │   │       └── test_fixer.py
    │   │   │   ├── compare
    │   │   │       ├── compare.py
    │   │   │       ├── output_test
    │   │   │       ├── readme.md
    │   │   │       └── test.sparql
    │   │   │   ├── labels.json
    │   │   │   ├── new_extractor_fromGraphDBpediaEmbeddings
    │   │   │       ├── breaker.sh
    │   │   │       ├── embedding_extractor.py
    │   │   │       ├── indexer.py
    │   │   │       └── readme.md
    │   │   │   ├── old_extractor_from_GraphDBpediaEmbeddings
    │   │   │       ├── en_extract_embed.py
    │   │   │       ├── readme.md
    │   │   │       └── sparql_extract_embed.py
    │   │   │   ├── qald_json
    │   │   │       ├── interpreter.py
    │   │   │       ├── qald_json_gerbil_input.py
    │   │   │       ├── readme.md
    │   │   │       ├── shifter.sh
    │   │   │       ├── test.en
    │   │   │       └── test.sparql
    │   │   │   ├── question_form.csv
    │   │   │   ├── readme.md
    │   │   │   └── vocab_extractor_from_model
    │   │   │       ├── embedding_extractor.py
    │   │   │       └── readme.md
    │   └── readme.md
    └── zheyuan
    │   ├── pipeline
    │       ├── README.md
    │       ├── basic_sentence_and_template_generator.py
    │       ├── batch_paraphrase.py
    │       ├── bert_classifier.py
    │       ├── eliminator.py
    │       ├── fetch_ranks.py
    │       ├── fetch_ranks_sub.py
    │       ├── generate_templates.py
    │       ├── generate_url.py
    │       ├── get_properties.py
    │       ├── multi_generate_templates.py
    │       ├── paraphrase_questions.py
    │       ├── pipeline.sh
    │       ├── question_generator.py
    │       ├── sentence_and_template_generator.py
    │       └── textual_similarity.py
    │   └── utility
    │       ├── GloVe
    │           └── glove_finetune.py
    │       ├── benchmark
    │           ├── README.md
    │           ├── benchmark.py
    │           ├── extract_questions.py
    │           ├── interpreter.py
    │           ├── reconstruct_json.py
    │           └── retrieve_answers.py
    │       ├── queryFilter.py
    │       ├── question_form.csv
    │       ├── readme.md
    │       └── vocab_creator.py
├── nspm
    ├── __init__.py
    ├── analyse.py
    ├── data_gen.py
    ├── filter_dataset.py
    ├── generator.py
    ├── generator_test.py
    ├── generator_utils.py
    ├── interpreter.py
    ├── learner.py
    ├── nmt.py
    ├── prepare_dataset.py
    └── split_in_train_dev_test.py
├── requirements.txt
└── test
    └── interpreter_airml_test.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.zip filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | # Usually these files are written by a python script from a template
 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # macOS
104 | .DS_Store
105 | 
106 | .idea/
107 | 
108 | data/*/*
109 | 
110 | gsoc/anand/utility/test.py
111 | gsoc/anand/utility/part-r-00000
112 | gsoc/anand/pipeline_3/utility/part-r-00000
113 | data/pipeline_3*
114 | gsoc/anand/pipeline_3/utility/part-r-00000
115 | data/*/*
116 | v1/
117 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 AKSW Research Group @ University of Leipzig
 4 | Copyright (c) 2020 Liber AI Research
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/gsoc/aman/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/aman/.DS_Store


--------------------------------------------------------------------------------
/gsoc/aman/PIPELINE:
--------------------------------------------------------------------------------
 1 | 
 2 | ### AUTOMATIC TRAINING DATA GENERATION FOR NEURAL QA MODEL ###
 3 | 
 4 | 
 5 | The final output file has each row of the form: 
 6 | ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
 7 | 
 8 | # STEP 1 - Get properties from web page #
 9 | Command: python get_properties.py --url <WEBPAGE URL> > temp.csv
10 | - --url argument is the webpage from where property metadata is to scraped
11 | 
12 | # STEP 2 - Get number of occurrences and URI #
13 | Store only the rows of required namespace properties
14 | 
15 | # STEP 3 - Integrate STEP 2 values with their corresponding property metadata row in temp.csv #
16 | Command: python integrate.py temp.csv
17 | Output file: manual-annotation-updated-v2.csv (change it in the file if needed)
18 | - Change the namespace to the required (it is 'ontology' right now) 
19 | 
20 | # STEP 4 - MVE generation #
21 | Command: python decision_tree.py data/manual-annotation-updated-v2.csv
22 | Output file: GS_with_mve.csv
23 | 
24 | # STEP 5 - SPARQL Query Template and Generator Query generation #
25 | Command: python sparql_generator.py GS_with_mve.csv
26 | 
27 | # STEP 6 - Formatting the data into required format
28 | Command: python final_formatting.py data/GS-v3.csv data/annotations_place_v2.csv
29 | 
30 | # STEP 7 - Follow the original data generation and training steps (readme of master branch)
31 | 
32 | ### COMPOSITIONALITY EXPERIMENT: ###
33 | 
34 | # STEP 1 - Create template annotations (all a[i]'s)
35 | Command: python range_place.py data/GS-v3.csv > data/annotations_compositions_combined.csv
36 | 
37 | 
38 | # STEP 2 - Create composite templates (a[i]○b true for all i <= sizeof list 'a')
39 | Command: python composite_template.py data/GS-v3.csv >> data/annotations_compositions_combined.csv
40 | 
41 | # STEP 3 - Follow the original data generation and training steps (readme of master branch)
42 | 
43 | # STEP 4 - Choose any 10% templates and their output and shift it to new file (test data), rest of the contents of this file should be split into 90% train and 10% dev using the split_in_train_dev_test.py script.
44 | 
45 | # STEP 5 - Run the training
46 | Command: sh train.sh <path of directory which contains train, dev and test data>
47 | 


--------------------------------------------------------------------------------
/gsoc/aman/README.md:
--------------------------------------------------------------------------------
1 | ## Aman: Work done during DBpedia's Google Summer of Code 2018
2 | 
3 | Hi, please find my blog here: https://amanmehta-maniac.github.io. - You will find details about what this project based on https://github.com/AKSW/NSpM had to offer.
4 | 
5 | 1. To be able to generate the dataset automatically, there is a five step pipeline which you would have to follow, guided at 'PIPELINE' file.
6 | 2. Otherwise you can directly use the data generated under `./data/place_v2.zip` and `./data/Compositions_v2.zip`.
7 | 


--------------------------------------------------------------------------------
/gsoc/aman/composite_template.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | f = open(sys.argv[1],'r')
 3 | lines = f.readlines()
 4 | 
 5 | 
 6 | # RUN: python composite_template.py data/GS-v3.csv
 7 | # Given format #
 8 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
 9 | 
10 | # Required format : separated by semi-colon ##
11 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
12 | b = []
13 | b.append("where is the")
14 | b.append("")
15 | b.append("of <A> located in")
16 | 
17 | for l in lines:
18 | 	l = l.strip().split(',')
19 | 	# print l
20 | 	if len(l) == 0:
21 | 		continue
22 | 	if 'place' in l[2].lower() and l[5]!='' and len(l[5])!=0and 'location of' not in l[7].lower():
23 | 		
24 | 		newl,to_remove = [],[]
25 | 		newl.append("dbo:Place")
26 | 		newl.append("")
27 | 		newl.append("")
28 | 
29 | 		l[1] = l[1].split()
30 | 		for i in range(len(l[1])):
31 | 			if '(' in l[1][i] or ')' in l[1][i]:
32 | 				to_remove.append(l[1][i]);
33 | 				continue
34 | 		for x in to_remove:
35 | 			l[1].remove(x);
36 | 
37 | 		b[1] = " ".join(l[1])
38 | 		# print b
39 | 		nlq = " ".join(b)
40 | 
41 | 		spq = "select ?a where { <A> " + l[5] + " ?b . ?b <http://dbpedia.org/ontology/location> ?a }"
42 | 		# print nlq + ";" + spq
43 | 
44 | 		gq = l[-1]
45 | 
46 | 		gq2 = gq.split()[1]
47 | 		gq2 = "distinct(" + gq2 + ")"
48 | 		gq = gq.split();
49 | 		gq[1] = gq2;
50 | 		gq = " ".join(gq).replace("SELECT","select").replace("WHERE","where")
51 | 
52 | 
53 | 		newl.append((nlq))
54 | 		newl.append((spq))
55 | 		newl.append((gq))
56 | 		newl = ";".join(newl)
57 | 		print newl


--------------------------------------------------------------------------------
/gsoc/aman/decision_tree.py:
--------------------------------------------------------------------------------
 1 | import sys, re
 2 | 
 3 | f = open(sys.argv[1],'r')
 4 | lines = f.readlines();
 5 | final_lines = []
 6 | 
 7 | lineno = 1
 8 | 
 9 | # print lines[0].split(',') 
10 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 
11 | # 'MVE', 'Optimal Expression\r\n']
12 | 
13 | mve = ""
14 | for line in lines:
15 | 	if lineno == 1: 
16 | 		lineno += 1
17 | 		continue
18 | 	line = line.strip().split(',')
19 | 	rng = line[2].lower()
20 | 	lbl = line[1]
21 | 	if 'person' in rng:
22 | 		rng = "who"
23 | 	else:
24 | 		rng = "what"
25 | 	line[7] = rng + " is the " + lbl + " of <X>"
26 | 	line[8] = rng + " is the " + lbl + " of <X>"
27 | 	mve += rng + " is the " + lbl + " of <X>\n"
28 | 	final_lines.append(",".join(line));
29 | 
30 | 
31 | fw = open('data/mve_output','w')
32 | fw.write(mve)
33 | 
34 | fw2 = open('GS_with_mve.csv','w');
35 | fw2.write("\n".join(final_lines))
36 | 
37 | 


--------------------------------------------------------------------------------
/gsoc/aman/delete_lines.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | 
 3 | f = open(sys.argv[1],'rw')
 4 | lines = f.readlines()
 5 | 
 6 | to_del = [3636, 15366, 22096, 23913, 27938, 29413, 29452, 33507, 34670, 50813, 58739, 71547, 71747, 72127, 72699, 73110, 73146, 75803, 76512, 76977]
 7 | to_del.reverse()
 8 | 
 9 | for x in to_del:
10 | 	del lines[x-1]
11 | 
12 | lines = "".join(lines)
13 | print lines


--------------------------------------------------------------------------------
/gsoc/aman/entity_errors.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | f = open(sys.argv[1],'r')
 4 | g = open(sys.argv[2],'r')
 5 | h = open(sys.argv[3],'r')
 6 | 
 7 | 
 8 | fl = f.readlines()
 9 | gl = g.readlines()
10 | hl = h.readlines()
11 | tot = 0
12 | entity_error, prop_error = 0, 0
13 | train_entities = set()
14 | 
15 | for i in range(len(fl)):
16 | 	trainout = fl[i].strip().split()
17 | 	for t in trainout:
18 | 		if 'dbr_' in t:
19 | 			train_entities.add(t[1:-1]);
20 | 			break
21 | 
22 | # print train_entities
23 | 
24 | cnt = 0
25 | save = []
26 | for i in range(len(gl)):
27 | 	testout = gl[i].strip().split()
28 | 	testEntity = ""
29 | 	for t in testout:
30 | 		if 'dbr_' in t:
31 | 			testEntity = t
32 | 			if testEntity[1:-1] in train_entities:
33 | 				save.append(i);
34 | 				cnt += 1
35 | 	tot += 1
36 | 
37 | newtest_nlq, newtest_sparql = [], []
38 | 
39 | for i in save:
40 | 	newtest_sparql.append(gl[i])
41 | 
42 | for i in save:
43 | 	newtest_nlq.append(hl[i])
44 | 
45 | # print "".join(newtest_sparql)
46 | print "".join(newtest_nlq)
47 | 
48 | 
49 | # print cnt, tot
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/gsoc/aman/error_analysis.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | f = open(sys.argv[1],'r')
 4 | g = open(sys.argv[2],'r')
 5 | 
 6 | 
 7 | fl = f.readlines()
 8 | gl = g.readlines()
 9 | tot = 0
10 | entity_error, prop_error = 0, 0
11 | 
12 | for i in range(len(fl)):
13 | 	if fl[i].strip() == gl[i].strip():
14 | 		# print fl[i], gl[i]
15 | 		continue
16 | 	myout = fl[i].strip().split()
17 | 	myEntity, myProp = "", ""
18 | 	for t in myout:
19 | 		if 'dbr_' in t:
20 | 			myEntity = t
21 | 		if '<' in t:
22 | 			myProp = t[1:-1]			
23 | 
24 | 	reqout = gl[i].strip().split()
25 | 	reqEntity, reqProp = "", ""
26 | 	for t in reqout:
27 | 		if 'dbr_' in t:
28 | 			reqEntity = t
29 | 		if '<' in t:
30 | 			reqProp = t[1:-1]			
31 | 
32 | 	if reqEntity != myEntity:
33 | 
34 | 		# print myEntity, reqEntity
35 | 		entity_error += 1
36 | 	if reqProp != myProp:
37 | 		print reqProp, myProp
38 | 		prop_error += 1
39 | 	tot += 1	
40 | 
41 | print (99/2034.0)
42 | 


--------------------------------------------------------------------------------
/gsoc/aman/final_formatting.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | f = open(sys.argv[1],'r')
 4 | # Given format #
 5 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
 6 | 
 7 | # Required format : separated by semi-colon ##
 8 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
 9 | 
10 | lines = f.readlines();
11 | f.close()
12 | fl = 1
13 | 
14 | output = ""
15 | for line in lines:
16 | 	
17 | 	if fl:
18 | 		fl = 0
19 | 		continue
20 | 	l = line.split(',');
21 | 	# print l
22 | 	
23 | 	newl,to_remove = [],[]
24 | 	newl.append("dbo:Place")
25 | 	newl.append("")
26 | 	newl.append("")
27 | 
28 | 	nlq = l[7].split();
29 | 	for i in range(len(nlq)):
30 | 		if '(' in nlq[i] or ')' in nlq[i]:
31 | 			to_remove.append(nlq[i]);
32 | 			continue
33 | 		if '<' not in nlq[i] and '?' not in nlq[i]:
34 | 			nlq[i] = nlq[i].lower()
35 | 
36 | 	for x in to_remove:
37 | 		nlq.remove(x);
38 | 
39 | 	spq = l[-2].split();
40 | 	for i in range(len(spq)):
41 | 		if '<' not in spq[i] and '?' not in spq[i]:
42 | 			spq[i] = spq[i].lower()
43 | 
44 | 	gq = l[-1].split();
45 | 	for i in range(len(gq)):
46 | 		if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 
47 | 			gq[i] = gq[i].lower()
48 | 
49 | 	newl.append(" ".join(nlq))
50 | 	newl.append(" ".join(spq))
51 | 	newl.append(" ".join(gq))
52 | 	output += ";".join(newl) + "\n";
53 | 
54 | 
55 | fw = open(sys.argv[2],'w')
56 | fw.write(output)
57 | fw.close()


--------------------------------------------------------------------------------
/gsoc/aman/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib2, urllib, httplib, json, sys, csv, io
 2 | import argparse
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | requiredNamed = parser.add_argument_group('Required Arguments');
 7 | requiredNamed.add_argument('--url', dest='url', metavar='url', help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
 8 | args = parser.parse_args()
 9 | 
10 | quote_page = args.url
11 | page = urllib2.urlopen(quote_page)
12 | 
13 | soup = BeautifulSoup(page, "html.parser")
14 | # print type(soup)
15 | fl = 0
16 | for rows in soup.find_all("tr"):
17 | 	
18 | 	x = rows.find_all("td");
19 | 
20 | 	if len(x) <= 2:
21 | 		fl = 1
22 | 		continue
23 | 		
24 | 	if fl == 1:
25 | 		fl = 2
26 | 		continue
27 | 
28 | 	name = rows.find_all("td")[0].get_text().replace(" (edit)","")
29 | 	label = rows.find_all("td")[1].get_text()
30 | 	dom = rows.find_all("td")[2].get_text()
31 | 	rng = rows.find_all("td")[3].get_text()
32 | 
33 | 	final = name + "," + label + ","  + dom + ","  + rng 
34 | 	print final.encode('utf-8')
35 | 
36 | # 	with io.open("test.csv", mode='w', encoding='utf-8') as toWrite:
37 | # 	writer = csv.writer(toWrite)
38 | # 	writer.writerows(props)
39 | 
40 | 


--------------------------------------------------------------------------------
/gsoc/aman/integrate.py:
--------------------------------------------------------------------------------
 1 | import sys, argparse
 2 | 
 3 | parser = argparse.ArgumentParser()
 4 | requiredNamed = parser.add_argument_group('Required Arguments');
 5 | requiredNamed.add_argument('--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True)
 6 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', help='eg: File which contains metadata of properties', required=True)
 7 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', help='eg: File which contains uri and number of occurrences of properties', required=True)
 8 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', help='File in which you want to store output', required=True)
 9 | args = parser.parse_args()
10 | 
11 | namespace = args.ns
12 | 
13 | f = open(args.uri,'r')
14 | file = f.readlines()
15 | d = {};
16 | 
17 | for l in file:
18 | 
19 | 	l = l.strip().split('\t')
20 | 	if l[0].split('/')[-2] != namespace:
21 | 		continue 
22 | 	d[l[0].split('/')[-1]] = l[1];
23 | 
24 | # print d["abstract"];
25 | 
26 | 
27 | f = open(args.inp,'r')
28 | manual = f.readlines()
29 | cnt,tot = 0,0;
30 | final = ""
31 | 
32 | for m in manual:
33 | 	l = m.strip().split(',')
34 | 	m = l[0]
35 | 	tot += 1
36 | 	# if ':' in m:
37 | 	# 	print "lol", m
38 | 	if m in d:
39 | 		cnt += 1;
40 | 		l.append("http://dbpedia.org/" + namespace + "/" +m)
41 | 		l.append(d[m])
42 | 	else:
43 | 
44 | 		l.append('')
45 | 		l.append('')
46 | 		print m
47 | 
48 | 	final += ",".join(l);
49 | 	final += '\n';
50 | 
51 | # print final
52 | f = open(args.out,'w');
53 | f.write(final);
54 | print cnt, tot


--------------------------------------------------------------------------------
/gsoc/aman/log_place:
--------------------------------------------------------------------------------
 1 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . }  OFFSET 0 LIMIT 10000
 2 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . }  OFFSET 0 LIMIT 10000
 3 | <addinfourl at 139978197854256 whose fp = <socket._fileobject object at 0x7f4f36d3b8d0>>
 4 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . }  OFFSET 10000 LIMIT 10000
 5 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . }  OFFSET 10000 LIMIT 10000
 6 | <addinfourl at 139978174693672 whose fp = <socket._fileobject object at 0x7f4f3567d350>>
 7 | SELECT DISTINCT ?p ?lab ?rng WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:range ?rng } . FILTER(lang(?lab) = 'en') .  }  OFFSET 0 LIMIT 10000
 8 | SELECT DISTINCT ?p ?lab ?rng WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:range ?rng } . FILTER(lang(?lab) = 'en') .  }  OFFSET 0 LIMIT 10000
 9 | <addinfourl at 139978174694536 whose fp = <socket._fileobject object at 0x7f4f3567d350>>
10 | 


--------------------------------------------------------------------------------
/gsoc/aman/range_place.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | f = open(sys.argv[1],'r')
 3 | lines = f.readlines()
 4 | 
 5 | # RUN: python range_place.py data/GS-v3.csv > data/annotations_compositions.csv 
 6 | 
 7 | 
 8 | for l in lines:
 9 | 	l = l.split(',');
10 | 	if len(l) == 0:
11 | 		continue
12 | 	if l[5] == "" or len(l[5])==0:
13 | 		continue;
14 | 	if 'place' in l[2].lower() and l[5]!='':
15 | 		newl,to_remove = [],[]
16 | 		newl.append("dbo:Place")
17 | 		newl.append("")
18 | 		newl.append("")
19 | 		nlq = l[7].split();
20 | 		for i in range(len(nlq)):
21 | 			if '(' in nlq[i] or ')' in nlq[i]:
22 | 				to_remove.append(nlq[i]);
23 | 				continue
24 | 			if '<' not in nlq[i] and '?' not in nlq[i]:
25 | 				nlq[i] = nlq[i].lower()
26 | 		
27 | 		for x in to_remove:
28 | 			nlq.remove(x);
29 | 		
30 | 		nlq = " ".join(nlq)
31 | 		
32 | 		spq = l[9].split();
33 | 		for i in range(len(spq)):
34 | 			if '<' not in spq[i] and '?' not in spq[i]:
35 | 				spq[i] = spq[i].lower()
36 | 
37 | 		spq = " ".join(spq)
38 | 		
39 | 
40 | 		gq = l[-1].split()
41 | 		for i in range(len(gq)):
42 | 			if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 
43 | 				gq[i] = gq[i].lower()
44 | 
45 | 		gq = " ".join(gq)
46 | 
47 | 		nlq = nlq.replace('<X>','<A>')
48 | 		spq = spq.replace('?x','?a').replace('<X>','<A>')
49 | 		gq = gq.replace('?x','?a').replace('<X>','<A>')
50 | 		newl.append((nlq))
51 | 		newl.append((spq))
52 | 		newl.append((gq))
53 | 		
54 | 		print ";".join(newl)
55 | 		


--------------------------------------------------------------------------------
/gsoc/aman/remove_en.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | f = open(sys.argv[1],'rw')
 4 | lines = f.readlines()
 5 | 
 6 | final = []
 7 | lineno = []
 8 | i = 0
 9 | for l in lines:
10 | 	i += 1
11 | 	if ' en ' in l:
12 | 		lineno.append(i)
13 | 
14 | # print "\n".join(lineno)
15 | print lineno


--------------------------------------------------------------------------------
/gsoc/aman/script.py:
--------------------------------------------------------------------------------
 1 | f = open('data/manual.csv','rw')
 2 | f2 = open('data/temp.csv','rw')
 3 | f3 = open('data/newtemp.csv','w')
 4 | 
 5 | l2 = f2.readlines();
 6 | 
 7 | lines = f.readlines()
 8 | # print "xxxs"
 9 | cnt = 0
10 | for line in lines:
11 | 	if cnt <= 200:
12 | 		cnt += 1
13 | 		continue
14 | 	line = line.split(',')
15 | 	x = l2[cnt-1].split(',')
16 | 	# print line
17 | 	if line[1] == '':
18 | 		line[1] = x[3].strip()
19 | 	if line[2] == '':
20 | 		line[2] = "what is the " + x[1] + " of <X>"
21 | 	else:
22 | 		line[2] = line[2].replace("x","<X>")
23 | 	if line[3] == '':
24 | 		line[3] = line[2]
25 | 	else:
26 | 		line[3] = line[3].replace("x","<X>")
27 | 	# print line
28 | 	# line[1]
29 | 	newline = ",".join(line)
30 | 	print newline
31 | 	f3.write(newline)
32 | 	# print
33 | 	cnt += 1
34 | 	# if cnt > 200:
35 | 	# 	break


--------------------------------------------------------------------------------
/gsoc/aman/script2.py:
--------------------------------------------------------------------------------
 1 | f2 = open('data/temp.csv','rw')
 2 | f3 = open('data/newtemp.csv','w')
 3 | 
 4 | l2 = f2.readlines();
 5 | 
 6 | lines = f.readlines()
 7 | # print "xxxs"
 8 | cnt = 0
 9 | for line in l2:
10 | 
11 | 	line = line.split(',')
12 | 	line.append("what is the " + line[0] + " of <X>")
13 | 	line.append("what is the " + line[0] + " of <X>")
14 | 	# print line
15 | 	# if line[1] == '':
16 | 	# 	line[1] = x[3].strip()
17 | 	# if line[2] == '':
18 | 	# 	line[2] = "what is the " + x[1] + " of <X>"
19 | 	# else:
20 | 	# 	line[2] = line[2].replace("x","<X>")
21 | 	# if line[3] == '':
22 | 	# 	line[3] = line[2]
23 | 	# else:
24 | 	# 	line[3] = line[3].replace("x","<X>")
25 | 	# print line
26 | 	# line[1]
27 | 	newline = ",".join(line)
28 | 	print newline
29 | 	f3.write(newline)
30 | 


--------------------------------------------------------------------------------
/gsoc/aman/sparql_generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | f = open(sys.argv[1],'r')
 4 | lines = f.readlines();
 5 | 
 6 | # print lines[0].split(',') 
 7 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 
 8 | # 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n']
 9 | # sparql_template = []
10 | 
11 | final = ""
12 | lineno = 1
13 | for line in lines:
14 | 	if lineno == 1:
15 | 		lineno += 1
16 | 		continue
17 | 	line = line.strip().split(',')
18 | 	# print lines
19 | 	if line[5]!='':
20 | 		# print line[5]
21 | 		line[-2] = 'SELECT ?x WHERE { <X> <' + line[5] + '> ?x }'
22 | 		line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }'
23 | 
24 | 	final += ",".join(line)
25 | 	final += '\n'
26 | 
27 | 
28 | print final
29 | # fw = open()
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/gsoc/aman/temp.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | proxy = urllib2.ProxyHandler({'https': 'http://proxy.iiit.ac.in:8080/'})
3 | opener = urllib2.build_opener(proxy)
4 | urllib2.install_opener(opener)
5 | result = urllib2.urlopen('https://www.python.org')
6 | print result.read()


--------------------------------------------------------------------------------
/gsoc/aman/test_pipeline/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib2, urllib, httplib, json, sys, csv, io
 2 | import argparse
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | requiredNamed = parser.add_argument_group('Required Arguments');
 7 | requiredNamed.add_argument('--url', dest='url', metavar='url', help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
 8 | args = parser.parse_args()
 9 | 
10 | quote_page = args.url
11 | page = urllib2.urlopen(quote_page)
12 | 
13 | soup = BeautifulSoup(page, "html.parser")
14 | # print type(soup)
15 | 
16 | for rows in soup.find_all("tr"):
17 | 	
18 | 	x = rows.find_all("td");
19 | 	if len(x) <= 2: continue
20 | 
21 | 	name = rows.find_all("td")[0].get_text().replace(" (edit)","")
22 | 	label = rows.find_all("td")[1].get_text()
23 | 	dom = rows.find_all("td")[2].get_text()
24 | 	rng = rows.find_all("td")[3].get_text()
25 | 
26 | 	final = name + "," + label + ","  + dom + ","  + rng 
27 | 	print final.encode('utf-8')
28 | 
29 | # 	with io.open("test.csv", mode='w', encoding='utf-8') as toWrite:
30 | # 	writer = csv.writer(toWrite)
31 | # 	writer.writerows(props)
32 | 
33 | 


--------------------------------------------------------------------------------
/gsoc/anand/.images/test-accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/.images/test-accuracy.png


--------------------------------------------------------------------------------
/gsoc/anand/.images/test-bleu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/.images/test-bleu.png


--------------------------------------------------------------------------------
/gsoc/anand/.pipeline_2/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python"
3 | }


--------------------------------------------------------------------------------
/gsoc/anand/.pipeline_2/final_formatting.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from tqdm import tqdm
  3 | import argparse
  4 | from sparql_generator import sparql_generator
  5 | # Given format #
  6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
  7 | 
  8 | # Required format : separated by semi-colon ##
  9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
 10 | 
 11 | 
 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs):
 13 | 	
 14 | 	if (int(rs) == 1) :
 15 | 		open_files = open(input_file, 'r')
 16 | 		lines = open_files.readlines()
 17 | 		open_files.close()
 18 | 	else:
 19 | 		lines = sparql_generator(input_file=input_file, project_name=project_name,
 20 | 									url=url, uri_file=uri_file, namespace=namespace)
 21 | 	
 22 | 	fl = 1
 23 | 
 24 | 	output = ""
 25 | 
 26 | 	"""
 27 | 	- 	We iterate over the lines of the document.
 28 | 	-	Convet the line into a list containig elements o the 
 29 | 		string delimited by commas.
 30 | 	-	
 31 | 	"""
 32 | 	for line in tqdm(lines):
 33 | 
 34 | 		if fl:
 35 | 			fl = 0
 36 | 			continue
 37 | 		l = line.split(',')
 38 | 
 39 | 		# print l
 40 | 
 41 | 		newl, to_remove = [], []
 42 | 		name = url.split("/")[-1]
 43 | 		newl.append("dbo:"+name)
 44 | 		newl.append("")
 45 | 		newl.append("")
 46 | 
 47 | 		nlq = l[7].split()
 48 | 		# The fuzzy score column is not present in the
 49 | 		# autmatically created csv file.
 50 | 		if(len(l) == 10):
 51 | 			nlq = l[6].split()
 52 | 
 53 | 		"""
 54 | 		From MVE column a question is selected and each word
 55 | 		is put into as an element of the list.
 56 | 		"""
 57 | 		for i in range(len(nlq)):
 58 | 			if '(' in nlq[i] or ')' in nlq[i]:
 59 | 				to_remove.append(nlq[i])
 60 | 				continue
 61 | 			if '<' not in nlq[i] and '?' not in nlq[i]:
 62 | 				nlq[i] = nlq[i].lower()
 63 | 
 64 | 		for x in to_remove:
 65 | 			nlq.remove(x)
 66 | 
 67 | 		spq = l[-2].split()
 68 | 		"""
 69 | 		Query one
 70 | 		"""
 71 | 		for i in range(len(spq)):
 72 | 			if '<' not in spq[i] and '?' not in spq[i]:
 73 | 				spq[i] = spq[i].lower()
 74 | 
 75 | 		"""
 76 | 		Query two
 77 | 		"""
 78 | 		gq = l[-1].split()
 79 | 		for i in range(len(gq)):
 80 | 			if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]:
 81 | 				gq[i] = gq[i].lower()
 82 | 
 83 | 		newl.append(" ".join(nlq))
 84 | 		newl.append(" ".join(spq))
 85 | 		newl.append(" ".join(gq))
 86 | 		output += ";".join(newl) + "\n"
 87 | 
 88 | 	fw = open(project_name+"/"+ output_file, 'w')
 89 | 	fw.write(output)
 90 | 	fw.close()
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 | 	"""
 95 | 	Section to parse the command line arguments.
 96 | 	"""
 97 | 	parser = argparse.ArgumentParser()
 98 | 	requiredNamed = parser.add_argument_group('Required Arguments')
 99 | 	requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
100 | 								help='eg: File which contains metadata of properties', required=False)
101 | 	requiredNamed.add_argument(
102 | 		'--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False)
103 | 	requiredNamed.add_argument('--output_file', dest='out', metavar='out',
104 | 								help='File in which you want to store output', required=False)
105 | 	requiredNamed.add_argument('--project_name', dest='project_name',
106 | 								metavar='project_name', help='test', required=False)
107 | 	requiredNamed.add_argument('--url', dest='url', metavar='url',
108 | 								help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False)
109 | 	requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
110 | 								help='eg: File which contains uri and number of occurrences of properties', required=False)
111 | 	requiredNamed.add_argument('--rs', dest='rs', metavar='rs',
112 | 								help='Toggle to run separately', required=True)
113 | 
114 | 	args = parser.parse_args()
115 | 	input_file = args.inp
116 | 	uri_file = args.uri
117 | 	url = args.url
118 | 	rs =args.rs
119 | 	namespace = args.ns
120 | 	output_file = args.out
121 | 	project_name = args.project_name
122 | 	final_formatting(input_file=input_file, uri_file=uri_file, url=url,
123 | 						output_file=output_file, project_name=project_name, namespace=namespace, rs= rs)
124 | 	pass
125 | 


--------------------------------------------------------------------------------
/gsoc/anand/.pipeline_2/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv"):
12 |     """
13 |     This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     """
16 |     page = urllib.request.urlopen(url)
17 |     soup = BeautifulSoup(page, "html.parser")
18 |     if(not os.path.isdir(project_name)):
19 |         os.makedirs(project_name)
20 |     output_file = open(project_name+"/" + output_file, 'w')
21 |     fl = 0
22 |     accum = []
23 |     for rows in tqdm(soup.find_all("tr")):
24 |         x = rows.find_all("td")
25 |         if len(x) <= 2:
26 |             fl = 1
27 |             continue
28 |         if fl == 1:
29 |             fl = 2
30 |             continue
31 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
32 |         label = rows.find_all("td")[1].get_text()
33 |         dom = rows.find_all("td")[2].get_text()
34 |         rng = rows.find_all("td")[3].get_text()
35 | 
36 |         final = name + "," + label + "," + dom + "," + rng
37 |         accum.append(final)
38 |         output_file.write(final+"\n")
39 |     output_file.close()
40 |     return accum
41 | 
42 | 
43 | """
44 | Name, Label, Domain, Range
45 | """
46 | 
47 | if __name__ == "__main__":
48 |     """
49 |     Section to parse the command line arguments.
50 |     """
51 |     parser = argparse.ArgumentParser()
52 |     requiredNamed = parser.add_argument_group('Required Arguments')
53 | 
54 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
55 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
56 |     requiredNamed.add_argument(
57 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
58 |     requiredNamed.add_argument(
59 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
60 |     args = parser.parse_args()
61 |     url = args.url
62 |     output_file = args.out_put
63 |     project_name = args.project_name
64 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
65 |     pass
66 | 


--------------------------------------------------------------------------------
/gsoc/anand/.pipeline_2/integrate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | from tqdm import tqdm
  4 | from get_properties import get_properties
  5 | from tqdm import tqdm
  6 | 
  7 | """
  8 | How was the tsv file created in the first place?
  9 | - 	The tsv file is read.
 10 | - 	A dictionary diction in made.
 11 | -	Every time the namespace is matched with the name 
 12 | 	space mentioned in the command line argument.
 13 | -	If the name space matches the dictionary diction here 
 14 | 	is updated with {name of the entity = frequency of occurance}
 15 | """
 16 | 
 17 | 
 18 | def integrate(namespace,  uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"):
 19 | 	print("Reading the TSV file: ")
 20 | 	open_tsv = open(uri_file, 'r')
 21 | 	read_tsv = open_tsv.readlines()
 22 | 	diction = {}
 23 | 	for line in tqdm(read_tsv):
 24 | 		line = line.strip().split('\t')
 25 | 		if line[0].split('/')[-2] != namespace:
 26 | 			continue
 27 | 		diction[line[0].split('/')[-1]] = line[1]
 28 | 
 29 | 	open_tsv.close()
 30 | 
 31 | 	"""
 32 | 	Processing the input file. 
 33 | 	-	The input file is read, out put from get_properties.py
 34 | 	-	Reading lines from the input files.
 35 | 	-	Iterating over every line of the read file.
 36 | 	-	Taking the name from the line.
 37 | 	-	if the given name is in the dictionry created above 
 38 | 		appending the url to the given name and corresponding 
 39 | 		frequency to the row entry(read line). Else appending 
 40 | 		an empty string. 
 41 | 	-	Joining all the elements of the list line with a comma,
 42 | 		adding a new line character and then going for the next 
 43 | 		iteration after adding it to a variable final (string addition)
 44 | 	"""
 45 | 	
 46 | 
 47 | 	if (__name__ == "__main__"):
 48 | 		print("Reading the input file: ")
 49 | 		open_inp = open(input_file, 'r')
 50 | 		line_inp = open_inp.readlines()
 51 | 
 52 | 	if (not __name__ == "__main__"):
 53 | 		line_inp = get_properties(url=url, output_file="get_properties.csv", project_name =  project_name)
 54 | 
 55 | 	cnt, tot = 0, 0
 56 | 	final = ""
 57 | 	accum = []
 58 | 	for in_line in tqdm(line_inp):
 59 | 		
 60 | 		line = in_line.strip().split(',')
 61 | 		in_line = line[0]
 62 | 		tot += 1
 63 | 		# if ':' in m:
 64 | 		# 	print "lol", m
 65 | 		if in_line in diction:
 66 | 			cnt += 1
 67 | 			line.append("http://dbpedia.org/" + namespace + "/" + in_line)
 68 | 			line.append(diction[in_line])
 69 | 		else:
 70 | 
 71 | 			line.append('')
 72 | 			line.append('')
 73 | 			# print in_line
 74 | 
 75 | 		final += ",".join(line)
 76 | 		accum.append(",".join(line))
 77 | 		final += '\n'
 78 | 
 79 | 	"""
 80 | 	The string final is the written to the output file name
 81 | 	as given in the command line argument.
 82 | 	"""
 83 | 	# print final
 84 | 	f = open(project_name+"/"+output_file, 'w')
 85 | 	f.write(final)
 86 | 	print("**************************************")
 87 | 	print("Total number of entity whose URI was found: "+str(cnt) +
 88 | 			"\nTotal number of entities present: " + str(tot))
 89 | 	return accum
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     """
 94 |     Section to parse the command line arguments.
 95 |     """
 96 |     parser = argparse.ArgumentParser()
 97 |     requiredNamed = parser.add_argument_group('Required Arguments')
 98 |     requiredNamed.add_argument(
 99 |         '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True)
100 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
101 |                                help='Output from previous step', required=True)
102 |     requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
103 |                                help='eg: File which contains uri and number of occurrences of properties', required=True)
104 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
105 |                                help='File in which you want to store output', required=True)
106 |     requiredNamed.add_argument('--project_name', dest='project_name',
107 |                                metavar='project_name', help='test', required=True)
108 |     args = parser.parse_args()
109 |     namespace = args.ns
110 |     input_file = args.inp
111 |     uri_file = args.uri
112 |     output_file = args.out
113 |     project_name = args.project_name
114 |     integrate(namespace,  uri_file, output_file,
115 |               project_name, "Enter a valid URL", input_file)
116 |     pass
117 | 


--------------------------------------------------------------------------------
/gsoc/anand/.pipeline_2/sparql_generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from decision_tree import decision_tree
 3 | import argparse
 4 | from tqdm import tqdm
 5 | 
 6 | """
 7 | Section to parse the command line arguments.
 8 | """
 9 | 
10 | 
11 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
12 | 	if __name__ == "__main__":
13 | 		f = open(input_file, 'r')
14 | 		lines = f.readlines()
15 | 		pass
16 | 	if not __name__ == "__main__":
17 | 		lines = decision_tree(input_file=input_file, project_name=project_name,
18 | 								url=url, uri_file=uri_file, namespace=namespace)
19 | 		pass
20 | 
21 | 	# print lines[0].split(',')
22 | 	# ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences',
23 | 	# 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n']
24 | 	# sparql_template = []
25 | 
26 | 	"""
27 | 	-	Read the file generated in the previous step.
28 | 	-	Read the lines from the file an save it as a list.
29 | 	-	If the frequency is known, Replace the 2nd last elemet of the formed list with
30 | 		a where statement, and last one witha where statement 
31 | 		followed by an assertion if it is a place.
32 | 	-	Join the updated list with comma as a delimeter and save
33 | 		add it in the string ending with a newline character.
34 | 	-	Print the final on the terminal
35 | 	"""
36 | 	accum = []
37 | 	final = ""
38 | 	lineno = 1
39 | 	for line in tqdm(lines):
40 | 		if lineno == 1:
41 | 			lineno += 1
42 | 			continue
43 | 		line = line.strip().split(',')
44 | 		# print lines
45 | 		if line[4] != '':
46 | 			# print line[5]
47 | 			# It was found the the MVE and OE was also required hence:
48 | 			#line[-2] = 'SELECT ?x WHERE { <X> <' + line[5] + '> ?x }'
49 | 			#line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }'
50 | 			line.append('SELECT ?x WHERE { <X> <' + line[4] + '> ?x }')
51 | 			line.append(
52 | 				'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }')
53 | 
54 | 		final += ",".join(line)
55 | 		accum.append(",".join(line))
56 | 		final += '\n'
57 | 
58 | 	# print final
59 | 
60 | 	# fw = open()
61 | 
62 | 	"""
63 | 	This data generated might be required for further steps
64 | 	thus it is saved in another file named sparql.csv
65 | 	"""
66 | 
67 | 	open(project_name+"/"+output_file, 'w').write(final)
68 | 	return accum
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     """
73 |     Section to parse the command line arguments.
74 |     """
75 |     parser = argparse.ArgumentParser()
76 |     requiredNamed = parser.add_argument_group('Required Arguments')
77 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
78 |                                help='Output from previous step', required=True)
79 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
80 |                                help='File in which you want to store output', required=True)
81 |     requiredNamed.add_argument('--project_name', dest='project_name',
82 |                                metavar='project_name', help='eg.:test', required=True)
83 |     args = parser.parse_args()
84 |     input_file = args.inp
85 |     output_file = args.out
86 |     project_name = args.project_name
87 |     sparql_generator(input_file=input_file, output_file=output_file,
88 |                      project_name=project_name)
89 |     pass
90 | 


--------------------------------------------------------------------------------
/gsoc/anand/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python"
3 | }


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/composite_template.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from range_place import range_place
  3 | import argparse
  4 | from tqdm import tqdm
  5 | 
  6 | # RUN: python composite_template.py data/GS-v3.csv
  7 | # Given format #
  8 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
  9 | 
 10 | # Required format : separated by semi-colon ##
 11 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
 12 | 
 13 | def composite_template(input_file, uri_file, url, output_file, project_name, namespace,rs):
 14 | 	if (int(rs) == 1) :
 15 | 		open_files = open(input_file, 'r')
 16 | 		lines = open_files.readlines()
 17 | 		open_files.close()
 18 | 		output_file_write = open(project_name+"/" + output_file, 'w')
 19 | 	else:
 20 | 		list_val = range_place(input_file=input_file, project_name=project_name,
 21 | 									url=url, uri_file=uri_file, namespace=namespace)
 22 | 		lines = list_val[0]
 23 | 		output_file_write = list_val[1]
 24 | 
 25 | 	
 26 | 	b = []
 27 | 	b.append("where is the")
 28 | 	b.append("")
 29 | 	b.append("of <A> located in")
 30 | 	accum = []
 31 | 	for l in tqdm(lines):
 32 | 		l = l.strip().split(',')
 33 | 		# print l
 34 | 		if len(l) == 0:
 35 | 			continue
 36 | 		if 'place' in l[2].lower() and l[5]!='' and len(l[5])!=0 and 'location of' not in l[7].lower():
 37 | 			
 38 | 			newl,to_remove = [],[]
 39 | 			newl.append("dbo:Place")
 40 | 			newl.append("")
 41 | 			newl.append("")
 42 | 
 43 | 			l[1] = l[1].split()
 44 | 			for i in range(len(l[1])):
 45 | 				if '(' in l[1][i] or ')' in l[1][i]:
 46 | 					to_remove.append(l[1][i])
 47 | 					continue
 48 | 			for x in to_remove:
 49 | 				l[1].remove(x)
 50 | 
 51 | 			b[1] = " ".join(l[1])
 52 | 			# print b
 53 | 			nlq = " ".join(b)
 54 | 			# no Fuzzy score so the index decreases by 1
 55 | 			spq = "select ?a where { <A> " + l[4] + " ?b . ?b <http://dbpedia.org/ontology/location> ?a }"
 56 | 			# print nlq + ";" + spq
 57 | 
 58 | 			gq = l[-1]
 59 | 
 60 | 			gq2 = gq.split()[1]
 61 | 			gq2 = "distinct(" + gq2 + ")"
 62 | 			gq = gq.split()
 63 | 			gq[1] = gq2
 64 | 			gq = " ".join(gq).replace("SELECT","select").replace("WHERE","where")
 65 | 
 66 | 
 67 | 			newl.append((nlq))
 68 | 			newl.append((spq))
 69 | 			newl.append((gq))
 70 | 			newl = ";".join(newl)
 71 | 			accum.append(newl)
 72 | 	output_file_write.write("\n")
 73 | 	output_file_write.write("\n".join(accum))
 74 | 	output_file_write.close()
 75 | 
 76 | 
 77 | if __name__ == "__main__":
 78 | 	"""
 79 | 	Section to parse the command line arguments.
 80 | 	"""
 81 | 	parser = argparse.ArgumentParser()
 82 | 	requiredNamed = parser.add_argument_group('Required Arguments')
 83 | 	requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
 84 | 								help='eg: File which contains metadata of properties', required=False)
 85 | 	requiredNamed.add_argument(
 86 | 		'--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False)
 87 | 	requiredNamed.add_argument('--output_file', dest='out', metavar='out',
 88 | 								help='File in which you want to store output', required=False)
 89 | 	requiredNamed.add_argument('--project_name', dest='project_name',
 90 | 								metavar='project_name', help='test', required=False)
 91 | 	requiredNamed.add_argument('--url', dest='url', metavar='url',
 92 | 								help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False)
 93 | 	requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
 94 | 								help='eg: File which contains uri and number of occurrences of properties', required=False)
 95 | 	requiredNamed.add_argument('--rs', dest='rs', metavar='rs',
 96 | 								help='Toggle to run separately', required=True)
 97 | 
 98 | 	args = parser.parse_args()
 99 | 	input_file = args.inp
100 | 	uri_file = args.uri
101 | 	url = args.url
102 | 	rs =args.rs
103 | 	namespace = args.ns
104 | 	output_file = args.out
105 | 	project_name = args.project_name
106 | 	composite_template(input_file=input_file, uri_file=uri_file, url=url,
107 | 						output_file=output_file, project_name=project_name, namespace=namespace, rs= rs)
108 | 	pass


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/decision_tree.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import argparse
 4 | from tqdm import tqdm
 5 | from integrate import integrate
 6 | 
 7 | 
 8 | def decision_tree(input_file, project_name, output_file="decision_tree.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
 9 |     if __name__ == "__main__":
10 |         f = open(input_file, 'r')
11 |         lines = f.readlines()
12 |         pass
13 |     if not __name__ == "__main__":
14 |         lines = integrate(namespace=namespace, uri_file=uri_file,
15 |                           project_name=project_name, url=url)
16 |     final_lines = []
17 |     lineno = 1
18 | 
19 |     """ 
20 |     print lines[0].split(',') 
21 |     ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 
22 |     'MVE', 'Optimal Expression\r\n']
23 |     """
24 | 
25 |     """
26 |     - 	The lines from the file generated in the previous steps
27 |         is read and a for loop iterates through ecery row of 
28 |     -	First we create a list of all elements seperated by commas.
29 |     - 	If the range has the substring person, the we put as 
30 |         question who else what.
31 |     -	We append the question thus generate 2 times as minimum 
32 |         viable instruction and optimal expression.
33 |     - 	We create a variable names final lines and add strings,
34 |         which are formed by adding strings formed by joining
35 |         the elements of the list delemited by comma in each line.
36 |     -	We also create a string of the question generated 
37 |         delemited by a newline characte and store it in mve
38 |         as a long string.
39 |     - 	We output the series of question in mve_output.
40 |     -	We save the final_lines strind in a file named GS_with_mve.csv
41 |         delimeted by a newline character.
42 |     """
43 | 
44 |     mve = ""
45 |     for line in tqdm(lines):
46 |         if lineno == 1:
47 |             lineno += 1
48 |             continue
49 |         line = line.strip().split(',')
50 |         rng = line[2].lower()
51 |         lbl = line[1]
52 |         if 'person' in rng:
53 |             rng = "who"
54 |         else:
55 |             rng = "what"
56 |         # The total length of a row in the list is 6,
57 |         # thus 7 and 8 are out of range values. Thus I
58 |         # replaced it with append.
59 |         """ 
60 |         line[7] = rng + " is the " + lbl + " of <X>"
61 |         line[8] = rng + " is the " + lbl + " of <X>" 
62 |         """
63 |         if(len(line) < 9):
64 |             line.append(rng + " is the " + lbl + " of <X>")
65 |             line.append(rng + " is the " + lbl + " of <X>")
66 |         else:
67 |             line[7] = rng + " is the " + lbl + " of <X>"
68 |             line[8] = rng + " is the " + lbl + " of <X>"
69 |         mve += rng + " is the " + lbl + " of <X>\n"
70 |         final_lines.append(",".join(line))
71 | 
72 |     fw = open(project_name+"/"+"mve"+output_file, 'w')
73 |     fw.write(mve)
74 | 
75 |     fw2 = open(project_name+"/"+output_file, 'w')
76 |     fw2.write("\n".join(final_lines))
77 |     return final_lines
78 | 
79 | if __name__ == "__main__":
80 |     """
81 |     Section to parse the command line arguments.
82 |     """
83 |     parser = argparse.ArgumentParser()
84 |     requiredNamed = parser.add_argument_group('Required Arguments')
85 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
86 |                                help='Output from previous step', required=True)
87 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
88 |                                help='File in which you want to store output', required=True)
89 |     requiredNamed.add_argument('--project_name', dest='project_name',
90 |                                metavar='project_name', help='test', required=True)
91 |     args = parser.parse_args()
92 |     input_file = args.inp
93 |     output_file = args.out
94 |     project_name = args.project_name
95 |     decision_tree(input_file=input_file, output_file=output_file,
96 |                   project_name=project_name)
97 |     pass
98 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/final_formatting.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from tqdm import tqdm
  3 | import argparse
  4 | from sparql_generator import sparql_generator
  5 | # Given format #
  6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
  7 | 
  8 | # Required format : separated by semi-colon ##
  9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
 10 | 
 11 | 
 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs):
 13 | 	
 14 | 	if (int(rs) == 1) :
 15 | 		open_files = open(input_file, 'r')
 16 | 		lines = open_files.readlines()
 17 | 		open_files.close()
 18 | 	else:
 19 | 		lines = sparql_generator(input_file=input_file, project_name=project_name,
 20 | 									url=url, uri_file=uri_file, namespace=namespace)
 21 | 	
 22 | 	fl = 1
 23 | 
 24 | 	output = ""
 25 | 
 26 | 	"""
 27 | 	- 	We iterate over the lines of the document.
 28 | 	-	Convet the line into a list containig elements o the 
 29 | 		string delimited by commas.
 30 | 	-	
 31 | 	"""
 32 | 	for line in tqdm(lines):
 33 | 
 34 | 		if fl:
 35 | 			fl = 0
 36 | 			continue
 37 | 		l = line.split(',')
 38 | 
 39 | 		# print l
 40 | 
 41 | 		newl, to_remove = [], []
 42 | 		name = url.split("/")[-1]
 43 | 		newl.append("dbo:"+name)
 44 | 		newl.append("")
 45 | 		newl.append("")
 46 | 
 47 | 		nlq = l[7].split()
 48 | 		# The fuzzy score column is not present in the
 49 | 		# autmatically created csv file.
 50 | 		if(len(l) == 10):
 51 | 			nlq = l[6].split()
 52 | 
 53 | 		"""
 54 | 		From MVE column a question is selected and each word
 55 | 		is put into as an element of the list.
 56 | 		"""
 57 | 		for i in range(len(nlq)):
 58 | 			if '(' in nlq[i] or ')' in nlq[i]:
 59 | 				to_remove.append(nlq[i])
 60 | 				continue
 61 | 			if '<' not in nlq[i] and '?' not in nlq[i]:
 62 | 				nlq[i] = nlq[i].lower()
 63 | 
 64 | 		for x in to_remove:
 65 | 			nlq.remove(x)
 66 | 
 67 | 		spq = l[-2].split()
 68 | 		"""
 69 | 		Query one
 70 | 		"""
 71 | 		for i in range(len(spq)):
 72 | 			if '<' not in spq[i] and '?' not in spq[i]:
 73 | 				spq[i] = spq[i].lower()
 74 | 
 75 | 		"""
 76 | 		Query two
 77 | 		"""
 78 | 		gq = l[-1].split()
 79 | 		for i in range(len(gq)):
 80 | 			if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]:
 81 | 				gq[i] = gq[i].lower()
 82 | 
 83 | 		newl.append(" ".join(nlq))
 84 | 		newl.append(" ".join(spq))
 85 | 		newl.append(" ".join(gq))
 86 | 		output += ";".join(newl) + "\n"
 87 | 
 88 | 	fw = open(project_name+"/"+ output_file, 'w')
 89 | 	fw.write(output)
 90 | 	fw.close()
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 | 	"""
 95 | 	Section to parse the command line arguments.
 96 | 	"""
 97 | 	parser = argparse.ArgumentParser()
 98 | 	requiredNamed = parser.add_argument_group('Required Arguments')
 99 | 	requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
100 | 								help='eg: File which contains metadata of properties', required=False)
101 | 	requiredNamed.add_argument(
102 | 		'--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False)
103 | 	requiredNamed.add_argument('--output_file', dest='out', metavar='out',
104 | 								help='File in which you want to store output', required=False)
105 | 	requiredNamed.add_argument('--project_name', dest='project_name',
106 | 								metavar='project_name', help='test', required=False)
107 | 	requiredNamed.add_argument('--url', dest='url', metavar='url',
108 | 								help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False)
109 | 	requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
110 | 								help='eg: File which contains uri and number of occurrences of properties', required=False)
111 | 	requiredNamed.add_argument('--rs', dest='rs', metavar='rs',
112 | 								help='Toggle to run separately', required=True)
113 | 
114 | 	args = parser.parse_args()
115 | 	input_file = args.inp
116 | 	uri_file = args.uri
117 | 	url = args.url
118 | 	rs =args.rs
119 | 	namespace = args.ns
120 | 	output_file = args.out
121 | 	project_name = args.project_name
122 | 	final_formatting(input_file=input_file, uri_file=uri_file, url=url,
123 | 						output_file=output_file, project_name=project_name, namespace=namespace, rs= rs)
124 | 	pass
125 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/get_properties.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv"):
12 |     """
13 |     This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     """
16 |     page = urlopen(url)
17 |     soup = BeautifulSoup(page, "html.parser")
18 |     if(not os.path.isdir(project_name)):
19 |         os.makedirs(project_name)
20 |     output_file = open(project_name+"/" + output_file, 'w')
21 |     fl = 0
22 |     accum = []
23 |     for rows in tqdm(soup.find_all("tr")):
24 |         x = rows.find_all("td")
25 |         if len(x) <= 2:
26 |             fl = 1
27 |             continue
28 |         if fl == 1:
29 |             fl = 2
30 |             continue
31 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
32 |         label = rows.find_all("td")[1].get_text()
33 |         dom = rows.find_all("td")[2].get_text()
34 |         rng = rows.find_all("td")[3].get_text()
35 | 
36 |         final = name + "," + label + "," + dom + "," + rng
37 |         accum.append(final)
38 |         output_file.write(final+"\n")
39 |     output_file.close()
40 |     return accum
41 | 
42 | 
43 | """
44 | Name, Label, Domain, Range
45 | """
46 | 
47 | if __name__ == "__main__":
48 |     """
49 |     Section to parse the command line arguments.
50 |     """
51 |     parser = argparse.ArgumentParser()
52 |     requiredNamed = parser.add_argument_group('Required Arguments')
53 | 
54 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
55 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
56 |     requiredNamed.add_argument(
57 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
58 |     requiredNamed.add_argument(
59 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
60 |     args = parser.parse_args()
61 |     url = args.url
62 |     output_file = args.out_put
63 |     project_name = args.project_name
64 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
65 |     pass
66 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/integrate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | from tqdm import tqdm
  4 | from get_properties import get_properties
  5 | from tqdm import tqdm
  6 | 
  7 | """
  8 | How was the tsv file created in the first place?
  9 | - 	The tsv file is read.
 10 | - 	A dictionary diction in made.
 11 | -	Every time the namespace is matched with the name 
 12 | 	space mentioned in the command line argument.
 13 | -	If the name space matches the dictionary diction here 
 14 | 	is updated with {name of the entity = frequency of occurance}
 15 | """
 16 | 
 17 | 
 18 | def integrate(namespace,  uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"):
 19 | 	print("Reading the TSV file: ")
 20 | 	open_tsv = open(uri_file, 'r')
 21 | 	read_tsv = open_tsv.readlines()
 22 | 	diction = {}
 23 | 	for line in tqdm(read_tsv):
 24 | 		line = line.strip().split('\t')
 25 | 		if line[0].split('/')[-2] != namespace:
 26 | 			continue
 27 | 		diction[line[0].split('/')[-1]] = line[1]
 28 | 
 29 | 	open_tsv.close()
 30 | 
 31 | 	"""
 32 | 	Processing the input file. 
 33 | 	-	The input file is read, out put from get_properties.py
 34 | 	-	Reading lines from the input files.
 35 | 	-	Iterating over every line of the read file.
 36 | 	-	Taking the name from the line.
 37 | 	-	if the given name is in the dictionry created above 
 38 | 		appending the url to the given name and corresponding 
 39 | 		frequency to the row entry(read line). Else appending 
 40 | 		an empty string. 
 41 | 	-	Joining all the elements of the list line with a comma,
 42 | 		adding a new line character and then going for the next 
 43 | 		iteration after adding it to a variable final (string addition)
 44 | 	"""
 45 | 	
 46 | 
 47 | 	if (__name__ == "__main__"):
 48 | 		print("Reading the input file: ")
 49 | 		open_inp = open(input_file, 'r')
 50 | 		line_inp = open_inp.readlines()
 51 | 
 52 | 	if (not __name__ == "__main__"):
 53 | 		line_inp = get_properties(url=url, output_file="get_properties.csv", project_name =  project_name)
 54 | 
 55 | 	cnt, tot = 0, 0
 56 | 	final = ""
 57 | 	accum = []
 58 | 	for in_line in tqdm(line_inp):
 59 | 		
 60 | 		line = in_line.strip().split(',')
 61 | 		in_line = line[0]
 62 | 		tot += 1
 63 | 		# if ':' in m:
 64 | 		# 	print "lol", m
 65 | 		if in_line in diction:
 66 | 			cnt += 1
 67 | 			line.append("http://dbpedia.org/" + namespace + "/" + in_line)
 68 | 			line.append(diction[in_line])
 69 | 		else:
 70 | 
 71 | 			line.append('')
 72 | 			line.append('')
 73 | 			# print in_line
 74 | 
 75 | 		final += ",".join(line)
 76 | 		accum.append(",".join(line))
 77 | 		final += '\n'
 78 | 
 79 | 	"""
 80 | 	The string final is the written to the output file name
 81 | 	as given in the command line argument.
 82 | 	"""
 83 | 	# print final
 84 | 	f = open(project_name+"/"+output_file, 'w')
 85 | 	f.write(final)
 86 | 	print("**************************************")
 87 | 	print("Total number of entity whose URI was found: "+str(cnt) +
 88 | 			"\nTotal number of entities present: " + str(tot))
 89 | 	return accum
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     """
 94 |     Section to parse the command line arguments.
 95 |     """
 96 |     parser = argparse.ArgumentParser()
 97 |     requiredNamed = parser.add_argument_group('Required Arguments')
 98 |     requiredNamed.add_argument(
 99 |         '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True)
100 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
101 |                                help='Output from previous step', required=True)
102 |     requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
103 |                                help='eg: File which contains uri and number of occurrences of properties', required=True)
104 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
105 |                                help='File in which you want to store output', required=True)
106 |     requiredNamed.add_argument('--project_name', dest='project_name',
107 |                                metavar='project_name', help='test', required=True)
108 |     args = parser.parse_args()
109 |     namespace = args.ns
110 |     input_file = args.inp
111 |     uri_file = args.uri
112 |     output_file = args.out
113 |     project_name = args.project_name
114 |     integrate(namespace,  uri_file, output_file,
115 |               project_name, "Enter a valid URL", input_file)
116 |     pass
117 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/range_place.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import os
 4 | from  sparql_generator import sparql_generator
 5 | from tqdm import tqdm
 6 | 
 7 | def range_place(input_file, project_name, output_file="test_res.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
 8 |     if __name__ == "__main__":
 9 |         f = open(input_file, 'r')
10 |         lines = f.readlines()
11 |         f.close()
12 |         print ("hello")
13 |     if not __name__ == "__main__":
14 |         lines = sparql_generator(input_file=input_file, project_name=project_name,
15 |                                     url=url, uri_file=uri_file, namespace=namespace)
16 | 
17 |     output_file_write = open(project_name+"/" + output_file, 'w')
18 |     name = url.split("/")[-1]
19 |     accum = []
20 |     for l in tqdm(lines):
21 |         l = l.split(',')
22 |         if len(l) == 0:
23 |             continue
24 |         if l[5] == "" or len(l[5]) == 0:
25 |             continue
26 |         if name.lower() in l[2].lower() and l[5] != '':
27 |             newl, to_remove = [], []
28 |             newl.append("dbo:"+name)
29 |             newl.append("")
30 |             newl.append("")
31 |             nlq = l[7].split()
32 |             for i in range(len(nlq)):
33 |                 if '(' in nlq[i] or ')' in nlq[i]:
34 |                     to_remove.append(nlq[i])
35 |                     continue
36 |                 if '<' not in nlq[i] and '?' not in nlq[i]:
37 |                     nlq[i] = nlq[i].lower()
38 | 
39 |             for x in to_remove:
40 |                 nlq.remove(x)
41 | 
42 |             nlq = " ".join(nlq)
43 | 
44 |             spq = l[9].split()
45 |             for i in range(len(spq)):
46 |                 if '<' not in spq[i] and '?' not in spq[i]:
47 |                     spq[i] = spq[i].lower()
48 | 
49 |             spq = " ".join(spq)
50 | 
51 |             gq = l[-1].split()
52 |             for i in range(len(gq)):
53 |                 if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]:
54 |                     gq[i] = gq[i].lower()
55 | 
56 |             gq = " ".join(gq)
57 | 
58 |             nlq = nlq.replace('<X>', '<A>')
59 |             spq = spq.replace('?x', '?a').replace('<X>', '<A>')
60 |             gq = gq.replace('?x', '?a').replace('<X>', '<A>')
61 |             newl.append((nlq))
62 |             newl.append((spq))
63 |             newl.append((gq))
64 |             accum.append(";".join(newl))
65 |     output_file_write.write("\n".join(accum))
66 |     return [lines,output_file_write]
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     """
71 |     Section to parse the command line arguments.
72 |     """
73 |     parser = argparse.ArgumentParser()
74 |     requiredNamed = parser.add_argument_group('Required Arguments')
75 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
76 |                                help='eg: File which contains metadata of properties', required=True)
77 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
78 |                                help='File in which you want to store output', required=True)
79 |     requiredNamed.add_argument('--project_name', dest='project_name',
80 |                                metavar='project_name', help='test', required=True)
81 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
82 | 								help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
83 | 	
84 |     args = parser.parse_args()
85 |     input_file = args.inp
86 |     output_file = args.out
87 |     url = args.url
88 |     project_name = args.project_name
89 |     range_place(input_file=input_file, output_file=output_file,
90 |                 project_name=project_name, url=url)
91 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_composite/sparql_generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from decision_tree import decision_tree
 3 | import argparse
 4 | from tqdm import tqdm
 5 | 
 6 | """
 7 | Section to parse the command line arguments.
 8 | """
 9 | 
10 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
11 | 	if __name__ == "__main__":
12 | 		f = open(input_file, 'r')
13 | 		lines = f.readlines()
14 | 		pass
15 | 	if not __name__ == "__main__":
16 | 		lines = decision_tree(input_file=input_file, project_name=project_name,
17 | 								url=url, uri_file=uri_file, namespace=namespace)
18 | 		pass
19 | 
20 | 	# print lines[0].split(',')
21 | 	# ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences',
22 | 	# 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n']
23 | 	# sparql_template = []
24 | 
25 | 	"""
26 | 	-	Read the file generated in the previous step.
27 | 	-	Read the lines from the file an save it as a list.
28 | 	-	If the frequency is known, Replace the 2nd last elemet of the formed list with
29 | 		a where statement, and last one witha where statement 
30 | 		followed by an assertion if it is a place.
31 | 	-	Join the updated list with comma as a delimeter and save
32 | 		add it in the string ending with a newline character.
33 | 	-	Print the final on the terminal
34 | 	"""
35 | 	accum = []
36 | 	final = ""
37 | 	lineno = 1
38 | 	for line in tqdm(lines):
39 | 		if lineno == 1:
40 | 			lineno += 1
41 | 			continue
42 | 		line = line.strip().split(',')
43 | 		# print lines
44 | 		if line[4] != '':
45 | 			# print line[5]
46 | 			# It was found the the MVE and OE was also required hence:
47 | 			#line[-2] = 'SELECT ?x WHERE { <X> <' + line[5] + '> ?x }'
48 | 			#line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }'
49 | 			line.append('SELECT ?x WHERE { <X> <' + line[4] + '> ?x }')
50 | 			line.append(
51 | 				'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }')
52 | 
53 | 		final += ",".join(line)
54 | 		accum.append(",".join(line))
55 | 		final += '\n'
56 | 
57 | 	# print final
58 | 
59 | 	# fw = open()
60 | 
61 | 	"""
62 | 	This data generated might be required for further steps
63 | 	thus it is saved in another file named sparql.csv
64 | 	"""
65 | 
66 | 	open(project_name+"/"+output_file, 'w').write(final)
67 | 	return accum
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     """
72 |     Section to parse the command line arguments.
73 |     """
74 |     parser = argparse.ArgumentParser()
75 |     requiredNamed = parser.add_argument_group('Required Arguments')
76 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
77 |                                help='Output from previous step', required=True)
78 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
79 |                                help='File in which you want to store output', required=True)
80 |     requiredNamed.add_argument('--project_name', dest='project_name',
81 |                                metavar='project_name', help='eg.:test', required=True)
82 |     args = parser.parse_args()
83 |     input_file = args.inp
84 |     output_file = args.out
85 |     project_name = args.project_name
86 |     sparql_generator(input_file=input_file, output_file=output_file,
87 |                      project_name=project_name)
88 |     pass
89 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_simple/decision_tree.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | import argparse
 4 | from tqdm import tqdm
 5 | from integrate import integrate
 6 | 
 7 | 
 8 | def decision_tree(input_file, project_name, output_file="decision_tree.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
 9 |     if __name__ == "__main__":
10 |         f = open(input_file, 'r')
11 |         lines = f.readlines()
12 |         pass
13 |     if not __name__ == "__main__":
14 |         lines = integrate(namespace=namespace, uri_file=uri_file,
15 |                           project_name=project_name, url=url)
16 |     final_lines = []
17 |     lineno = 1
18 | 
19 |     """ 
20 |     print lines[0].split(',') 
21 |     ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 
22 |     'MVE', 'Optimal Expression\r\n']
23 |     """
24 | 
25 |     """
26 |     - 	The lines from the file generated in the previous steps
27 |         is read and a for loop iterates through ecery row of 
28 |     -	First we create a list of all elements seperated by commas.
29 |     - 	If the range has the substring person, the we put as 
30 |         question who else what.
31 |     -	We append the question thus generate 2 times as minimum 
32 |         viable instruction and optimal expression.
33 |     - 	We create a variable names final lines and add strings,
34 |         which are formed by adding strings formed by joining
35 |         the elements of the list delemited by comma in each line.
36 |     -	We also create a string of the question generated 
37 |         delemited by a newline characte and store it in mve
38 |         as a long string.
39 |     - 	We output the series of question in mve_output.
40 |     -	We save the final_lines strind in a file named GS_with_mve.csv
41 |         delimeted by a newline character.
42 |     """
43 | 
44 |     mve = ""
45 |     for line in tqdm(lines):
46 |         if lineno == 1:
47 |             lineno += 1
48 |             continue
49 |         line = line.strip().split(',')
50 |         # Wrong index for range is corrected here
51 |         rng = line[3].lower()
52 |         lbl = line[1]
53 |         if 'person' in rng:
54 |             rng = "who"
55 |         else:
56 |             rng = "what"
57 |         # The total length of a row in the list is 6,
58 |         # thus 7 and 8 are out of range values. Thus I
59 |         # replaced it with append.
60 |         """ 
61 |         line[7] = rng + " is the " + lbl + " of <X>"
62 |         line[8] = rng + " is the " + lbl + " of <X>" 
63 |         """
64 |         if(len(line) < 9):
65 |             line.append(rng + " is the " + lbl + " of <X>")
66 |             line.append(rng + " is the " + lbl + " of <X>")
67 |         else:
68 |             line[7] = rng + " is the " + lbl + " of <X>"
69 |             line[8] = rng + " is the " + lbl + " of <X>"
70 |         mve += rng + " is the " + lbl + " of <X>\n"
71 |         final_lines.append(",".join(line))
72 | 
73 |     fw = open(project_name+"/"+"mve"+output_file, 'w')
74 |     fw.write(mve)
75 | 
76 |     fw2 = open(project_name+"/"+output_file, 'w')
77 |     fw2.write("\n".join(final_lines))
78 |     return final_lines
79 | 
80 | if __name__ == "__main__":
81 |     """
82 |     Section to parse the command line arguments.
83 |     """
84 |     parser = argparse.ArgumentParser()
85 |     requiredNamed = parser.add_argument_group('Required Arguments')
86 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
87 |                                help='Output from previous step', required=True)
88 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
89 |                                help='File in which you want to store output', required=True)
90 |     requiredNamed.add_argument('--project_name', dest='project_name',
91 |                                metavar='project_name', help='test', required=True)
92 |     args = parser.parse_args()
93 |     input_file = args.inp
94 |     output_file = args.out
95 |     project_name = args.project_name
96 |     decision_tree(input_file=input_file, output_file=output_file,
97 |                   project_name=project_name)
98 |     pass
99 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_simple/final_formatting.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from tqdm import tqdm
  3 | import argparse
  4 | from sparql_generator import sparql_generator
  5 | # Given format #
  6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n']
  7 | 
  8 | # Required format : separated by semi-colon ##
  9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] #
 10 | 
 11 | 
 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs):
 13 | 	
 14 | 	if (int(rs) == 1) :
 15 | 		open_files = open(input_file, 'r')
 16 | 		lines = open_files.readlines()
 17 | 		open_files.close()
 18 | 	else:
 19 | 		lines = sparql_generator(input_file=input_file, project_name=project_name,
 20 | 									url=url, uri_file=uri_file, namespace=namespace)
 21 | 	
 22 | 	fl = 1
 23 | 
 24 | 	output = ""
 25 | 
 26 | 	"""
 27 | 	- 	We iterate over the lines of the document.
 28 | 	-	Convet the line into a list containig elements o the 
 29 | 		string delimited by commas.
 30 | 	-	
 31 | 	"""
 32 | 	for line in tqdm(lines):
 33 | 
 34 | 		if fl:
 35 | 			fl = 0
 36 | 			continue
 37 | 		l = line.split(',')
 38 | 
 39 | 		# print l
 40 | 
 41 | 		newl, to_remove = [], []
 42 | 		name = url.split("/")[-1]
 43 | 		newl.append("dbo:"+name)
 44 | 		newl.append("")
 45 | 		newl.append("")
 46 | 
 47 | 		nlq = l[7].split()
 48 | 		# The fuzzy score column is not present in the
 49 | 		# autmatically created csv file.
 50 | 		if(len(l) == 10):
 51 | 			nlq = l[6].split()
 52 | 
 53 | 		"""
 54 | 		From MVE column a question is selected and each word
 55 | 		is put into as an element of the list.
 56 | 		"""
 57 | 		for i in range(len(nlq)):
 58 | 			if '(' in nlq[i] or ')' in nlq[i]:
 59 | 				to_remove.append(nlq[i])
 60 | 				continue
 61 | 			if '<' not in nlq[i] and '?' not in nlq[i]:
 62 | 				nlq[i] = nlq[i].lower()
 63 | 
 64 | 		for x in to_remove:
 65 | 			nlq.remove(x)
 66 | 
 67 | 		spq = l[-2].split()
 68 | 		"""
 69 | 		Query one
 70 | 		"""
 71 | 		for i in range(len(spq)):
 72 | 			if '<' not in spq[i] and '?' not in spq[i]:
 73 | 				spq[i] = spq[i].lower()
 74 | 
 75 | 		"""
 76 | 		Query two
 77 | 		"""
 78 | 		gq = l[-1].split()
 79 | 		for i in range(len(gq)):
 80 | 			if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]:
 81 | 				gq[i] = gq[i].lower()
 82 | 
 83 | 		newl.append(" ".join(nlq))
 84 | 		newl.append(" ".join(spq))
 85 | 		newl.append(" ".join(gq))
 86 | 		output += ";".join(newl) + "\n"
 87 | 
 88 | 	fw = open(project_name+"/"+ output_file, 'w')
 89 | 	fw.write(output)
 90 | 	fw.close()
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 | 	"""
 95 | 	Section to parse the command line arguments.
 96 | 	"""
 97 | 	parser = argparse.ArgumentParser()
 98 | 	requiredNamed = parser.add_argument_group('Required Arguments')
 99 | 	requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
100 | 								help='eg: File which contains metadata of properties', required=False)
101 | 	requiredNamed.add_argument(
102 | 		'--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False)
103 | 	requiredNamed.add_argument('--output_file', dest='out', metavar='out',
104 | 								help='File in which you want to store output', required=False)
105 | 	requiredNamed.add_argument('--project_name', dest='project_name',
106 | 								metavar='project_name', help='test', required=False)
107 | 	requiredNamed.add_argument('--url', dest='url', metavar='url',
108 | 								help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False)
109 | 	requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
110 | 								help='eg: File which contains uri and number of occurrences of properties', required=False)
111 | 	requiredNamed.add_argument('--rs', dest='rs', metavar='rs',
112 | 								help='Toggle to run separately', required=True)
113 | 
114 | 	args = parser.parse_args()
115 | 	input_file = args.inp
116 | 	uri_file = args.uri
117 | 	url = args.url
118 | 	rs =args.rs
119 | 	namespace = args.ns
120 | 	output_file = args.out
121 | 	project_name = args.project_name
122 | 	final_formatting(input_file=input_file, uri_file=uri_file, url=url,
123 | 						output_file=output_file, project_name=project_name, namespace=namespace, rs= rs)
124 | 	pass
125 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_simple/get_properties.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv"):
12 |     """
13 |     This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     """
16 |     page = urlopen(url)
17 |     soup = BeautifulSoup(page, "html.parser")
18 |     if(not os.path.isdir(project_name)):
19 |         os.makedirs(project_name)
20 |     output_file = open(project_name+"/" + output_file, 'w')
21 |     fl = 0
22 |     accum = []
23 |     for rows in tqdm(soup.find_all("tr")):
24 |         x = rows.find_all("td")
25 |         if len(x) <= 2:
26 |             fl = 1
27 |             continue
28 |         if fl == 1:
29 |             fl = 2
30 |             continue
31 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
32 |         label = rows.find_all("td")[1].get_text()
33 |         dom = rows.find_all("td")[2].get_text()
34 |         rng = rows.find_all("td")[3].get_text()
35 | 
36 |         final = name + "," + label + "," + dom + "," + rng
37 |         accum.append(final)
38 |         output_file.write(final+"\n")
39 |     output_file.close()
40 |     return accum
41 | 
42 | 
43 | """
44 | Name, Label, Domain, Range
45 | """
46 | 
47 | if __name__ == "__main__":
48 |     """
49 |     Section to parse the command line arguments.
50 |     """
51 |     parser = argparse.ArgumentParser()
52 |     requiredNamed = parser.add_argument_group('Required Arguments')
53 | 
54 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
55 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
56 |     requiredNamed.add_argument(
57 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
58 |     requiredNamed.add_argument(
59 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
60 |     args = parser.parse_args()
61 |     url = args.url
62 |     output_file = args.out_put
63 |     project_name = args.project_name
64 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
65 |     pass
66 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_simple/integrate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | from tqdm import tqdm
  4 | from get_properties import get_properties
  5 | from tqdm import tqdm
  6 | 
  7 | """
  8 | How was the tsv file created in the first place?
  9 | - 	The tsv file is read.
 10 | - 	A dictionary diction in made.
 11 | -	Every time the namespace is matched with the name 
 12 | 	space mentioned in the command line argument.
 13 | -	If the name space matches the dictionary diction here 
 14 | 	is updated with {name of the entity = frequency of occurance}
 15 | """
 16 | 
 17 | 
 18 | def integrate(namespace,  uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"):
 19 | 	print("Reading the TSV file: ")
 20 | 	open_tsv = open(uri_file, 'r')
 21 | 	read_tsv = open_tsv.readlines()
 22 | 	diction = {}
 23 | 	for line in tqdm(read_tsv):
 24 | 		line = line.strip().split('\t')
 25 | 		if line[0].split('/')[-2] != namespace:
 26 | 			continue
 27 | 		diction[line[0].split('/')[-1]] = line[1]
 28 | 
 29 | 	open_tsv.close()
 30 | 
 31 | 	"""
 32 | 	Processing the input file. 
 33 | 	-	The input file is read, out put from get_properties.py
 34 | 	-	Reading lines from the input files.
 35 | 	-	Iterating over every line of the read file.
 36 | 	-	Taking the name from the line.
 37 | 	-	if the given name is in the dictionry created above 
 38 | 		appending the url to the given name and corresponding 
 39 | 		frequency to the row entry(read line). Else appending 
 40 | 		an empty string. 
 41 | 	-	Joining all the elements of the list line with a comma,
 42 | 		adding a new line character and then going for the next 
 43 | 		iteration after adding it to a variable final (string addition)
 44 | 	"""
 45 | 	
 46 | 
 47 | 	if (__name__ == "__main__"):
 48 | 		print("Reading the input file: ")
 49 | 		open_inp = open(input_file, 'r')
 50 | 		line_inp = open_inp.readlines()
 51 | 
 52 | 	if (not __name__ == "__main__"):
 53 | 		line_inp = get_properties(url=url, output_file="get_properties.csv", project_name =  project_name)
 54 | 
 55 | 	cnt, tot = 0, 0
 56 | 	final = ""
 57 | 	accum = []
 58 | 	for in_line in tqdm(line_inp):
 59 | 		
 60 | 		line = in_line.strip().split(',')
 61 | 		in_line = line[0]
 62 | 		tot += 1
 63 | 		# if ':' in m:
 64 | 		# 	print "lol", m
 65 | 		if in_line in diction:
 66 | 			cnt += 1
 67 | 			line.append("http://dbpedia.org/" + namespace + "/" + in_line)
 68 | 			line.append(diction[in_line])
 69 | 		else:
 70 | 
 71 | 			line.append('')
 72 | 			line.append('')
 73 | 			# print in_line
 74 | 
 75 | 		final += ",".join(line)
 76 | 		accum.append(",".join(line))
 77 | 		final += '\n'
 78 | 
 79 | 	"""
 80 | 	The string final is the written to the output file name
 81 | 	as given in the command line argument.
 82 | 	"""
 83 | 	# print final
 84 | 	f = open(project_name+"/"+output_file, 'w')
 85 | 	f.write(final)
 86 | 	print("**************************************")
 87 | 	print("Total number of entity whose URI was found: "+str(cnt) +
 88 | 			"\nTotal number of entities present: " + str(tot))
 89 | 	return accum
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     """
 94 |     Section to parse the command line arguments.
 95 |     """
 96 |     parser = argparse.ArgumentParser()
 97 |     requiredNamed = parser.add_argument_group('Required Arguments')
 98 |     requiredNamed.add_argument(
 99 |         '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True)
100 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
101 |                                help='Output from previous step', required=True)
102 |     requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri',
103 |                                help='eg: File which contains uri and number of occurrences of properties', required=True)
104 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
105 |                                help='File in which you want to store output', required=True)
106 |     requiredNamed.add_argument('--project_name', dest='project_name',
107 |                                metavar='project_name', help='test', required=True)
108 |     args = parser.parse_args()
109 |     namespace = args.ns
110 |     input_file = args.inp
111 |     uri_file = args.uri
112 |     output_file = args.out
113 |     project_name = args.project_name
114 |     integrate(namespace,  uri_file, output_file,
115 |               project_name, "Enter a valid URL", input_file)
116 |     pass
117 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_1/pipeline_1_simple/sparql_generator.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from decision_tree import decision_tree
 3 | import argparse
 4 | from tqdm import tqdm
 5 | 
 6 | """
 7 | Section to parse the command line arguments.
 8 | """
 9 | 
10 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"):
11 | 	if __name__ == "__main__":
12 | 		f = open(input_file, 'r')
13 | 		lines = f.readlines()
14 | 		pass
15 | 	if not __name__ == "__main__":
16 | 		lines = decision_tree(input_file=input_file, project_name=project_name,
17 | 								url=url, uri_file=uri_file, namespace=namespace)
18 | 		pass
19 | 
20 | 	# print lines[0].split(',')
21 | 	# ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences',
22 | 	# 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n']
23 | 	# sparql_template = []
24 | 
25 | 	"""
26 | 	-	Read the file generated in the previous step.
27 | 	-	Read the lines from the file an save it as a list.
28 | 	-	If the frequency is known, Replace the 2nd last elemet of the formed list with
29 | 		a where statement, and last one witha where statement 
30 | 		followed by an assertion if it is a place.
31 | 	-	Join the updated list with comma as a delimeter and save
32 | 		add it in the string ending with a newline character.
33 | 	-	Print the final on the terminal
34 | 	"""
35 | 	accum = []
36 | 	final = ""
37 | 	lineno = 1
38 | 	for line in tqdm(lines):
39 | 		if lineno == 1:
40 | 			lineno += 1
41 | 			continue
42 | 		line = line.strip().split(',')
43 | 		# print lines
44 | 		if line[4] != '':
45 | 			# print line[5]
46 | 			# It was found the the MVE and OE was also required hence:
47 | 			#line[-2] = 'SELECT ?x WHERE { <X> <' + line[5] + '> ?x }'
48 | 			#line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }'
49 | 			line.append('SELECT ?x WHERE { <X> <' + line[4] + '> ?x }')
50 | 			line.append(
51 | 				'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a <http://dbpedia.org/ontology/Place> }')
52 | 
53 | 		final += ",".join(line)
54 | 		accum.append(",".join(line))
55 | 		final += '\n'
56 | 
57 | 	# print final
58 | 
59 | 	# fw = open()
60 | 
61 | 	"""
62 | 	This data generated might be required for further steps
63 | 	thus it is saved in another file named sparql.csv
64 | 	"""
65 | 
66 | 	open(project_name+"/"+output_file, 'w').write(final)
67 | 	return accum
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     """
72 |     Section to parse the command line arguments.
73 |     """
74 |     parser = argparse.ArgumentParser()
75 |     requiredNamed = parser.add_argument_group('Required Arguments')
76 |     requiredNamed.add_argument('--input_file', dest='inp', metavar='inp',
77 |                                help='Output from previous step', required=True)
78 |     requiredNamed.add_argument('--output_file', dest='out', metavar='out',
79 |                                help='File in which you want to store output', required=True)
80 |     requiredNamed.add_argument('--project_name', dest='project_name',
81 |                                metavar='project_name', help='eg.:test', required=True)
82 |     args = parser.parse_args()
83 |     input_file = args.inp
84 |     output_file = args.out
85 |     project_name = args.project_name
86 |     sparql_generator(input_file=input_file, output_file=output_file,
87 |                      project_name=project_name)
88 |     pass
89 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python"
3 | }


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/Film/get_properties.csv:
--------------------------------------------------------------------------------
 1 | afdbId,afdb id,Film,xsd:string
 2 | allcinemaId,allcinema id,Film,xsd:string
 3 | alternativeTitle,alternative title,Work,rdf:langString
 4 | amgid,amgId,Film,xsd:string
 5 | author,author,Work,Person
 6 | basedOn,based on,Work,Work
 7 | bgafdId,bgafd id,Film,xsd:string
 8 | bibo:pages,pages,Work,xsd:string
 9 | chiefEditor,chief editor,Work,Person
10 | cinematography,cinematography,Film,Person
11 | cites,cites,Work,xsd:string
12 | commissioner,commissioner,Work,xsd:string
13 | completionDate,completion date,Work,xsd:date
14 | composer,composer,Work,Person
15 | costumeDesigner,costume designer,Film,Person
16 | coverArtist,cover artist,Work,Person
17 | dc:description,description,Work,xsd:string
18 | dc:publisher,publisher,Work,xsd:string
19 | dcc,Dewey Decimal Classification,Work,xsd:string
20 | dct:references,references,Work,owl:Thing
21 | dct:source,source,Work,owl:Thing
22 | director,film director,Film,Person
23 | eTeatrId,e-teatr.pl id,Film,xsd:string
24 | editing,editing,Film,Person
25 | egafdId,egafd id,Film,xsd:string
26 | eurobabeIndexId,eurobabe index id,Film,xsd:string
27 | fileSize,size,Work,InformationUnit
28 | filename,filename,Work,xsd:string
29 | filmAudioType,film audio type,Film,xsd:string
30 | filmColourType,film colour type,Film,xsd:string
31 | filmRuntime,film runtime,Film,Time
32 | firstBroadcast,first broadcast,Film,xsd:string
33 | gross,gross,Film,Currency
34 | iafdId,iafd id,Film,xsd:string
35 | idAllocine,Allocine ID,Film,xsd:string
36 | license,license,Work,owl:Thing
37 | mainCharacter,main character,Work,Person
38 | makeupArtist,makeup artist,Film,Person
39 | musicComposer,music composer,Work,MusicalArtist
40 | narrator,narrator,Work,Person
41 | originalLanguage,original language,Work,Language
42 | originalTitle,original title,Work,rdf:langString
43 | previousWork,previous work,Work,Work
44 | producedBy,produced by,Film,Company
45 | producer,producer,Work,Agent
46 | productionCompany,production company,Work,Company
47 | publisher,publisher,Work,Agent
48 | quebecerTitle,quebecer title,Film,xsd:string
49 | releaseLocation,release location,Work,Place
50 | runtime,runtime,Work,Time
51 | setDesigner,set designer,Film,Person
52 | skos:notation,notation,Work,xsd:string
53 | specialEffects,special effects,Film,Person
54 | starring,starring,Work,Actor
55 | subjectTerm,subject term,Work,xsd:string
56 | subsequentWork,subsequent work,Work,Work
57 | titleLanguage,title language,Film,xsd:string
58 | translator,translator,Work,Person
59 | writer,writer,Work,Person
60 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/Film/sentence_and_template_generator.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/Film/sentence_and_template_generator.ods


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/eliminator.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import argparse
 3 | 
 4 | def eliminator(input_file, output_file,threshold):
 5 |         lines = open(input_file,'r').readlines()
 6 |         print(len(lines))
 7 |         accum = []
 8 |         nspm_ready = open(output_file,'w')
 9 |         for line in tqdm(lines):
10 |                 values = line.split(";")
11 |                 if(int(values[-1])>int(threshold)):
12 |                         accum.append(";".join(values[:-1])+"\n")
13 |                         nspm_ready.write(accum[-1])
14 |         nspm_ready.close()
15 | 
16 | 
17 | if __name__ == "__main__":
18 |         """
19 |         Section to parse the command line arguments.
20 |         """
21 |         parser = argparse.ArgumentParser()
22 |         requiredNamed = parser.add_argument_group('Required Arguments')
23 | 
24 |         requiredNamed.add_argument('--input', dest='input', metavar='input',
25 |                                                                 help='Input file name ', required=True)
26 |         requiredNamed.add_argument(
27 |                 '--output_file', dest='output', metavar='output', help='Output file name', required=True)
28 |         requiredNamed.add_argument(
29 |                 '--threshold', dest='threshold', metavar='threshold', help='threshold', required=True)
30 |         args = parser.parse_args()
31 |         input_file = args.input
32 |         output_file = args.output
33 |         threshold = args.threshold
34 |         eliminator(input_file=input_file, output_file=output_file,threshold=threshold)
35 |         pass
36 | 
37 |         


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/get_properties.csv:
--------------------------------------------------------------------------------
 1 | binomial,binomial,Species,owl:Thing
 2 | binomialAuthority,binomial authority,Species,owl:Thing
 3 | clade,clade,Species,owl:Thing
 4 | classis,classis,Species,owl:Thing
 5 | conservationStatus,conservation status,Species,xsd:string
 6 | conservationStatusSystem,conservation status system,Species,xsd:string
 7 | domain,domain,Species,owl:Thing
 8 | extinctionYear,extinction year,Species,xsd:gYear
 9 | family,family,Species,Species
10 | fossil,fossil,Species,Species
11 | genus,genus,Species,owl:Thing
12 | kingdom,kingdom,Species,owl:Thing
13 | order,order (taxonomy),Species,owl:Thing
14 | parentheses,parentheses,Species,owl:Thing
15 | phylum,phylum,Species,owl:Thing
16 | redListIdNL,red list ID NL,Species,xsd:integer
17 | scientificName,scientific name,Species,xsd:string
18 | species,species,Species,Species
19 | subClassis,sub-classis,Species,owl:Thing
20 | subFamily,sub-family,Species,Taxon
21 | subGenus,subgenus,Species,owl:Thing
22 | subOrder,sub-order,Species,owl:Thing
23 | subTribus,subtribus,Species,Species
24 | superFamily,super-family,Species,Taxon
25 | superOrder,super-order,Species,owl:Thing
26 | superTribus,supertribus,Species,Species
27 | taxon,has taxon,Species,Taxon
28 | tribus,tribus,Species,Species
29 | woRMS,WoRMS,Species,owl:Thing
30 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/sentence_and_template_generator.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/sentence_and_template_generator.ods


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/fetch_ranks_sub.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | 
 4 | def fetch_ranks(filename='part-r-00000'):
 5 |     sub = open(filename,'r').readlines()
 6 |     diction={}
 7 | 
 8 |     print("Loading Rankings")
 9 |     for val in tqdm(sub):
10 |         diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1])
11 |     return diction
12 | 
13 | if __name__ == "__main__":
14 |     fetch_ranks()
15 |     pass
16 | 
17 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/generate_templates.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from get_properties import get_properties
 3 | from generate_url import generate_url
 4 | from sentence_and_template_generator import sentence_and_template_generator
 5 | import os
 6 | from fetch_ranks_sub import fetch_ranks
 7 | import logging
 8 | 
 9 | def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"):
10 |     """
11 |     Funtion to generate templates | wrapper function for rest of the functions. 
12 |     """
13 |     val = generate_url(label)
14 |     url = val[0]
15 |     about = (val[1])
16 |     count =0
17 |     vessel= []  
18 |     
19 |     diction = fetch_ranks("../utility/part-r-00000")
20 |     if(not os.path.isdir(project_name)):
21 |         os.makedirs(project_name)
22 |     output_file = open(project_name+"/" + output_file, 'w')
23 |     
24 |     # Create a logger object
25 |     logger = logging.getLogger()
26 | 
27 |     # Configure logger
28 |     logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')
29 | 
30 |     # Setting threshold level
31 |     logger.setLevel(logging.DEBUG)
32 | 
33 |     # Use the logging methods
34 |     #logger.debug("This is a debug message")  
35 |     logger.info("This is a log file.")  
36 |     #logger.warning("This is a warning message")  
37 |     #logger.error("This is an error message")  
38 |     #logger.critical("This is a critical message")   
39 | 
40 |     list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
41 |     for property_line in list_of_property_information:
42 |         count+=1
43 |         prop = property_line.split(',')
44 |         print("**************\n"+str(prop))
45 |         sentence_and_template_generator(log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = 2)
46 |     output_file.close()    
47 | 
48 | if __name__ == "__main__":
49 |     """
50 |     Section to parse the command line arguments.
51 |     """
52 |     parser = argparse.ArgumentParser()
53 |     requiredNamed = parser.add_argument_group('Required Arguments')
54 | 
55 |     requiredNamed.add_argument('--label', dest='label', metavar='label',
56 |                                                             help='label: person, place etc.', required=True)
57 |     requiredNamed.add_argument(
58 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
59 |     requiredNamed.add_argument(
60 |         '--depth', dest='depth', metavar='depth', help='Mention the depth you want to go in the knowledge graph (The number of questions will increase exponentially!), e.g. 2', required=False)
61 |     args = parser.parse_args()
62 |     label = args.label
63 |     project_name = args.project_name
64 |     depth = args.depth
65 |     generate_templates(label=label,project_name=project_name,depth=depth)
66 |     pass


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv"):
12 |     """
13 |     - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     - This code on execution creates a csv which contains all the properties, ontology,
16 |     class related information and data types as field values in each row.
17 |     - This function also returns a 2D list of the information mentioned above to the calling
18 |     function
19 |     """
20 |     page = urllib.request.urlopen(url)
21 |     soup = BeautifulSoup(page, "html.parser")
22 |     if(not os.path.isdir(project_name)):
23 |         os.makedirs(project_name)
24 |     output_file = open(project_name+"/" + output_file, 'w')
25 |     fl = 0
26 |     accum = []
27 |     for rows in tqdm(soup.find_all("tr")):
28 |         x = rows.find_all("td")
29 |         if len(x) <= 2:
30 |             fl = 1
31 |             continue
32 |         if fl == 1:
33 |             fl = 2
34 |             continue
35 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
36 |         label = rows.find_all("td")[1].get_text()
37 |         dom = rows.find_all("td")[2].get_text()
38 |         rng = rows.find_all("td")[3].get_text()
39 |         URL_name = ((rows.find_all("td")[0].find('a').attrs['href']))
40 |         final = name + "," + label + "," + dom + "," + rng 
41 |         #+ ","+ URL_name.split(':')[-1]
42 |         accum.append(final)
43 |         output_file.write(final+"\n")
44 |     output_file.close()
45 |     return accum
46 | 
47 | 
48 | """
49 | Name, Label, Domain, Range, URL_name
50 | """
51 | 
52 | if __name__ == "__main__":
53 |     """
54 |     Section to parse the command line arguments.
55 |     """
56 |     parser = argparse.ArgumentParser()
57 |     requiredNamed = parser.add_argument_group('Required Arguments')
58 | 
59 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
60 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
61 |     requiredNamed.add_argument(
62 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
63 |     requiredNamed.add_argument(
64 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
65 |     args = parser.parse_args()
66 |     url = args.url
67 |     output_file = args.out_put
68 |     project_name = args.project_name
69 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
70 |     pass
71 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/rank_test/affiliation.csv:
--------------------------------------------------------------------------------
 1 | administrator,administrator,Organisation,Person
 2 | age,age,Agent,xsd:integer
 3 | artPatron,patron (art),Agent,Artist
 4 | ceo,chief executive officer,Organisation,Person
 5 | chairperson,chairperson,Organisation,Person
 6 | championships,championships,Agent,xsd:nonNegativeInteger
 7 | chaplain,chaplain,Organisation,Person
 8 | childOrganisation,child organisation,Organisation,Organisation
 9 | denomination,denomination,Agent,owl:Thing
10 | discipline,discipline,Agent,owl:Thing
11 | endowment,endowment,Organisation,Currency
12 | formationDate,formation date,Organisation,xsd:date
13 | formationYear,formation year,Organisation,xsd:gYear
14 | foundationPlace,foundation place,Organisation,City
15 | generalCouncil,general council,Agent,TermOfOffice
16 | headquarter,headquarter,Organisation,PopulatedPlace
17 | hometown,home town,Agent,Settlement
18 | ideology,ideology,Agent,Ideology
19 | juniorSeason,junior season,Agent,owl:Thing
20 | leaderFunction,leaderFunction,Organisation,PersonFunction
21 | legalForm,legal form,Organisation,owl:Thing
22 | locationCity,location city,Organisation,City
23 | mainOrgan,main organ,Organisation,owl:Thing
24 | managerSeason,manager season,Agent,owl:Thing
25 | membership,membership,Organisation,rdf:langString
26 | mergedWith,merged with,Organisation,Organisation
27 | nationalSelection,national selection,Agent,owl:Thing
28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
32 | organisationMember,organisation member,Organisation,OrganisationMember
33 | owns,owns,Agent,Thing
34 | parentOrganisation,parent organisation,Organisation,Organisation
35 | playerSeason,player season,Agent,owl:Thing
36 | product,product,Organisation,owl:Thing
37 | ranking,ranking,Organisation,xsd:positiveInteger
38 | regionServed,region served,Organisation,Place
39 | regionalCouncil,regional council,Agent,TermOfOffice
40 | revenue,revenue,Organisation,Currency
41 | roleInEvent,A Person's role in an event,Agent,Event
42 | season,season,Agent,owl:Thing
43 | secretaryGeneral,secretary,Organisation,Person
44 | service,service,Organisation,owl:Thing
45 | staff,staff,Organisation,xsd:nonNegativeInteger
46 | superintendent,superintendent,Organisation,Person
47 | trustee,trustee,Organisation,Person
48 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/.pipeline_3/rank_test/sentence_and_template_generator.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/rank_test/sentence_and_template_generator.ods


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.images/person_properties.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.images/person_properties.png


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/bin/python"
3 | }


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/sentence_and_template_generator:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/sentence_and_template_generator


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/test.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/test.csv


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/get_properties.csv:
--------------------------------------------------------------------------------
 1 | architect,architect,ArchitecturalStructure,Architect
 2 | architectualBureau,architectual bureau,ArchitecturalStructure,Company
 3 | architecturalStyle,architectural style,ArchitecturalStructure,owl:Thing
 4 | buildingEndYear,building end year,ArchitecturalStructure,xsd:gYear
 5 | buildingStartYear,building start year,ArchitecturalStructure,xsd:gYear
 6 | construction,construction,ArchitecturalStructure,owl:Thing
 7 | constructionMaterial,construction material,ArchitecturalStructure,owl:Thing
 8 | currentlyUsedFor,currently used for,ArchitecturalStructure,xsd:string
 9 | dateUnveiled,date unveiled,Monument,xsd:date
10 | demolitionDate,demolition date,ArchitecturalStructure,xsd:date
11 | demolitionYear,demolition year,ArchitecturalStructure,xsd:gYear
12 | features,features,ArchitecturalStructure,Work
13 | groupCommemorated,group commemorated,Monument,xsd:string
14 | initiallyUsedFor,initally used for,ArchitecturalStructure,xsd:string
15 | maintainedBy,maintained by,ArchitecturalStructure,owl:Thing
16 | rebuildingDate,rebuilding date,ArchitecturalStructure,xsd:date
17 | rebuildingYear,rebuilding year,ArchitecturalStructure,xsd:gYear
18 | reopeningDate,reopening date,ArchitecturalStructure,xsd:date
19 | reopeningYear,reopening year,ArchitecturalStructure,xsd:gYear
20 | tenant,tenant,ArchitecturalStructure,Organisation
21 | visitorStatisticsAsOf,visitor statistics as of,ArchitecturalStructure,xsd:gYear
22 | visitorsPerDay,visitors per day,ArchitecturalStructure,xsd:nonNegativeInteger
23 | visitorsPerYear,visitors per year,ArchitecturalStructure,xsd:nonNegativeInteger
24 | visitorsPercentageChange,visitor percentage change,ArchitecturalStructure,xsd:double
25 | visitorsTotal,visitors total,ArchitecturalStructure,xsd:nonNegativeInteger
26 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/tenant.csv:
--------------------------------------------------------------------------------
 1 | administrator,administrator,Organisation,Person
 2 | age,age,Agent,xsd:integer
 3 | artPatron,patron (art),Agent,Artist
 4 | ceo,chief executive officer,Organisation,Person
 5 | chairperson,chairperson,Organisation,Person
 6 | championships,championships,Agent,xsd:nonNegativeInteger
 7 | chaplain,chaplain,Organisation,Person
 8 | childOrganisation,child organisation,Organisation,Organisation
 9 | denomination,denomination,Agent,owl:Thing
10 | discipline,discipline,Agent,owl:Thing
11 | endowment,endowment,Organisation,Currency
12 | formationDate,formation date,Organisation,xsd:date
13 | formationYear,formation year,Organisation,xsd:gYear
14 | foundationPlace,foundation place,Organisation,City
15 | generalCouncil,general council,Agent,TermOfOffice
16 | headquarter,headquarter,Organisation,PopulatedPlace
17 | hometown,home town,Agent,Settlement
18 | ideology,ideology,Agent,Ideology
19 | juniorSeason,junior season,Agent,owl:Thing
20 | leaderFunction,leaderFunction,Organisation,PersonFunction
21 | legalForm,legal form,Organisation,owl:Thing
22 | locationCity,location city,Organisation,City
23 | mainOrgan,main organ,Organisation,owl:Thing
24 | managerSeason,manager season,Agent,owl:Thing
25 | membership,membership,Organisation,rdf:langString
26 | mergedWith,merged with,Organisation,Organisation
27 | nationalSelection,national selection,Agent,owl:Thing
28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
32 | organisationMember,organisation member,Organisation,OrganisationMember
33 | owns,owns,Agent,Thing
34 | parentOrganisation,parent organisation,Organisation,Organisation
35 | playerSeason,player season,Agent,owl:Thing
36 | product,product,Organisation,owl:Thing
37 | ranking,ranking,Organisation,xsd:positiveInteger
38 | regionServed,region served,Organisation,Place
39 | regionalCouncil,regional council,Agent,TermOfOffice
40 | revenue,revenue,Organisation,Currency
41 | roleInEvent,A Person's role in an event,Agent,Event
42 | season,season,Agent,owl:Thing
43 | secretaryGeneral,secretary,Organisation,Person
44 | service,service,Organisation,owl:Thing
45 | staff,staff,Organisation,xsd:nonNegativeInteger
46 | superintendent,superintendent,Organisation,Person
47 | trustee,trustee,Organisation,Person
48 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/test.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/test.csv


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Organisation/get_properties.csv:
--------------------------------------------------------------------------------
 1 | administrator,administrator,Organisation,Person
 2 | age,age,Agent,xsd:integer
 3 | artPatron,patron (art),Agent,Artist
 4 | ceo,chief executive officer,Organisation,Person
 5 | chairperson,chairperson,Organisation,Person
 6 | championships,championships,Agent,xsd:nonNegativeInteger
 7 | chaplain,chaplain,Organisation,Person
 8 | childOrganisation,child organisation,Organisation,Organisation
 9 | denomination,denomination,Agent,owl:Thing
10 | discipline,discipline,Agent,owl:Thing
11 | endowment,endowment,Organisation,Currency
12 | formationDate,formation date,Organisation,xsd:date
13 | formationYear,formation year,Organisation,xsd:gYear
14 | foundationPlace,foundation place,Organisation,City
15 | generalCouncil,general council,Agent,TermOfOffice
16 | headquarter,headquarter,Organisation,PopulatedPlace
17 | hometown,home town,Agent,Settlement
18 | ideology,ideology,Agent,Ideology
19 | juniorSeason,junior season,Agent,owl:Thing
20 | leaderFunction,leaderFunction,Organisation,PersonFunction
21 | legalForm,legal form,Organisation,owl:Thing
22 | locationCity,location city,Organisation,City
23 | mainOrgan,main organ,Organisation,owl:Thing
24 | managerSeason,manager season,Agent,owl:Thing
25 | membership,membership,Organisation,rdf:langString
26 | mergedWith,merged with,Organisation,Organisation
27 | nationalSelection,national selection,Agent,owl:Thing
28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
32 | organisationMember,organisation member,Organisation,OrganisationMember
33 | owns,owns,Agent,Thing
34 | parentOrganisation,parent organisation,Organisation,Organisation
35 | playerSeason,player season,Agent,owl:Thing
36 | product,product,Organisation,owl:Thing
37 | ranking,ranking,Organisation,xsd:positiveInteger
38 | regionServed,region served,Organisation,Place
39 | regionalCouncil,regional council,Agent,TermOfOffice
40 | revenue,revenue,Organisation,Currency
41 | roleInEvent,A Person's role in an event,Agent,Event
42 | season,season,Agent,owl:Thing
43 | secretaryGeneral,secretary,Organisation,Person
44 | service,service,Organisation,owl:Thing
45 | staff,staff,Organisation,xsd:nonNegativeInteger
46 | superintendent,superintendent,Organisation,Person
47 | trustee,trustee,Organisation,Person
48 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/alma mater.csv:
--------------------------------------------------------------------------------
 1 | actingHeadteacher,acting headteacher,EducationalInstitution,Person
 2 | administrator,administrator,Organisation,Person
 3 | alumni,alumni,EducationalInstitution,Person
 4 | assistantPrincipal,assistant principal,EducationalInstitution,owl:Thing
 5 | brinCode,BRIN code,EducationalInstitution,xsd:string
 6 | campusType,campus type,EducationalInstitution,rdf:langString
 7 | ceo,chief executive officer,Organisation,Person
 8 | chairperson,chairperson,Organisation,Person
 9 | chaplain,chaplain,Organisation,Person
10 | childOrganisation,child organisation,Organisation,Organisation
11 | closed,closed,EducationalInstitution,xsd:date
12 | custodian,custodian,EducationalInstitution,Person
13 | dean,dean,EducationalInstitution,Person
14 | educationSystem,education system,EducationalInstitution,owl:Thing
15 | endowment,endowment,Organisation,Currency
16 | facultySize,faculty size,EducationalInstitution,xsd:nonNegativeInteger
17 | formationDate,formation date,Organisation,xsd:date
18 | formationYear,formation year,Organisation,xsd:gYear
19 | foundationPlace,foundation place,Organisation,City
20 | head,head,EducationalInstitution,Person
21 | headquarter,headquarter,Organisation,PopulatedPlace
22 | leaderFunction,leaderFunction,Organisation,PersonFunction
23 | legalForm,legal form,Organisation,owl:Thing
24 | locationCity,location city,Organisation,City
25 | mainOrgan,main organ,Organisation,owl:Thing
26 | membership,membership,Organisation,rdf:langString
27 | mergedWith,merged with,Organisation,Organisation
28 | nationalRanking,national ranking,EducationalInstitution,xsd:positiveInteger
29 | numberOfAcademicStaff,number of academic staff,EducationalInstitution,xsd:nonNegativeInteger
30 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
31 | numberOfGraduateStudents,number of graduate students,EducationalInstitution,xsd:nonNegativeInteger
32 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
33 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
34 | numberOfStudents,number of students,EducationalInstitution,xsd:nonNegativeInteger
35 | numberOfUndergraduateStudents,number of undergraduate students,EducationalInstitution,xsd:nonNegativeInteger
36 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
37 | offeredClasses,offered classes,EducationalInstitution,xsd:string
38 | officialSchoolColour,official school colour,EducationalInstitution,xsd:string
39 | organisationMember,organisation member,Organisation,OrganisationMember
40 | parentOrganisation,parent organisation,Organisation,Organisation
41 | principal,principal,EducationalInstitution,Person
42 | product,product,Organisation,owl:Thing
43 | ranking,ranking,Organisation,xsd:positiveInteger
44 | rector,rector,EducationalInstitution,Person
45 | regionServed,region served,Organisation,Place
46 | revenue,revenue,Organisation,Currency
47 | secretaryGeneral,secretary,Organisation,Person
48 | service,service,Organisation,owl:Thing
49 | staff,staff,Organisation,xsd:nonNegativeInteger
50 | superintendent,superintendent,Organisation,Person
51 | trustee,trustee,Organisation,Person
52 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/career station.csv:
--------------------------------------------------------------------------------
1 | end,end,TimePeriod,xsd:date
2 | numberOfGoals,number of goals scored,CareerStation,xsd:integer
3 | numberOfMatches,number of matches or caps,CareerStation,xsd:integer
4 | start,start,TimePeriod,xsd:date
5 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/college.csv:
--------------------------------------------------------------------------------
 1 | actingHeadteacher,acting headteacher,EducationalInstitution,Person
 2 | administrator,administrator,Organisation,Person
 3 | alumni,alumni,EducationalInstitution,Person
 4 | assistantPrincipal,assistant principal,EducationalInstitution,owl:Thing
 5 | brinCode,BRIN code,EducationalInstitution,xsd:string
 6 | campusType,campus type,EducationalInstitution,rdf:langString
 7 | ceo,chief executive officer,Organisation,Person
 8 | chairperson,chairperson,Organisation,Person
 9 | chaplain,chaplain,Organisation,Person
10 | childOrganisation,child organisation,Organisation,Organisation
11 | closed,closed,EducationalInstitution,xsd:date
12 | custodian,custodian,EducationalInstitution,Person
13 | dean,dean,EducationalInstitution,Person
14 | educationSystem,education system,EducationalInstitution,owl:Thing
15 | endowment,endowment,Organisation,Currency
16 | facultySize,faculty size,EducationalInstitution,xsd:nonNegativeInteger
17 | formationDate,formation date,Organisation,xsd:date
18 | formationYear,formation year,Organisation,xsd:gYear
19 | foundationPlace,foundation place,Organisation,City
20 | head,head,EducationalInstitution,Person
21 | headquarter,headquarter,Organisation,PopulatedPlace
22 | leaderFunction,leaderFunction,Organisation,PersonFunction
23 | legalForm,legal form,Organisation,owl:Thing
24 | locationCity,location city,Organisation,City
25 | mainOrgan,main organ,Organisation,owl:Thing
26 | membership,membership,Organisation,rdf:langString
27 | mergedWith,merged with,Organisation,Organisation
28 | nationalRanking,national ranking,EducationalInstitution,xsd:positiveInteger
29 | numberOfAcademicStaff,number of academic staff,EducationalInstitution,xsd:nonNegativeInteger
30 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
31 | numberOfGraduateStudents,number of graduate students,EducationalInstitution,xsd:nonNegativeInteger
32 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
33 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
34 | numberOfStudents,number of students,EducationalInstitution,xsd:nonNegativeInteger
35 | numberOfUndergraduateStudents,number of undergraduate students,EducationalInstitution,xsd:nonNegativeInteger
36 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
37 | offeredClasses,offered classes,EducationalInstitution,xsd:string
38 | officialSchoolColour,official school colour,EducationalInstitution,xsd:string
39 | organisationMember,organisation member,Organisation,OrganisationMember
40 | parentOrganisation,parent organisation,Organisation,Organisation
41 | principal,principal,EducationalInstitution,Person
42 | product,product,Organisation,owl:Thing
43 | ranking,ranking,Organisation,xsd:positiveInteger
44 | rector,rector,EducationalInstitution,Person
45 | regionServed,region served,Organisation,Place
46 | revenue,revenue,Organisation,Currency
47 | secretaryGeneral,secretary,Organisation,Person
48 | service,service,Organisation,owl:Thing
49 | staff,staff,Organisation,xsd:nonNegativeInteger
50 | superintendent,superintendent,Organisation,Person
51 | trustee,trustee,Organisation,Person
52 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/employer.csv:
--------------------------------------------------------------------------------
 1 | administrator,administrator,Organisation,Person
 2 | age,age,Agent,xsd:integer
 3 | artPatron,patron (art),Agent,Artist
 4 | ceo,chief executive officer,Organisation,Person
 5 | chairperson,chairperson,Organisation,Person
 6 | championships,championships,Agent,xsd:nonNegativeInteger
 7 | chaplain,chaplain,Organisation,Person
 8 | childOrganisation,child organisation,Organisation,Organisation
 9 | denomination,denomination,Agent,owl:Thing
10 | discipline,discipline,Agent,owl:Thing
11 | endowment,endowment,Organisation,Currency
12 | formationDate,formation date,Organisation,xsd:date
13 | formationYear,formation year,Organisation,xsd:gYear
14 | foundationPlace,foundation place,Organisation,City
15 | generalCouncil,general council,Agent,TermOfOffice
16 | headquarter,headquarter,Organisation,PopulatedPlace
17 | hometown,home town,Agent,Settlement
18 | ideology,ideology,Agent,Ideology
19 | juniorSeason,junior season,Agent,owl:Thing
20 | leaderFunction,leaderFunction,Organisation,PersonFunction
21 | legalForm,legal form,Organisation,owl:Thing
22 | locationCity,location city,Organisation,City
23 | mainOrgan,main organ,Organisation,owl:Thing
24 | managerSeason,manager season,Agent,owl:Thing
25 | membership,membership,Organisation,rdf:langString
26 | mergedWith,merged with,Organisation,Organisation
27 | nationalSelection,national selection,Agent,owl:Thing
28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger
29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger
30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger
31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger
32 | organisationMember,organisation member,Organisation,OrganisationMember
33 | owns,owns,Agent,Thing
34 | parentOrganisation,parent organisation,Organisation,Organisation
35 | playerSeason,player season,Agent,owl:Thing
36 | product,product,Organisation,owl:Thing
37 | ranking,ranking,Organisation,xsd:positiveInteger
38 | regionServed,region served,Organisation,Place
39 | regionalCouncil,regional council,Agent,TermOfOffice
40 | revenue,revenue,Organisation,Currency
41 | roleInEvent,A Person's role in an event,Agent,Event
42 | season,season,Agent,owl:Thing
43 | secretaryGeneral,secretary,Organisation,Person
44 | service,service,Organisation,owl:Thing
45 | staff,staff,Organisation,xsd:nonNegativeInteger
46 | superintendent,superintendent,Organisation,Person
47 | trustee,trustee,Organisation,Person
48 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/ideology.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/ideology.csv


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/eliminator.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import argparse
 3 | 
 4 | def eliminator():
 5 |         """
 6 |         The function remove the templates which are considered as less popular 
 7 |         based on the proposed ranking mechanism, the input files should be pre processed
 8 |         and a TRUE or FALSE should be added as the last column. 
 9 | 
10 |         This function just removes the entries with FALSE as the last entry in a row 
11 |         and create a file name new_train.csv to be used for futher purposes.
12 |         """
13 |         lines = open(location,'r').readlines()
14 |         print(len(lines))
15 |         accum = []
16 |         nspm_ready = open("new_train.csv",'w')
17 |         for line in tqdm(lines):
18 |                 values = line.split(",")
19 |                 if(len(values)<8):
20 |                         print("Input file is of wrong format, please add the corect bolean values as the last column entry to use this function")
21 |                 print(values[-1])
22 |                 if(values[-1]=="TRUE\n"):
23 |                         accum.append(";".join(values[:-2])+"\n")
24 |                         nspm_ready.write(accum[-1])
25 |         nspm_ready.close()
26 | 
27 | if __name__ == "__main__":
28 |         parser = argparse.ArgumentParser()
29 |         requiredNamed = parser.add_argument_group('Required Arguments')
30 | 
31 |         requiredNamed.add_argument('--location', dest='location', metavar='location',
32 |                                                                 help='location of the file to be pruned.', required=True)
33 |         args = parser.parse_args()
34 |         location = args.location
35 |         eliminator(location)
36 |         pass
37 | 
38 | 
39 |         


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/fetch_ranks.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | 
 4 | def fetch_ranks(filename='../utility/wikidata.rank'):
 5 |     """
 6 |     The function loads rank from a supplied position.
 7 |     """
 8 |     sub = open(filename,'r').readlines()
 9 |     diction={}
10 | 
11 |     print("Loading Rankings")
12 |     for val in tqdm(sub):
13 |         diction[val.split('\t')[0]] = float(val.split('\t')[1])
14 |     return diction
15 | 
16 | if __name__ == "__main__":
17 |     fetch_ranks()
18 |     pass
19 | 
20 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/fetch_ranks_sub.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | 
 4 | def fetch_ranks(filename='part-r-00000'):
 5 |     """
 6 |     The function loads ranks from a supplied location, 
 7 |     The ranks fileshould belong to the subjective 3d format
 8 |     of saving ranks.
 9 |     """
10 |     sub = open(filename,'r').readlines()
11 |     diction={}
12 | 
13 |     print("Loading Rankings")
14 |     for val in tqdm(sub):
15 |         diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1])
16 |     return diction
17 | 
18 | if __name__ == "__main__":
19 |     fetch_ranks()
20 |     pass
21 | 
22 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/generate_templates.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from get_properties import get_properties
 3 | from generate_url import generate_url
 4 | from sentence_and_template_generator import sentence_and_template_generator
 5 | import os
 6 | from fetch_ranks_sub import fetch_ranks
 7 | import logging
 8 | 
 9 | def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"):
10 |     """
11 |     The function acts as a wrapper for the whole package of supplied source code.
12 |     """
13 |     val = generate_url(label)
14 |     url = val[0]
15 |     about = (val[1])
16 |     count =0
17 |     vessel= []  
18 |     depth=int(depth)
19 |     diction = fetch_ranks("../utility/part-r-00000")
20 |     if(not os.path.isdir(project_name)):
21 |         os.makedirs(project_name)
22 |     output_file = open(project_name+"/" + output_file, 'w')
23 |     test_set = open(project_name+"/" + "test.csv", 'w')
24 |     prop_dic = {}
25 |     for iterator in range(depth):
26 |         prop_dic[iterator] = []
27 |     # Create a logger object
28 |     logger = logging.getLogger()
29 | 
30 |     # Configure logger
31 |     logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')
32 | 
33 |     # Setting threshold level
34 |     logger.setLevel(logging.DEBUG)
35 | 
36 |     # Use the logging methods
37 |     #logger.debug("This is a debug message")  
38 |     logger.info("This is a log file.")  
39 |     #logger.warning("This is a warning message")  
40 |     #logger.error("This is an error message")  
41 |     #logger.critical("This is a critical message")   
42 | 
43 |     list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
44 |     for property_line in list_of_property_information:
45 |         count+=1
46 |         prop = property_line.split(',')
47 |         print("**************\n"+str(prop))
48 |         sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth)
49 |     output_file.close()    
50 | 
51 | if __name__ == "__main__":
52 |     """
53 |     Section to parse the command line arguments.
54 |     """
55 |     parser = argparse.ArgumentParser()
56 |     requiredNamed = parser.add_argument_group('Required Arguments')
57 | 
58 |     requiredNamed.add_argument('--label', dest='label', metavar='label',
59 |                                                             help='label: person, place etc.', required=True)
60 |     requiredNamed.add_argument(
61 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
62 |     requiredNamed.add_argument(
63 |         '--depth', dest='depth', metavar='depth', help='Mention the depth you want to go in the knowledge graph (The number of questions will increase exponentially!), e.g. 2', required=False)
64 |     args = parser.parse_args()
65 |     label = args.label
66 |     project_name = args.project_name
67 |     depth = args.depth
68 |     generate_templates(label=label,project_name=project_name,depth=depth)
69 |     pass


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv"):
12 |     """
13 |     - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     - This code on execution creates a csv which contains all the properties, ontology,
16 |     class related information and data types as field values in each row.
17 |     - This function also returns a 2D list of the information mentioned above to the calling
18 |     function
19 |     """
20 |     page = urllib.request.urlopen(url)
21 |     soup = BeautifulSoup(page, "html.parser")
22 |     if(not os.path.isdir(project_name)):
23 |         os.makedirs(project_name)
24 |     output_file = open(project_name+"/" + output_file, 'w')
25 |     fl = 0
26 |     accum = []
27 |     for rows in tqdm(soup.find_all("tr")):
28 |         x = rows.find_all("td")
29 |         if len(x) <= 2:
30 |             fl = 1
31 |             continue
32 |         if fl == 1:
33 |             fl = 2
34 |             continue
35 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
36 |         label = rows.find_all("td")[1].get_text()
37 |         dom = rows.find_all("td")[2].get_text()
38 |         rng = rows.find_all("td")[3].get_text()
39 |         URL_name = ((rows.find_all("td")[0].find('a').attrs['href']))
40 |         final = name + "," + label + "," + dom + "," + rng 
41 |         #+ ","+ URL_name.split(':')[-1]
42 |         accum.append(final)
43 |         output_file.write(final+"\n")
44 |     output_file.close()
45 |     return accum
46 | 
47 | 
48 | """
49 | Name, Label, Domain, Range, URL_name
50 | """
51 | 
52 | if __name__ == "__main__":
53 |     """
54 |     Section to parse the command line arguments.
55 |     """
56 |     parser = argparse.ArgumentParser()
57 |     requiredNamed = parser.add_argument_group('Required Arguments')
58 | 
59 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
60 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
61 |     requiredNamed.add_argument(
62 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
63 |     requiredNamed.add_argument(
64 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
65 |     args = parser.parse_args()
66 |     url = args.url
67 |     output_file = args.out_put
68 |     project_name = args.project_name
69 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
70 |     pass
71 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/question_generator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | def question_generator(query):
 5 |    pass 
 6 | 
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     """
11 |     Section to parse the command line arguments.
12 |     """
13 |     parser = argparse.ArgumentParser()
14 |     requiredNamed = parser.add_argument_group('Required Arguments')
15 | 
16 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
17 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
18 |     requiredNamed.add_argument(
19 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
20 |     requiredNamed.add_argument(
21 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
22 |     args = parser.parse_args()
23 |     url = args.url
24 |     output_file = args.out_put
25 |     project_name = args.project_name
26 |     pass


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/.~lock.question_form.csv#:
--------------------------------------------------------------------------------
1 | ,petrichor,DragonFeaster,24.06.2019 07:05,file:///home/petrichor/.config/libreoffice/4;


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/Test_Fixer/readme.md:
--------------------------------------------------------------------------------
 1 | # Dataset thresholder
 2 | 
 3 | This code creates a test set making sure the following constraints are 
 4 | followed: 
 5 | - The vocabulary in the test set has been learned in a separate context in the 
 6 | train set.
 7 | - Frequency Thresholding: The vocabulary in the test set is present in the train 
 8 | set for a given number of times.
 9 | 
10 | To run the code, download the files from [here](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/anand-pipeline_3-Test_Fixer-files.zip), then run the following command:
11 | 
12 | ```python 
13 | python text_fixer.py
14 | ```
15 | 
16 | Minimum requirements:
17 | 
18 | - `train.sparql` file containing the SPARQL queries of the training set.
19 | - `old_test.sparql` The test set containing all the test SPARQL queries.
20 | - `vocab.sparql` Vocabulary of the training set.
21 | 
22 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/compare/compare.py:
--------------------------------------------------------------------------------
 1 | """ 
 2 | The comparer will compare 2 files and determine the following:
 3 | - In a line by line inspection, how many lines were not exactly the same.
 4 | - A dictionary containing the number of errors in the matched lines like:
 5 | {
 6 |     0: 34044, 
 7 |     1: 36629, 
 8 |     2: 16682, 
 9 |     3: 4291, 
10 |     7: 82, 
11 |     8: 173, 
12 |     11: 18, 
13 |     12: 22, 
14 |     'Wrong number of tokens': 22051
15 | }
16 | """
17 | 
18 | from tqdm import tqdm
19 | 
20 | ref = open("test.sparql",'r').readlines()
21 | test = open("output_test",'r').readlines()
22 | diction_error = {}
23 | counter = 0
24 | for val in tqdm(range(len(ref))):
25 |     error_count = 0
26 |     ref_s = ref[val].split(" ")
27 |     test_s = test[val].split(" ")
28 |     """ print("```")
29 |     print("reference:"+ref[val])
30 |     print("test:"+test[val])
31 |     print("```") """
32 |     if (ref_s != test_s):
33 |         counter+=1
34 |     try:
35 |         for count in range(len(ref_s)):
36 |             if(ref_s[count] == test_s[count]):
37 |                 continue
38 |             else:
39 |                 error_count += 1
40 |                 #print("ref:"+ref_s[count]+"</br>")
41 |                 #print("test:"+test_s[count]+"</br>")
42 |     except:
43 |         error_count = "Wrong number of tokens"
44 |         #print("Wrong number of tokens")
45 |     if(error_count not in diction_error.keys()):
46 |         diction_error[(error_count)] = 1
47 |     diction_error[(error_count)] += 1
48 |     #print("\n----\n")
49 | print(counter)
50 | print(diction_error)


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/compare/readme.md:
--------------------------------------------------------------------------------
 1 | # Compare SPARQL files 
 2 | 
 3 | The comparer will compare 2 files and determine the following:
 4 | - In a line by line inspection, how many lines were not exactly the same.
 5 | - A dictionary containing the number of errors in the matched lines like:
 6 | {
 7 |     0: 34044, 
 8 |     1: 36629, 
 9 |     2: 16682, 
10 |     3: 4291, 
11 |     7: 82, 
12 |     8: 173, 
13 |     11: 18, 
14 |     12: 22, 
15 |     'Wrong number of tokens': 22051
16 | }
17 | - To run the code please use the following:
18 | ```
19 | python compare.py
20 | ```
21 | Minimum requirements:
22 | - `<file1>` and `<file2>` the 2 files to be compared. 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/labels.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/utility/labels.json


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/breaker.sh:
--------------------------------------------------------------------------------
1 | echo "Creating data_fragments folder"
2 | mkdir data_fragments
3 | cd data_fragments
4 | echo "Usage:    ./breaker.sh <size> <name of file to be split>" 
5 | echo "Example:  ./breaker.sh 1000MB pageRank.txt"
6 | echo "It will take some time, please remain calm." 
7 | split -b $1  ../$2


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/embedding_extractor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | import os 
 4 | index = open("index.csv").readlines()
 5 | diction = {}
 6 | missed_counter = 0
 7 | print("""
 8 | Loading the index information from the index.csv file.
 9 | """)
10 | 
11 | for line in tqdm(range(len(index))):
12 |     index[line] = index[line].split('\t')
13 |     key = index[line][0].strip()
14 |     diction[key] = {}
15 |     diction[key]['file'] = index[line][1]
16 |     diction[key]['line'] = index[line][2].strip()
17 | """ 
18 | This part of the code creates the a json file containing the information which can be loaded easily.
19 | """
20 | """ 
21 | with open("data_file.json", "w") as write_file:
22 |     json.dump(diction, write_file) 
23 | """
24 | 
25 | file_diction = {}
26 | 
27 | a = [f for f in os.listdir("data_fragments")]
28 | 
29 | for val in (a):
30 |     file_diction[val.strip()] = []
31 | 
32 | vocab = open("vocab.sparql",'r').readlines()
33 | filename = []
34 | dict_keys = diction.keys()
35 | 
36 | print("""
37 | Checking and accumulating information for words obtained from the vocabulary from the index thus loaded. 
38 | """)
39 | 
40 | for word in tqdm(vocab):
41 |     word = word.strip()
42 |     if(word in dict_keys ):
43 |         file_diction[diction[word]["file"]].append(diction[word])
44 |     else:
45 |         missed_counter+=1
46 | 
47 | 
48 | print(""" 
49 | Loading information from the broken files to extract the required embeddings.
50 | """)
51 | accum = []
52 | for files in tqdm(a): 
53 |     file_reader = open("data_fragments/"+files).readlines()
54 |     for words_in_file in file_diction[files.strip()]:
55 |         accum.append(file_reader[int(words_in_file["line"].strip())].strip()) 
56 | 
57 | print(""" 
58 | Writing the extracted embeddings in a file for future use. 
59 | """)
60 | final = ("\n".join(accum)).replace('\t',' ')
61 | final = final.replace("http://dbpedia.org/resource/","dbr_")
62 | final = final.replace("http://dbpedia.org/ontology/","dbo_")
63 | open("new_vocbulary.csv",'w').write(final)
64 | 
65 | print("Missed words: "+str(missed_counter))
66 | 
67 | 
68 | 
69 |         
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/indexer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This code extracts all the embeddings from the pageRank.txt and records the following information in the following manner.
 3 | 
 4 | <name of embedding>\t<line number>\t<File number> 
 5 | 
 6 | This index will contain information regarding the position of all the words so that the corresponding embeddings can be extracted easily. Withput having to query through the whole embedding file.
 7 | """
 8 | 
 9 | import sys
10 | import os
11 | from tqdm import tqdm 
12 | 
13 | a = [f for f in os.listdir("data_fragments/")]
14 | for files in tqdm(a):
15 |     #print(files)
16 |     lines = open("data_fragments/"+files).readlines()
17 |     writer = open("index.csv",'a')
18 |     for line in range(len(lines)):
19 |         lines[line] = lines[line].split("\t")
20 |         word = lines[line][0]
21 |         if "http://dbpedia.org/resource/" in (word):
22 |             word = word.replace("http://dbpedia.org/resource/","dbr_")
23 |         if "http://dbpedia.org/ontology/" in (word):
24 |             word = word.replace("http://dbpedia.org/ontology/","dbo_")
25 |         writer.write('\t'.join([word,files,str(line)])+'\n')
26 |     writer.close()


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/readme.md:
--------------------------------------------------------------------------------
 1 | # SPARQL embedding extractor  
 2 | ## With Significant decrease in time taken compared to older model: within ~1 minute.
 3 | 
 4 | - The code needs the big embedding file to be downloaded from the following link: [https://zenodo.org/record/1320038#.XT8CeHUzbEG](https://zenodo.org/record/1320038#.XT8CeHUzbEG)
 5 | - Run the following utility to make the process faster by breaking the files into smaller files. 
 6 | - If the files fail to load, then decrease the size of file in the script command. (breaker.sh)
 7 | 
 8 | The bash script:
 9 | ```bash
10 | echo "Creating data_fragments folder"
11 | mkdir data_fragments
12 | cd data_fragments
13 | echo "Usage:    ./breaker.sh <size> <name of file to be split>" 
14 | echo "Example:  ./breaker.sh 1000MB pageRank.txt"
15 | echo "It will take some time, please remain calm." 
16 | split -b $1  ../$2
17 | ```
18 | 
19 | ## How to use? | Different components of this utility,
20 | - For the utility to run the pageRank.txt files need to br present in this directory.
21 | - First the `breaker.sh` script should be run as per the instructions stated above.
22 | - After running the breaker.sh script run the following command on the terminal:
23 | ```bash
24 | python indexer.py
25 | ```
26 | - This will create an index for all the words present in the `pageRank.txt` file.
27 | - After this we will require a vocab.sparql file which is the vocabulary (list of words for which you want to extract the embeddings.)
28 | - Copy the vocab.sparql in this directory.
29 | - Run the `embedding_extractor.py` code using the following command:
30 | ```bash
31 | python embedding_extractor.py
32 | ```
33 | - The new embeddings will be created with the name `new_vocabulary.csv`: which is an embedding file to be used in NMT. The file has the following format:
34 | ```
35 | <word><space><n space sepeated values>
36 | ```
37 | 
38 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/en_extract_embed.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | reader = open("pageRank.txt",'r')
 3 | vocab = open("vocab.sparql",'r').readlines()
 4 | count = 0
 5 | temp = ""
 6 | dict_vocab = []
 7 | vocab_count = 0
 8 | 
 9 | while(True):
10 |     line = reader.readline()
11 |     if(line == temp):
12 |         break
13 |     count+=1
14 |     temp = line
15 | 
16 |     #print(line)
17 |     line = line.split("\t")
18 |     """ if "http://dbpedia.org/resource/" in (line[0]):
19 |         line[0] = line[0].replace("http://dbpedia.org/resource/","dbr_")
20 |     if "http://dbpedia.org/ontology/" in (line[0]):
21 |         line[0] = line[0].replace("http://dbpedia.org/ontology/","dbo_") """
22 |     for words in vocab:
23 |         if(words.strip() == line[0]):
24 |             vocab_count+=1
25 |             dict_vocab.append(words.strip() +" " +" ".join(line[1:]))
26 |             #print(words.strip())
27 |             break
28 | print(count)
29 | print(vocab_count)
30 | reader.close()
31 | 
32 | writer = open("new_vocab.sparql",'w').write("\n".join(dict_vocab))
33 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/readme.md:
--------------------------------------------------------------------------------
 1 | # Old and slow embedding extractor
 2 | 
 3 | - Please download the pageRank.txt file from the following link:  https://zenodo.org/record/1320038#.XT8CeHUzbEG.
 4 | - Run the extractor code in the following way:
 5 | 
 6 | ```
 7 | For SPARQL: python sparql_extract_embed.py 
 8 | For English: python english_extract_embed.py 
 9 | ```
10 | 
11 | ## Note:
12 | 
13 | It works for file with the following format:
14 | ```
15 | word<tab><n values seperated by tab>
16 | word<tab><n values seperated by tab>
17 | word<tab><n values seperated by tab>
18 | word<tab><n values seperated by tab>
19 | word<tab><n values seperated by tab>
20 | word<tab><n values seperated by tab>
21 | word<tab><n values seperated by tab>
22 | word<tab><n values seperated by tab>
23 | ```
24 | 
25 | - The SPARQL prefixed python code replaces DBpedia urls with their shortforms like dbo, dbr.
26 | - The en prefixed python code runs on the word without any modifications. 
27 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/sparql_extract_embed.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | reader = open("pageRank.txt",'r')
 3 | vocab = open("vocab.sparql",'r').readlines()
 4 | count = 0
 5 | temp = ""
 6 | dict_vocab = []
 7 | vocab_count = 0
 8 | expected_count= 5774165
 9 | for val in tqdm(range(expected_count)):
10 |     line = reader.readline()
11 |     if(line == temp):
12 |         break
13 |     count+=1
14 |     temp = line
15 | 
16 |     #print(line)
17 |     line = line.split("\t")
18 |     if "http://dbpedia.org/resource/" in (line[0]):
19 |         line[0] = line[0].replace("http://dbpedia.org/resource/","dbr_")
20 |     if "http://dbpedia.org/ontology/" in (line[0]):
21 |         line[0] = line[0].replace("http://dbpedia.org/ontology/","dbo_")
22 |     for words in vocab:
23 |         if(words.strip() == line[0]):
24 |             vocab_count+=1
25 |             dict_vocab.append(words.strip() +" " +" ".join(line[1:]))
26 |             #print(words.strip())
27 |             break
28 | print(count)
29 | print(vocab_count)
30 | reader.close()
31 | 
32 | writer = open("new_vocab.sparql",'w').write("\n".join(dict_vocab))
33 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/qald_json/interpreter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Interpreter module
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html
 8 | https://arxiv.org/abs/1708.07624
 9 | 
10 | Version 0.1.0-akaha
11 | 
12 | """
13 | import sys
14 | import re
15 | 
16 | from generator_utils import decode,fix_URI
17 | 
18 | def interpreter(val):
19 |     reload(sys)
20 |     sys.setdefaultencoding("utf-8")
21 |     encoded_sparql = val
22 |     decoded_sparql = decode(encoded_sparql)
23 |     decoded_sparql = fix_URI(decoded_sparql)
24 |     return( decoded_sparql)
25 | 
26 | if __name__ == '__main__':
27 |     reload(sys)
28 |     sys.setdefaultencoding("utf-8")
29 |     encoded_sparql = sys.argv[1]
30 |     decoded_sparql = decode(encoded_sparql)
31 |     decoded_sparql = fix_URI(decoded_sparql)
32 |     print( decoded_sparql)
33 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/qald_json/qald_json_gerbil_input.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import f1_score
 2 | from interpreter import interpreter
 3 | import os
 4 | import numpy as np
 5 | from nltk.corpus import stopwords
 6 | import urllib
 7 | from bs4 import BeautifulSoup
 8 | from tqdm import tqdm
 9 | from collections import OrderedDict
10 | import json
11 | 
12 | base = {"dataset":{"id": "stuff"}}
13 | base["questions"] = []
14 | question_lines = open('qald_json/test.en', 'r').readlines()
15 | lines = open('qald_json/test.sparql', 'r').readlines()
16 | lines = list(map(interpreter, tuple(lines)))
17 | 
18 | for valu in range(len(lines)):
19 |         lines[valu] = lines[valu].replace("limit\n","limit 1\n")
20 | 
21 | #print("".join(lines))
22 | #print(len(lines))
23 | 
24 | import urllib2
25 | contents = urllib2.urlopen
26 | accum = []
27 | count = 0
28 | stop = set(stopwords.words('english'))
29 | for valu in tqdm(range(len(lines))):
30 |         count+=1
31 |         query = urllib.quote(lines[valu])
32 |         url2 = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
33 |         #url2 = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
34 |         #page = urllib2.urlopen(url2)
35 |         #soup = BeautifulSoup(page, "html.parser")
36 |         try:
37 |                 page = urllib2.urlopen(url2)
38 |         except:
39 |                 print(url2)
40 |                 continue
41 |         soup = BeautifulSoup(page, "html.parser")
42 |         js_dic = json.loads(str(soup))
43 |         bindings = js_dic["results"]["bindings"]
44 |         #print (bindings)
45 |         answer = []
46 |         #print("************")
47 |         """ for rows in (soup.find_all("tr")):
48 |                 for td in rows.find_all("td"):
49 |                         answer.append(td.getText())
50 |         """
51 |         que = {}
52 |         que["id"] = str(valu)
53 |         que["answertype"] = "resource" # Check
54 |         que["aggregation"] = False
55 |         que["onlydbo"] = True
56 |         que["hybrid"] = False
57 |         que["question"] = [{"language":"en", "string":question_lines[valu][:-1], "keywords" : " ".join([i for i in question_lines[valu].lower().split() if i not in stop] )}]
58 |         que["query"] = {"sparql":lines[valu][:-1]}
59 |         que["answers"] = []
60 |         anc_accum = []
61 |         #answer_unit = js_dic
62 |         answer_unit =  {"head": {"vars": ["uri"]}}
63 |         for ans in bindings: 
64 |                 ans = ans["x"]
65 |                 if("dbpedia" in ans):
66 |                         temp = {"uri": {"type": ans["type"], "value": ans["value"]}}
67 |                 else:
68 |                         temp = {"uri": {"type": ans["type"], "value": ans["value"]}} 
69 |                 anc_accum.append(temp)
70 |         answer_unit["results"] = {}
71 |         answer_unit["results"]["bindings"] = anc_accum
72 |         que["answers"].append(answer_unit)
73 |         accum.append(que)
74 | 
75 |         """ if(count>10):
76 |                 break """
77 |         
78 | base["questions"] = accum
79 | 
80 | import json
81 | with open('qald_json/data.json', 'w') as outfile:
82 |     json.dump(OrderedDict(base), outfile, ensure_ascii=False, indent=2)
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/qald_json/readme.md:
--------------------------------------------------------------------------------
 1 | # QALD JSON format generator
 2 | 
 3 | - Runs on Python 2.7
 4 | 
 5 | - It requires the interpreter function to run this. Thus to use this please copy this folder to the main folder and copy the python file to the man folder, for your ease please run the following code on the terminal : 
 6 | ```
 7 | ./shifter.sh
 8 | ```
 9 | - The QALD format generator is a program that generates the input for the gerbil portal.
10 | - The function requires the english and it's translated version to be present in the this folder with the following file names: `test.en` and `test.sparql`.
11 | - The question and corresponding SPARQL form should have the same line number.
12 | - The code can be run by running the `qald_json_gerbil_input.py` present outside this folder. An example of the output json file is as follows:
13 | 
14 | ```json
15 | {
16 | 	"questions": [{
17 | 		"id": "1",
18 | 		"question": [{
19 | 			"language": "en",
20 | 			"string": "Which German cities have more than 250000 inhabitants?"
21 | 		}],
22 | 		"query": {
23 | 			"sparql": "SELECT DISTINCT ?uri WHERE { { ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/City> . } UNION { ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Town> . }  ?uri <http://dbpedia.org/ontology/country> <http://dbpedia.org/resource/Germany> .  ?uri <http://dbpedia.org/ontology/populationTotal> ?population .  FILTER ( ?population > 250000 ) } "
24 | 		},
25 | 		"answers": [{
26 | 			"head": {
27 | 				"vars": [
28 | 					"uri"
29 | 				]
30 | 			},
31 | 			"results": {
32 | 				"bindings": [{
33 | 					"uri": {
34 | 						"type": "uri",
35 | 						"value": "http://dbpedia.org/resource/Bonn"
36 | 					}
37 | 				}]
38 | 			}
39 | 		}]
40 | 	}]
41 | }
42 | ```
43 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/qald_json/shifter.sh:
--------------------------------------------------------------------------------
1 | cd ../
2 | cp -r qald_json ../../../../
3 | cd ../../../../qald_json 
4 | cp qald_json_gerbil_input.py ../
5 | cp interpreter.py ../
6 | echo "Files shifted to the main folder."


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/qald_json/test.en:
--------------------------------------------------------------------------------
 1 | What is the binomial authority of species of university of maryland arboretum & botanical garden ?
 2 | What is the binomial authority of species of cyclura cychlura figginsi ?
 3 | What is the number of binomial authority of species of cyclura cychlura figginsi ?
 4 | What is the number of binomial authority of species of university of maryland arboretum & botanical garden ?
 5 | What is the conservation status of species of crotalus mitchellii muertensis ?
 6 | What is the conservation status of species of cyclura cychlura figginsi ?
 7 | What is the number of conservation status of species of crotalus mitchellii muertensis ?
 8 | What is the number of conservation status of species of cyclura cychlura figginsi ?
 9 | What is the conservation status system of species of cyclura cychlura figginsi ?
10 | What is the conservation status system of species of crotalus mitchellii muertensis ?
11 | What is the number of conservation status system of species of cyclura cychlura figginsi ?
12 | What is the number of conservation status system of species of crotalus mitchellii muertensis ?
13 | What is the division of species of university of maryland arboretum & botanical garden ?
14 | What is the number of division of species of university of maryland arboretum & botanical garden ?
15 | What is the family of species of crotalus mitchellii muertensis ?
16 | What is the family of species of university of maryland arboretum & botanical garden ?
17 | What is the family of species of cyclura cychlura figginsi ?
18 | What is the genus of species of university of maryland arboretum & botanical garden ?
19 | What is the number of genus of species of university of maryland arboretum & botanical garden ?
20 | What is the kingdom of species of cyclura cychlura figginsi ?
21 | What is the kingdom of species of university of maryland arboretum & botanical garden ?
22 | What is the kingdom of species of crotalus mitchellii muertensis ?
23 | What is the number of kingdom of species of cyclura cychlura figginsi ?
24 | What is the number of kingdom of species of university of maryland arboretum & botanical garden ?
25 | What is the order (taxonomy) of species of cyclura cychlura figginsi ?
26 | What is the order (taxonomy) of species of university of maryland arboretum & botanical garden ?
27 | What is the number of order (taxonomy) of species of university of maryland arboretum & botanical garden ?
28 | What is the number of order (taxonomy) of species of cyclura cychlura figginsi ?
29 | What is the phylum of species of cyclura cychlura figginsi ?
30 | What is the phylum of species of crotalus mitchellii muertensis ?
31 | What is the number of phylum of species of cyclura cychlura figginsi ?
32 | What is the number of phylum of species of crotalus mitchellii muertensis ?
33 | What is the synonym of species of crotalus mitchellii muertensis ?
34 | What is the number of synonym of species of crotalus mitchellii muertensis ?
35 | What is the Link from a Wikipage to an external page of species of crotalus mitchellii muertensis ?
36 | What is the number of Link from a Wikipage to an external page of species of crotalus mitchellii muertensis ?
37 | What is the Wikipage page ID of species of university of maryland arboretum & botanical garden ?
38 | What is the Wikipage page ID of species of crotalus mitchellii muertensis ?
39 | What is the Wikipage page ID of species of cyclura cychlura figginsi ?
40 | What is the Wikipage redirect of species of black-chinned emperor tamarin ?
41 | What is the number of Wikipage redirect of species of black-chinned emperor tamarin ?
42 | What is the Wikipage revision ID of species of cyclura cychlura figginsi ?
43 | What is the Wikipage revision ID of species of crotalus mitchellii muertensis ?
44 | What is the Wikipage revision ID of species of university of maryland arboretum & botanical garden ?
45 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/question_form.csv:
--------------------------------------------------------------------------------
1 | When is the  ,Who is the  ,What is the  ,Where is the  ,What is the number of  ,Which one is the oldest based on ,Which one the highest based on,Which one the highest based on
2 | select ?x ,select ?x ,select ?x ,select ?x ,select count(*) as ?x ,select distinct(?x) ,select distinct(?x),select distinct(?x)
3 | } , } , } , } , } , } order by ?x limit 1 ,} order by ?x limit 1,} order by ?x limit 1
4 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/readme.md:
--------------------------------------------------------------------------------
 1 | #  Utilities
 2 | 
 3 | Download the SubjectiveEye3D dataset here. The link: [https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz](https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz)
 4 | 
 5 | ## Note
 6 | 
 7 | Apart from the main project a number of utilities were created various tasks from full fledge code to few lines of script. A brief of those utilities is as follows. I encourage you to feel free to use them for evaluating, pre-processing and analyse different aspects of the project. The utilities are present in `pepeline_3/utility`
 8 | 
 9 | - Comparer
10 | - Relavent embedding extractor
11 | - QALD JSON geberator to enable use in GERBIL
12 | - Test fixer for thresholding the question present in the general test set.
13 | - Vocab extractor from previous tensorflow models.
14 | 
15 | Specific instructions for running each of the utilities is provided in their respective directories.
16 | 
17 | 


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/vocab_extractor_from_model/embedding_extractor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | """
 5 | - The following code when run with proper model location is capable of extracting the trained embeddings of a given model.
 6 | - The embeddings are present in the form: <word> <dimensions>
 7 | - The embedding decoder outputs sparql language embeddings
 8 | - The embedding encoder outputs english language embeddings
 9 | """
10 | def restore_session(self, session):
11 |    saver = tf.train.import_meta_graph('./translate.ckpt-32000.meta')
12 |    saver.restore(session, './translate.ckpt-32000')
13 | 
14 | 
15 | def test_word2vec():
16 |    opts = Options()    
17 |    with tf.Graph().as_default(), tf.Session() as session:
18 |        with tf.device("/cpu:0"):            
19 |            model = Word2Vec(opts, session)
20 |            model.restore_session(session)
21 |            model.get_embedding("assistance")
22 | accum = []
23 | with tf.Session() as sess:
24 |     saver = tf.train.import_meta_graph('translate.ckpt-32000.meta')
25 |     print("***************")
26 |     print(saver.restore(sess, "translate.ckpt-32000"))
27 |     print(tf.all_variables())
28 |     lis = (sess.run(('embeddings/decoder/embedding_decoder:0')))
29 |     print(np.shape(lis))
30 |     decode = open('vocab.sparql','r').readlines()
31 |     embed = open('embed_vocab.sparql','w')
32 |     if(len(decode) == np.shape(lis)[0]): 
33 |         for dec in range(len(decode)):
34 |             accum.append([decode[dec][:-1]]+list(lis[dec,:]))
35 |             temp = ' '.join(str(v) for v in accum[-1])
36 |             #print(temp)
37 |             embed.write(temp+'\n')
38 |     embed.close()
39 |     
40 | 
41 |     lis = (sess.run(('embeddings/encoder/embedding_encoder:0')))
42 |     print(np.shape(lis))
43 |     decode = open('vocab.en','r').readlines()
44 |     embed = open('embed_vocab.en','w')
45 |     if(len(decode) == np.shape(lis)[0]): 
46 |         for dec in range(len(decode)):
47 |             accum.append([decode[dec][:-1]]+list(lis[dec,:]))
48 |             temp = ' '.join(str(v) for v in accum[-1])
49 |             #print(temp)
50 |             embed.write(temp+'\n')
51 |     embed.close()
52 | 
53 | 
54 | 
55 |     


--------------------------------------------------------------------------------
/gsoc/anand/pipeline_3/utility/vocab_extractor_from_model/readme.md:
--------------------------------------------------------------------------------
1 | # Embedding Extractor
2 | 
3 | - Download the files from [here](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/anand-pipeline_3-vocab_extractor_from_model-files.zip)
4 | - The following code when run with proper model location is capable of extracting the trained embeddings of a given model.
5 | - The embeddings are present in the form: `<word> <dimensions>`
6 | - The embedding decoder outputs sparql language embeddings
7 | - The embedding encoder outputs english language embeddings
8 | 
9 | 


--------------------------------------------------------------------------------
/gsoc/anand/readme.md:
--------------------------------------------------------------------------------
 1 | # A Neural QA Model for DBpedia
 2 | ## Abstract
 3 | With booming amount of information being continuously added to the internet, organising the facts becomes a very difficult task. Currently DBpedia hosts billions of such data points and corresponding relations in the RDF format.
 4 | 
 5 | Extracting data from such data sources requires a query to be made in SPARQL and the response to the query is a link that contains the information pertaining to the answer or the answer itself.
 6 | 
 7 | Accessing such data is difficult for a lay user, who does not know how to write a query. This proposal tries to built upon a System :(​ https://github.com/AKSW/NSpM/tree/master ​) — which tries to make this humongous linked data available to a larger user base in their natural languages(now restricted to English) by improving, adding and amending upon the existing codebase.
 8 | 
 9 | The primary objective of the project is to be able to translate any natural language question to a valid SPARQL query.
10 | 
11 | >> You can find the supporting blogs at : https://anandpanchbhai.com/A-Neural-QA-Model-for-DBpedia/
12 | 
13 | ## Pipeline 1
14 | 
15 | The code is a completely automated and fixed version of what was done by the previous developer working on the project. The instruction for running the same are provided inside the `pipeline_1` folder.
16 | 
17 | ## Pipeline 3
18 | 
19 | Pipeline 3 refers to the newest code implementation that was introducedas part of GSoC'19 by Anand Panchbhai. The highlights of this pipeline include the folowing:
20 | 
21 | - A fast, automated and custom question set generator for DBpedia which can generate questions for any class. 
22 | - It also generates test set to test different composionality based queries.
23 | - The ranking of the questions is also generated as part of the project which help us understand which question might be more natural compared to others. A higher ranl signifies a higher probability of the question being natural.
24 | - This pipeline was used to create test sets on which further experiments were done. an the corresponding results are as follows:
25 | 
26 | ### Eukaryotes
27 | 
28 | With the following configuration:
29 | 
30 | | Size  | Layers    | Dropout   | Attention Mechanism   | Embeddings |BLEU | Accuracy |
31 | |------ | ----------| --------  | -------------------   |------------|-----|---------|
32 | |128    |    2      |   0.7     | Scaled Luong          | Yes: SPARQL (RDF2VEC), English Previous embeddings | 93 | 63|
33 | 
34 | Grid search was done on this dataset to determine the best possible hyperparameters for the NMT model pertaining to the English to SPARQL conversion. THe grid search stats can be found at [Grid Search](https://anandpanchbhai.com/A-Neural-QA-Model-for-DBpedia/static/GridSearch/GridSearch.html)
35 | 
36 | ### Person
37 | 
38 | After successfully  completing the Grid search on the Eukaryotes data we moved on to checking the viability of the model on other ontologies namely Person. The data set was humongous compared to what we had dealt with earlier. The train set contained: 302277 queries and the test after thresholding contained: 113982 queries. We were still able to get some pretty interesting results for our best expected model configuration obtained from the grid search done earlier:
39 | 
40 | With the following configuration:
41 | 
42 | | Size  | Layers    | Dropout   | Attention Mechanism   | Embeddings |BLEU | Accuracy |
43 | |------ | ----------| --------  | -------------------   |------------|-----|---------|
44 | |128    |    2      |   0.7     | Scaled Luong          | Yes: SPARQL (RDF2VEC), English Previous embeddings | 80 | 40|
45 | 
46 | The graphs are as follows:
47 | 
48 | ![Test BLEU](.images/test-bleu.png)
49 | BLEU scores for person test set.
50 | ![Test Accuracy](.images/test-accuracy.png)
51 | Accuracy scores for person test set.
52 | 
53 | 
54 | ## Utilities
55 | 
56 | Apart from the main project a number of utilities were created various tasks from full fledge code to few lines of script. A brief of those utilities is as follows. I encourage you to feel free to use them for evaluating, pre-processing and analyse different aspects of the project. The utilities are present in `pepeline_3/utility`
57 | 
58 | - Comparer
59 | - Relavent embedding extractor
60 | - QALD JSON geberator to enable use in GERBIL
61 | - Test fixer for thresholding the question present in the general test set.
62 | - Vocab extractor from previous tensorflow models.
63 | 
64 | Specific instructions for running each of the utilities is provided in their respective directories.
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # One-Command Pipeline #
 3 | To run the complete pipeline, please use the command:
 4 | 
 5 | ```bash
 6 | ./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer]
 7 | ```
 8 | 
 9 | | Parameter | Description | Type | Default |
10 | | :--------:|-------------|------|--------:|
11 | | $1 | The project's name | String | Required |
12 | | $2 | Dimension of the GloVe embeddings | Integer {50,100,200,300} | 300 |
13 | | $3 | Number of unit in the LSTM cells | Integer | 512 |
14 | | $4 | Training steps | Integer | 60000 |
15 | | $5 | Examples per template | Integer | 600 |
16 | 
17 | Examples:
18 | 
19 | ```bash
20 | ./pipeline.sh Project1
21 | ```
22 | ```bash
23 | ./pipeline.sh Project2 300 512 60000 600
24 | ```
25 | 
26 | 
27 | # Code Notes #
28 | ## Paraphrase Questions
29 | After the creation of templates and the elimination of the never asked queries, the questions will be passed to the Paraphraser.
30 | - `paraphrase_questions.py`: This aims to paraphrase the question template and return several possible candidates with their scores (potentially textual similarity, POS taggings, etc.). The main pipeline will select the templates with a strategy and add it/them to the template set.
31 | - `textual_similarity.py`: This aims to calculate the scores of similarity between the candidates and the original question template.
32 | 
33 | To test the Paraphraser
34 | ```bash
35 | python paraphrase_questions.py --sentence "what is your name ?"
36 | ```
37 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/batch_paraphrase.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import tensorflow as tf
 4 | tf.compat.v1.enable_eager_execution()
 5 | from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
 6 | from constant import Constant
 7 | 
 8 | const = Constant()
 9 | seperator = "\t"
10 | 
11 | const.URL = "https://datascience-models-ramsri.s3.amazonaws.com/t5_paraphraser.zip"
12 | 
13 | def batch_paraphrase(templates_path, model_dir):
14 |     folder_path = get_pretrained_model(const.URL)
15 |     set_seed(42)
16 |     tokenizer, device, model = prepare_model(folder_path)
17 |     dir = os.path.realpath(templates_path)
18 |     with open(dir, "r") as lines:
19 |         with open(dir + "_paraphrased", "w") as w:
20 |             for line in lines:
21 |                 prop = line.strip("\n").split(seperator)
22 |                 question = prop[3]
23 |                 paraphrased_candidates = paraphrase_questions(tokenizer, device, model, question)
24 |                 paraphrased = pick_final_sentence(question, paraphrased_candidates)
25 |                 advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
26 |                 w.write(line)
27 |                 print("Original", line)
28 |                 # for i, candidate in enumerate(paraphrased_candidates):
29 |                 #     new_prop = prop[:-1]
30 |                 #     new_prop[3] = candidate
31 |                 #     new_prop.append("Paraphrased {}\n".format(i))
32 |                 #     print(new_prop)
33 |                 #     new_line = seperator.join(new_prop)
34 |                 #
35 |                 #     w.write(new_line)
36 |                 new_prop = prop[:-1]
37 |                 new_prop[3] = paraphrased
38 |                 new_prop.append("Paraphrased \n")
39 |                 new_line = seperator.join(new_prop)
40 |                 w.write(new_line)
41 |                 print("Paraphrase", new_line)
42 | 
43 |                 new_prop = prop[:-1]
44 |                 new_prop[3] = advanced
45 |                 new_prop.append("Paraphrased advanced\n")
46 |                 new_line = seperator.join(new_prop)
47 |                 w.write(new_line)
48 |                 print("Advanced", new_line)
49 | 
50 | 
51 | if __name__=="__main__":
52 |     parser = argparse.ArgumentParser()
53 |     requiredNamed = parser.add_argument_group('Required Arguments')
54 | 
55 |     requiredNamed.add_argument('--templates', dest='templates', metavar='templates file',
56 |                                help='templates file', required=True)
57 |     requiredNamed.add_argument('--model', dest='model', metavar='model_dir',
58 |                                help='path of directory of the fine-tuned model', required=True)
59 | 
60 | 
61 |     args = parser.parse_args()
62 |     templates_path = args.templates
63 |     model_dir = args.model
64 |     batch_paraphrase(templates_path, model_dir)
65 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/bert_classifier.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch #version==1.6.0
  3 | from transformers import BertTokenizer
  4 | from transformers import BertForSequenceClassification
  5 | import numpy as np
  6 | 
  7 | 
  8 | 
  9 | def load_model(model_dir, device):
 10 |     model = BertForSequenceClassification.from_pretrained(model_dir)
 11 |     tokenizer = BertTokenizer.from_pretrained(model_dir)
 12 | 
 13 |     # Copy the model to the GPU.
 14 |     model.to(device)
 15 |     return model, tokenizer
 16 | 
 17 | def encode(test_set, tokenizer):
 18 |     input_ids = []
 19 |     attention_masks = []
 20 |     for sent in test_set[:10]:
 21 |         encoded_dict = tokenizer.encode_plus(
 22 |             sent[0],
 23 |             sent[1],  # Sentence to encode.
 24 |             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
 25 |             max_length=64,  # Pad & truncate all sentences.
 26 |             truncation=True,
 27 |             pad_to_max_length=True,
 28 |             return_attention_mask=True,  # Construct attn. masks.
 29 |             return_tensors='pt',  # Return pytorch tensors.
 30 |         )
 31 |         input_ids.append(encoded_dict['input_ids'])
 32 | 
 33 |         # And its attention mask (simply differentiates padding from non-padding).
 34 |         attention_masks.append(encoded_dict['attention_mask'])
 35 |     input_ids = torch.cat(input_ids, dim=0)
 36 |     attention_masks = torch.cat(attention_masks, dim=0)
 37 |     return input_ids, attention_masks
 38 | 
 39 | def predict(device, test_set, model, tokenizer):
 40 |     input_ids, attention_masks = encode(test_set, tokenizer)
 41 |     model.eval()
 42 |     # Tracking variables
 43 |     predictions = []
 44 |     # Add batch to GPU
 45 |     b_input_ids = input_ids.to(device)
 46 |     b_input_mask = attention_masks.to(device)
 47 |     # Telling the model not to compute or store gradients, saving memory and
 48 |     # speeding up prediction
 49 |     with torch.no_grad():
 50 |         # Forward pass, calculate logit predictions
 51 |         outputs = model(b_input_ids, token_type_ids=None,
 52 |                         attention_mask=b_input_mask)
 53 |     logits = outputs[0]
 54 |     # Move logits and labels to CPU
 55 |     logits = logits.detach().cpu().numpy()
 56 |     # Store predictions and true labels
 57 |     predictions.append(logits)
 58 |     pred_labels = []
 59 |     print(predictions)
 60 |     for i in range(len(predictions[0])):
 61 |         # get the highest score of logits to be the class
 62 |         # the result will be 0,1,2 so it should be -1
 63 |         pred_labels.append(np.argmax(predictions[0][i]).flatten()[0]-1)
 64 |     return pred_labels
 65 | 
 66 | if __name__ == "__main__":
 67 |     parser = argparse.ArgumentParser()
 68 |     requiredNamed = parser.add_argument_group('Required Arguments')
 69 | 
 70 |     requiredNamed.add_argument('--model', dest='model', metavar='model folder',
 71 |                                help='Bert fine-tuned model\'s folder path', required=True)
 72 |     # requiredNamed.add_argument('--testset', dest='testset', metavar='testset',
 73 |     #                            help='A list: [[Origin, Paraphrase1],[O, P2]..]', required=True)
 74 | 
 75 | 
 76 |     # If there's a GPU available...
 77 |     if torch.cuda.is_available():
 78 | 
 79 |         # Tell PyTorch to use the GPU.
 80 |         device = torch.device("cuda")
 81 | 
 82 |         print('There are %d GPU(s) available.' % torch.cuda.device_count())
 83 | 
 84 |         print('We will use the GPU:', torch.cuda.get_device_name(0))
 85 | 
 86 |     # If not...
 87 |     else:
 88 |         print('No GPU available, using the CPU instead.')
 89 |         device = torch.device("cpu")
 90 | 
 91 | 
 92 |     testset = [['When is the birth date of <A> ?', 'When is the birthday of <A> ?'],
 93 |                ["When is the birth date of <A> ?", "When was <A> born ?"],
 94 |                ["When is the birth date of <A> ?", "Where does <A> come from ?"],
 95 |                ["When is the birth date of <A> ?","What is the birth name of <A> ?"],
 96 |                ["What is the ingredient of <A> ?","What is the Ingredient for <A> ?"],
 97 |                ["What is the ingredient of <A> ?","What is <A>'s ingredient ?"]]
 98 |     args = parser.parse_args()
 99 |     model = args.model
100 |     # testset = args.testset
101 |     model, tokenizer = load_model(model, device)
102 |     pred_labels = predict(device, testset, model, tokenizer)
103 |     for i, pair in enumerate(testset):
104 |         print(" ".join(pair), pred_labels[i])


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/eliminator.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import argparse
 3 | 
 4 | def eliminator():
 5 |         """
 6 |         The function remove the templates which are considered as less popular 
 7 |         based on the proposed ranking mechanism, the input files should be pre processed
 8 |         and a TRUE or FALSE should be added as the last column. 
 9 | 
10 |         This function just removes the entries with FALSE as the last entry in a row 
11 |         and create a file name new_train.csv to be used for futher purposes.
12 |         """
13 |         lines = open(location,'r').readlines()
14 |         print(len(lines))
15 |         accum = []
16 |         nspm_ready = open("new_train.csv",'w')
17 |         for line in tqdm(lines):
18 |                 values = line.split(",")
19 |                 if(len(values)<8):
20 |                         print("Input file is of wrong format, please add the corect bolean values as the last column entry to use this function")
21 |                 print(values[-1])
22 |                 if(values[-1]=="TRUE\n"):
23 |                         accum.append(";".join(values[:-2])+"\n")
24 |                         nspm_ready.write(accum[-1])
25 |         nspm_ready.close()
26 | 
27 | if __name__ == "__main__":
28 |         parser = argparse.ArgumentParser()
29 |         requiredNamed = parser.add_argument_group('Required Arguments')
30 | 
31 |         requiredNamed.add_argument('--location', dest='location', metavar='location',
32 |                                                                 help='location of the file to be pruned.', required=True)
33 |         args = parser.parse_args()
34 |         location = args.location
35 |         eliminator(location)
36 |         pass
37 | 
38 | 
39 |         


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/fetch_ranks.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | 
 4 | def fetch_ranks(filename='../utility/wikidata.rank'):
 5 |     """
 6 |     The function loads rank from a supplied position.
 7 |     """
 8 |     sub = open(filename,'r').readlines()
 9 |     diction={}
10 | 
11 |     print("Loading Rankings")
12 |     for val in tqdm(sub):
13 |         diction[val.split('\t')[0]] = float(val.split('\t')[1])
14 |     return diction
15 | 
16 | if __name__ == "__main__":
17 |     fetch_ranks()
18 |     pass
19 | 
20 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/fetch_ranks_sub.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import numpy as np
 3 | 
 4 | def fetch_ranks(filename='part-r-00000'):
 5 |     """
 6 |     The function loads ranks from a supplied location, 
 7 |     The ranks fileshould belong to the subjective 3d format
 8 |     of saving ranks.
 9 |     """
10 |     sub = open(filename,'r').readlines()
11 |     diction={}
12 | 
13 |     print("Loading Rankings")
14 |     for val in tqdm(sub):
15 |         diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1])
16 |     return diction
17 | 
18 | if __name__ == "__main__":
19 |     fetch_ranks()
20 |     pass
21 | 
22 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/get_properties.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | import sys
 4 | import csv
 5 | import io
 6 | import argparse
 7 | import os
 8 | from bs4 import BeautifulSoup
 9 | from tqdm import tqdm
10 | 
11 | def get_properties(url,  project_name="test_project", output_file = "get_properties.csv", multi=False):
12 |     """
13 |     - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this :
14 |     http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format.
15 |     - This code on execution creates a csv which contains all the properties, ontology,
16 |     class related information and data types as field values in each row.
17 |     - This function also returns a 2D list of the information mentioned above to the calling
18 |     function
19 |     """
20 |     page = urllib.request.urlopen(url)
21 |     soup = BeautifulSoup(page, "html.parser")
22 |     if(not os.path.isdir(project_name)):
23 |         os.makedirs(project_name)
24 |     if multi:
25 |         output_file = open(project_name + "/" + output_file, 'a', encoding="utf-8")
26 |     else:
27 |         output_file = open(project_name+"/" + output_file, 'w', encoding="utf-8")
28 |     fl = 0
29 |     accum = []
30 |     for rows in tqdm(soup.find_all("tr")):
31 |         x = rows.find_all("td")
32 |         if len(x) <= 2:
33 |             fl = 1
34 |             continue
35 |         if fl == 1:
36 |             fl = 2
37 |             continue
38 |         name = rows.find_all("td")[0].get_text().replace(" (edit)", "")
39 |         label = rows.find_all("td")[1].get_text()
40 |         dom = rows.find_all("td")[2].get_text()
41 |         rng = rows.find_all("td")[3].get_text()
42 |         if rows.find_all("td")[0].find('a'):
43 |             URL_name = ((rows.find_all("td")[0].find('a').attrs['href']))
44 | 
45 |         final = name + "," + label + "," + dom + "," + rng 
46 |         #+ ","+ URL_name.split(':')[-1]
47 |         accum.append(final)
48 |         output_file.write(final+"\n")
49 |     output_file.close()
50 |     return accum
51 | 
52 | 
53 | """
54 | Name, Label, Domain, Range, URL_name
55 | """
56 | 
57 | if __name__ == "__main__":
58 |     """
59 |     Section to parse the command line arguments.
60 |     """
61 |     parser = argparse.ArgumentParser()
62 |     requiredNamed = parser.add_argument_group('Required Arguments')
63 | 
64 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
65 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
66 |     requiredNamed.add_argument(
67 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
68 |     requiredNamed.add_argument(
69 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
70 |     args = parser.parse_args()
71 |     url = args.url
72 |     output_file = args.out_put
73 |     project_name = args.project_name
74 |     get_properties(url = url, project_name= project_name,  output_file = output_file)
75 |     pass
76 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/question_generator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | def question_generator(query):
 5 |    pass 
 6 | 
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     """
11 |     Section to parse the command line arguments.
12 |     """
13 |     parser = argparse.ArgumentParser()
14 |     requiredNamed = parser.add_argument_group('Required Arguments')
15 | 
16 |     requiredNamed.add_argument('--url', dest='url', metavar='url',
17 |                                                             help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True)
18 |     requiredNamed.add_argument(
19 |         '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True)
20 |     requiredNamed.add_argument(
21 |         '--project_name', dest='project_name', metavar='project_name', help='test', required=True)
22 |     args = parser.parse_args()
23 |     url = args.url
24 |     output_file = args.out_put
25 |     project_name = args.project_name
26 |     pass


--------------------------------------------------------------------------------
/gsoc/zheyuan/pipeline/textual_similarity.py:
--------------------------------------------------------------------------------
  1 | import math, argparse
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import tensorflow_hub as hub
  5 | import os
  6 | import nltk
  7 | nltk.download('punkt')
  8 | nltk.download('averaged_perceptron_tagger')
  9 | nltk.download('universal_tagset')
 10 | from nltk import pos_tag, word_tokenize, RegexpParser
 11 | 
 12 | from constant import Constant
 13 | const = Constant()
 14 | const.MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
 15 | # os.environ['TFHUB_CACHE_DIR'] = '/tmp/tfhub_modules'
 16 | print('start')
 17 | embed = hub.load(const.MODULE_URL)
 18 | 
 19 | def similarities(sentence, paraphrases):
 20 |     vectors = embed([sentence] + paraphrases)
 21 |     cosine_similarities = []
 22 |     for v2 in vectors[1:]:
 23 |         cosine_similarities.append(cosine_similarity(np.array(vectors[0]), np.array(v2)))
 24 | 
 25 |     return cosine_similarities
 26 | 
 27 | def similarity(sentence,paraphrase):
 28 |     vectors = embed([sentence, paraphrase])
 29 |     return cosine_similarity(vectors[0], vectors[1])
 30 | 
 31 | # def cosine_similarities(v1, vectors):
 32 | #     #Calculate semantic similarity between two original v1 and paraphrased vectors
 33 | #     similarities = []
 34 | #     for v2 in vectors:
 35 | #         similarities.append(cosine_similarity(v1,v2))
 36 | #     return similarities
 37 | 
 38 | 
 39 | def cosine_similarity(v1, v2):
 40 |     # Calculate semantic similarity between two sentence vectors
 41 |     mag1 = np.linalg.norm(v1)
 42 |     mag2 = np.linalg.norm(v2)
 43 |     if (not mag1) or (not mag2):
 44 |         return 0
 45 |     return np.dot(v1, v2) / (mag1 * mag2)
 46 | 
 47 | 
 48 | def prof_similarity(v1, v2):
 49 |     #Calculate the semantic similarity based on the angular distance
 50 |     cosine = cosine_similarity(v1, v2)
 51 |     prof_similarity = 1 - math.acos(cosine) / math.pi
 52 |     return prof_similarity
 53 | 
 54 | def minDistance(s1, s2):
 55 |     """
 56 |     :type s1: str
 57 |     :type s2: str
 58 |     :rtype: int
 59 |     """
 60 | 
 61 |     len1 = len(s1)
 62 |     len2 = len(s2)
 63 |     dp = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]
 64 |     for i in range(len1 + 1):
 65 |         for j in range(len2 + 1):
 66 |             if i > 0 and j == 0:
 67 |                 dp[i][j] = dp[i - 1][j] + 1
 68 |             elif j > 0 and i == 0:
 69 |                 dp[i][j] = dp[i][j - 1] + 1
 70 |             elif j > 0 and i > 0:
 71 |                 res1 = dp[i - 1][j] + 1
 72 |                 res2 = dp[i][j - 1] + 1
 73 |                 res3 = not s1[i - 1] == s2[j - 1] and dp[i - 1][j - 1] + 1 or dp[i - 1][j - 1]
 74 |                 dp[i][j] = min(res1, res2, res3)
 75 |     return dp[len1][len2]
 76 | 
 77 | 
 78 | def words_distance(sentence1, sentence2):
 79 |     return minDistance(word_tokenize(sentence1), word_tokenize(sentence2))
 80 | 
 81 | 
 82 | def tags_distance(sentence1, sentence2):
 83 |     tagged1 = pos_tag(word_tokenize(sentence1), tagset='universal')
 84 |     tagged2 = pos_tag(word_tokenize(sentence2), tagset='universal')
 85 |     tags1 = [j for i, j in tagged1]
 86 |     tags2 = [j for i, j in tagged2]
 87 |     return minDistance(tags1, tags2)
 88 | 
 89 | 
 90 | from collections import Counter
 91 | 
 92 | def count_NNP(sentence):
 93 |     tokens = word_tokenize(sentence)
 94 |     tagged = pos_tag(tokens)
 95 |     tag = [j for i, j in tagged]
 96 |     result = Counter(tag)
 97 |     return result["NNP"]
 98 | 
 99 | def has_NNP(sentence, num):
100 |     tokens = word_tokenize(sentence)
101 |     tagged = pos_tag(tokens)
102 |     tag = [j for i, j in tagged]
103 |     result = Counter(tag)
104 |     return result["NNP"] <= num
105 | 
106 | 
107 | 
108 | if __name__=="__main__":
109 |     parser = argparse.ArgumentParser()
110 |     requiredNamed = parser.add_argument_group('Required Arguments')
111 | 
112 |     requiredNamed.add_argument('--s1', dest='sentence1', metavar='sentence1',
113 |                                help='a sentence', required=True)
114 |     requiredNamed.add_argument('--s2', dest='sentence2', metavar='sentence2',
115 |                                help='another sentence', required=True)
116 |     args = parser.parse_args()
117 |     s1 = args.sentence1
118 |     s2 = args.sentence2
119 |     print("cosine similarity:", similarity(s1, s2), "Edit distance: ", edit_distance(s1,s2))
120 |     pass
121 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/GloVe/glove_finetune.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import numpy as np
  4 | from collections import Counter
  5 | from nltk.corpus import brown
  6 | from mittens import GloVe, Mittens
  7 | from sklearn.feature_extraction import stop_words
  8 | from sklearn.feature_extraction.text import CountVectorizer
  9 | 
 10 | 
 11 | def glove2dict(glove_filename):
 12 |     with open(glove_filename, encoding='utf-8') as f:
 13 |         reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
 14 |         embed = {line[0]: np.array(list(map(float, line[1:])))
 15 |                 for line in reader}
 16 |     return embed
 17 | 
 18 | def batch_finetune(finetune_glove, batch_word, dimension):
 19 |     oov = [token for token in batch_word if token not in finetune_glove.keys()]
 20 | 
 21 |     en_doc = [' '.join(batch_word)]
 22 | 
 23 |     corp_vocab = list(set(oov))
 24 |     cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
 25 |     X = cv.fit_transform(en_doc)
 26 |     Xc = (X.T * X)
 27 |     Xc.setdiag(0)
 28 |     coocc_ar = Xc.toarray()
 29 | 
 30 |     mittens_model = Mittens(n=dimension, max_iter=1800)
 31 |     new_embeddings = mittens_model.fit(
 32 |       coocc_ar,
 33 |       vocab=corp_vocab,
 34 |       initial_embedding_dict=finetune_glove)
 35 | 
 36 |     newglove = dict(zip(corp_vocab, new_embeddings))
 37 |     finetune_glove.update(newglove)
 38 |     return finetune_glove
 39 | 
 40 | def calculate_unknown(finetune_glove,dimension):
 41 |     vecs = np.zeros((len(finetune_glove), dimension), dtype=np.float32)
 42 |     for i, key in enumerate(finetune_glove):
 43 |         vecs[i] = np.array(finetune_glove[key], dtype=np.float32)
 44 |     unknown = np.mean(vecs, axis=0)
 45 |     return unknown
 46 | 
 47 | def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
 48 |     word_en = []
 49 |     with open(project_path+"/data.en", "r") as lines:
 50 |         for sentence in lines:
 51 |             sentence = sentence.strip("\n")
 52 |             sentence = "<s> " + sentence + " </s>"
 53 |             for word in sentence.split():
 54 |                 word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
 55 |     print(len(word_en), word_en[:20])
 56 | 
 57 |     vocab_en = list(set(word_en) - set(["<s>", "</s>"]))
 58 | 
 59 |     pre_glove = glove2dict(glove_path)
 60 |     stride = 700000
 61 |     start = 0
 62 |     end = start+stride
 63 |     finetune_glove = pre_glove.copy()
 64 |     while end<len(word_en):
 65 |         print("Start: ", start, "End: ", end)
 66 |         word_split = word_en[start:end]
 67 |         finetune_glove = batch_finetune(finetune_glove, word_split, dimension)
 68 |         start = end
 69 |         end = start + stride
 70 |     finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension)
 71 |     unk = calculate_unknown(finetune_glove, dimension)
 72 |     finetune_glove["<UNK>"] = unk
 73 |     with open(project_path+"/embed.en", "w") as w:
 74 |         for word in finetune_glove:
 75 |             w.write(word + " " + str(list(finetune_glove[word])).replace("[", "").replace("]", "").replace(",", "") + "\n")
 76 | 
 77 | if __name__=="__main__":
 78 |     parser = argparse.ArgumentParser()
 79 |     requiredNamed = parser.add_argument_group('Required Arguments')
 80 | 
 81 |     requiredNamed.add_argument('--path', dest='path', metavar='path',
 82 |                                help='path of project that contains the data..eb/sparql files', required=True)
 83 |     requiredNamed.add_argument('--dimension', dest='dimension', metavar='dimension',
 84 |                                help='path of project that contains the data..eb/sparql files', required=False)
 85 | 
 86 |     args = parser.parse_args()
 87 |     path = args.path
 88 |     dimension = args.dimension
 89 | 
 90 |     if dimension:
 91 |         if dimension <= 50:
 92 |             dimension = 50
 93 |         elif dimension <= 100:
 94 |             dimension = 100
 95 |         elif dimension <= 200:
 96 |             dimension = 200
 97 |         else:
 98 |             dimension = 300
 99 |         finetune_glove(path,"glove.6B/glove.6B."+dimension+"d.txt", dimension=dimension)
100 |     else:
101 |         finetune_glove(path, "glove.6B/glove.6B.300d.txt")
102 |     pass


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline of Benchmark #
 2 | This benchmark pipeline uses irbench as a local tool to calculate metrics on final answers.
 3 | 
 4 | ## To run the code
 5 | Firstly, you need to follow the instructions of NSpM to train the NMT model. Let's say that the model is trained on `monument_300`.
 6 | 
 7 | Then, run our pipeline to generate the answers JSON file
 8 | ```bash
 9 | python benchmark.py --model <trained modle ID> --test <test set ID> [--answer <answers file name>]
10 | ```
11 | For example, using [this file](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/qald-9-train-multilingual.qald.json):
12 | ```bash
13 | python benchmark.py --model monument_300 --test qald-9-train-multilingual.qald.json
14 | ```
15 | 
16 | Finally, evaluate out answers using the irbench
17 | Remember to clone the project of irebench and download their release jar file
18 | ```bash
19 | java -jar irbench-v0.0.1-beta.2.jar -evaluate "qald-9-train-multilingual" "<yourpath>/answer.qald.json" "f-score"
20 | ```
21 | For other configurations details, please visit their [site](https://github.com/AKSW/irbench) and some tips are given for the jdk version in my [blogs](https://baiblanc.github.io/2020/06/23/GSOC-Week-Three/)


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/benchmark.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from tqdm import tqdm
 4 | from interpreter import interprete
 5 | from extract_questions import read_json, write_to_ask
 6 | from retrieve_answers import read_sparqls, retrieve
 7 | from reconstruct_json import construct_json
 8 | 
 9 | 
10 | 
11 | def benchmark(trained_model, test_set, answer_file="answers.json"):
12 |     # Deconstruct the questions and infos from test set
13 |     questions_info, questions = read_json(test_set)
14 | 
15 |     # Write the questions to to_ask.txt
16 |     write_to_ask(questions)
17 | 
18 |     # Use Interpreter to interprete them into decoded queries stored in output_decoded.txt
19 |     interprete(trained_model)
20 | 
21 |     # Use the sparql endpoint (http://dbpedia.org/sparql) to retrieve answers of the queries
22 |     sparqls = read_sparqls()
23 |     answers = []
24 |     print("Retrieving answers of queries via SPARQL endpoint")
25 |     for query in tqdm(sparqls):
26 |         try:
27 |             answer_group = retrieve(query)
28 |         except:
29 |             answer_group = []
30 |         answers.append(answer_group)
31 | 
32 |     json_file = construct_json(test_set.replace(".qald.json",""), questions_info, questions, sparqls, answers)
33 |     path = "../gsoc/zheyuan/utility/benchmark/"
34 |     with open(path+"answers-"+test_set, "w") as f:
35 |         # js = json.dumps(json_file, indent=4, separators=(',', ':'))
36 |         json.dump(json_file, f, indent=4, separators=(', ', ': '))
37 | 
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     """
42 |     Section to parse the command line arguments.
43 |     """
44 |     parser = argparse.ArgumentParser()
45 |     requiredNamed = parser.add_argument_group('Required Arguments')
46 | 
47 |     requiredNamed.add_argument('--model', dest='model', metavar='[modelId]',
48 |                                help='the trained model', required=True)
49 |     requiredNamed.add_argument('--test', dest='test', metavar='[testset file name]',
50 |                                help='the testing qald set file name', required=True)
51 |     requiredNamed.add_argument('--answer', dest='answer', metavar='[answer file name]',
52 |                               help='the answers of qald dataset file name', required=False)
53 |     args = parser.parse_args()
54 |     trained_model = args.model
55 |     test_set = args.test
56 |     answer_file = args.answer
57 |     benchmark(trained_model,test_set, answer_file)
58 |     pass


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/extract_questions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | def read_json(file):
 5 |     with open(file,'r') as load_f:
 6 |          load_dict = json.load(load_f)
 7 |          dataset_id = load_dict['dataset']['id']
 8 |          questions = {}
 9 |          questions_info = []
10 | 
11 |          info_names = ['id','answertype','aggregation','onlydbo','hybrid']
12 |          for question_dict in load_dict['questions']:
13 |              id = ""
14 |              question_infos = {}
15 |              for key in question_dict:
16 |                  value = question_dict[key]
17 | 
18 | 
19 |                  if key == "id":
20 |                      id = value
21 |                  if key in info_names:
22 |                      question_infos[key] = value
23 | 
24 |                  elif key == "question":
25 |                      for question in value:
26 |                         if question['language'] == "en":
27 |                             questions[id] = question["string"]
28 |              questions_info.append(question_infos)
29 | 
30 |     return questions_info, questions
31 | def write_to_ask(questions):
32 |     with open('to_ask1.txt', 'w') as write_f:
33 |         for key in questions:
34 |             question = questions[key]
35 |             write_f.write(question.lower().replace("?"," ?")+"\n")
36 | 
37 | if __name__ == "__main__":
38 |     """
39 |     Section to parse the command line arguments.
40 |     """
41 |     parser = argparse.ArgumentParser()
42 |     requiredNamed = parser.add_argument_group('Required Arguments')
43 | 
44 |     requiredNamed.add_argument('--path', dest='path', metavar='[path]',
45 |                                help='the test set\'s file path', required=True)
46 |     args = parser.parse_args()
47 |     path = args.path
48 |     questions_info, questions = read_json(path)
49 |     print(questions_info, questions)
50 |     # write_to_ask(questions)
51 |     pass


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/interpreter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def interprete(trained_model_folder):
 4 |     os.chdir('../../../../nmt')
 5 |     os.system('pwd')
 6 |     print('start')
 7 |     folder_name = 'data/'+trained_model_folder
 8 |     print('python -m nmt.nmt  --vocab_prefix=../' + folder_name + '/vocab --out_dir=../' + folder_name + '_model  --inference_input_file=../gsoc/zheyuan/utility/benchmark/to_ask1.txt  --inference_output_file=../gsoc/zheyuan/utility/benchmark/output1.txt --src=en --tgt=sparql | tail -n4')
 9 |     os.system(
10 |         'python -m nmt.nmt  --vocab_prefix=../' + folder_name + '/vocab --out_dir=../' + folder_name + '_model  --inference_input_file=../gsoc/zheyuan/utility/benchmark/to_ask1.txt  --inference_output_file=../gsoc/zheyuan/utility/benchmark/output1.txt --src=en --tgt=sparql | tail -n4')
11 | 
12 |     os.system('''if [ $? -eq 0 ]
13 |             then
14 |                 echo ""
15 |                 echo "ANSWER IN SPARQL SEQUENCE:"
16 |                 ENCODED="$(cat ../gsoc/zheyuan/utility/benchmark/output1.txt)"
17 |                 python ../interpreter.py "${ENCODED}" > ../gsoc/zheyuan/utility/benchmark/output_decoded1.txt
18 |                 cat ../gsoc/zheyuan/utility/benchmark/output_decoded1.txt
19 |                 echo ""
20 |             fi''')
21 |     print('end')
22 | 
23 | if __name__ == "__main__":
24 |     """
25 |     Section to test the Interpreter.
26 |     """
27 |     interprete('monument_300')
28 |     pass
29 | 
30 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/reconstruct_json.py:
--------------------------------------------------------------------------------
 1 | def construct_json(dataset_id,infos, questions, sparqls, answers):
 2 |     qald_test_answers_dict = {}
 3 |     qald_test_answers_dict["dataset"] = {'id':dataset_id}
 4 |     qald_test_answers_dict['questions'] = []
 5 |     print(len(answers))
 6 |     for index,info in enumerate(infos):
 7 | 
 8 |         question_dict = info
 9 | 
10 |         id = info["id"]
11 |         question = questions[id]
12 |         question_dict["question"] = [{
13 |             "language" : "en",
14 |             "string" : question
15 |         }]
16 |         question_dict["query"] = {"sparql" : sparqls[index]}
17 |         question_dict["answers"] = answers[index]
18 |         print(answers[index])
19 |         qald_test_answers_dict['questions'].append(question_dict)
20 |     return qald_test_answers_dict
21 | 
22 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/benchmark/retrieve_answers.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import urllib.request
 5 | import urllib.parse
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | def read_sparqls():
 9 |     os.system("pwd")
10 |     sparqls = []
11 |     file_path = "../gsoc/zheyuan/utility/benchmark/output_decoded1.txt"
12 |     with open(file_path, 'r') as lines:
13 |         for line in lines:
14 |             sparqls.append(line)
15 |     return sparqls
16 | 
17 | def retrieve(query):
18 |     try:  # python3
19 |         query = urllib.parse.quote_plus(query)
20 |     except:  # python2
21 |         query = urllib.quote_plus(query)
22 |     url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=" + query + "&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
23 |     page = urllib.request.urlopen(url)
24 |     soup = BeautifulSoup(page, "html.parser")
25 |     total = len(soup.find_all("tr"))
26 |     answers = []
27 | 
28 |     for rows in (soup.find_all("tr")):
29 |         answer_dict = {
30 |             "head": {
31 |                 "vars": ["uri"]
32 |             }, "results": {
33 |                 "bindings": []
34 |             }
35 |         }
36 |         for td in rows.find_all("a"):
37 |             for a in td:
38 |                 uri = {
39 |                     "uri": {
40 |                         "type": "uri",
41 |                         "value": a
42 |                     }
43 |                 }
44 | 
45 | 
46 |                 answer_dict["results"]["bindings"].append(uri)
47 | 
48 |         for td in rows.find_all("pre"):
49 |             for pre in td:
50 |                 # Eliminate the answer if it is longer than 50(not a URI nor a simple literal)
51 |                 if len(pre) <= 50:
52 |                     uri = {
53 |                         "uri": {
54 |                             "type": "uri",
55 |                             "value": a
56 |                         }
57 |                     }
58 | 
59 |                     answer_dict["results"]["bindings"].append(uri)
60 |         if answer_dict["results"]["bindings"]:
61 |             answers.append(answer_dict)
62 | 
63 |     if not answers:
64 |         return [{
65 |                   "head" : {
66 |                     "vars" : [ "date" ]
67 |                   },
68 |                   "results" : { }
69 |                 }]
70 |     return answers
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     """
75 |     Section to parse the command line arguments.
76 |     """
77 |     # parser = argparse.ArgumentParser()
78 |     # requiredNamed = parser.add_argument_group('Required Arguments')
79 |     #
80 |     # requiredNamed.add_argument('--query', dest='query', metavar='query',
81 |     #                            help='query of SPARQL', required=True)
82 |     # args = parser.parse_args()
83 |     # query = args.query
84 |     answer_groups = []
85 |     i = 1
86 |     with open("../output_decoded.txt", 'r') as lines:
87 |          for line in lines:
88 |              i+=1
89 |              try:
90 |                  answer_group = retrieve(line)
91 |              except:
92 |                  answer_group=[]
93 |              answer_groups.append(answer_group)
94 | 
95 |     print(len(answer_groups), answer_groups)
96 | 
97 | 
98 |     pass


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/queryFilter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from tqdm import tqdm
 3 | from bs4 import BeautifulSoup
 4 | import urllib.request
 5 | 
 6 | labels = ['publisher', 'leaderName', 'mayor', 'country', 'musicComposer', 'routeEnd', 'starring', 'targetAirport', 'timeZone', 'origin', 'architect', 'team', 'Holiday', 'party', 'language', 'activeYearsEndDate', 'Protein', 'founder', 'foundingDate', 'governmentType', 'deathDate', 'type', 'birthName', 'vicePresident', 'knownFor', 'birthYear', 'crosses', 'city', 'height', 'ingredient', 'spouse', 'battle', 'child', 'location', 'doctoralAdvisor', 'portrayer', 'wineRegion', 'influenced', 'Beverage', 'developer', 'programmingLanguage', 'completionDate', 'budget', 'Organisation', 'numberOfPages', 'Sport', 'deathCause', 'growingGrape', 'product', 'capital', 'bandMember', 'largestCity', 'director', 'mission', 'ethnicGroup', 'officialLanguage', 'leader', 'foundationPlace', 'writer', 'date', 'abbreviation', 'dissolutionDate', 'successor', 'runtime', 'sourceCountry', 'maximumDepth', 'numberOfLocations', 'currency', 'state', 'birthDate', 'series', 'firstAscentPerson', 'composer', 'creator', 'influencedBy', 'almaMater', 'presenter', 'editor', 'discoverer', 'areaTotal', 'restingPlace', 'deathPlace', 'class', 'populationTotal', 'alias', 'owner', 'author', 'birthPlace', 'award']
 7 | ontologies = []
 8 | for label in labels:
 9 |     if re.match(r'^[A-Z].*', label):
10 |         print(label)
11 |         ontologies.append(label)
12 | # ontologies = '[Holiday,Protein,Beverage,Organisation,Sport]'
13 | properties = list(set(labels) ^ set(ontologies))
14 | print(properties)
15 | domains = set()
16 | for property in properties:
17 |     url = "http://dbpedia.org/ontology/"+property
18 |     page = urllib.request.urlopen(url)
19 |     soup = BeautifulSoup(page, "html.parser")
20 |     for a in soup.find_all(rel="rdfs:domain"):
21 |         domains.add(re.search(r'dbo:([a-zA-Z]+)', a.text).group(1))
22 | 
23 |         print(re.search(r'dbo:([a-zA-Z]+)', a.text).group(1))
24 | print(list(domains | set(ontologies)))
25 | # '[Airline, MeanOfTransportation, Mountain, Organisation, Beverage, Food, Software, Film, Protein, WrittenWork, Work, TelevisionShow, Band, ArchitecturalStructure, PopulatedPlace, Sport, Stream, Person, RouteOfTransportation, Place, Bridge, FictionalCharacter, Holiday, WineRegion, Scientist, Grape]'
26 | # python multi_generate_templates.py --label '[Airline, MeanOfTransportation, Mountain, Organisation, Beverage, Food, Software, Film, Protein, WrittenWork, Work, TelevisionShow, Band, ArchitecturalStructure, PopulatedPlace, Sport, Stream, Person, RouteOfTransportation, Place, Bridge, FictionalCharacter, Holiday, WineRegion, Scientist, Grape]'--project_name test --depth 1 --multi True


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/question_form.csv:
--------------------------------------------------------------------------------
1 | When is the  ,Who is the  ,What is the  ,Where is the  ,What is the number of  ,Which one is the oldest based on ,Which one the highest based on ,Which one the highest based on
2 | select ?x ,select ?x ,select ?x ,select ?x ,select count(*) as ?x ,select distinct(?x) ,select distinct(?x) ,select distinct(?x)
3 | } , } , } , } , } , } order by ?x limit 1 ,} order by ?x limit 1,} order by ?x limit 1
4 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/readme.md:
--------------------------------------------------------------------------------
1 | #  Utilities
2 | 
3 | Download the SubjectiveEye3D dataset here. The link: [https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz](https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz)
4 | 
5 | ```bash
6 | wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
7 | gzip -d part-r-00000.gz
8 | ```
9 | 


--------------------------------------------------------------------------------
/gsoc/zheyuan/utility/vocab_creator.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from tqdm import tqdm
 3 | 
 4 | def english_vocab(project_path):
 5 |     print("Creating english vocabulary")
 6 |     vocab_en = []
 7 |     word_en = []
 8 |     with open(project_path+"/data.en", "r") as lines:
 9 |         for sentence in tqdm(lines):
10 |             sentence = sentence.strip("\n")
11 |             for word in sentence.split():
12 |                 word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
13 | 
14 |     vocab_en = list(set(word_en))
15 |     try:
16 |         vocab_en.remove("")
17 |     except:
18 |         print("There is no \'\' in vocab_en")
19 |     with open(project_path+"/vocab.en", "w") as w:
20 |         for vocab in vocab_en:
21 | 
22 |             w.write(vocab.strip() + "\n")
23 | 
24 | def sparql_vocab(project_path):
25 |     print("Creating SPARQL vocabulary")
26 | 
27 |     vocab_sparql = []
28 |     with open(project_path+"/data.sparql", "r") as lines:
29 |         for sentence in tqdm(lines):
30 |             sentence = sentence.strip("\n")
31 |             for word in sentence.split():
32 |                 if word == "dbr_Flying_Legend":
33 |                     print(sentence)
34 |                 vocab_sparql.append(word)
35 |     vocab_sparql = list(set(vocab_sparql))
36 |     with open(project_path+"/vocab.sparql", "w") as w:
37 |         for vocab in vocab_sparql:
38 |             w.write(vocab.strip() + "\n")
39 | 
40 | def add_s_tokens(path):
41 |     with open(path+"/data.sparql", "r") as lines:
42 |         with open(path+"/../../GloVe/data_s.sparql", "w") as w:
43 |             for line in lines:
44 |                 new_line = "<s> " + line.strip() + " </s>\n"
45 |                 w.write(new_line)
46 | 
47 | if __name__=="__main__":
48 |     parser = argparse.ArgumentParser()
49 |     requiredNamed = parser.add_argument_group('Required Arguments')
50 | 
51 |     requiredNamed.add_argument('--path', dest='path', metavar='path',
52 |                                help='path of project that contains the data.en/sparql files', required=True)
53 |     args = parser.parse_args()
54 |     path = args.path
55 |     english_vocab(path)
56 |     sparql_vocab(path)
57 |     add_s_tokens(path)
58 |     pass


--------------------------------------------------------------------------------
/nspm/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 


--------------------------------------------------------------------------------
/nspm/data_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Data generation.
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://arxiv.org/abs/1708.07624
 8 | 
 9 | Version 2.0.0
10 | 
11 | """
12 | import argparse
13 | import tensorflow as tf
14 | from sklearn.model_selection import train_test_split
15 | 
16 | from prepare_dataset import load_dataset, convert
17 | 
18 | 
19 | global output_direc
20 | 
21 | 
22 | def merging_datafile(input_dir, output_dir):
23 |     input_diren = input_dir + '/data.en'
24 |     input_dirspq = input_dir + '/data.sparql'
25 |     output_dir += '/data.txt'
26 |     file1 = open(input_diren, 'r', encoding="utf8")
27 |     Lines1 = file1.readlines()
28 |     file2 = open(input_dirspq, 'r', encoding="utf8")
29 |     Lines2 = file2.readlines()
30 |     s = []
31 |     for i in range(len(Lines1)):
32 |         s.append(Lines1[i].replace('\n', " ") + "\t " + Lines2[i])
33 | 
34 |     filef = open(output_dir, 'w', encoding="utf8")
35 |     filef.writelines(s)
36 |     file1.close()
37 |     file2.close()
38 |     filef.close()
39 |     return output_dir
40 | 
41 | 
42 | def data_gen(input_dir, output_dir):
43 | 
44 |     output_direc = merging_datafile(input_dir, output_dir)
45 | 
46 |     input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(output_direc)
47 | 
48 |     # Calculate max_length of the target tensors
49 |     max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
50 |     input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
51 | 
52 |     # Show length
53 |     print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))
54 | 
55 |     print("Input Language; index to word mapping")
56 |     convert(inp_lang, input_tensor_train[0])
57 |     print()
58 |     print("Target Language; index to word mapping")
59 |     convert(targ_lang, target_tensor_train[0])
60 |     buffer_size = len(input_tensor_train)
61 |     batch_size = 16
62 |     steps_per_epoch = len(input_tensor_train) // batch_size
63 |     embedding_dim = 256
64 |     units = 1024
65 |     vocab_inp_size = len(inp_lang.word_index) + 1
66 |     vocab_tar_size = len(targ_lang.word_index) + 1
67 | 
68 |     dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size)
69 |     dataset = dataset.batch(batch_size, drop_remainder=True)
70 |     example_input_batch, example_target_batch = next(iter(dataset))
71 | 
72 |     return dataset, vocab_inp_size, vocab_tar_size, embedding_dim, units, batch_size, example_input_batch, steps_per_epoch, targ_lang, max_length_targ, max_length_inp, inp_lang, targ_lang
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     
77 |     parser = argparse.ArgumentParser()
78 |     requiredNamed = parser.add_argument_group('required named arguments')
79 |     requiredNamed.add_argument(
80 |         '--input', dest='input', metavar='inputDirectory', help='dataset directory', required=True)
81 |     requiredNamed.add_argument(
82 |         '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
83 |     requiredNamed.add_argument(
84 |             '--inputstr', dest='inputstr', metavar='inputString', help='Input string for translation', required=False)
85 |     args = parser.parse_args()
86 |     input_dir = args.input
87 |     output_dir = args.output
88 | 
89 |     data_gen(input_dir, output_dir)
90 | 


--------------------------------------------------------------------------------
/nspm/filter_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Script to filter data by a given criterion.
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://arxiv.org/abs/1708.07624
 8 | 
 9 | Version 2.0.0
10 | 
11 | """
12 | import argparse
13 | import collections
14 | import json
15 | import os
16 | import sys
17 | 
18 | from generator_utils import encode, save_cache, extract_encoded_entities
19 | import importlib
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser()
24 |     requiredNamed = parser.add_argument_group('required named arguments')
25 |     requiredNamed.add_argument('--dataset', dest='dataset', metavar='data_300.en', help='dataset', required=True)
26 |     requiredNamed.add_argument('--used_resources', dest='used_resources', metavar='used_resources.json', help='json file', required=True)
27 |     requiredNamed.add_argument('--minimum', dest='minimum', metavar='15', help='minimum number of occurence', required=True)
28 |     requiredNamed.add_argument('--comp', dest='comp', metavar='all|any', help='require minimum for all/any resources in the query', required=True)
29 |     args = parser.parse_args()
30 | 
31 |     dataset_file = args.dataset
32 |     used_resources_file = args.used_resources
33 |     MINIMUM = int(args.minimum)
34 |     COMP = any if args.comp == 'any' else all
35 | 
36 |     importlib.reload(sys)
37 |     sys.setdefaultencoding("utf-8")
38 | 
39 | 
40 |     dataset_root, _ = os.path.splitext(dataset_file)
41 |     used_resources_root, _ = os.path.splitext(used_resources_file)
42 |     filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(dataset_root, MINIMUM, COMP.__name__)
43 |     filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__)
44 | 
45 |     used_resources = collections.Counter(json.loads(open(used_resources_file).read()))
46 |     filtered_resources = [elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM]
47 |     save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources)))
48 |     valid_encoded_resources = [encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources]
49 |     check = lambda encoded_entity : encoded_entity in valid_encoded_resources
50 | 
51 |     valid_lines = []
52 |     filtered_queries = []
53 |     with open(dataset_root+'.sparql', 'r') as sparql_file:
54 |         for linenumber, line in enumerate(sparql_file):
55 |             entities = extract_encoded_entities(line)
56 |             valid = COMP(list(map(check, entities)))
57 |             if valid:
58 |                 filtered_queries.append(line)
59 |                 valid_lines.append(linenumber)
60 | 
61 |     filtered_questions = []
62 |     with open(dataset_root+'.en', 'r') as en_file:
63 |         for linenumber, line in enumerate(en_file):
64 |             if linenumber in valid_lines:
65 |                 filtered_questions.append(line)
66 | 
67 |     with open(filtered_en_file, 'w') as filtered:
68 |         filtered.writelines(filtered_questions)
69 | 
70 |     with open(filtered_sparql_file, 'w') as filtered:
71 |         filtered.writelines(filtered_queries)
72 | 


--------------------------------------------------------------------------------
/nspm/generator_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Generator test unit.
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://arxiv.org/abs/1708.07624
 8 | 
 9 | Version 1.0.0
10 | 
11 | """
12 | import generator
13 | import generator_utils
14 | import operator
15 | 
16 | 
17 | def test_extract_variables():
18 |     query = 'select distinct(?x) ?y where { ?x a C . ?x a ?y}'
19 |     query2 = 'select distinct ?a where'
20 | 
21 |     result = generator_utils.extract_variables(query)
22 |     result2 = generator_utils.extract_variables(query2)
23 | 
24 |     assert result == ['x', 'y']
25 |     assert result2 == ['a']
26 | 
27 | 
28 | def test_single_resource_sort():
29 |     matches = [{'usages': [17]}, {'usages': [0]}, {'usages': [3]}, {'usages': [2]}, {'usages': [1]}]
30 | 
31 |     result = sorted(matches, key=generator.prioritize_usage)
32 | 
33 |     assert list(map(operator.itemgetter(0), list(map(operator.itemgetter('usages'), result)))) == [17, 3, 2, 1, 0 ]
34 | 
35 | 
36 | def test_couple_resource_sort():
37 |     matches = [{'usages': [17, 2]}, {'usages': [0, 0]}, {'usages': [3, 2]}, {'usages': [2, 2]}, {'usages': [1, 2]}]
38 | 
39 |     result = sorted(matches, key=generator.prioritize_usage)
40 | 
41 |     assert list(map(operator.itemgetter('usages'), result)) == [[17, 2], [3, 2], [2, 2], [1, 2], [0, 0]]
42 | 
43 | 
44 | def test_encoding():
45 |     original = 'SELECT ?city WHERE { ?m skos:broader dbc:Cities_in_Germany . ?city dct:subject ?m . ?city dbo:areaTotal ?area . ?b dbo:artist dbr:John_Halsey_(musician) } order by asc (?area)'
46 |     expected_encoding = 'SELECT var_city WHERE  brack_open  var_m skos_broader dbc_Cities_in_Germany sep_dot var_city dct_subject var_m sep_dot var_city dbo_areaTotal var_area sep_dot var_b dbo_artist dbr_John_Halsey_ attr_open musician attr_close  brack_close  _oba_ var_area '
47 | 
48 |     result = generator_utils.encode(original)
49 | 
50 |     assert result == expected_encoding
51 |     assert str.strip(generator_utils.decode(result)) == original
52 | 
53 | 
54 | def test_shorten_query():
55 |     shorten = generator_utils.shorten_query
56 | 
57 |     assert shorten('ORDER BY var_area') == '_oba_ var_area'
58 |     assert shorten('order by asc par_open var_area par_close') == '_oba_ var_area'
59 |     assert shorten('order by desc attr_open var_area attr_close') == '_obd_ var_area'
60 | 
61 | 
62 | def test_normalize_predicates():
63 |     alt = 'dbp_placeOfBirth'
64 | 
65 |     assert generator_utils.normalize_predicates(alt) == 'dbo_birthPlace'


--------------------------------------------------------------------------------
/nspm/prepare_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Dataset preparation.
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://arxiv.org/abs/1708.07624
 8 | 
 9 | Version 2.0.0
10 | 
11 | """
12 | import tensorflow as tf
13 | 
14 | import unicodedata
15 | import re
16 | import io
17 | 
18 | 
19 | def unicode_to_ascii(s):
20 |   return ''.join(c for c in unicodedata.normalize('NFD', s)
21 |       if unicodedata.category(c) != 'Mn')
22 | 
23 | 
24 | def preprocess_sentence(w):
25 |   w = unicode_to_ascii(w.strip())
26 | 
27 |   # creating a space between a word and the punctuation following it
28 |   # eg: "he is a boy." => "he is a boy ."
29 |   # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
30 |   w = re.sub(r"([?.!,¿])", r" \1 ", w)
31 |   w = re.sub(r'[" "]+', " ", w)
32 | 
33 |   # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
34 |   w = re.sub(r"[^a-zA-Z?.!,¿_]+", " ", w)
35 | 
36 |   w = w.strip()
37 | 
38 |   # adding a start and an end token to the sentence
39 |   # so that the model know when to start and stop predicting.
40 |   w = '<start> ' + w + ' <end>'
41 |   return w
42 | 
43 | 
44 | def create_dataset(path, num_examples):
45 |   lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
46 | 
47 |   word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
48 | 
49 |   return zip(*word_pairs)
50 | 
51 | 
52 | def tokenize(lang):
53 |   lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
54 |       filters='',lower=False)
55 |   lang_tokenizer.fit_on_texts(lang)
56 | 
57 |   tensor = lang_tokenizer.texts_to_sequences(lang)
58 | 
59 |   tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
60 |                                                          padding='post')
61 | 
62 |   return tensor, lang_tokenizer
63 | 
64 | 
65 | def load_dataset(path, num_examples=None):
66 |   # creating cleaned input, output pairs
67 |   inp_lang_wp, targ_lang_wp  = create_dataset(path, num_examples)
68 | 
69 |   input_tensor, inp_lang = tokenize(inp_lang_wp)
70 |   target_tensor, targ_lang = tokenize(targ_lang_wp)
71 | 
72 |   return input_tensor, target_tensor, inp_lang, targ_lang
73 | 
74 | 
75 | def convert(lang, tensor):
76 |   for t in tensor:
77 |     if t!=0:
78 |       print ("%d ----> %s" % (t, lang.index_word[t]))
79 | 
80 | 
81 | def merging_datafile(input_dir,output_dir):
82 |     input_diren=input_dir+'/data.en'
83 |     input_dirspq=input_dir+'/data.sparql'
84 |     output_dir+='/data.txt'
85 |     file1 = open(input_diren,'r',encoding="utf8")
86 |     Lines1 = file1.readlines()
87 |     file2 = open(input_dirspq,'r',encoding="utf8")
88 |     Lines2 = file2.readlines()
89 |     s=[]
90 |     for i in range(len(Lines1)):
91 |         s.append(Lines1[i].replace('\n'," ")+"\t "+Lines2[i])
92 | 
93 |     filef = open(output_dir,'w',encoding="utf8")
94 |     filef.writelines(s)
95 |     file1.close()
96 |     file2.close()
97 |     filef.close()
98 |     return output_dir
99 | 


--------------------------------------------------------------------------------
/nspm/split_in_train_dev_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Neural SPARQL Machines - Script to split data into train, dev, and test sets.
 5 | 
 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017
 7 | https://arxiv.org/abs/1708.07624
 8 | 
 9 | Version 2.0.0
10 | 
11 | """
12 | import argparse
13 | #!/usr/bin/env python
14 | import random
15 | import os
16 | import io
17 | 
18 | TRAINING_PERCENTAGE = 80
19 | TEST_PERCENTAGE = 10
20 | DEV_PERCENTAGE = 10
21 | 
22 | if __name__ == '__main__':
23 |     parser = argparse.ArgumentParser()
24 |     requiredNamed = parser.add_argument_group('required named arguments')
25 |     requiredNamed.add_argument('--lines', dest='lines', metavar='lines',
26 |                                help='total number of lines (wc -l <file>)', required=True)
27 |     requiredNamed.add_argument('--dataset', dest='dataset',
28 |                                metavar='dataset.sparql', help='sparql dataset file', required=True)
29 |     args = parser.parse_args()
30 | 
31 |     lines = int(args.lines)
32 |     dataset_file = os.path.splitext(args.dataset)[0]
33 |     sparql_file = dataset_file + '.sparql'
34 |     en_file = dataset_file + '.en'
35 | 
36 |     random.seed()
37 | 
38 |     test_and_dev_percentage = sum([TEST_PERCENTAGE, DEV_PERCENTAGE])
39 |     number_of_test_and_dev_examples = int(
40 |         lines * test_and_dev_percentage / 100)
41 |     number_of_dev_examples = int(
42 |         number_of_test_and_dev_examples * DEV_PERCENTAGE / test_and_dev_percentage)
43 | 
44 |     dev_and_test = random.sample(range(lines), number_of_test_and_dev_examples)
45 |     dev = random.sample(dev_and_test, number_of_dev_examples)
46 |     with io.open(sparql_file, encoding="utf-8") as original_sparql, io.open(en_file, encoding="utf-8") as original_en:
47 |         sparql = original_sparql.readlines()
48 |         english = original_en.readlines()
49 | 
50 |         dev_sparql_lines = []
51 |         dev_en_lines = []
52 |         train_sparql_lines = []
53 |         train_en_lines = []
54 |         test_sparql_lines = []
55 |         test_en_lines = []
56 | 
57 |         for i in range(len(sparql)):
58 |             sparql_line = sparql[i]
59 |             en_line = english[i]
60 |             if i in dev_and_test:
61 |                 if i in dev:
62 |                     dev_sparql_lines.append(sparql_line)
63 |                     dev_en_lines.append(en_line)
64 |                 else:
65 |                     test_sparql_lines.append(sparql_line)
66 |                     test_en_lines.append(en_line)
67 |             else:
68 |                 train_sparql_lines.append(sparql_line)
69 |                 train_en_lines.append(en_line)
70 | 
71 |         with io.open('train.sparql', 'w', encoding="utf-8") as train_sparql, io.open('train.en', 'w', encoding="utf-8") as train_en, \
72 |                 io.open('dev.sparql', 'w', encoding="utf-8") as dev_sparql, io.open('dev.en', 'w', encoding="utf-8") as dev_en, \
73 |                 io.open('test.sparql', 'w', encoding="utf-8") as test_sparql, io.open('test.en', 'w', encoding="utf-8") as test_en:
74 | 
75 |             train_sparql.writelines(train_sparql_lines)
76 |             train_en.writelines(train_en_lines)
77 |             dev_sparql.writelines(dev_sparql_lines)
78 |             dev_en.writelines(dev_en_lines)
79 |             test_sparql.writelines(test_sparql_lines)
80 |             test_en.writelines(test_en_lines)
81 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.9.0
 2 | astor==0.8.1
 3 | backports.functools-lru-cache==1.6.1
 4 | backports.weakref==1.0rc1
 5 | beautifulsoup4==4.8.2
 6 | bleach==3.3.0
 7 | enum34==1.1.9
 8 | funcsigs==1.0.2
 9 | gast==0.2.2
10 | grpcio==1.27.2
11 | h5py==2.10.0
12 | html5lib==0.9999999
13 | Markdown==3.1.1
14 | mock==3.0.5
15 | numpy==1.19.3
16 | protobuf==3.11.3
17 | pytest==6.2.2
18 | rdflib==5.0.0
19 | six==1.14.0
20 | soupsieve==1.9.5
21 | tensorboard==2.1.0
22 | tensorflow==2.4.0
23 | termcolor==1.1.0
24 | tqdm==4.56.0
25 | Werkzeug==1.0.1
26 | airML==0.0.3
27 | 


--------------------------------------------------------------------------------
/test/interpreter_airml_test.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import unittest
 3 | 
 4 | 
 5 | class TestAirML(unittest.TestCase):
 6 |     def test_airml_input_args_with_valid_kn(self):
 7 |         process = subprocess.Popen(
 8 |             ['python3', 'interpreter.py', "--airml", "http://nspm.org/art", "--output", "test", "--inputstr",
 9 |              '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE)
10 |         output, err = process.communicate()
11 |         output = output.decode("utf-8")
12 |         self.assertTrue("http://nspm.org/art KB installed." in output)
13 |         self.assertTrue("Predicted translation:" in output)
14 | 
15 |     def test_airml_input_args_with_invalid_kn(self):
16 |         process = subprocess.Popen(
17 |             ['python3', 'interpreter.py', "--airml", "http://nspm.org/arts", "--output", "test", "--inputstr",
18 |              '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE)
19 |         output, err = process.communicate()
20 |         output = output.decode("utf-8")
21 |         self.assertTrue("Predicted translation:" not in output)
22 | 
23 |     def test_airml_without_input_arg(self):
24 |         process = subprocess.Popen(
25 |             ['python3', 'interpreter.py', "--output", "test", "--inputstr",
26 |              '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE)
27 |         output, err = process.communicate()
28 |         output = output.decode("utf-8")
29 |         self.assertTrue("--input or --airml argument should be provided to load the model." in output)
30 |         self.assertTrue("Predicted translation:" not in output)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------