├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── data ├── properties │ ├── dbpedia-201610-manual-annotation.csv │ ├── dbpedia-201610-place-properties.csv │ ├── dbpedia-201610-properties.tsv │ └── dbpedia-ontology-properties.tsv └── templates │ ├── Annotations_F30_art.csv │ ├── LC-QuAD_v6_art.csv │ ├── LC-QuAD_v6_other.csv │ ├── LC-QuAD_v6_personal.csv │ ├── LC-QuAD_v6_sport.csv │ ├── LC-QuAD_v6_sport_art.csv │ ├── QALD-7.csv │ └── dbpedia-201610-place.csv ├── gsoc ├── aman │ ├── .DS_Store │ ├── GS_with_mve.csv │ ├── PIPELINE │ ├── README.md │ ├── composite_template.py │ ├── decision_tree.py │ ├── delete_lines.py │ ├── entity_errors.py │ ├── error_analysis.py │ ├── final_formatting.py │ ├── get_metadata.py │ ├── get_properties.py │ ├── integrate.py │ ├── log_place │ ├── metadata_place.txt │ ├── place_labels │ ├── placetemp │ │ ├── data_300.en │ │ ├── data_300.sparql │ │ └── resource_dump.json │ ├── range_place.py │ ├── remove_en.py │ ├── script.py │ ├── script2.py │ ├── sparql_generator.csv │ ├── sparql_generator.py │ ├── temp.py │ ├── tempout │ ├── test.txt │ ├── test_comp.csv │ ├── test_pipeline │ │ ├── 1 │ │ ├── 2 │ │ └── get_properties.py │ └── training_log ├── anand │ ├── .images │ │ ├── test-accuracy.png │ │ └── test-bleu.png │ ├── .pipeline_2 │ │ ├── .vscode │ │ │ └── settings.json │ │ ├── decision_tree.py │ │ ├── final_formatting.py │ │ ├── get_properties.py │ │ ├── integrate.py │ │ ├── person │ │ │ ├── decision_tree.csv │ │ │ ├── get_properties.csv │ │ │ ├── integrate.csv │ │ │ ├── mvedecision_tree.csv │ │ │ ├── sparql_generator.csv │ │ │ └── test_res.csv │ │ ├── sparql_generator.py │ │ └── test │ │ │ ├── check_test_res.csv │ │ │ ├── decision_tree.csv │ │ │ ├── get_properties.csv │ │ │ ├── integrate.csv │ │ │ ├── mvedecision_tree.csv │ │ │ └── sparql_generator.csv │ ├── .vscode │ │ └── settings.json │ ├── PIPELINE.md │ ├── pipeline_1 │ │ ├── pipeline_1_composite │ │ │ ├── composite_template.py │ │ │ ├── decision_tree.py │ │ │ ├── final_formatting.py │ │ │ ├── get_properties.py │ │ │ ├── integrate.py │ │ │ ├── person │ │ │ │ ├── decision_tree.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── integrate.csv │ │ │ │ ├── mvedecision_tree.csv │ │ │ │ ├── sparql_generator.csv │ │ │ │ └── test_res.csv │ │ │ ├── range_place.py │ │ │ ├── sparql_generator.py │ │ │ └── test3 │ │ │ │ ├── decision_tree.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── integrate.csv │ │ │ │ ├── mvedecision_tree.csv │ │ │ │ ├── sparql_generator.csv │ │ │ │ └── test_res.csv │ │ ├── pipeline_1_simple │ │ │ ├── decision_tree.py │ │ │ ├── final_formatting.py │ │ │ ├── get_properties.py │ │ │ ├── integrate.py │ │ │ ├── person │ │ │ │ ├── decision_tree.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── integrate.csv │ │ │ │ ├── mvedecision_tree.csv │ │ │ │ ├── sparql_generator.csv │ │ │ │ └── test_res.csv │ │ │ ├── sparql_generator.py │ │ │ ├── test │ │ │ │ ├── decision_tree.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── integrate.csv │ │ │ │ ├── mvedecision_tree.csv │ │ │ │ ├── sparql_generator.csv │ │ │ │ └── test_res.csv │ │ │ └── test_person │ │ │ │ ├── decision_tree.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── integrate.csv │ │ │ │ ├── mvedecision_tree.csv │ │ │ │ ├── sparql_generator.csv │ │ │ │ └── test_res.csv │ │ └── readme.md │ ├── pipeline_3 │ │ ├── .pipeline_3 │ │ │ ├── .vscode │ │ │ │ └── settings.json │ │ │ ├── Film │ │ │ │ ├── author.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ └── sentence_and_template_generator.ods │ │ │ ├── eliminator.py │ │ │ ├── eukaryotes │ │ │ │ ├── family.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── results.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ ├── sentence_and_template_generator.ods │ │ │ │ └── species.csv │ │ │ ├── fetch_ranks_sub.py │ │ │ ├── generate_templates.py │ │ │ ├── generate_url.py │ │ │ ├── get_properties.py │ │ │ ├── nspm_ready.csv │ │ │ ├── rank_test │ │ │ │ ├── affiliation.csv │ │ │ │ ├── birth place.csv │ │ │ │ ├── body discovered.csv │ │ │ │ ├── child.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ ├── sentence_and_template_generator.csv │ │ │ │ └── sentence_and_template_generator.ods │ │ │ ├── readme.md │ │ │ └── sentence_and_template_generator.py │ │ ├── pipeline_3_with_controlled_test_set │ │ │ ├── .images │ │ │ │ └── person_properties.png │ │ │ ├── .vscode │ │ │ │ └── settings.json │ │ │ ├── Eukaryotes │ │ │ │ ├── sentence_and_template_generator │ │ │ │ └── test.csv │ │ │ ├── Monument │ │ │ │ ├── architect.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ ├── tenant.csv │ │ │ │ └── test.csv │ │ │ ├── Organisation │ │ │ │ ├── administrator.csv │ │ │ │ ├── chairperson.csv │ │ │ │ ├── chaplain.csv │ │ │ │ ├── chief executive officer.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ └── test.csv │ │ │ ├── Person │ │ │ │ ├── alma mater.csv │ │ │ │ ├── birth place.csv │ │ │ │ ├── body discovered.csv │ │ │ │ ├── career station.csv │ │ │ │ ├── child.csv │ │ │ │ ├── college.csv │ │ │ │ ├── death place.csv │ │ │ │ ├── employer.csv │ │ │ │ ├── ethnicity.csv │ │ │ │ ├── get_properties.csv │ │ │ │ ├── home town.csv │ │ │ │ ├── ideology.csv │ │ │ │ ├── nationality.csv │ │ │ │ ├── networth.csv │ │ │ │ ├── opponent.csv │ │ │ │ ├── parent.csv │ │ │ │ ├── partner.csv │ │ │ │ ├── person function.csv │ │ │ │ ├── place of burial.csv │ │ │ │ ├── relation.csv │ │ │ │ ├── relative.csv │ │ │ │ ├── residence.csv │ │ │ │ ├── resting place.csv │ │ │ │ ├── sentence_and_template_generator │ │ │ │ └── test.csv │ │ │ ├── eliminator.py │ │ │ ├── fetch_ranks.py │ │ │ ├── fetch_ranks_sub.py │ │ │ ├── generate_templates.py │ │ │ ├── generate_url.py │ │ │ ├── get_properties.py │ │ │ ├── new_train.csv │ │ │ ├── nspm_ready.csv │ │ │ ├── question_generator.py │ │ │ ├── readme.md │ │ │ └── sentence_and_template_generator.py │ │ └── utility │ │ │ ├── .~lock.question_form.csv# │ │ │ ├── Test_Fixer │ │ │ ├── readme.md │ │ │ └── test_fixer.py │ │ │ ├── compare │ │ │ ├── compare.py │ │ │ ├── output_test │ │ │ ├── readme.md │ │ │ └── test.sparql │ │ │ ├── labels.json │ │ │ ├── new_extractor_fromGraphDBpediaEmbeddings │ │ │ ├── breaker.sh │ │ │ ├── embedding_extractor.py │ │ │ ├── indexer.py │ │ │ └── readme.md │ │ │ ├── old_extractor_from_GraphDBpediaEmbeddings │ │ │ ├── en_extract_embed.py │ │ │ ├── readme.md │ │ │ └── sparql_extract_embed.py │ │ │ ├── qald_json │ │ │ ├── interpreter.py │ │ │ ├── qald_json_gerbil_input.py │ │ │ ├── readme.md │ │ │ ├── shifter.sh │ │ │ ├── test.en │ │ │ └── test.sparql │ │ │ ├── question_form.csv │ │ │ ├── readme.md │ │ │ └── vocab_extractor_from_model │ │ │ ├── embedding_extractor.py │ │ │ └── readme.md │ └── readme.md └── zheyuan │ ├── pipeline │ ├── README.md │ ├── basic_sentence_and_template_generator.py │ ├── batch_paraphrase.py │ ├── bert_classifier.py │ ├── eliminator.py │ ├── fetch_ranks.py │ ├── fetch_ranks_sub.py │ ├── generate_templates.py │ ├── generate_url.py │ ├── get_properties.py │ ├── multi_generate_templates.py │ ├── paraphrase_questions.py │ ├── pipeline.sh │ ├── question_generator.py │ ├── sentence_and_template_generator.py │ └── textual_similarity.py │ └── utility │ ├── GloVe │ └── glove_finetune.py │ ├── benchmark │ ├── README.md │ ├── benchmark.py │ ├── extract_questions.py │ ├── interpreter.py │ ├── reconstruct_json.py │ └── retrieve_answers.py │ ├── queryFilter.py │ ├── question_form.csv │ ├── readme.md │ └── vocab_creator.py ├── nspm ├── __init__.py ├── analyse.py ├── data_gen.py ├── filter_dataset.py ├── generator.py ├── generator_test.py ├── generator_utils.py ├── interpreter.py ├── learner.py ├── nmt.py ├── prepare_dataset.py └── split_in_train_dev_test.py ├── requirements.txt └── test └── interpreter_airml_test.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.zip filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # macOS 104 | .DS_Store 105 | 106 | .idea/ 107 | 108 | data/*/* 109 | 110 | gsoc/anand/utility/test.py 111 | gsoc/anand/utility/part-r-00000 112 | gsoc/anand/pipeline_3/utility/part-r-00000 113 | data/pipeline_3* 114 | gsoc/anand/pipeline_3/utility/part-r-00000 115 | data/*/* 116 | v1/ 117 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 AKSW Research Group @ University of Leipzig 4 | Copyright (c) 2020 Liber AI Research 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /gsoc/aman/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/aman/.DS_Store -------------------------------------------------------------------------------- /gsoc/aman/PIPELINE: -------------------------------------------------------------------------------- 1 | 2 | ### AUTOMATIC TRAINING DATA GENERATION FOR NEURAL QA MODEL ### 3 | 4 | 5 | The final output file has each row of the form: 6 | ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 7 | 8 | # STEP 1 - Get properties from web page # 9 | Command: python get_properties.py --url > temp.csv 10 | - --url argument is the webpage from where property metadata is to scraped 11 | 12 | # STEP 2 - Get number of occurrences and URI # 13 | Store only the rows of required namespace properties 14 | 15 | # STEP 3 - Integrate STEP 2 values with their corresponding property metadata row in temp.csv # 16 | Command: python integrate.py temp.csv 17 | Output file: manual-annotation-updated-v2.csv (change it in the file if needed) 18 | - Change the namespace to the required (it is 'ontology' right now) 19 | 20 | # STEP 4 - MVE generation # 21 | Command: python decision_tree.py data/manual-annotation-updated-v2.csv 22 | Output file: GS_with_mve.csv 23 | 24 | # STEP 5 - SPARQL Query Template and Generator Query generation # 25 | Command: python sparql_generator.py GS_with_mve.csv 26 | 27 | # STEP 6 - Formatting the data into required format 28 | Command: python final_formatting.py data/GS-v3.csv data/annotations_place_v2.csv 29 | 30 | # STEP 7 - Follow the original data generation and training steps (readme of master branch) 31 | 32 | ### COMPOSITIONALITY EXPERIMENT: ### 33 | 34 | # STEP 1 - Create template annotations (all a[i]'s) 35 | Command: python range_place.py data/GS-v3.csv > data/annotations_compositions_combined.csv 36 | 37 | 38 | # STEP 2 - Create composite templates (a[i]○b true for all i <= sizeof list 'a') 39 | Command: python composite_template.py data/GS-v3.csv >> data/annotations_compositions_combined.csv 40 | 41 | # STEP 3 - Follow the original data generation and training steps (readme of master branch) 42 | 43 | # STEP 4 - Choose any 10% templates and their output and shift it to new file (test data), rest of the contents of this file should be split into 90% train and 10% dev using the split_in_train_dev_test.py script. 44 | 45 | # STEP 5 - Run the training 46 | Command: sh train.sh 47 | -------------------------------------------------------------------------------- /gsoc/aman/README.md: -------------------------------------------------------------------------------- 1 | ## Aman: Work done during DBpedia's Google Summer of Code 2018 2 | 3 | Hi, please find my blog here: https://amanmehta-maniac.github.io. - You will find details about what this project based on https://github.com/AKSW/NSpM had to offer. 4 | 5 | 1. To be able to generate the dataset automatically, there is a five step pipeline which you would have to follow, guided at 'PIPELINE' file. 6 | 2. Otherwise you can directly use the data generated under `./data/place_v2.zip` and `./data/Compositions_v2.zip`. 7 | -------------------------------------------------------------------------------- /gsoc/aman/composite_template.py: -------------------------------------------------------------------------------- 1 | import sys 2 | f = open(sys.argv[1],'r') 3 | lines = f.readlines() 4 | 5 | 6 | # RUN: python composite_template.py data/GS-v3.csv 7 | # Given format # 8 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 9 | 10 | # Required format : separated by semi-colon ## 11 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 12 | b = [] 13 | b.append("where is the") 14 | b.append("") 15 | b.append("of located in") 16 | 17 | for l in lines: 18 | l = l.strip().split(',') 19 | # print l 20 | if len(l) == 0: 21 | continue 22 | if 'place' in l[2].lower() and l[5]!='' and len(l[5])!=0and 'location of' not in l[7].lower(): 23 | 24 | newl,to_remove = [],[] 25 | newl.append("dbo:Place") 26 | newl.append("") 27 | newl.append("") 28 | 29 | l[1] = l[1].split() 30 | for i in range(len(l[1])): 31 | if '(' in l[1][i] or ')' in l[1][i]: 32 | to_remove.append(l[1][i]); 33 | continue 34 | for x in to_remove: 35 | l[1].remove(x); 36 | 37 | b[1] = " ".join(l[1]) 38 | # print b 39 | nlq = " ".join(b) 40 | 41 | spq = "select ?a where { " + l[5] + " ?b . ?b ?a }" 42 | # print nlq + ";" + spq 43 | 44 | gq = l[-1] 45 | 46 | gq2 = gq.split()[1] 47 | gq2 = "distinct(" + gq2 + ")" 48 | gq = gq.split(); 49 | gq[1] = gq2; 50 | gq = " ".join(gq).replace("SELECT","select").replace("WHERE","where") 51 | 52 | 53 | newl.append((nlq)) 54 | newl.append((spq)) 55 | newl.append((gq)) 56 | newl = ";".join(newl) 57 | print newl -------------------------------------------------------------------------------- /gsoc/aman/decision_tree.py: -------------------------------------------------------------------------------- 1 | import sys, re 2 | 3 | f = open(sys.argv[1],'r') 4 | lines = f.readlines(); 5 | final_lines = [] 6 | 7 | lineno = 1 8 | 9 | # print lines[0].split(',') 10 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 11 | # 'MVE', 'Optimal Expression\r\n'] 12 | 13 | mve = "" 14 | for line in lines: 15 | if lineno == 1: 16 | lineno += 1 17 | continue 18 | line = line.strip().split(',') 19 | rng = line[2].lower() 20 | lbl = line[1] 21 | if 'person' in rng: 22 | rng = "who" 23 | else: 24 | rng = "what" 25 | line[7] = rng + " is the " + lbl + " of " 26 | line[8] = rng + " is the " + lbl + " of " 27 | mve += rng + " is the " + lbl + " of \n" 28 | final_lines.append(",".join(line)); 29 | 30 | 31 | fw = open('data/mve_output','w') 32 | fw.write(mve) 33 | 34 | fw2 = open('GS_with_mve.csv','w'); 35 | fw2.write("\n".join(final_lines)) 36 | 37 | -------------------------------------------------------------------------------- /gsoc/aman/delete_lines.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'rw') 4 | lines = f.readlines() 5 | 6 | to_del = [3636, 15366, 22096, 23913, 27938, 29413, 29452, 33507, 34670, 50813, 58739, 71547, 71747, 72127, 72699, 73110, 73146, 75803, 76512, 76977] 7 | to_del.reverse() 8 | 9 | for x in to_del: 10 | del lines[x-1] 11 | 12 | lines = "".join(lines) 13 | print lines -------------------------------------------------------------------------------- /gsoc/aman/entity_errors.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'r') 4 | g = open(sys.argv[2],'r') 5 | h = open(sys.argv[3],'r') 6 | 7 | 8 | fl = f.readlines() 9 | gl = g.readlines() 10 | hl = h.readlines() 11 | tot = 0 12 | entity_error, prop_error = 0, 0 13 | train_entities = set() 14 | 15 | for i in range(len(fl)): 16 | trainout = fl[i].strip().split() 17 | for t in trainout: 18 | if 'dbr_' in t: 19 | train_entities.add(t[1:-1]); 20 | break 21 | 22 | # print train_entities 23 | 24 | cnt = 0 25 | save = [] 26 | for i in range(len(gl)): 27 | testout = gl[i].strip().split() 28 | testEntity = "" 29 | for t in testout: 30 | if 'dbr_' in t: 31 | testEntity = t 32 | if testEntity[1:-1] in train_entities: 33 | save.append(i); 34 | cnt += 1 35 | tot += 1 36 | 37 | newtest_nlq, newtest_sparql = [], [] 38 | 39 | for i in save: 40 | newtest_sparql.append(gl[i]) 41 | 42 | for i in save: 43 | newtest_nlq.append(hl[i]) 44 | 45 | # print "".join(newtest_sparql) 46 | print "".join(newtest_nlq) 47 | 48 | 49 | # print cnt, tot 50 | 51 | 52 | -------------------------------------------------------------------------------- /gsoc/aman/error_analysis.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'r') 4 | g = open(sys.argv[2],'r') 5 | 6 | 7 | fl = f.readlines() 8 | gl = g.readlines() 9 | tot = 0 10 | entity_error, prop_error = 0, 0 11 | 12 | for i in range(len(fl)): 13 | if fl[i].strip() == gl[i].strip(): 14 | # print fl[i], gl[i] 15 | continue 16 | myout = fl[i].strip().split() 17 | myEntity, myProp = "", "" 18 | for t in myout: 19 | if 'dbr_' in t: 20 | myEntity = t 21 | if '<' in t: 22 | myProp = t[1:-1] 23 | 24 | reqout = gl[i].strip().split() 25 | reqEntity, reqProp = "", "" 26 | for t in reqout: 27 | if 'dbr_' in t: 28 | reqEntity = t 29 | if '<' in t: 30 | reqProp = t[1:-1] 31 | 32 | if reqEntity != myEntity: 33 | 34 | # print myEntity, reqEntity 35 | entity_error += 1 36 | if reqProp != myProp: 37 | print reqProp, myProp 38 | prop_error += 1 39 | tot += 1 40 | 41 | print (99/2034.0) 42 | -------------------------------------------------------------------------------- /gsoc/aman/final_formatting.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'r') 4 | # Given format # 5 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 6 | 7 | # Required format : separated by semi-colon ## 8 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 9 | 10 | lines = f.readlines(); 11 | f.close() 12 | fl = 1 13 | 14 | output = "" 15 | for line in lines: 16 | 17 | if fl: 18 | fl = 0 19 | continue 20 | l = line.split(','); 21 | # print l 22 | 23 | newl,to_remove = [],[] 24 | newl.append("dbo:Place") 25 | newl.append("") 26 | newl.append("") 27 | 28 | nlq = l[7].split(); 29 | for i in range(len(nlq)): 30 | if '(' in nlq[i] or ')' in nlq[i]: 31 | to_remove.append(nlq[i]); 32 | continue 33 | if '<' not in nlq[i] and '?' not in nlq[i]: 34 | nlq[i] = nlq[i].lower() 35 | 36 | for x in to_remove: 37 | nlq.remove(x); 38 | 39 | spq = l[-2].split(); 40 | for i in range(len(spq)): 41 | if '<' not in spq[i] and '?' not in spq[i]: 42 | spq[i] = spq[i].lower() 43 | 44 | gq = l[-1].split(); 45 | for i in range(len(gq)): 46 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 47 | gq[i] = gq[i].lower() 48 | 49 | newl.append(" ".join(nlq)) 50 | newl.append(" ".join(spq)) 51 | newl.append(" ".join(gq)) 52 | output += ";".join(newl) + "\n"; 53 | 54 | 55 | fw = open(sys.argv[2],'w') 56 | fw.write(output) 57 | fw.close() -------------------------------------------------------------------------------- /gsoc/aman/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib2, urllib, httplib, json, sys, csv, io 2 | import argparse 3 | from bs4 import BeautifulSoup 4 | 5 | parser = argparse.ArgumentParser() 6 | requiredNamed = parser.add_argument_group('Required Arguments'); 7 | requiredNamed.add_argument('--url', dest='url', metavar='url', help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 8 | args = parser.parse_args() 9 | 10 | quote_page = args.url 11 | page = urllib2.urlopen(quote_page) 12 | 13 | soup = BeautifulSoup(page, "html.parser") 14 | # print type(soup) 15 | fl = 0 16 | for rows in soup.find_all("tr"): 17 | 18 | x = rows.find_all("td"); 19 | 20 | if len(x) <= 2: 21 | fl = 1 22 | continue 23 | 24 | if fl == 1: 25 | fl = 2 26 | continue 27 | 28 | name = rows.find_all("td")[0].get_text().replace(" (edit)","") 29 | label = rows.find_all("td")[1].get_text() 30 | dom = rows.find_all("td")[2].get_text() 31 | rng = rows.find_all("td")[3].get_text() 32 | 33 | final = name + "," + label + "," + dom + "," + rng 34 | print final.encode('utf-8') 35 | 36 | # with io.open("test.csv", mode='w', encoding='utf-8') as toWrite: 37 | # writer = csv.writer(toWrite) 38 | # writer.writerows(props) 39 | 40 | -------------------------------------------------------------------------------- /gsoc/aman/integrate.py: -------------------------------------------------------------------------------- 1 | import sys, argparse 2 | 3 | parser = argparse.ArgumentParser() 4 | requiredNamed = parser.add_argument_group('Required Arguments'); 5 | requiredNamed.add_argument('--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True) 6 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', help='eg: File which contains metadata of properties', required=True) 7 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', help='eg: File which contains uri and number of occurrences of properties', required=True) 8 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', help='File in which you want to store output', required=True) 9 | args = parser.parse_args() 10 | 11 | namespace = args.ns 12 | 13 | f = open(args.uri,'r') 14 | file = f.readlines() 15 | d = {}; 16 | 17 | for l in file: 18 | 19 | l = l.strip().split('\t') 20 | if l[0].split('/')[-2] != namespace: 21 | continue 22 | d[l[0].split('/')[-1]] = l[1]; 23 | 24 | # print d["abstract"]; 25 | 26 | 27 | f = open(args.inp,'r') 28 | manual = f.readlines() 29 | cnt,tot = 0,0; 30 | final = "" 31 | 32 | for m in manual: 33 | l = m.strip().split(',') 34 | m = l[0] 35 | tot += 1 36 | # if ':' in m: 37 | # print "lol", m 38 | if m in d: 39 | cnt += 1; 40 | l.append("http://dbpedia.org/" + namespace + "/" +m) 41 | l.append(d[m]) 42 | else: 43 | 44 | l.append('') 45 | l.append('') 46 | print m 47 | 48 | final += ",".join(l); 49 | final += '\n'; 50 | 51 | # print final 52 | f = open(args.out,'w'); 53 | f.write(final); 54 | print cnt, tot -------------------------------------------------------------------------------- /gsoc/aman/log_place: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . } OFFSET 0 LIMIT 10000 2 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . } OFFSET 0 LIMIT 10000 3 | > 4 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . } OFFSET 10000 LIMIT 10000 5 | SELECT DISTINCT ?p ?lab ?dom WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:domain ?dom } . FILTER(lang(?lab) = 'en') . } OFFSET 10000 LIMIT 10000 6 | > 7 | SELECT DISTINCT ?p ?lab ?rng WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:range ?rng } . FILTER(lang(?lab) = 'en') . } OFFSET 0 LIMIT 10000 8 | SELECT DISTINCT ?p ?lab ?rng WHERE { ?s ?p ?o . ?s a dbo:Place . ?p a rdf:Property . ?p rdfs:label ?lab . OPTIONAL { ?p rdfs:range ?rng } . FILTER(lang(?lab) = 'en') . } OFFSET 0 LIMIT 10000 9 | > 10 | -------------------------------------------------------------------------------- /gsoc/aman/range_place.py: -------------------------------------------------------------------------------- 1 | import sys 2 | f = open(sys.argv[1],'r') 3 | lines = f.readlines() 4 | 5 | # RUN: python range_place.py data/GS-v3.csv > data/annotations_compositions.csv 6 | 7 | 8 | for l in lines: 9 | l = l.split(','); 10 | if len(l) == 0: 11 | continue 12 | if l[5] == "" or len(l[5])==0: 13 | continue; 14 | if 'place' in l[2].lower() and l[5]!='': 15 | newl,to_remove = [],[] 16 | newl.append("dbo:Place") 17 | newl.append("") 18 | newl.append("") 19 | nlq = l[7].split(); 20 | for i in range(len(nlq)): 21 | if '(' in nlq[i] or ')' in nlq[i]: 22 | to_remove.append(nlq[i]); 23 | continue 24 | if '<' not in nlq[i] and '?' not in nlq[i]: 25 | nlq[i] = nlq[i].lower() 26 | 27 | for x in to_remove: 28 | nlq.remove(x); 29 | 30 | nlq = " ".join(nlq) 31 | 32 | spq = l[9].split(); 33 | for i in range(len(spq)): 34 | if '<' not in spq[i] and '?' not in spq[i]: 35 | spq[i] = spq[i].lower() 36 | 37 | spq = " ".join(spq) 38 | 39 | 40 | gq = l[-1].split() 41 | for i in range(len(gq)): 42 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 43 | gq[i] = gq[i].lower() 44 | 45 | gq = " ".join(gq) 46 | 47 | nlq = nlq.replace('','') 48 | spq = spq.replace('?x','?a').replace('','') 49 | gq = gq.replace('?x','?a').replace('','') 50 | newl.append((nlq)) 51 | newl.append((spq)) 52 | newl.append((gq)) 53 | 54 | print ";".join(newl) 55 | -------------------------------------------------------------------------------- /gsoc/aman/remove_en.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'rw') 4 | lines = f.readlines() 5 | 6 | final = [] 7 | lineno = [] 8 | i = 0 9 | for l in lines: 10 | i += 1 11 | if ' en ' in l: 12 | lineno.append(i) 13 | 14 | # print "\n".join(lineno) 15 | print lineno -------------------------------------------------------------------------------- /gsoc/aman/script.py: -------------------------------------------------------------------------------- 1 | f = open('data/manual.csv','rw') 2 | f2 = open('data/temp.csv','rw') 3 | f3 = open('data/newtemp.csv','w') 4 | 5 | l2 = f2.readlines(); 6 | 7 | lines = f.readlines() 8 | # print "xxxs" 9 | cnt = 0 10 | for line in lines: 11 | if cnt <= 200: 12 | cnt += 1 13 | continue 14 | line = line.split(',') 15 | x = l2[cnt-1].split(',') 16 | # print line 17 | if line[1] == '': 18 | line[1] = x[3].strip() 19 | if line[2] == '': 20 | line[2] = "what is the " + x[1] + " of " 21 | else: 22 | line[2] = line[2].replace("x","") 23 | if line[3] == '': 24 | line[3] = line[2] 25 | else: 26 | line[3] = line[3].replace("x","") 27 | # print line 28 | # line[1] 29 | newline = ",".join(line) 30 | print newline 31 | f3.write(newline) 32 | # print 33 | cnt += 1 34 | # if cnt > 200: 35 | # break -------------------------------------------------------------------------------- /gsoc/aman/script2.py: -------------------------------------------------------------------------------- 1 | f2 = open('data/temp.csv','rw') 2 | f3 = open('data/newtemp.csv','w') 3 | 4 | l2 = f2.readlines(); 5 | 6 | lines = f.readlines() 7 | # print "xxxs" 8 | cnt = 0 9 | for line in l2: 10 | 11 | line = line.split(',') 12 | line.append("what is the " + line[0] + " of ") 13 | line.append("what is the " + line[0] + " of ") 14 | # print line 15 | # if line[1] == '': 16 | # line[1] = x[3].strip() 17 | # if line[2] == '': 18 | # line[2] = "what is the " + x[1] + " of " 19 | # else: 20 | # line[2] = line[2].replace("x","") 21 | # if line[3] == '': 22 | # line[3] = line[2] 23 | # else: 24 | # line[3] = line[3].replace("x","") 25 | # print line 26 | # line[1] 27 | newline = ",".join(line) 28 | print newline 29 | f3.write(newline) 30 | -------------------------------------------------------------------------------- /gsoc/aman/sparql_generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | f = open(sys.argv[1],'r') 4 | lines = f.readlines(); 5 | 6 | # print lines[0].split(',') 7 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 8 | # 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n'] 9 | # sparql_template = [] 10 | 11 | final = "" 12 | lineno = 1 13 | for line in lines: 14 | if lineno == 1: 15 | lineno += 1 16 | continue 17 | line = line.strip().split(',') 18 | # print lines 19 | if line[5]!='': 20 | # print line[5] 21 | line[-2] = 'SELECT ?x WHERE { <' + line[5] + '> ?x }' 22 | line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a }' 23 | 24 | final += ",".join(line) 25 | final += '\n' 26 | 27 | 28 | print final 29 | # fw = open() 30 | 31 | 32 | -------------------------------------------------------------------------------- /gsoc/aman/temp.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | proxy = urllib2.ProxyHandler({'https': 'http://proxy.iiit.ac.in:8080/'}) 3 | opener = urllib2.build_opener(proxy) 4 | urllib2.install_opener(opener) 5 | result = urllib2.urlopen('https://www.python.org') 6 | print result.read() -------------------------------------------------------------------------------- /gsoc/aman/test_pipeline/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib2, urllib, httplib, json, sys, csv, io 2 | import argparse 3 | from bs4 import BeautifulSoup 4 | 5 | parser = argparse.ArgumentParser() 6 | requiredNamed = parser.add_argument_group('Required Arguments'); 7 | requiredNamed.add_argument('--url', dest='url', metavar='url', help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 8 | args = parser.parse_args() 9 | 10 | quote_page = args.url 11 | page = urllib2.urlopen(quote_page) 12 | 13 | soup = BeautifulSoup(page, "html.parser") 14 | # print type(soup) 15 | 16 | for rows in soup.find_all("tr"): 17 | 18 | x = rows.find_all("td"); 19 | if len(x) <= 2: continue 20 | 21 | name = rows.find_all("td")[0].get_text().replace(" (edit)","") 22 | label = rows.find_all("td")[1].get_text() 23 | dom = rows.find_all("td")[2].get_text() 24 | rng = rows.find_all("td")[3].get_text() 25 | 26 | final = name + "," + label + "," + dom + "," + rng 27 | print final.encode('utf-8') 28 | 29 | # with io.open("test.csv", mode='w', encoding='utf-8') as toWrite: 30 | # writer = csv.writer(toWrite) 31 | # writer.writerows(props) 32 | 33 | -------------------------------------------------------------------------------- /gsoc/anand/.images/test-accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/.images/test-accuracy.png -------------------------------------------------------------------------------- /gsoc/anand/.images/test-bleu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/.images/test-bleu.png -------------------------------------------------------------------------------- /gsoc/anand/.pipeline_2/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python" 3 | } -------------------------------------------------------------------------------- /gsoc/anand/.pipeline_2/final_formatting.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | import argparse 4 | from sparql_generator import sparql_generator 5 | # Given format # 6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 7 | 8 | # Required format : separated by semi-colon ## 9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 10 | 11 | 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs): 13 | 14 | if (int(rs) == 1) : 15 | open_files = open(input_file, 'r') 16 | lines = open_files.readlines() 17 | open_files.close() 18 | else: 19 | lines = sparql_generator(input_file=input_file, project_name=project_name, 20 | url=url, uri_file=uri_file, namespace=namespace) 21 | 22 | fl = 1 23 | 24 | output = "" 25 | 26 | """ 27 | - We iterate over the lines of the document. 28 | - Convet the line into a list containig elements o the 29 | string delimited by commas. 30 | - 31 | """ 32 | for line in tqdm(lines): 33 | 34 | if fl: 35 | fl = 0 36 | continue 37 | l = line.split(',') 38 | 39 | # print l 40 | 41 | newl, to_remove = [], [] 42 | name = url.split("/")[-1] 43 | newl.append("dbo:"+name) 44 | newl.append("") 45 | newl.append("") 46 | 47 | nlq = l[7].split() 48 | # The fuzzy score column is not present in the 49 | # autmatically created csv file. 50 | if(len(l) == 10): 51 | nlq = l[6].split() 52 | 53 | """ 54 | From MVE column a question is selected and each word 55 | is put into as an element of the list. 56 | """ 57 | for i in range(len(nlq)): 58 | if '(' in nlq[i] or ')' in nlq[i]: 59 | to_remove.append(nlq[i]) 60 | continue 61 | if '<' not in nlq[i] and '?' not in nlq[i]: 62 | nlq[i] = nlq[i].lower() 63 | 64 | for x in to_remove: 65 | nlq.remove(x) 66 | 67 | spq = l[-2].split() 68 | """ 69 | Query one 70 | """ 71 | for i in range(len(spq)): 72 | if '<' not in spq[i] and '?' not in spq[i]: 73 | spq[i] = spq[i].lower() 74 | 75 | """ 76 | Query two 77 | """ 78 | gq = l[-1].split() 79 | for i in range(len(gq)): 80 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 81 | gq[i] = gq[i].lower() 82 | 83 | newl.append(" ".join(nlq)) 84 | newl.append(" ".join(spq)) 85 | newl.append(" ".join(gq)) 86 | output += ";".join(newl) + "\n" 87 | 88 | fw = open(project_name+"/"+ output_file, 'w') 89 | fw.write(output) 90 | fw.close() 91 | 92 | 93 | if __name__ == "__main__": 94 | """ 95 | Section to parse the command line arguments. 96 | """ 97 | parser = argparse.ArgumentParser() 98 | requiredNamed = parser.add_argument_group('Required Arguments') 99 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 100 | help='eg: File which contains metadata of properties', required=False) 101 | requiredNamed.add_argument( 102 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False) 103 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 104 | help='File in which you want to store output', required=False) 105 | requiredNamed.add_argument('--project_name', dest='project_name', 106 | metavar='project_name', help='test', required=False) 107 | requiredNamed.add_argument('--url', dest='url', metavar='url', 108 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False) 109 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 110 | help='eg: File which contains uri and number of occurrences of properties', required=False) 111 | requiredNamed.add_argument('--rs', dest='rs', metavar='rs', 112 | help='Toggle to run separately', required=True) 113 | 114 | args = parser.parse_args() 115 | input_file = args.inp 116 | uri_file = args.uri 117 | url = args.url 118 | rs =args.rs 119 | namespace = args.ns 120 | output_file = args.out 121 | project_name = args.project_name 122 | final_formatting(input_file=input_file, uri_file=uri_file, url=url, 123 | output_file=output_file, project_name=project_name, namespace=namespace, rs= rs) 124 | pass 125 | -------------------------------------------------------------------------------- /gsoc/anand/.pipeline_2/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv"): 12 | """ 13 | This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | """ 16 | page = urllib.request.urlopen(url) 17 | soup = BeautifulSoup(page, "html.parser") 18 | if(not os.path.isdir(project_name)): 19 | os.makedirs(project_name) 20 | output_file = open(project_name+"/" + output_file, 'w') 21 | fl = 0 22 | accum = [] 23 | for rows in tqdm(soup.find_all("tr")): 24 | x = rows.find_all("td") 25 | if len(x) <= 2: 26 | fl = 1 27 | continue 28 | if fl == 1: 29 | fl = 2 30 | continue 31 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 32 | label = rows.find_all("td")[1].get_text() 33 | dom = rows.find_all("td")[2].get_text() 34 | rng = rows.find_all("td")[3].get_text() 35 | 36 | final = name + "," + label + "," + dom + "," + rng 37 | accum.append(final) 38 | output_file.write(final+"\n") 39 | output_file.close() 40 | return accum 41 | 42 | 43 | """ 44 | Name, Label, Domain, Range 45 | """ 46 | 47 | if __name__ == "__main__": 48 | """ 49 | Section to parse the command line arguments. 50 | """ 51 | parser = argparse.ArgumentParser() 52 | requiredNamed = parser.add_argument_group('Required Arguments') 53 | 54 | requiredNamed.add_argument('--url', dest='url', metavar='url', 55 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 56 | requiredNamed.add_argument( 57 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 58 | requiredNamed.add_argument( 59 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 60 | args = parser.parse_args() 61 | url = args.url 62 | output_file = args.out_put 63 | project_name = args.project_name 64 | get_properties(url = url, project_name= project_name, output_file = output_file) 65 | pass 66 | -------------------------------------------------------------------------------- /gsoc/anand/.pipeline_2/integrate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from tqdm import tqdm 4 | from get_properties import get_properties 5 | from tqdm import tqdm 6 | 7 | """ 8 | How was the tsv file created in the first place? 9 | - The tsv file is read. 10 | - A dictionary diction in made. 11 | - Every time the namespace is matched with the name 12 | space mentioned in the command line argument. 13 | - If the name space matches the dictionary diction here 14 | is updated with {name of the entity = frequency of occurance} 15 | """ 16 | 17 | 18 | def integrate(namespace, uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"): 19 | print("Reading the TSV file: ") 20 | open_tsv = open(uri_file, 'r') 21 | read_tsv = open_tsv.readlines() 22 | diction = {} 23 | for line in tqdm(read_tsv): 24 | line = line.strip().split('\t') 25 | if line[0].split('/')[-2] != namespace: 26 | continue 27 | diction[line[0].split('/')[-1]] = line[1] 28 | 29 | open_tsv.close() 30 | 31 | """ 32 | Processing the input file. 33 | - The input file is read, out put from get_properties.py 34 | - Reading lines from the input files. 35 | - Iterating over every line of the read file. 36 | - Taking the name from the line. 37 | - if the given name is in the dictionry created above 38 | appending the url to the given name and corresponding 39 | frequency to the row entry(read line). Else appending 40 | an empty string. 41 | - Joining all the elements of the list line with a comma, 42 | adding a new line character and then going for the next 43 | iteration after adding it to a variable final (string addition) 44 | """ 45 | 46 | 47 | if (__name__ == "__main__"): 48 | print("Reading the input file: ") 49 | open_inp = open(input_file, 'r') 50 | line_inp = open_inp.readlines() 51 | 52 | if (not __name__ == "__main__"): 53 | line_inp = get_properties(url=url, output_file="get_properties.csv", project_name = project_name) 54 | 55 | cnt, tot = 0, 0 56 | final = "" 57 | accum = [] 58 | for in_line in tqdm(line_inp): 59 | 60 | line = in_line.strip().split(',') 61 | in_line = line[0] 62 | tot += 1 63 | # if ':' in m: 64 | # print "lol", m 65 | if in_line in diction: 66 | cnt += 1 67 | line.append("http://dbpedia.org/" + namespace + "/" + in_line) 68 | line.append(diction[in_line]) 69 | else: 70 | 71 | line.append('') 72 | line.append('') 73 | # print in_line 74 | 75 | final += ",".join(line) 76 | accum.append(",".join(line)) 77 | final += '\n' 78 | 79 | """ 80 | The string final is the written to the output file name 81 | as given in the command line argument. 82 | """ 83 | # print final 84 | f = open(project_name+"/"+output_file, 'w') 85 | f.write(final) 86 | print("**************************************") 87 | print("Total number of entity whose URI was found: "+str(cnt) + 88 | "\nTotal number of entities present: " + str(tot)) 89 | return accum 90 | 91 | 92 | if __name__ == "__main__": 93 | """ 94 | Section to parse the command line arguments. 95 | """ 96 | parser = argparse.ArgumentParser() 97 | requiredNamed = parser.add_argument_group('Required Arguments') 98 | requiredNamed.add_argument( 99 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True) 100 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 101 | help='Output from previous step', required=True) 102 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 103 | help='eg: File which contains uri and number of occurrences of properties', required=True) 104 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 105 | help='File in which you want to store output', required=True) 106 | requiredNamed.add_argument('--project_name', dest='project_name', 107 | metavar='project_name', help='test', required=True) 108 | args = parser.parse_args() 109 | namespace = args.ns 110 | input_file = args.inp 111 | uri_file = args.uri 112 | output_file = args.out 113 | project_name = args.project_name 114 | integrate(namespace, uri_file, output_file, 115 | project_name, "Enter a valid URL", input_file) 116 | pass 117 | -------------------------------------------------------------------------------- /gsoc/anand/.pipeline_2/sparql_generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from decision_tree import decision_tree 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | """ 7 | Section to parse the command line arguments. 8 | """ 9 | 10 | 11 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 12 | if __name__ == "__main__": 13 | f = open(input_file, 'r') 14 | lines = f.readlines() 15 | pass 16 | if not __name__ == "__main__": 17 | lines = decision_tree(input_file=input_file, project_name=project_name, 18 | url=url, uri_file=uri_file, namespace=namespace) 19 | pass 20 | 21 | # print lines[0].split(',') 22 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 23 | # 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n'] 24 | # sparql_template = [] 25 | 26 | """ 27 | - Read the file generated in the previous step. 28 | - Read the lines from the file an save it as a list. 29 | - If the frequency is known, Replace the 2nd last elemet of the formed list with 30 | a where statement, and last one witha where statement 31 | followed by an assertion if it is a place. 32 | - Join the updated list with comma as a delimeter and save 33 | add it in the string ending with a newline character. 34 | - Print the final on the terminal 35 | """ 36 | accum = [] 37 | final = "" 38 | lineno = 1 39 | for line in tqdm(lines): 40 | if lineno == 1: 41 | lineno += 1 42 | continue 43 | line = line.strip().split(',') 44 | # print lines 45 | if line[4] != '': 46 | # print line[5] 47 | # It was found the the MVE and OE was also required hence: 48 | #line[-2] = 'SELECT ?x WHERE { <' + line[5] + '> ?x }' 49 | #line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a }' 50 | line.append('SELECT ?x WHERE { <' + line[4] + '> ?x }') 51 | line.append( 52 | 'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a }') 53 | 54 | final += ",".join(line) 55 | accum.append(",".join(line)) 56 | final += '\n' 57 | 58 | # print final 59 | 60 | # fw = open() 61 | 62 | """ 63 | This data generated might be required for further steps 64 | thus it is saved in another file named sparql.csv 65 | """ 66 | 67 | open(project_name+"/"+output_file, 'w').write(final) 68 | return accum 69 | 70 | 71 | if __name__ == "__main__": 72 | """ 73 | Section to parse the command line arguments. 74 | """ 75 | parser = argparse.ArgumentParser() 76 | requiredNamed = parser.add_argument_group('Required Arguments') 77 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 78 | help='Output from previous step', required=True) 79 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 80 | help='File in which you want to store output', required=True) 81 | requiredNamed.add_argument('--project_name', dest='project_name', 82 | metavar='project_name', help='eg.:test', required=True) 83 | args = parser.parse_args() 84 | input_file = args.inp 85 | output_file = args.out 86 | project_name = args.project_name 87 | sparql_generator(input_file=input_file, output_file=output_file, 88 | project_name=project_name) 89 | pass 90 | -------------------------------------------------------------------------------- /gsoc/anand/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python" 3 | } -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/composite_template.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from range_place import range_place 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | # RUN: python composite_template.py data/GS-v3.csv 7 | # Given format # 8 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 9 | 10 | # Required format : separated by semi-colon ## 11 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 12 | 13 | def composite_template(input_file, uri_file, url, output_file, project_name, namespace,rs): 14 | if (int(rs) == 1) : 15 | open_files = open(input_file, 'r') 16 | lines = open_files.readlines() 17 | open_files.close() 18 | output_file_write = open(project_name+"/" + output_file, 'w') 19 | else: 20 | list_val = range_place(input_file=input_file, project_name=project_name, 21 | url=url, uri_file=uri_file, namespace=namespace) 22 | lines = list_val[0] 23 | output_file_write = list_val[1] 24 | 25 | 26 | b = [] 27 | b.append("where is the") 28 | b.append("") 29 | b.append("of located in") 30 | accum = [] 31 | for l in tqdm(lines): 32 | l = l.strip().split(',') 33 | # print l 34 | if len(l) == 0: 35 | continue 36 | if 'place' in l[2].lower() and l[5]!='' and len(l[5])!=0 and 'location of' not in l[7].lower(): 37 | 38 | newl,to_remove = [],[] 39 | newl.append("dbo:Place") 40 | newl.append("") 41 | newl.append("") 42 | 43 | l[1] = l[1].split() 44 | for i in range(len(l[1])): 45 | if '(' in l[1][i] or ')' in l[1][i]: 46 | to_remove.append(l[1][i]) 47 | continue 48 | for x in to_remove: 49 | l[1].remove(x) 50 | 51 | b[1] = " ".join(l[1]) 52 | # print b 53 | nlq = " ".join(b) 54 | # no Fuzzy score so the index decreases by 1 55 | spq = "select ?a where { " + l[4] + " ?b . ?b ?a }" 56 | # print nlq + ";" + spq 57 | 58 | gq = l[-1] 59 | 60 | gq2 = gq.split()[1] 61 | gq2 = "distinct(" + gq2 + ")" 62 | gq = gq.split() 63 | gq[1] = gq2 64 | gq = " ".join(gq).replace("SELECT","select").replace("WHERE","where") 65 | 66 | 67 | newl.append((nlq)) 68 | newl.append((spq)) 69 | newl.append((gq)) 70 | newl = ";".join(newl) 71 | accum.append(newl) 72 | output_file_write.write("\n") 73 | output_file_write.write("\n".join(accum)) 74 | output_file_write.close() 75 | 76 | 77 | if __name__ == "__main__": 78 | """ 79 | Section to parse the command line arguments. 80 | """ 81 | parser = argparse.ArgumentParser() 82 | requiredNamed = parser.add_argument_group('Required Arguments') 83 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 84 | help='eg: File which contains metadata of properties', required=False) 85 | requiredNamed.add_argument( 86 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False) 87 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 88 | help='File in which you want to store output', required=False) 89 | requiredNamed.add_argument('--project_name', dest='project_name', 90 | metavar='project_name', help='test', required=False) 91 | requiredNamed.add_argument('--url', dest='url', metavar='url', 92 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False) 93 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 94 | help='eg: File which contains uri and number of occurrences of properties', required=False) 95 | requiredNamed.add_argument('--rs', dest='rs', metavar='rs', 96 | help='Toggle to run separately', required=True) 97 | 98 | args = parser.parse_args() 99 | input_file = args.inp 100 | uri_file = args.uri 101 | url = args.url 102 | rs =args.rs 103 | namespace = args.ns 104 | output_file = args.out 105 | project_name = args.project_name 106 | composite_template(input_file=input_file, uri_file=uri_file, url=url, 107 | output_file=output_file, project_name=project_name, namespace=namespace, rs= rs) 108 | pass -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/decision_tree.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import argparse 4 | from tqdm import tqdm 5 | from integrate import integrate 6 | 7 | 8 | def decision_tree(input_file, project_name, output_file="decision_tree.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 9 | if __name__ == "__main__": 10 | f = open(input_file, 'r') 11 | lines = f.readlines() 12 | pass 13 | if not __name__ == "__main__": 14 | lines = integrate(namespace=namespace, uri_file=uri_file, 15 | project_name=project_name, url=url) 16 | final_lines = [] 17 | lineno = 1 18 | 19 | """ 20 | print lines[0].split(',') 21 | ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 22 | 'MVE', 'Optimal Expression\r\n'] 23 | """ 24 | 25 | """ 26 | - The lines from the file generated in the previous steps 27 | is read and a for loop iterates through ecery row of 28 | - First we create a list of all elements seperated by commas. 29 | - If the range has the substring person, the we put as 30 | question who else what. 31 | - We append the question thus generate 2 times as minimum 32 | viable instruction and optimal expression. 33 | - We create a variable names final lines and add strings, 34 | which are formed by adding strings formed by joining 35 | the elements of the list delemited by comma in each line. 36 | - We also create a string of the question generated 37 | delemited by a newline characte and store it in mve 38 | as a long string. 39 | - We output the series of question in mve_output. 40 | - We save the final_lines strind in a file named GS_with_mve.csv 41 | delimeted by a newline character. 42 | """ 43 | 44 | mve = "" 45 | for line in tqdm(lines): 46 | if lineno == 1: 47 | lineno += 1 48 | continue 49 | line = line.strip().split(',') 50 | rng = line[2].lower() 51 | lbl = line[1] 52 | if 'person' in rng: 53 | rng = "who" 54 | else: 55 | rng = "what" 56 | # The total length of a row in the list is 6, 57 | # thus 7 and 8 are out of range values. Thus I 58 | # replaced it with append. 59 | """ 60 | line[7] = rng + " is the " + lbl + " of " 61 | line[8] = rng + " is the " + lbl + " of " 62 | """ 63 | if(len(line) < 9): 64 | line.append(rng + " is the " + lbl + " of ") 65 | line.append(rng + " is the " + lbl + " of ") 66 | else: 67 | line[7] = rng + " is the " + lbl + " of " 68 | line[8] = rng + " is the " + lbl + " of " 69 | mve += rng + " is the " + lbl + " of \n" 70 | final_lines.append(",".join(line)) 71 | 72 | fw = open(project_name+"/"+"mve"+output_file, 'w') 73 | fw.write(mve) 74 | 75 | fw2 = open(project_name+"/"+output_file, 'w') 76 | fw2.write("\n".join(final_lines)) 77 | return final_lines 78 | 79 | if __name__ == "__main__": 80 | """ 81 | Section to parse the command line arguments. 82 | """ 83 | parser = argparse.ArgumentParser() 84 | requiredNamed = parser.add_argument_group('Required Arguments') 85 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 86 | help='Output from previous step', required=True) 87 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 88 | help='File in which you want to store output', required=True) 89 | requiredNamed.add_argument('--project_name', dest='project_name', 90 | metavar='project_name', help='test', required=True) 91 | args = parser.parse_args() 92 | input_file = args.inp 93 | output_file = args.out 94 | project_name = args.project_name 95 | decision_tree(input_file=input_file, output_file=output_file, 96 | project_name=project_name) 97 | pass 98 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/final_formatting.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | import argparse 4 | from sparql_generator import sparql_generator 5 | # Given format # 6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 7 | 8 | # Required format : separated by semi-colon ## 9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 10 | 11 | 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs): 13 | 14 | if (int(rs) == 1) : 15 | open_files = open(input_file, 'r') 16 | lines = open_files.readlines() 17 | open_files.close() 18 | else: 19 | lines = sparql_generator(input_file=input_file, project_name=project_name, 20 | url=url, uri_file=uri_file, namespace=namespace) 21 | 22 | fl = 1 23 | 24 | output = "" 25 | 26 | """ 27 | - We iterate over the lines of the document. 28 | - Convet the line into a list containig elements o the 29 | string delimited by commas. 30 | - 31 | """ 32 | for line in tqdm(lines): 33 | 34 | if fl: 35 | fl = 0 36 | continue 37 | l = line.split(',') 38 | 39 | # print l 40 | 41 | newl, to_remove = [], [] 42 | name = url.split("/")[-1] 43 | newl.append("dbo:"+name) 44 | newl.append("") 45 | newl.append("") 46 | 47 | nlq = l[7].split() 48 | # The fuzzy score column is not present in the 49 | # autmatically created csv file. 50 | if(len(l) == 10): 51 | nlq = l[6].split() 52 | 53 | """ 54 | From MVE column a question is selected and each word 55 | is put into as an element of the list. 56 | """ 57 | for i in range(len(nlq)): 58 | if '(' in nlq[i] or ')' in nlq[i]: 59 | to_remove.append(nlq[i]) 60 | continue 61 | if '<' not in nlq[i] and '?' not in nlq[i]: 62 | nlq[i] = nlq[i].lower() 63 | 64 | for x in to_remove: 65 | nlq.remove(x) 66 | 67 | spq = l[-2].split() 68 | """ 69 | Query one 70 | """ 71 | for i in range(len(spq)): 72 | if '<' not in spq[i] and '?' not in spq[i]: 73 | spq[i] = spq[i].lower() 74 | 75 | """ 76 | Query two 77 | """ 78 | gq = l[-1].split() 79 | for i in range(len(gq)): 80 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 81 | gq[i] = gq[i].lower() 82 | 83 | newl.append(" ".join(nlq)) 84 | newl.append(" ".join(spq)) 85 | newl.append(" ".join(gq)) 86 | output += ";".join(newl) + "\n" 87 | 88 | fw = open(project_name+"/"+ output_file, 'w') 89 | fw.write(output) 90 | fw.close() 91 | 92 | 93 | if __name__ == "__main__": 94 | """ 95 | Section to parse the command line arguments. 96 | """ 97 | parser = argparse.ArgumentParser() 98 | requiredNamed = parser.add_argument_group('Required Arguments') 99 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 100 | help='eg: File which contains metadata of properties', required=False) 101 | requiredNamed.add_argument( 102 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False) 103 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 104 | help='File in which you want to store output', required=False) 105 | requiredNamed.add_argument('--project_name', dest='project_name', 106 | metavar='project_name', help='test', required=False) 107 | requiredNamed.add_argument('--url', dest='url', metavar='url', 108 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False) 109 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 110 | help='eg: File which contains uri and number of occurrences of properties', required=False) 111 | requiredNamed.add_argument('--rs', dest='rs', metavar='rs', 112 | help='Toggle to run separately', required=True) 113 | 114 | args = parser.parse_args() 115 | input_file = args.inp 116 | uri_file = args.uri 117 | url = args.url 118 | rs =args.rs 119 | namespace = args.ns 120 | output_file = args.out 121 | project_name = args.project_name 122 | final_formatting(input_file=input_file, uri_file=uri_file, url=url, 123 | output_file=output_file, project_name=project_name, namespace=namespace, rs= rs) 124 | pass 125 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/get_properties.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv"): 12 | """ 13 | This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | """ 16 | page = urlopen(url) 17 | soup = BeautifulSoup(page, "html.parser") 18 | if(not os.path.isdir(project_name)): 19 | os.makedirs(project_name) 20 | output_file = open(project_name+"/" + output_file, 'w') 21 | fl = 0 22 | accum = [] 23 | for rows in tqdm(soup.find_all("tr")): 24 | x = rows.find_all("td") 25 | if len(x) <= 2: 26 | fl = 1 27 | continue 28 | if fl == 1: 29 | fl = 2 30 | continue 31 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 32 | label = rows.find_all("td")[1].get_text() 33 | dom = rows.find_all("td")[2].get_text() 34 | rng = rows.find_all("td")[3].get_text() 35 | 36 | final = name + "," + label + "," + dom + "," + rng 37 | accum.append(final) 38 | output_file.write(final+"\n") 39 | output_file.close() 40 | return accum 41 | 42 | 43 | """ 44 | Name, Label, Domain, Range 45 | """ 46 | 47 | if __name__ == "__main__": 48 | """ 49 | Section to parse the command line arguments. 50 | """ 51 | parser = argparse.ArgumentParser() 52 | requiredNamed = parser.add_argument_group('Required Arguments') 53 | 54 | requiredNamed.add_argument('--url', dest='url', metavar='url', 55 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 56 | requiredNamed.add_argument( 57 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 58 | requiredNamed.add_argument( 59 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 60 | args = parser.parse_args() 61 | url = args.url 62 | output_file = args.out_put 63 | project_name = args.project_name 64 | get_properties(url = url, project_name= project_name, output_file = output_file) 65 | pass 66 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/integrate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from tqdm import tqdm 4 | from get_properties import get_properties 5 | from tqdm import tqdm 6 | 7 | """ 8 | How was the tsv file created in the first place? 9 | - The tsv file is read. 10 | - A dictionary diction in made. 11 | - Every time the namespace is matched with the name 12 | space mentioned in the command line argument. 13 | - If the name space matches the dictionary diction here 14 | is updated with {name of the entity = frequency of occurance} 15 | """ 16 | 17 | 18 | def integrate(namespace, uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"): 19 | print("Reading the TSV file: ") 20 | open_tsv = open(uri_file, 'r') 21 | read_tsv = open_tsv.readlines() 22 | diction = {} 23 | for line in tqdm(read_tsv): 24 | line = line.strip().split('\t') 25 | if line[0].split('/')[-2] != namespace: 26 | continue 27 | diction[line[0].split('/')[-1]] = line[1] 28 | 29 | open_tsv.close() 30 | 31 | """ 32 | Processing the input file. 33 | - The input file is read, out put from get_properties.py 34 | - Reading lines from the input files. 35 | - Iterating over every line of the read file. 36 | - Taking the name from the line. 37 | - if the given name is in the dictionry created above 38 | appending the url to the given name and corresponding 39 | frequency to the row entry(read line). Else appending 40 | an empty string. 41 | - Joining all the elements of the list line with a comma, 42 | adding a new line character and then going for the next 43 | iteration after adding it to a variable final (string addition) 44 | """ 45 | 46 | 47 | if (__name__ == "__main__"): 48 | print("Reading the input file: ") 49 | open_inp = open(input_file, 'r') 50 | line_inp = open_inp.readlines() 51 | 52 | if (not __name__ == "__main__"): 53 | line_inp = get_properties(url=url, output_file="get_properties.csv", project_name = project_name) 54 | 55 | cnt, tot = 0, 0 56 | final = "" 57 | accum = [] 58 | for in_line in tqdm(line_inp): 59 | 60 | line = in_line.strip().split(',') 61 | in_line = line[0] 62 | tot += 1 63 | # if ':' in m: 64 | # print "lol", m 65 | if in_line in diction: 66 | cnt += 1 67 | line.append("http://dbpedia.org/" + namespace + "/" + in_line) 68 | line.append(diction[in_line]) 69 | else: 70 | 71 | line.append('') 72 | line.append('') 73 | # print in_line 74 | 75 | final += ",".join(line) 76 | accum.append(",".join(line)) 77 | final += '\n' 78 | 79 | """ 80 | The string final is the written to the output file name 81 | as given in the command line argument. 82 | """ 83 | # print final 84 | f = open(project_name+"/"+output_file, 'w') 85 | f.write(final) 86 | print("**************************************") 87 | print("Total number of entity whose URI was found: "+str(cnt) + 88 | "\nTotal number of entities present: " + str(tot)) 89 | return accum 90 | 91 | 92 | if __name__ == "__main__": 93 | """ 94 | Section to parse the command line arguments. 95 | """ 96 | parser = argparse.ArgumentParser() 97 | requiredNamed = parser.add_argument_group('Required Arguments') 98 | requiredNamed.add_argument( 99 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True) 100 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 101 | help='Output from previous step', required=True) 102 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 103 | help='eg: File which contains uri and number of occurrences of properties', required=True) 104 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 105 | help='File in which you want to store output', required=True) 106 | requiredNamed.add_argument('--project_name', dest='project_name', 107 | metavar='project_name', help='test', required=True) 108 | args = parser.parse_args() 109 | namespace = args.ns 110 | input_file = args.inp 111 | uri_file = args.uri 112 | output_file = args.out 113 | project_name = args.project_name 114 | integrate(namespace, uri_file, output_file, 115 | project_name, "Enter a valid URL", input_file) 116 | pass 117 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/range_place.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | from sparql_generator import sparql_generator 5 | from tqdm import tqdm 6 | 7 | def range_place(input_file, project_name, output_file="test_res.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 8 | if __name__ == "__main__": 9 | f = open(input_file, 'r') 10 | lines = f.readlines() 11 | f.close() 12 | print ("hello") 13 | if not __name__ == "__main__": 14 | lines = sparql_generator(input_file=input_file, project_name=project_name, 15 | url=url, uri_file=uri_file, namespace=namespace) 16 | 17 | output_file_write = open(project_name+"/" + output_file, 'w') 18 | name = url.split("/")[-1] 19 | accum = [] 20 | for l in tqdm(lines): 21 | l = l.split(',') 22 | if len(l) == 0: 23 | continue 24 | if l[5] == "" or len(l[5]) == 0: 25 | continue 26 | if name.lower() in l[2].lower() and l[5] != '': 27 | newl, to_remove = [], [] 28 | newl.append("dbo:"+name) 29 | newl.append("") 30 | newl.append("") 31 | nlq = l[7].split() 32 | for i in range(len(nlq)): 33 | if '(' in nlq[i] or ')' in nlq[i]: 34 | to_remove.append(nlq[i]) 35 | continue 36 | if '<' not in nlq[i] and '?' not in nlq[i]: 37 | nlq[i] = nlq[i].lower() 38 | 39 | for x in to_remove: 40 | nlq.remove(x) 41 | 42 | nlq = " ".join(nlq) 43 | 44 | spq = l[9].split() 45 | for i in range(len(spq)): 46 | if '<' not in spq[i] and '?' not in spq[i]: 47 | spq[i] = spq[i].lower() 48 | 49 | spq = " ".join(spq) 50 | 51 | gq = l[-1].split() 52 | for i in range(len(gq)): 53 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 54 | gq[i] = gq[i].lower() 55 | 56 | gq = " ".join(gq) 57 | 58 | nlq = nlq.replace('', '') 59 | spq = spq.replace('?x', '?a').replace('', '') 60 | gq = gq.replace('?x', '?a').replace('', '') 61 | newl.append((nlq)) 62 | newl.append((spq)) 63 | newl.append((gq)) 64 | accum.append(";".join(newl)) 65 | output_file_write.write("\n".join(accum)) 66 | return [lines,output_file_write] 67 | 68 | 69 | if __name__ == "__main__": 70 | """ 71 | Section to parse the command line arguments. 72 | """ 73 | parser = argparse.ArgumentParser() 74 | requiredNamed = parser.add_argument_group('Required Arguments') 75 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 76 | help='eg: File which contains metadata of properties', required=True) 77 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 78 | help='File in which you want to store output', required=True) 79 | requiredNamed.add_argument('--project_name', dest='project_name', 80 | metavar='project_name', help='test', required=True) 81 | requiredNamed.add_argument('--url', dest='url', metavar='url', 82 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 83 | 84 | args = parser.parse_args() 85 | input_file = args.inp 86 | output_file = args.out 87 | url = args.url 88 | project_name = args.project_name 89 | range_place(input_file=input_file, output_file=output_file, 90 | project_name=project_name, url=url) 91 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_composite/sparql_generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from decision_tree import decision_tree 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | """ 7 | Section to parse the command line arguments. 8 | """ 9 | 10 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 11 | if __name__ == "__main__": 12 | f = open(input_file, 'r') 13 | lines = f.readlines() 14 | pass 15 | if not __name__ == "__main__": 16 | lines = decision_tree(input_file=input_file, project_name=project_name, 17 | url=url, uri_file=uri_file, namespace=namespace) 18 | pass 19 | 20 | # print lines[0].split(',') 21 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 22 | # 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n'] 23 | # sparql_template = [] 24 | 25 | """ 26 | - Read the file generated in the previous step. 27 | - Read the lines from the file an save it as a list. 28 | - If the frequency is known, Replace the 2nd last elemet of the formed list with 29 | a where statement, and last one witha where statement 30 | followed by an assertion if it is a place. 31 | - Join the updated list with comma as a delimeter and save 32 | add it in the string ending with a newline character. 33 | - Print the final on the terminal 34 | """ 35 | accum = [] 36 | final = "" 37 | lineno = 1 38 | for line in tqdm(lines): 39 | if lineno == 1: 40 | lineno += 1 41 | continue 42 | line = line.strip().split(',') 43 | # print lines 44 | if line[4] != '': 45 | # print line[5] 46 | # It was found the the MVE and OE was also required hence: 47 | #line[-2] = 'SELECT ?x WHERE { <' + line[5] + '> ?x }' 48 | #line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a }' 49 | line.append('SELECT ?x WHERE { <' + line[4] + '> ?x }') 50 | line.append( 51 | 'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a }') 52 | 53 | final += ",".join(line) 54 | accum.append(",".join(line)) 55 | final += '\n' 56 | 57 | # print final 58 | 59 | # fw = open() 60 | 61 | """ 62 | This data generated might be required for further steps 63 | thus it is saved in another file named sparql.csv 64 | """ 65 | 66 | open(project_name+"/"+output_file, 'w').write(final) 67 | return accum 68 | 69 | 70 | if __name__ == "__main__": 71 | """ 72 | Section to parse the command line arguments. 73 | """ 74 | parser = argparse.ArgumentParser() 75 | requiredNamed = parser.add_argument_group('Required Arguments') 76 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 77 | help='Output from previous step', required=True) 78 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 79 | help='File in which you want to store output', required=True) 80 | requiredNamed.add_argument('--project_name', dest='project_name', 81 | metavar='project_name', help='eg.:test', required=True) 82 | args = parser.parse_args() 83 | input_file = args.inp 84 | output_file = args.out 85 | project_name = args.project_name 86 | sparql_generator(input_file=input_file, output_file=output_file, 87 | project_name=project_name) 88 | pass 89 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_simple/decision_tree.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import argparse 4 | from tqdm import tqdm 5 | from integrate import integrate 6 | 7 | 8 | def decision_tree(input_file, project_name, output_file="decision_tree.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 9 | if __name__ == "__main__": 10 | f = open(input_file, 'r') 11 | lines = f.readlines() 12 | pass 13 | if not __name__ == "__main__": 14 | lines = integrate(namespace=namespace, uri_file=uri_file, 15 | project_name=project_name, url=url) 16 | final_lines = [] 17 | lineno = 1 18 | 19 | """ 20 | print lines[0].split(',') 21 | ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 22 | 'MVE', 'Optimal Expression\r\n'] 23 | """ 24 | 25 | """ 26 | - The lines from the file generated in the previous steps 27 | is read and a for loop iterates through ecery row of 28 | - First we create a list of all elements seperated by commas. 29 | - If the range has the substring person, the we put as 30 | question who else what. 31 | - We append the question thus generate 2 times as minimum 32 | viable instruction and optimal expression. 33 | - We create a variable names final lines and add strings, 34 | which are formed by adding strings formed by joining 35 | the elements of the list delemited by comma in each line. 36 | - We also create a string of the question generated 37 | delemited by a newline characte and store it in mve 38 | as a long string. 39 | - We output the series of question in mve_output. 40 | - We save the final_lines strind in a file named GS_with_mve.csv 41 | delimeted by a newline character. 42 | """ 43 | 44 | mve = "" 45 | for line in tqdm(lines): 46 | if lineno == 1: 47 | lineno += 1 48 | continue 49 | line = line.strip().split(',') 50 | # Wrong index for range is corrected here 51 | rng = line[3].lower() 52 | lbl = line[1] 53 | if 'person' in rng: 54 | rng = "who" 55 | else: 56 | rng = "what" 57 | # The total length of a row in the list is 6, 58 | # thus 7 and 8 are out of range values. Thus I 59 | # replaced it with append. 60 | """ 61 | line[7] = rng + " is the " + lbl + " of " 62 | line[8] = rng + " is the " + lbl + " of " 63 | """ 64 | if(len(line) < 9): 65 | line.append(rng + " is the " + lbl + " of ") 66 | line.append(rng + " is the " + lbl + " of ") 67 | else: 68 | line[7] = rng + " is the " + lbl + " of " 69 | line[8] = rng + " is the " + lbl + " of " 70 | mve += rng + " is the " + lbl + " of \n" 71 | final_lines.append(",".join(line)) 72 | 73 | fw = open(project_name+"/"+"mve"+output_file, 'w') 74 | fw.write(mve) 75 | 76 | fw2 = open(project_name+"/"+output_file, 'w') 77 | fw2.write("\n".join(final_lines)) 78 | return final_lines 79 | 80 | if __name__ == "__main__": 81 | """ 82 | Section to parse the command line arguments. 83 | """ 84 | parser = argparse.ArgumentParser() 85 | requiredNamed = parser.add_argument_group('Required Arguments') 86 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 87 | help='Output from previous step', required=True) 88 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 89 | help='File in which you want to store output', required=True) 90 | requiredNamed.add_argument('--project_name', dest='project_name', 91 | metavar='project_name', help='test', required=True) 92 | args = parser.parse_args() 93 | input_file = args.inp 94 | output_file = args.out 95 | project_name = args.project_name 96 | decision_tree(input_file=input_file, output_file=output_file, 97 | project_name=project_name) 98 | pass 99 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_simple/final_formatting.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tqdm import tqdm 3 | import argparse 4 | from sparql_generator import sparql_generator 5 | # Given format # 6 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 'MVE', 'Optimal Expression', 'SPARQL Query Template', 'Generator Query\r\n'] 7 | 8 | # Required format : separated by semi-colon ## 9 | # [ class_name, empty, empty, NLQ (MVE), Sparql Query, Generator Query] # 10 | 11 | 12 | def final_formatting(input_file, uri_file, url, output_file, project_name, namespace,rs): 13 | 14 | if (int(rs) == 1) : 15 | open_files = open(input_file, 'r') 16 | lines = open_files.readlines() 17 | open_files.close() 18 | else: 19 | lines = sparql_generator(input_file=input_file, project_name=project_name, 20 | url=url, uri_file=uri_file, namespace=namespace) 21 | 22 | fl = 1 23 | 24 | output = "" 25 | 26 | """ 27 | - We iterate over the lines of the document. 28 | - Convet the line into a list containig elements o the 29 | string delimited by commas. 30 | - 31 | """ 32 | for line in tqdm(lines): 33 | 34 | if fl: 35 | fl = 0 36 | continue 37 | l = line.split(',') 38 | 39 | # print l 40 | 41 | newl, to_remove = [], [] 42 | name = url.split("/")[-1] 43 | newl.append("dbo:"+name) 44 | newl.append("") 45 | newl.append("") 46 | 47 | nlq = l[7].split() 48 | # The fuzzy score column is not present in the 49 | # autmatically created csv file. 50 | if(len(l) == 10): 51 | nlq = l[6].split() 52 | 53 | """ 54 | From MVE column a question is selected and each word 55 | is put into as an element of the list. 56 | """ 57 | for i in range(len(nlq)): 58 | if '(' in nlq[i] or ')' in nlq[i]: 59 | to_remove.append(nlq[i]) 60 | continue 61 | if '<' not in nlq[i] and '?' not in nlq[i]: 62 | nlq[i] = nlq[i].lower() 63 | 64 | for x in to_remove: 65 | nlq.remove(x) 66 | 67 | spq = l[-2].split() 68 | """ 69 | Query one 70 | """ 71 | for i in range(len(spq)): 72 | if '<' not in spq[i] and '?' not in spq[i]: 73 | spq[i] = spq[i].lower() 74 | 75 | """ 76 | Query two 77 | """ 78 | gq = l[-1].split() 79 | for i in range(len(gq)): 80 | if '<' not in gq[i] and '?' not in gq[i] and '[' not in gq[i]: 81 | gq[i] = gq[i].lower() 82 | 83 | newl.append(" ".join(nlq)) 84 | newl.append(" ".join(spq)) 85 | newl.append(" ".join(gq)) 86 | output += ";".join(newl) + "\n" 87 | 88 | fw = open(project_name+"/"+ output_file, 'w') 89 | fw.write(output) 90 | fw.close() 91 | 92 | 93 | if __name__ == "__main__": 94 | """ 95 | Section to parse the command line arguments. 96 | """ 97 | parser = argparse.ArgumentParser() 98 | requiredNamed = parser.add_argument_group('Required Arguments') 99 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 100 | help='eg: File which contains metadata of properties', required=False) 101 | requiredNamed.add_argument( 102 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=False) 103 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 104 | help='File in which you want to store output', required=False) 105 | requiredNamed.add_argument('--project_name', dest='project_name', 106 | metavar='project_name', help='test', required=False) 107 | requiredNamed.add_argument('--url', dest='url', metavar='url', 108 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=False) 109 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 110 | help='eg: File which contains uri and number of occurrences of properties', required=False) 111 | requiredNamed.add_argument('--rs', dest='rs', metavar='rs', 112 | help='Toggle to run separately', required=True) 113 | 114 | args = parser.parse_args() 115 | input_file = args.inp 116 | uri_file = args.uri 117 | url = args.url 118 | rs =args.rs 119 | namespace = args.ns 120 | output_file = args.out 121 | project_name = args.project_name 122 | final_formatting(input_file=input_file, uri_file=uri_file, url=url, 123 | output_file=output_file, project_name=project_name, namespace=namespace, rs= rs) 124 | pass 125 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_simple/get_properties.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv"): 12 | """ 13 | This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | """ 16 | page = urlopen(url) 17 | soup = BeautifulSoup(page, "html.parser") 18 | if(not os.path.isdir(project_name)): 19 | os.makedirs(project_name) 20 | output_file = open(project_name+"/" + output_file, 'w') 21 | fl = 0 22 | accum = [] 23 | for rows in tqdm(soup.find_all("tr")): 24 | x = rows.find_all("td") 25 | if len(x) <= 2: 26 | fl = 1 27 | continue 28 | if fl == 1: 29 | fl = 2 30 | continue 31 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 32 | label = rows.find_all("td")[1].get_text() 33 | dom = rows.find_all("td")[2].get_text() 34 | rng = rows.find_all("td")[3].get_text() 35 | 36 | final = name + "," + label + "," + dom + "," + rng 37 | accum.append(final) 38 | output_file.write(final+"\n") 39 | output_file.close() 40 | return accum 41 | 42 | 43 | """ 44 | Name, Label, Domain, Range 45 | """ 46 | 47 | if __name__ == "__main__": 48 | """ 49 | Section to parse the command line arguments. 50 | """ 51 | parser = argparse.ArgumentParser() 52 | requiredNamed = parser.add_argument_group('Required Arguments') 53 | 54 | requiredNamed.add_argument('--url', dest='url', metavar='url', 55 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 56 | requiredNamed.add_argument( 57 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 58 | requiredNamed.add_argument( 59 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 60 | args = parser.parse_args() 61 | url = args.url 62 | output_file = args.out_put 63 | project_name = args.project_name 64 | get_properties(url = url, project_name= project_name, output_file = output_file) 65 | pass 66 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_simple/integrate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | from tqdm import tqdm 4 | from get_properties import get_properties 5 | from tqdm import tqdm 6 | 7 | """ 8 | How was the tsv file created in the first place? 9 | - The tsv file is read. 10 | - A dictionary diction in made. 11 | - Every time the namespace is matched with the name 12 | space mentioned in the command line argument. 13 | - If the name space matches the dictionary diction here 14 | is updated with {name of the entity = frequency of occurance} 15 | """ 16 | 17 | 18 | def integrate(namespace, uri_file, output_file="integrate.csv", project_name="test_project", url="Enter a valid URL", input_file="Pleaes enter a valid file name"): 19 | print("Reading the TSV file: ") 20 | open_tsv = open(uri_file, 'r') 21 | read_tsv = open_tsv.readlines() 22 | diction = {} 23 | for line in tqdm(read_tsv): 24 | line = line.strip().split('\t') 25 | if line[0].split('/')[-2] != namespace: 26 | continue 27 | diction[line[0].split('/')[-1]] = line[1] 28 | 29 | open_tsv.close() 30 | 31 | """ 32 | Processing the input file. 33 | - The input file is read, out put from get_properties.py 34 | - Reading lines from the input files. 35 | - Iterating over every line of the read file. 36 | - Taking the name from the line. 37 | - if the given name is in the dictionry created above 38 | appending the url to the given name and corresponding 39 | frequency to the row entry(read line). Else appending 40 | an empty string. 41 | - Joining all the elements of the list line with a comma, 42 | adding a new line character and then going for the next 43 | iteration after adding it to a variable final (string addition) 44 | """ 45 | 46 | 47 | if (__name__ == "__main__"): 48 | print("Reading the input file: ") 49 | open_inp = open(input_file, 'r') 50 | line_inp = open_inp.readlines() 51 | 52 | if (not __name__ == "__main__"): 53 | line_inp = get_properties(url=url, output_file="get_properties.csv", project_name = project_name) 54 | 55 | cnt, tot = 0, 0 56 | final = "" 57 | accum = [] 58 | for in_line in tqdm(line_inp): 59 | 60 | line = in_line.strip().split(',') 61 | in_line = line[0] 62 | tot += 1 63 | # if ':' in m: 64 | # print "lol", m 65 | if in_line in diction: 66 | cnt += 1 67 | line.append("http://dbpedia.org/" + namespace + "/" + in_line) 68 | line.append(diction[in_line]) 69 | else: 70 | 71 | line.append('') 72 | line.append('') 73 | # print in_line 74 | 75 | final += ",".join(line) 76 | accum.append(",".join(line)) 77 | final += '\n' 78 | 79 | """ 80 | The string final is the written to the output file name 81 | as given in the command line argument. 82 | """ 83 | # print final 84 | f = open(project_name+"/"+output_file, 'w') 85 | f.write(final) 86 | print("**************************************") 87 | print("Total number of entity whose URI was found: "+str(cnt) + 88 | "\nTotal number of entities present: " + str(tot)) 89 | return accum 90 | 91 | 92 | if __name__ == "__main__": 93 | """ 94 | Section to parse the command line arguments. 95 | """ 96 | parser = argparse.ArgumentParser() 97 | requiredNamed = parser.add_argument_group('Required Arguments') 98 | requiredNamed.add_argument( 99 | '--namespace', dest='ns', metavar='ns', help='eg: "ontology"', required=True) 100 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 101 | help='Output from previous step', required=True) 102 | requiredNamed.add_argument('--uri_file', dest='uri', metavar='uri', 103 | help='eg: File which contains uri and number of occurrences of properties', required=True) 104 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 105 | help='File in which you want to store output', required=True) 106 | requiredNamed.add_argument('--project_name', dest='project_name', 107 | metavar='project_name', help='test', required=True) 108 | args = parser.parse_args() 109 | namespace = args.ns 110 | input_file = args.inp 111 | uri_file = args.uri 112 | output_file = args.out 113 | project_name = args.project_name 114 | integrate(namespace, uri_file, output_file, 115 | project_name, "Enter a valid URL", input_file) 116 | pass 117 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_1/pipeline_1_simple/sparql_generator.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from decision_tree import decision_tree 3 | import argparse 4 | from tqdm import tqdm 5 | 6 | """ 7 | Section to parse the command line arguments. 8 | """ 9 | 10 | def sparql_generator(input_file, project_name, output_file="sparql_generator.csv", url="Use a valid URL", uri_file="Proper URI file", namespace="Valid namespace"): 11 | if __name__ == "__main__": 12 | f = open(input_file, 'r') 13 | lines = f.readlines() 14 | pass 15 | if not __name__ == "__main__": 16 | lines = decision_tree(input_file=input_file, project_name=project_name, 17 | url=url, uri_file=uri_file, namespace=namespace) 18 | pass 19 | 20 | # print lines[0].split(',') 21 | # ['Property', 'Label ', 'Range', 'Fuzzy Score', 'Comment about expr', 'URI', 'Number of Occurrences', 22 | # 'MVE', 'Optimal Expression, SPARQL-TEMPLATE, GENERATOR-QUERY-TEMPLATE\r\n'] 23 | # sparql_template = [] 24 | 25 | """ 26 | - Read the file generated in the previous step. 27 | - Read the lines from the file an save it as a list. 28 | - If the frequency is known, Replace the 2nd last elemet of the formed list with 29 | a where statement, and last one witha where statement 30 | followed by an assertion if it is a place. 31 | - Join the updated list with comma as a delimeter and save 32 | add it in the string ending with a newline character. 33 | - Print the final on the terminal 34 | """ 35 | accum = [] 36 | final = "" 37 | lineno = 1 38 | for line in tqdm(lines): 39 | if lineno == 1: 40 | lineno += 1 41 | continue 42 | line = line.strip().split(',') 43 | # print lines 44 | if line[4] != '': 45 | # print line[5] 46 | # It was found the the MVE and OE was also required hence: 47 | #line[-2] = 'SELECT ?x WHERE { <' + line[5] + '> ?x }' 48 | #line[-1] = 'SELECT ?a WHERE { ?a <' + line[5] + '> [] . ?a a }' 49 | line.append('SELECT ?x WHERE { <' + line[4] + '> ?x }') 50 | line.append( 51 | 'SELECT ?a WHERE { ?a <' + line[4] + '> [] . ?a a }') 52 | 53 | final += ",".join(line) 54 | accum.append(",".join(line)) 55 | final += '\n' 56 | 57 | # print final 58 | 59 | # fw = open() 60 | 61 | """ 62 | This data generated might be required for further steps 63 | thus it is saved in another file named sparql.csv 64 | """ 65 | 66 | open(project_name+"/"+output_file, 'w').write(final) 67 | return accum 68 | 69 | 70 | if __name__ == "__main__": 71 | """ 72 | Section to parse the command line arguments. 73 | """ 74 | parser = argparse.ArgumentParser() 75 | requiredNamed = parser.add_argument_group('Required Arguments') 76 | requiredNamed.add_argument('--input_file', dest='inp', metavar='inp', 77 | help='Output from previous step', required=True) 78 | requiredNamed.add_argument('--output_file', dest='out', metavar='out', 79 | help='File in which you want to store output', required=True) 80 | requiredNamed.add_argument('--project_name', dest='project_name', 81 | metavar='project_name', help='eg.:test', required=True) 82 | args = parser.parse_args() 83 | input_file = args.inp 84 | output_file = args.out 85 | project_name = args.project_name 86 | sparql_generator(input_file=input_file, output_file=output_file, 87 | project_name=project_name) 88 | pass 89 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/petrichor/Projects/environments/gymnoz/bin/python" 3 | } -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/Film/get_properties.csv: -------------------------------------------------------------------------------- 1 | afdbId,afdb id,Film,xsd:string 2 | allcinemaId,allcinema id,Film,xsd:string 3 | alternativeTitle,alternative title,Work,rdf:langString 4 | amgid,amgId,Film,xsd:string 5 | author,author,Work,Person 6 | basedOn,based on,Work,Work 7 | bgafdId,bgafd id,Film,xsd:string 8 | bibo:pages,pages,Work,xsd:string 9 | chiefEditor,chief editor,Work,Person 10 | cinematography,cinematography,Film,Person 11 | cites,cites,Work,xsd:string 12 | commissioner,commissioner,Work,xsd:string 13 | completionDate,completion date,Work,xsd:date 14 | composer,composer,Work,Person 15 | costumeDesigner,costume designer,Film,Person 16 | coverArtist,cover artist,Work,Person 17 | dc:description,description,Work,xsd:string 18 | dc:publisher,publisher,Work,xsd:string 19 | dcc,Dewey Decimal Classification,Work,xsd:string 20 | dct:references,references,Work,owl:Thing 21 | dct:source,source,Work,owl:Thing 22 | director,film director,Film,Person 23 | eTeatrId,e-teatr.pl id,Film,xsd:string 24 | editing,editing,Film,Person 25 | egafdId,egafd id,Film,xsd:string 26 | eurobabeIndexId,eurobabe index id,Film,xsd:string 27 | fileSize,size,Work,InformationUnit 28 | filename,filename,Work,xsd:string 29 | filmAudioType,film audio type,Film,xsd:string 30 | filmColourType,film colour type,Film,xsd:string 31 | filmRuntime,film runtime,Film,Time 32 | firstBroadcast,first broadcast,Film,xsd:string 33 | gross,gross,Film,Currency 34 | iafdId,iafd id,Film,xsd:string 35 | idAllocine,Allocine ID,Film,xsd:string 36 | license,license,Work,owl:Thing 37 | mainCharacter,main character,Work,Person 38 | makeupArtist,makeup artist,Film,Person 39 | musicComposer,music composer,Work,MusicalArtist 40 | narrator,narrator,Work,Person 41 | originalLanguage,original language,Work,Language 42 | originalTitle,original title,Work,rdf:langString 43 | previousWork,previous work,Work,Work 44 | producedBy,produced by,Film,Company 45 | producer,producer,Work,Agent 46 | productionCompany,production company,Work,Company 47 | publisher,publisher,Work,Agent 48 | quebecerTitle,quebecer title,Film,xsd:string 49 | releaseLocation,release location,Work,Place 50 | runtime,runtime,Work,Time 51 | setDesigner,set designer,Film,Person 52 | skos:notation,notation,Work,xsd:string 53 | specialEffects,special effects,Film,Person 54 | starring,starring,Work,Actor 55 | subjectTerm,subject term,Work,xsd:string 56 | subsequentWork,subsequent work,Work,Work 57 | titleLanguage,title language,Film,xsd:string 58 | translator,translator,Work,Person 59 | writer,writer,Work,Person 60 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/Film/sentence_and_template_generator.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/Film/sentence_and_template_generator.ods -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/eliminator.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | 4 | def eliminator(input_file, output_file,threshold): 5 | lines = open(input_file,'r').readlines() 6 | print(len(lines)) 7 | accum = [] 8 | nspm_ready = open(output_file,'w') 9 | for line in tqdm(lines): 10 | values = line.split(";") 11 | if(int(values[-1])>int(threshold)): 12 | accum.append(";".join(values[:-1])+"\n") 13 | nspm_ready.write(accum[-1]) 14 | nspm_ready.close() 15 | 16 | 17 | if __name__ == "__main__": 18 | """ 19 | Section to parse the command line arguments. 20 | """ 21 | parser = argparse.ArgumentParser() 22 | requiredNamed = parser.add_argument_group('Required Arguments') 23 | 24 | requiredNamed.add_argument('--input', dest='input', metavar='input', 25 | help='Input file name ', required=True) 26 | requiredNamed.add_argument( 27 | '--output_file', dest='output', metavar='output', help='Output file name', required=True) 28 | requiredNamed.add_argument( 29 | '--threshold', dest='threshold', metavar='threshold', help='threshold', required=True) 30 | args = parser.parse_args() 31 | input_file = args.input 32 | output_file = args.output 33 | threshold = args.threshold 34 | eliminator(input_file=input_file, output_file=output_file,threshold=threshold) 35 | pass 36 | 37 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/get_properties.csv: -------------------------------------------------------------------------------- 1 | binomial,binomial,Species,owl:Thing 2 | binomialAuthority,binomial authority,Species,owl:Thing 3 | clade,clade,Species,owl:Thing 4 | classis,classis,Species,owl:Thing 5 | conservationStatus,conservation status,Species,xsd:string 6 | conservationStatusSystem,conservation status system,Species,xsd:string 7 | domain,domain,Species,owl:Thing 8 | extinctionYear,extinction year,Species,xsd:gYear 9 | family,family,Species,Species 10 | fossil,fossil,Species,Species 11 | genus,genus,Species,owl:Thing 12 | kingdom,kingdom,Species,owl:Thing 13 | order,order (taxonomy),Species,owl:Thing 14 | parentheses,parentheses,Species,owl:Thing 15 | phylum,phylum,Species,owl:Thing 16 | redListIdNL,red list ID NL,Species,xsd:integer 17 | scientificName,scientific name,Species,xsd:string 18 | species,species,Species,Species 19 | subClassis,sub-classis,Species,owl:Thing 20 | subFamily,sub-family,Species,Taxon 21 | subGenus,subgenus,Species,owl:Thing 22 | subOrder,sub-order,Species,owl:Thing 23 | subTribus,subtribus,Species,Species 24 | superFamily,super-family,Species,Taxon 25 | superOrder,super-order,Species,owl:Thing 26 | superTribus,supertribus,Species,Species 27 | taxon,has taxon,Species,Taxon 28 | tribus,tribus,Species,Species 29 | woRMS,WoRMS,Species,owl:Thing 30 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/sentence_and_template_generator.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/eukaryotes/sentence_and_template_generator.ods -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/fetch_ranks_sub.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | def fetch_ranks(filename='part-r-00000'): 5 | sub = open(filename,'r').readlines() 6 | diction={} 7 | 8 | print("Loading Rankings") 9 | for val in tqdm(sub): 10 | diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1]) 11 | return diction 12 | 13 | if __name__ == "__main__": 14 | fetch_ranks() 15 | pass 16 | 17 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/generate_templates.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from get_properties import get_properties 3 | from generate_url import generate_url 4 | from sentence_and_template_generator import sentence_and_template_generator 5 | import os 6 | from fetch_ranks_sub import fetch_ranks 7 | import logging 8 | 9 | def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"): 10 | """ 11 | Funtion to generate templates | wrapper function for rest of the functions. 12 | """ 13 | val = generate_url(label) 14 | url = val[0] 15 | about = (val[1]) 16 | count =0 17 | vessel= [] 18 | 19 | diction = fetch_ranks("../utility/part-r-00000") 20 | if(not os.path.isdir(project_name)): 21 | os.makedirs(project_name) 22 | output_file = open(project_name+"/" + output_file, 'w') 23 | 24 | # Create a logger object 25 | logger = logging.getLogger() 26 | 27 | # Configure logger 28 | logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w') 29 | 30 | # Setting threshold level 31 | logger.setLevel(logging.DEBUG) 32 | 33 | # Use the logging methods 34 | #logger.debug("This is a debug message") 35 | logger.info("This is a log file.") 36 | #logger.warning("This is a warning message") 37 | #logger.error("This is an error message") 38 | #logger.critical("This is a critical message") 39 | 40 | list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv") 41 | for property_line in list_of_property_information: 42 | count+=1 43 | prop = property_line.split(',') 44 | print("**************\n"+str(prop)) 45 | sentence_and_template_generator(log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of ?",count = 2) 46 | output_file.close() 47 | 48 | if __name__ == "__main__": 49 | """ 50 | Section to parse the command line arguments. 51 | """ 52 | parser = argparse.ArgumentParser() 53 | requiredNamed = parser.add_argument_group('Required Arguments') 54 | 55 | requiredNamed.add_argument('--label', dest='label', metavar='label', 56 | help='label: person, place etc.', required=True) 57 | requiredNamed.add_argument( 58 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 59 | requiredNamed.add_argument( 60 | '--depth', dest='depth', metavar='depth', help='Mention the depth you want to go in the knowledge graph (The number of questions will increase exponentially!), e.g. 2', required=False) 61 | args = parser.parse_args() 62 | label = args.label 63 | project_name = args.project_name 64 | depth = args.depth 65 | generate_templates(label=label,project_name=project_name,depth=depth) 66 | pass -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv"): 12 | """ 13 | - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | - This code on execution creates a csv which contains all the properties, ontology, 16 | class related information and data types as field values in each row. 17 | - This function also returns a 2D list of the information mentioned above to the calling 18 | function 19 | """ 20 | page = urllib.request.urlopen(url) 21 | soup = BeautifulSoup(page, "html.parser") 22 | if(not os.path.isdir(project_name)): 23 | os.makedirs(project_name) 24 | output_file = open(project_name+"/" + output_file, 'w') 25 | fl = 0 26 | accum = [] 27 | for rows in tqdm(soup.find_all("tr")): 28 | x = rows.find_all("td") 29 | if len(x) <= 2: 30 | fl = 1 31 | continue 32 | if fl == 1: 33 | fl = 2 34 | continue 35 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 36 | label = rows.find_all("td")[1].get_text() 37 | dom = rows.find_all("td")[2].get_text() 38 | rng = rows.find_all("td")[3].get_text() 39 | URL_name = ((rows.find_all("td")[0].find('a').attrs['href'])) 40 | final = name + "," + label + "," + dom + "," + rng 41 | #+ ","+ URL_name.split(':')[-1] 42 | accum.append(final) 43 | output_file.write(final+"\n") 44 | output_file.close() 45 | return accum 46 | 47 | 48 | """ 49 | Name, Label, Domain, Range, URL_name 50 | """ 51 | 52 | if __name__ == "__main__": 53 | """ 54 | Section to parse the command line arguments. 55 | """ 56 | parser = argparse.ArgumentParser() 57 | requiredNamed = parser.add_argument_group('Required Arguments') 58 | 59 | requiredNamed.add_argument('--url', dest='url', metavar='url', 60 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 61 | requiredNamed.add_argument( 62 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 63 | requiredNamed.add_argument( 64 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 65 | args = parser.parse_args() 66 | url = args.url 67 | output_file = args.out_put 68 | project_name = args.project_name 69 | get_properties(url = url, project_name= project_name, output_file = output_file) 70 | pass 71 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/rank_test/affiliation.csv: -------------------------------------------------------------------------------- 1 | administrator,administrator,Organisation,Person 2 | age,age,Agent,xsd:integer 3 | artPatron,patron (art),Agent,Artist 4 | ceo,chief executive officer,Organisation,Person 5 | chairperson,chairperson,Organisation,Person 6 | championships,championships,Agent,xsd:nonNegativeInteger 7 | chaplain,chaplain,Organisation,Person 8 | childOrganisation,child organisation,Organisation,Organisation 9 | denomination,denomination,Agent,owl:Thing 10 | discipline,discipline,Agent,owl:Thing 11 | endowment,endowment,Organisation,Currency 12 | formationDate,formation date,Organisation,xsd:date 13 | formationYear,formation year,Organisation,xsd:gYear 14 | foundationPlace,foundation place,Organisation,City 15 | generalCouncil,general council,Agent,TermOfOffice 16 | headquarter,headquarter,Organisation,PopulatedPlace 17 | hometown,home town,Agent,Settlement 18 | ideology,ideology,Agent,Ideology 19 | juniorSeason,junior season,Agent,owl:Thing 20 | leaderFunction,leaderFunction,Organisation,PersonFunction 21 | legalForm,legal form,Organisation,owl:Thing 22 | locationCity,location city,Organisation,City 23 | mainOrgan,main organ,Organisation,owl:Thing 24 | managerSeason,manager season,Agent,owl:Thing 25 | membership,membership,Organisation,rdf:langString 26 | mergedWith,merged with,Organisation,Organisation 27 | nationalSelection,national selection,Agent,owl:Thing 28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 32 | organisationMember,organisation member,Organisation,OrganisationMember 33 | owns,owns,Agent,Thing 34 | parentOrganisation,parent organisation,Organisation,Organisation 35 | playerSeason,player season,Agent,owl:Thing 36 | product,product,Organisation,owl:Thing 37 | ranking,ranking,Organisation,xsd:positiveInteger 38 | regionServed,region served,Organisation,Place 39 | regionalCouncil,regional council,Agent,TermOfOffice 40 | revenue,revenue,Organisation,Currency 41 | roleInEvent,A Person's role in an event,Agent,Event 42 | season,season,Agent,owl:Thing 43 | secretaryGeneral,secretary,Organisation,Person 44 | service,service,Organisation,owl:Thing 45 | staff,staff,Organisation,xsd:nonNegativeInteger 46 | superintendent,superintendent,Organisation,Person 47 | trustee,trustee,Organisation,Person 48 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/.pipeline_3/rank_test/sentence_and_template_generator.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/.pipeline_3/rank_test/sentence_and_template_generator.ods -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.images/person_properties.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.images/person_properties.png -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/bin/python" 3 | } -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/sentence_and_template_generator: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/sentence_and_template_generator -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/test.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Eukaryotes/test.csv -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/get_properties.csv: -------------------------------------------------------------------------------- 1 | architect,architect,ArchitecturalStructure,Architect 2 | architectualBureau,architectual bureau,ArchitecturalStructure,Company 3 | architecturalStyle,architectural style,ArchitecturalStructure,owl:Thing 4 | buildingEndYear,building end year,ArchitecturalStructure,xsd:gYear 5 | buildingStartYear,building start year,ArchitecturalStructure,xsd:gYear 6 | construction,construction,ArchitecturalStructure,owl:Thing 7 | constructionMaterial,construction material,ArchitecturalStructure,owl:Thing 8 | currentlyUsedFor,currently used for,ArchitecturalStructure,xsd:string 9 | dateUnveiled,date unveiled,Monument,xsd:date 10 | demolitionDate,demolition date,ArchitecturalStructure,xsd:date 11 | demolitionYear,demolition year,ArchitecturalStructure,xsd:gYear 12 | features,features,ArchitecturalStructure,Work 13 | groupCommemorated,group commemorated,Monument,xsd:string 14 | initiallyUsedFor,initally used for,ArchitecturalStructure,xsd:string 15 | maintainedBy,maintained by,ArchitecturalStructure,owl:Thing 16 | rebuildingDate,rebuilding date,ArchitecturalStructure,xsd:date 17 | rebuildingYear,rebuilding year,ArchitecturalStructure,xsd:gYear 18 | reopeningDate,reopening date,ArchitecturalStructure,xsd:date 19 | reopeningYear,reopening year,ArchitecturalStructure,xsd:gYear 20 | tenant,tenant,ArchitecturalStructure,Organisation 21 | visitorStatisticsAsOf,visitor statistics as of,ArchitecturalStructure,xsd:gYear 22 | visitorsPerDay,visitors per day,ArchitecturalStructure,xsd:nonNegativeInteger 23 | visitorsPerYear,visitors per year,ArchitecturalStructure,xsd:nonNegativeInteger 24 | visitorsPercentageChange,visitor percentage change,ArchitecturalStructure,xsd:double 25 | visitorsTotal,visitors total,ArchitecturalStructure,xsd:nonNegativeInteger 26 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/tenant.csv: -------------------------------------------------------------------------------- 1 | administrator,administrator,Organisation,Person 2 | age,age,Agent,xsd:integer 3 | artPatron,patron (art),Agent,Artist 4 | ceo,chief executive officer,Organisation,Person 5 | chairperson,chairperson,Organisation,Person 6 | championships,championships,Agent,xsd:nonNegativeInteger 7 | chaplain,chaplain,Organisation,Person 8 | childOrganisation,child organisation,Organisation,Organisation 9 | denomination,denomination,Agent,owl:Thing 10 | discipline,discipline,Agent,owl:Thing 11 | endowment,endowment,Organisation,Currency 12 | formationDate,formation date,Organisation,xsd:date 13 | formationYear,formation year,Organisation,xsd:gYear 14 | foundationPlace,foundation place,Organisation,City 15 | generalCouncil,general council,Agent,TermOfOffice 16 | headquarter,headquarter,Organisation,PopulatedPlace 17 | hometown,home town,Agent,Settlement 18 | ideology,ideology,Agent,Ideology 19 | juniorSeason,junior season,Agent,owl:Thing 20 | leaderFunction,leaderFunction,Organisation,PersonFunction 21 | legalForm,legal form,Organisation,owl:Thing 22 | locationCity,location city,Organisation,City 23 | mainOrgan,main organ,Organisation,owl:Thing 24 | managerSeason,manager season,Agent,owl:Thing 25 | membership,membership,Organisation,rdf:langString 26 | mergedWith,merged with,Organisation,Organisation 27 | nationalSelection,national selection,Agent,owl:Thing 28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 32 | organisationMember,organisation member,Organisation,OrganisationMember 33 | owns,owns,Agent,Thing 34 | parentOrganisation,parent organisation,Organisation,Organisation 35 | playerSeason,player season,Agent,owl:Thing 36 | product,product,Organisation,owl:Thing 37 | ranking,ranking,Organisation,xsd:positiveInteger 38 | regionServed,region served,Organisation,Place 39 | regionalCouncil,regional council,Agent,TermOfOffice 40 | revenue,revenue,Organisation,Currency 41 | roleInEvent,A Person's role in an event,Agent,Event 42 | season,season,Agent,owl:Thing 43 | secretaryGeneral,secretary,Organisation,Person 44 | service,service,Organisation,owl:Thing 45 | staff,staff,Organisation,xsd:nonNegativeInteger 46 | superintendent,superintendent,Organisation,Person 47 | trustee,trustee,Organisation,Person 48 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/test.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Monument/test.csv -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Organisation/get_properties.csv: -------------------------------------------------------------------------------- 1 | administrator,administrator,Organisation,Person 2 | age,age,Agent,xsd:integer 3 | artPatron,patron (art),Agent,Artist 4 | ceo,chief executive officer,Organisation,Person 5 | chairperson,chairperson,Organisation,Person 6 | championships,championships,Agent,xsd:nonNegativeInteger 7 | chaplain,chaplain,Organisation,Person 8 | childOrganisation,child organisation,Organisation,Organisation 9 | denomination,denomination,Agent,owl:Thing 10 | discipline,discipline,Agent,owl:Thing 11 | endowment,endowment,Organisation,Currency 12 | formationDate,formation date,Organisation,xsd:date 13 | formationYear,formation year,Organisation,xsd:gYear 14 | foundationPlace,foundation place,Organisation,City 15 | generalCouncil,general council,Agent,TermOfOffice 16 | headquarter,headquarter,Organisation,PopulatedPlace 17 | hometown,home town,Agent,Settlement 18 | ideology,ideology,Agent,Ideology 19 | juniorSeason,junior season,Agent,owl:Thing 20 | leaderFunction,leaderFunction,Organisation,PersonFunction 21 | legalForm,legal form,Organisation,owl:Thing 22 | locationCity,location city,Organisation,City 23 | mainOrgan,main organ,Organisation,owl:Thing 24 | managerSeason,manager season,Agent,owl:Thing 25 | membership,membership,Organisation,rdf:langString 26 | mergedWith,merged with,Organisation,Organisation 27 | nationalSelection,national selection,Agent,owl:Thing 28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 32 | organisationMember,organisation member,Organisation,OrganisationMember 33 | owns,owns,Agent,Thing 34 | parentOrganisation,parent organisation,Organisation,Organisation 35 | playerSeason,player season,Agent,owl:Thing 36 | product,product,Organisation,owl:Thing 37 | ranking,ranking,Organisation,xsd:positiveInteger 38 | regionServed,region served,Organisation,Place 39 | regionalCouncil,regional council,Agent,TermOfOffice 40 | revenue,revenue,Organisation,Currency 41 | roleInEvent,A Person's role in an event,Agent,Event 42 | season,season,Agent,owl:Thing 43 | secretaryGeneral,secretary,Organisation,Person 44 | service,service,Organisation,owl:Thing 45 | staff,staff,Organisation,xsd:nonNegativeInteger 46 | superintendent,superintendent,Organisation,Person 47 | trustee,trustee,Organisation,Person 48 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/alma mater.csv: -------------------------------------------------------------------------------- 1 | actingHeadteacher,acting headteacher,EducationalInstitution,Person 2 | administrator,administrator,Organisation,Person 3 | alumni,alumni,EducationalInstitution,Person 4 | assistantPrincipal,assistant principal,EducationalInstitution,owl:Thing 5 | brinCode,BRIN code,EducationalInstitution,xsd:string 6 | campusType,campus type,EducationalInstitution,rdf:langString 7 | ceo,chief executive officer,Organisation,Person 8 | chairperson,chairperson,Organisation,Person 9 | chaplain,chaplain,Organisation,Person 10 | childOrganisation,child organisation,Organisation,Organisation 11 | closed,closed,EducationalInstitution,xsd:date 12 | custodian,custodian,EducationalInstitution,Person 13 | dean,dean,EducationalInstitution,Person 14 | educationSystem,education system,EducationalInstitution,owl:Thing 15 | endowment,endowment,Organisation,Currency 16 | facultySize,faculty size,EducationalInstitution,xsd:nonNegativeInteger 17 | formationDate,formation date,Organisation,xsd:date 18 | formationYear,formation year,Organisation,xsd:gYear 19 | foundationPlace,foundation place,Organisation,City 20 | head,head,EducationalInstitution,Person 21 | headquarter,headquarter,Organisation,PopulatedPlace 22 | leaderFunction,leaderFunction,Organisation,PersonFunction 23 | legalForm,legal form,Organisation,owl:Thing 24 | locationCity,location city,Organisation,City 25 | mainOrgan,main organ,Organisation,owl:Thing 26 | membership,membership,Organisation,rdf:langString 27 | mergedWith,merged with,Organisation,Organisation 28 | nationalRanking,national ranking,EducationalInstitution,xsd:positiveInteger 29 | numberOfAcademicStaff,number of academic staff,EducationalInstitution,xsd:nonNegativeInteger 30 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 31 | numberOfGraduateStudents,number of graduate students,EducationalInstitution,xsd:nonNegativeInteger 32 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 33 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 34 | numberOfStudents,number of students,EducationalInstitution,xsd:nonNegativeInteger 35 | numberOfUndergraduateStudents,number of undergraduate students,EducationalInstitution,xsd:nonNegativeInteger 36 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 37 | offeredClasses,offered classes,EducationalInstitution,xsd:string 38 | officialSchoolColour,official school colour,EducationalInstitution,xsd:string 39 | organisationMember,organisation member,Organisation,OrganisationMember 40 | parentOrganisation,parent organisation,Organisation,Organisation 41 | principal,principal,EducationalInstitution,Person 42 | product,product,Organisation,owl:Thing 43 | ranking,ranking,Organisation,xsd:positiveInteger 44 | rector,rector,EducationalInstitution,Person 45 | regionServed,region served,Organisation,Place 46 | revenue,revenue,Organisation,Currency 47 | secretaryGeneral,secretary,Organisation,Person 48 | service,service,Organisation,owl:Thing 49 | staff,staff,Organisation,xsd:nonNegativeInteger 50 | superintendent,superintendent,Organisation,Person 51 | trustee,trustee,Organisation,Person 52 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/career station.csv: -------------------------------------------------------------------------------- 1 | end,end,TimePeriod,xsd:date 2 | numberOfGoals,number of goals scored,CareerStation,xsd:integer 3 | numberOfMatches,number of matches or caps,CareerStation,xsd:integer 4 | start,start,TimePeriod,xsd:date 5 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/college.csv: -------------------------------------------------------------------------------- 1 | actingHeadteacher,acting headteacher,EducationalInstitution,Person 2 | administrator,administrator,Organisation,Person 3 | alumni,alumni,EducationalInstitution,Person 4 | assistantPrincipal,assistant principal,EducationalInstitution,owl:Thing 5 | brinCode,BRIN code,EducationalInstitution,xsd:string 6 | campusType,campus type,EducationalInstitution,rdf:langString 7 | ceo,chief executive officer,Organisation,Person 8 | chairperson,chairperson,Organisation,Person 9 | chaplain,chaplain,Organisation,Person 10 | childOrganisation,child organisation,Organisation,Organisation 11 | closed,closed,EducationalInstitution,xsd:date 12 | custodian,custodian,EducationalInstitution,Person 13 | dean,dean,EducationalInstitution,Person 14 | educationSystem,education system,EducationalInstitution,owl:Thing 15 | endowment,endowment,Organisation,Currency 16 | facultySize,faculty size,EducationalInstitution,xsd:nonNegativeInteger 17 | formationDate,formation date,Organisation,xsd:date 18 | formationYear,formation year,Organisation,xsd:gYear 19 | foundationPlace,foundation place,Organisation,City 20 | head,head,EducationalInstitution,Person 21 | headquarter,headquarter,Organisation,PopulatedPlace 22 | leaderFunction,leaderFunction,Organisation,PersonFunction 23 | legalForm,legal form,Organisation,owl:Thing 24 | locationCity,location city,Organisation,City 25 | mainOrgan,main organ,Organisation,owl:Thing 26 | membership,membership,Organisation,rdf:langString 27 | mergedWith,merged with,Organisation,Organisation 28 | nationalRanking,national ranking,EducationalInstitution,xsd:positiveInteger 29 | numberOfAcademicStaff,number of academic staff,EducationalInstitution,xsd:nonNegativeInteger 30 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 31 | numberOfGraduateStudents,number of graduate students,EducationalInstitution,xsd:nonNegativeInteger 32 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 33 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 34 | numberOfStudents,number of students,EducationalInstitution,xsd:nonNegativeInteger 35 | numberOfUndergraduateStudents,number of undergraduate students,EducationalInstitution,xsd:nonNegativeInteger 36 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 37 | offeredClasses,offered classes,EducationalInstitution,xsd:string 38 | officialSchoolColour,official school colour,EducationalInstitution,xsd:string 39 | organisationMember,organisation member,Organisation,OrganisationMember 40 | parentOrganisation,parent organisation,Organisation,Organisation 41 | principal,principal,EducationalInstitution,Person 42 | product,product,Organisation,owl:Thing 43 | ranking,ranking,Organisation,xsd:positiveInteger 44 | rector,rector,EducationalInstitution,Person 45 | regionServed,region served,Organisation,Place 46 | revenue,revenue,Organisation,Currency 47 | secretaryGeneral,secretary,Organisation,Person 48 | service,service,Organisation,owl:Thing 49 | staff,staff,Organisation,xsd:nonNegativeInteger 50 | superintendent,superintendent,Organisation,Person 51 | trustee,trustee,Organisation,Person 52 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/employer.csv: -------------------------------------------------------------------------------- 1 | administrator,administrator,Organisation,Person 2 | age,age,Agent,xsd:integer 3 | artPatron,patron (art),Agent,Artist 4 | ceo,chief executive officer,Organisation,Person 5 | chairperson,chairperson,Organisation,Person 6 | championships,championships,Agent,xsd:nonNegativeInteger 7 | chaplain,chaplain,Organisation,Person 8 | childOrganisation,child organisation,Organisation,Organisation 9 | denomination,denomination,Agent,owl:Thing 10 | discipline,discipline,Agent,owl:Thing 11 | endowment,endowment,Organisation,Currency 12 | formationDate,formation date,Organisation,xsd:date 13 | formationYear,formation year,Organisation,xsd:gYear 14 | foundationPlace,foundation place,Organisation,City 15 | generalCouncil,general council,Agent,TermOfOffice 16 | headquarter,headquarter,Organisation,PopulatedPlace 17 | hometown,home town,Agent,Settlement 18 | ideology,ideology,Agent,Ideology 19 | juniorSeason,junior season,Agent,owl:Thing 20 | leaderFunction,leaderFunction,Organisation,PersonFunction 21 | legalForm,legal form,Organisation,owl:Thing 22 | locationCity,location city,Organisation,City 23 | mainOrgan,main organ,Organisation,owl:Thing 24 | managerSeason,manager season,Agent,owl:Thing 25 | membership,membership,Organisation,rdf:langString 26 | mergedWith,merged with,Organisation,Organisation 27 | nationalSelection,national selection,Agent,owl:Thing 28 | numberOfEmployees,number of employees,Organisation,xsd:nonNegativeInteger 29 | numberOfLocations,number of locations,Organisation,xsd:nonNegativeInteger 30 | numberOfStaff,number of staff,Organisation,xsd:nonNegativeInteger 31 | numberOfVolunteers,number of volunteers,Organisation,xsd:nonNegativeInteger 32 | organisationMember,organisation member,Organisation,OrganisationMember 33 | owns,owns,Agent,Thing 34 | parentOrganisation,parent organisation,Organisation,Organisation 35 | playerSeason,player season,Agent,owl:Thing 36 | product,product,Organisation,owl:Thing 37 | ranking,ranking,Organisation,xsd:positiveInteger 38 | regionServed,region served,Organisation,Place 39 | regionalCouncil,regional council,Agent,TermOfOffice 40 | revenue,revenue,Organisation,Currency 41 | roleInEvent,A Person's role in an event,Agent,Event 42 | season,season,Agent,owl:Thing 43 | secretaryGeneral,secretary,Organisation,Person 44 | service,service,Organisation,owl:Thing 45 | staff,staff,Organisation,xsd:nonNegativeInteger 46 | superintendent,superintendent,Organisation,Person 47 | trustee,trustee,Organisation,Person 48 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/ideology.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/Person/ideology.csv -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/eliminator.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | 4 | def eliminator(): 5 | """ 6 | The function remove the templates which are considered as less popular 7 | based on the proposed ranking mechanism, the input files should be pre processed 8 | and a TRUE or FALSE should be added as the last column. 9 | 10 | This function just removes the entries with FALSE as the last entry in a row 11 | and create a file name new_train.csv to be used for futher purposes. 12 | """ 13 | lines = open(location,'r').readlines() 14 | print(len(lines)) 15 | accum = [] 16 | nspm_ready = open("new_train.csv",'w') 17 | for line in tqdm(lines): 18 | values = line.split(",") 19 | if(len(values)<8): 20 | print("Input file is of wrong format, please add the corect bolean values as the last column entry to use this function") 21 | print(values[-1]) 22 | if(values[-1]=="TRUE\n"): 23 | accum.append(";".join(values[:-2])+"\n") 24 | nspm_ready.write(accum[-1]) 25 | nspm_ready.close() 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | requiredNamed = parser.add_argument_group('Required Arguments') 30 | 31 | requiredNamed.add_argument('--location', dest='location', metavar='location', 32 | help='location of the file to be pruned.', required=True) 33 | args = parser.parse_args() 34 | location = args.location 35 | eliminator(location) 36 | pass 37 | 38 | 39 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/fetch_ranks.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | def fetch_ranks(filename='../utility/wikidata.rank'): 5 | """ 6 | The function loads rank from a supplied position. 7 | """ 8 | sub = open(filename,'r').readlines() 9 | diction={} 10 | 11 | print("Loading Rankings") 12 | for val in tqdm(sub): 13 | diction[val.split('\t')[0]] = float(val.split('\t')[1]) 14 | return diction 15 | 16 | if __name__ == "__main__": 17 | fetch_ranks() 18 | pass 19 | 20 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/fetch_ranks_sub.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | def fetch_ranks(filename='part-r-00000'): 5 | """ 6 | The function loads ranks from a supplied location, 7 | The ranks fileshould belong to the subjective 3d format 8 | of saving ranks. 9 | """ 10 | sub = open(filename,'r').readlines() 11 | diction={} 12 | 13 | print("Loading Rankings") 14 | for val in tqdm(sub): 15 | diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1]) 16 | return diction 17 | 18 | if __name__ == "__main__": 19 | fetch_ranks() 20 | pass 21 | 22 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/generate_templates.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from get_properties import get_properties 3 | from generate_url import generate_url 4 | from sentence_and_template_generator import sentence_and_template_generator 5 | import os 6 | from fetch_ranks_sub import fetch_ranks 7 | import logging 8 | 9 | def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"): 10 | """ 11 | The function acts as a wrapper for the whole package of supplied source code. 12 | """ 13 | val = generate_url(label) 14 | url = val[0] 15 | about = (val[1]) 16 | count =0 17 | vessel= [] 18 | depth=int(depth) 19 | diction = fetch_ranks("../utility/part-r-00000") 20 | if(not os.path.isdir(project_name)): 21 | os.makedirs(project_name) 22 | output_file = open(project_name+"/" + output_file, 'w') 23 | test_set = open(project_name+"/" + "test.csv", 'w') 24 | prop_dic = {} 25 | for iterator in range(depth): 26 | prop_dic[iterator] = [] 27 | # Create a logger object 28 | logger = logging.getLogger() 29 | 30 | # Configure logger 31 | logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w') 32 | 33 | # Setting threshold level 34 | logger.setLevel(logging.DEBUG) 35 | 36 | # Use the logging methods 37 | #logger.debug("This is a debug message") 38 | logger.info("This is a log file.") 39 | #logger.warning("This is a warning message") 40 | #logger.error("This is an error message") 41 | #logger.critical("This is a critical message") 42 | 43 | list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv") 44 | for property_line in list_of_property_information: 45 | count+=1 46 | prop = property_line.split(',') 47 | print("**************\n"+str(prop)) 48 | sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of ?",count = depth) 49 | output_file.close() 50 | 51 | if __name__ == "__main__": 52 | """ 53 | Section to parse the command line arguments. 54 | """ 55 | parser = argparse.ArgumentParser() 56 | requiredNamed = parser.add_argument_group('Required Arguments') 57 | 58 | requiredNamed.add_argument('--label', dest='label', metavar='label', 59 | help='label: person, place etc.', required=True) 60 | requiredNamed.add_argument( 61 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 62 | requiredNamed.add_argument( 63 | '--depth', dest='depth', metavar='depth', help='Mention the depth you want to go in the knowledge graph (The number of questions will increase exponentially!), e.g. 2', required=False) 64 | args = parser.parse_args() 65 | label = args.label 66 | project_name = args.project_name 67 | depth = args.depth 68 | generate_templates(label=label,project_name=project_name,depth=depth) 69 | pass -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv"): 12 | """ 13 | - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | - This code on execution creates a csv which contains all the properties, ontology, 16 | class related information and data types as field values in each row. 17 | - This function also returns a 2D list of the information mentioned above to the calling 18 | function 19 | """ 20 | page = urllib.request.urlopen(url) 21 | soup = BeautifulSoup(page, "html.parser") 22 | if(not os.path.isdir(project_name)): 23 | os.makedirs(project_name) 24 | output_file = open(project_name+"/" + output_file, 'w') 25 | fl = 0 26 | accum = [] 27 | for rows in tqdm(soup.find_all("tr")): 28 | x = rows.find_all("td") 29 | if len(x) <= 2: 30 | fl = 1 31 | continue 32 | if fl == 1: 33 | fl = 2 34 | continue 35 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 36 | label = rows.find_all("td")[1].get_text() 37 | dom = rows.find_all("td")[2].get_text() 38 | rng = rows.find_all("td")[3].get_text() 39 | URL_name = ((rows.find_all("td")[0].find('a').attrs['href'])) 40 | final = name + "," + label + "," + dom + "," + rng 41 | #+ ","+ URL_name.split(':')[-1] 42 | accum.append(final) 43 | output_file.write(final+"\n") 44 | output_file.close() 45 | return accum 46 | 47 | 48 | """ 49 | Name, Label, Domain, Range, URL_name 50 | """ 51 | 52 | if __name__ == "__main__": 53 | """ 54 | Section to parse the command line arguments. 55 | """ 56 | parser = argparse.ArgumentParser() 57 | requiredNamed = parser.add_argument_group('Required Arguments') 58 | 59 | requiredNamed.add_argument('--url', dest='url', metavar='url', 60 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 61 | requiredNamed.add_argument( 62 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 63 | requiredNamed.add_argument( 64 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 65 | args = parser.parse_args() 66 | url = args.url 67 | output_file = args.out_put 68 | project_name = args.project_name 69 | get_properties(url = url, project_name= project_name, output_file = output_file) 70 | pass 71 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/question_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def question_generator(query): 5 | pass 6 | 7 | 8 | 9 | if __name__ == "__main__": 10 | """ 11 | Section to parse the command line arguments. 12 | """ 13 | parser = argparse.ArgumentParser() 14 | requiredNamed = parser.add_argument_group('Required Arguments') 15 | 16 | requiredNamed.add_argument('--url', dest='url', metavar='url', 17 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 18 | requiredNamed.add_argument( 19 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 20 | requiredNamed.add_argument( 21 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 22 | args = parser.parse_args() 23 | url = args.url 24 | output_file = args.out_put 25 | project_name = args.project_name 26 | pass -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/.~lock.question_form.csv#: -------------------------------------------------------------------------------- 1 | ,petrichor,DragonFeaster,24.06.2019 07:05,file:///home/petrichor/.config/libreoffice/4; -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/Test_Fixer/readme.md: -------------------------------------------------------------------------------- 1 | # Dataset thresholder 2 | 3 | This code creates a test set making sure the following constraints are 4 | followed: 5 | - The vocabulary in the test set has been learned in a separate context in the 6 | train set. 7 | - Frequency Thresholding: The vocabulary in the test set is present in the train 8 | set for a given number of times. 9 | 10 | To run the code, download the files from [here](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/anand-pipeline_3-Test_Fixer-files.zip), then run the following command: 11 | 12 | ```python 13 | python text_fixer.py 14 | ``` 15 | 16 | Minimum requirements: 17 | 18 | - `train.sparql` file containing the SPARQL queries of the training set. 19 | - `old_test.sparql` The test set containing all the test SPARQL queries. 20 | - `vocab.sparql` Vocabulary of the training set. 21 | 22 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/compare/compare.py: -------------------------------------------------------------------------------- 1 | """ 2 | The comparer will compare 2 files and determine the following: 3 | - In a line by line inspection, how many lines were not exactly the same. 4 | - A dictionary containing the number of errors in the matched lines like: 5 | { 6 | 0: 34044, 7 | 1: 36629, 8 | 2: 16682, 9 | 3: 4291, 10 | 7: 82, 11 | 8: 173, 12 | 11: 18, 13 | 12: 22, 14 | 'Wrong number of tokens': 22051 15 | } 16 | """ 17 | 18 | from tqdm import tqdm 19 | 20 | ref = open("test.sparql",'r').readlines() 21 | test = open("output_test",'r').readlines() 22 | diction_error = {} 23 | counter = 0 24 | for val in tqdm(range(len(ref))): 25 | error_count = 0 26 | ref_s = ref[val].split(" ") 27 | test_s = test[val].split(" ") 28 | """ print("```") 29 | print("reference:"+ref[val]) 30 | print("test:"+test[val]) 31 | print("```") """ 32 | if (ref_s != test_s): 33 | counter+=1 34 | try: 35 | for count in range(len(ref_s)): 36 | if(ref_s[count] == test_s[count]): 37 | continue 38 | else: 39 | error_count += 1 40 | #print("ref:"+ref_s[count]+"
") 41 | #print("test:"+test_s[count]+"
") 42 | except: 43 | error_count = "Wrong number of tokens" 44 | #print("Wrong number of tokens") 45 | if(error_count not in diction_error.keys()): 46 | diction_error[(error_count)] = 1 47 | diction_error[(error_count)] += 1 48 | #print("\n----\n") 49 | print(counter) 50 | print(diction_error) -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/compare/readme.md: -------------------------------------------------------------------------------- 1 | # Compare SPARQL files 2 | 3 | The comparer will compare 2 files and determine the following: 4 | - In a line by line inspection, how many lines were not exactly the same. 5 | - A dictionary containing the number of errors in the matched lines like: 6 | { 7 | 0: 34044, 8 | 1: 36629, 9 | 2: 16682, 10 | 3: 4291, 11 | 7: 82, 12 | 8: 173, 13 | 11: 18, 14 | 12: 22, 15 | 'Wrong number of tokens': 22051 16 | } 17 | - To run the code please use the following: 18 | ``` 19 | python compare.py 20 | ``` 21 | Minimum requirements: 22 | - `` and `` the 2 files to be compared. -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/labels.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiberAI/NSpM/28d61796baafc78a858ec305626d95fad02caefc/gsoc/anand/pipeline_3/utility/labels.json -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/breaker.sh: -------------------------------------------------------------------------------- 1 | echo "Creating data_fragments folder" 2 | mkdir data_fragments 3 | cd data_fragments 4 | echo "Usage: ./breaker.sh " 5 | echo "Example: ./breaker.sh 1000MB pageRank.txt" 6 | echo "It will take some time, please remain calm." 7 | split -b $1 ../$2 -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/embedding_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | import os 4 | index = open("index.csv").readlines() 5 | diction = {} 6 | missed_counter = 0 7 | print(""" 8 | Loading the index information from the index.csv file. 9 | """) 10 | 11 | for line in tqdm(range(len(index))): 12 | index[line] = index[line].split('\t') 13 | key = index[line][0].strip() 14 | diction[key] = {} 15 | diction[key]['file'] = index[line][1] 16 | diction[key]['line'] = index[line][2].strip() 17 | """ 18 | This part of the code creates the a json file containing the information which can be loaded easily. 19 | """ 20 | """ 21 | with open("data_file.json", "w") as write_file: 22 | json.dump(diction, write_file) 23 | """ 24 | 25 | file_diction = {} 26 | 27 | a = [f for f in os.listdir("data_fragments")] 28 | 29 | for val in (a): 30 | file_diction[val.strip()] = [] 31 | 32 | vocab = open("vocab.sparql",'r').readlines() 33 | filename = [] 34 | dict_keys = diction.keys() 35 | 36 | print(""" 37 | Checking and accumulating information for words obtained from the vocabulary from the index thus loaded. 38 | """) 39 | 40 | for word in tqdm(vocab): 41 | word = word.strip() 42 | if(word in dict_keys ): 43 | file_diction[diction[word]["file"]].append(diction[word]) 44 | else: 45 | missed_counter+=1 46 | 47 | 48 | print(""" 49 | Loading information from the broken files to extract the required embeddings. 50 | """) 51 | accum = [] 52 | for files in tqdm(a): 53 | file_reader = open("data_fragments/"+files).readlines() 54 | for words_in_file in file_diction[files.strip()]: 55 | accum.append(file_reader[int(words_in_file["line"].strip())].strip()) 56 | 57 | print(""" 58 | Writing the extracted embeddings in a file for future use. 59 | """) 60 | final = ("\n".join(accum)).replace('\t',' ') 61 | final = final.replace("http://dbpedia.org/resource/","dbr_") 62 | final = final.replace("http://dbpedia.org/ontology/","dbo_") 63 | open("new_vocbulary.csv",'w').write(final) 64 | 65 | print("Missed words: "+str(missed_counter)) 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/indexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code extracts all the embeddings from the pageRank.txt and records the following information in the following manner. 3 | 4 | \t\t 5 | 6 | This index will contain information regarding the position of all the words so that the corresponding embeddings can be extracted easily. Withput having to query through the whole embedding file. 7 | """ 8 | 9 | import sys 10 | import os 11 | from tqdm import tqdm 12 | 13 | a = [f for f in os.listdir("data_fragments/")] 14 | for files in tqdm(a): 15 | #print(files) 16 | lines = open("data_fragments/"+files).readlines() 17 | writer = open("index.csv",'a') 18 | for line in range(len(lines)): 19 | lines[line] = lines[line].split("\t") 20 | word = lines[line][0] 21 | if "http://dbpedia.org/resource/" in (word): 22 | word = word.replace("http://dbpedia.org/resource/","dbr_") 23 | if "http://dbpedia.org/ontology/" in (word): 24 | word = word.replace("http://dbpedia.org/ontology/","dbo_") 25 | writer.write('\t'.join([word,files,str(line)])+'\n') 26 | writer.close() -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/new_extractor_fromGraphDBpediaEmbeddings/readme.md: -------------------------------------------------------------------------------- 1 | # SPARQL embedding extractor 2 | ## With Significant decrease in time taken compared to older model: within ~1 minute. 3 | 4 | - The code needs the big embedding file to be downloaded from the following link: [https://zenodo.org/record/1320038#.XT8CeHUzbEG](https://zenodo.org/record/1320038#.XT8CeHUzbEG) 5 | - Run the following utility to make the process faster by breaking the files into smaller files. 6 | - If the files fail to load, then decrease the size of file in the script command. (breaker.sh) 7 | 8 | The bash script: 9 | ```bash 10 | echo "Creating data_fragments folder" 11 | mkdir data_fragments 12 | cd data_fragments 13 | echo "Usage: ./breaker.sh " 14 | echo "Example: ./breaker.sh 1000MB pageRank.txt" 15 | echo "It will take some time, please remain calm." 16 | split -b $1 ../$2 17 | ``` 18 | 19 | ## How to use? | Different components of this utility, 20 | - For the utility to run the pageRank.txt files need to br present in this directory. 21 | - First the `breaker.sh` script should be run as per the instructions stated above. 22 | - After running the breaker.sh script run the following command on the terminal: 23 | ```bash 24 | python indexer.py 25 | ``` 26 | - This will create an index for all the words present in the `pageRank.txt` file. 27 | - After this we will require a vocab.sparql file which is the vocabulary (list of words for which you want to extract the embeddings.) 28 | - Copy the vocab.sparql in this directory. 29 | - Run the `embedding_extractor.py` code using the following command: 30 | ```bash 31 | python embedding_extractor.py 32 | ``` 33 | - The new embeddings will be created with the name `new_vocabulary.csv`: which is an embedding file to be used in NMT. The file has the following format: 34 | ``` 35 | 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/en_extract_embed.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | reader = open("pageRank.txt",'r') 3 | vocab = open("vocab.sparql",'r').readlines() 4 | count = 0 5 | temp = "" 6 | dict_vocab = [] 7 | vocab_count = 0 8 | 9 | while(True): 10 | line = reader.readline() 11 | if(line == temp): 12 | break 13 | count+=1 14 | temp = line 15 | 16 | #print(line) 17 | line = line.split("\t") 18 | """ if "http://dbpedia.org/resource/" in (line[0]): 19 | line[0] = line[0].replace("http://dbpedia.org/resource/","dbr_") 20 | if "http://dbpedia.org/ontology/" in (line[0]): 21 | line[0] = line[0].replace("http://dbpedia.org/ontology/","dbo_") """ 22 | for words in vocab: 23 | if(words.strip() == line[0]): 24 | vocab_count+=1 25 | dict_vocab.append(words.strip() +" " +" ".join(line[1:])) 26 | #print(words.strip()) 27 | break 28 | print(count) 29 | print(vocab_count) 30 | reader.close() 31 | 32 | writer = open("new_vocab.sparql",'w').write("\n".join(dict_vocab)) 33 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/readme.md: -------------------------------------------------------------------------------- 1 | # Old and slow embedding extractor 2 | 3 | - Please download the pageRank.txt file from the following link: https://zenodo.org/record/1320038#.XT8CeHUzbEG. 4 | - Run the extractor code in the following way: 5 | 6 | ``` 7 | For SPARQL: python sparql_extract_embed.py 8 | For English: python english_extract_embed.py 9 | ``` 10 | 11 | ## Note: 12 | 13 | It works for file with the following format: 14 | ``` 15 | word 16 | word 17 | word 18 | word 19 | word 20 | word 21 | word 22 | word 23 | ``` 24 | 25 | - The SPARQL prefixed python code replaces DBpedia urls with their shortforms like dbo, dbr. 26 | - The en prefixed python code runs on the word without any modifications. 27 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/old_extractor_from_GraphDBpediaEmbeddings/sparql_extract_embed.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | reader = open("pageRank.txt",'r') 3 | vocab = open("vocab.sparql",'r').readlines() 4 | count = 0 5 | temp = "" 6 | dict_vocab = [] 7 | vocab_count = 0 8 | expected_count= 5774165 9 | for val in tqdm(range(expected_count)): 10 | line = reader.readline() 11 | if(line == temp): 12 | break 13 | count+=1 14 | temp = line 15 | 16 | #print(line) 17 | line = line.split("\t") 18 | if "http://dbpedia.org/resource/" in (line[0]): 19 | line[0] = line[0].replace("http://dbpedia.org/resource/","dbr_") 20 | if "http://dbpedia.org/ontology/" in (line[0]): 21 | line[0] = line[0].replace("http://dbpedia.org/ontology/","dbo_") 22 | for words in vocab: 23 | if(words.strip() == line[0]): 24 | vocab_count+=1 25 | dict_vocab.append(words.strip() +" " +" ".join(line[1:])) 26 | #print(words.strip()) 27 | break 28 | print(count) 29 | print(vocab_count) 30 | reader.close() 31 | 32 | writer = open("new_vocab.sparql",'w').write("\n".join(dict_vocab)) 33 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/qald_json/interpreter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Interpreter module 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html 8 | https://arxiv.org/abs/1708.07624 9 | 10 | Version 0.1.0-akaha 11 | 12 | """ 13 | import sys 14 | import re 15 | 16 | from generator_utils import decode,fix_URI 17 | 18 | def interpreter(val): 19 | reload(sys) 20 | sys.setdefaultencoding("utf-8") 21 | encoded_sparql = val 22 | decoded_sparql = decode(encoded_sparql) 23 | decoded_sparql = fix_URI(decoded_sparql) 24 | return( decoded_sparql) 25 | 26 | if __name__ == '__main__': 27 | reload(sys) 28 | sys.setdefaultencoding("utf-8") 29 | encoded_sparql = sys.argv[1] 30 | decoded_sparql = decode(encoded_sparql) 31 | decoded_sparql = fix_URI(decoded_sparql) 32 | print( decoded_sparql) 33 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/qald_json/qald_json_gerbil_input.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import f1_score 2 | from interpreter import interpreter 3 | import os 4 | import numpy as np 5 | from nltk.corpus import stopwords 6 | import urllib 7 | from bs4 import BeautifulSoup 8 | from tqdm import tqdm 9 | from collections import OrderedDict 10 | import json 11 | 12 | base = {"dataset":{"id": "stuff"}} 13 | base["questions"] = [] 14 | question_lines = open('qald_json/test.en', 'r').readlines() 15 | lines = open('qald_json/test.sparql', 'r').readlines() 16 | lines = list(map(interpreter, tuple(lines))) 17 | 18 | for valu in range(len(lines)): 19 | lines[valu] = lines[valu].replace("limit\n","limit 1\n") 20 | 21 | #print("".join(lines)) 22 | #print(len(lines)) 23 | 24 | import urllib2 25 | contents = urllib2.urlopen 26 | accum = [] 27 | count = 0 28 | stop = set(stopwords.words('english')) 29 | for valu in tqdm(range(len(lines))): 30 | count+=1 31 | query = urllib.quote(lines[valu]) 32 | url2 = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+" 33 | #url2 = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+" 34 | #page = urllib2.urlopen(url2) 35 | #soup = BeautifulSoup(page, "html.parser") 36 | try: 37 | page = urllib2.urlopen(url2) 38 | except: 39 | print(url2) 40 | continue 41 | soup = BeautifulSoup(page, "html.parser") 42 | js_dic = json.loads(str(soup)) 43 | bindings = js_dic["results"]["bindings"] 44 | #print (bindings) 45 | answer = [] 46 | #print("************") 47 | """ for rows in (soup.find_all("tr")): 48 | for td in rows.find_all("td"): 49 | answer.append(td.getText()) 50 | """ 51 | que = {} 52 | que["id"] = str(valu) 53 | que["answertype"] = "resource" # Check 54 | que["aggregation"] = False 55 | que["onlydbo"] = True 56 | que["hybrid"] = False 57 | que["question"] = [{"language":"en", "string":question_lines[valu][:-1], "keywords" : " ".join([i for i in question_lines[valu].lower().split() if i not in stop] )}] 58 | que["query"] = {"sparql":lines[valu][:-1]} 59 | que["answers"] = [] 60 | anc_accum = [] 61 | #answer_unit = js_dic 62 | answer_unit = {"head": {"vars": ["uri"]}} 63 | for ans in bindings: 64 | ans = ans["x"] 65 | if("dbpedia" in ans): 66 | temp = {"uri": {"type": ans["type"], "value": ans["value"]}} 67 | else: 68 | temp = {"uri": {"type": ans["type"], "value": ans["value"]}} 69 | anc_accum.append(temp) 70 | answer_unit["results"] = {} 71 | answer_unit["results"]["bindings"] = anc_accum 72 | que["answers"].append(answer_unit) 73 | accum.append(que) 74 | 75 | """ if(count>10): 76 | break """ 77 | 78 | base["questions"] = accum 79 | 80 | import json 81 | with open('qald_json/data.json', 'w') as outfile: 82 | json.dump(OrderedDict(base), outfile, ensure_ascii=False, indent=2) 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/qald_json/readme.md: -------------------------------------------------------------------------------- 1 | # QALD JSON format generator 2 | 3 | - Runs on Python 2.7 4 | 5 | - It requires the interpreter function to run this. Thus to use this please copy this folder to the main folder and copy the python file to the man folder, for your ease please run the following code on the terminal : 6 | ``` 7 | ./shifter.sh 8 | ``` 9 | - The QALD format generator is a program that generates the input for the gerbil portal. 10 | - The function requires the english and it's translated version to be present in the this folder with the following file names: `test.en` and `test.sparql`. 11 | - The question and corresponding SPARQL form should have the same line number. 12 | - The code can be run by running the `qald_json_gerbil_input.py` present outside this folder. An example of the output json file is as follows: 13 | 14 | ```json 15 | { 16 | "questions": [{ 17 | "id": "1", 18 | "question": [{ 19 | "language": "en", 20 | "string": "Which German cities have more than 250000 inhabitants?" 21 | }], 22 | "query": { 23 | "sparql": "SELECT DISTINCT ?uri WHERE { { ?uri . } UNION { ?uri . } ?uri . ?uri ?population . FILTER ( ?population > 250000 ) } " 24 | }, 25 | "answers": [{ 26 | "head": { 27 | "vars": [ 28 | "uri" 29 | ] 30 | }, 31 | "results": { 32 | "bindings": [{ 33 | "uri": { 34 | "type": "uri", 35 | "value": "http://dbpedia.org/resource/Bonn" 36 | } 37 | }] 38 | } 39 | }] 40 | }] 41 | } 42 | ``` 43 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/qald_json/shifter.sh: -------------------------------------------------------------------------------- 1 | cd ../ 2 | cp -r qald_json ../../../../ 3 | cd ../../../../qald_json 4 | cp qald_json_gerbil_input.py ../ 5 | cp interpreter.py ../ 6 | echo "Files shifted to the main folder." -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/qald_json/test.en: -------------------------------------------------------------------------------- 1 | What is the binomial authority of species of university of maryland arboretum & botanical garden ? 2 | What is the binomial authority of species of cyclura cychlura figginsi ? 3 | What is the number of binomial authority of species of cyclura cychlura figginsi ? 4 | What is the number of binomial authority of species of university of maryland arboretum & botanical garden ? 5 | What is the conservation status of species of crotalus mitchellii muertensis ? 6 | What is the conservation status of species of cyclura cychlura figginsi ? 7 | What is the number of conservation status of species of crotalus mitchellii muertensis ? 8 | What is the number of conservation status of species of cyclura cychlura figginsi ? 9 | What is the conservation status system of species of cyclura cychlura figginsi ? 10 | What is the conservation status system of species of crotalus mitchellii muertensis ? 11 | What is the number of conservation status system of species of cyclura cychlura figginsi ? 12 | What is the number of conservation status system of species of crotalus mitchellii muertensis ? 13 | What is the division of species of university of maryland arboretum & botanical garden ? 14 | What is the number of division of species of university of maryland arboretum & botanical garden ? 15 | What is the family of species of crotalus mitchellii muertensis ? 16 | What is the family of species of university of maryland arboretum & botanical garden ? 17 | What is the family of species of cyclura cychlura figginsi ? 18 | What is the genus of species of university of maryland arboretum & botanical garden ? 19 | What is the number of genus of species of university of maryland arboretum & botanical garden ? 20 | What is the kingdom of species of cyclura cychlura figginsi ? 21 | What is the kingdom of species of university of maryland arboretum & botanical garden ? 22 | What is the kingdom of species of crotalus mitchellii muertensis ? 23 | What is the number of kingdom of species of cyclura cychlura figginsi ? 24 | What is the number of kingdom of species of university of maryland arboretum & botanical garden ? 25 | What is the order (taxonomy) of species of cyclura cychlura figginsi ? 26 | What is the order (taxonomy) of species of university of maryland arboretum & botanical garden ? 27 | What is the number of order (taxonomy) of species of university of maryland arboretum & botanical garden ? 28 | What is the number of order (taxonomy) of species of cyclura cychlura figginsi ? 29 | What is the phylum of species of cyclura cychlura figginsi ? 30 | What is the phylum of species of crotalus mitchellii muertensis ? 31 | What is the number of phylum of species of cyclura cychlura figginsi ? 32 | What is the number of phylum of species of crotalus mitchellii muertensis ? 33 | What is the synonym of species of crotalus mitchellii muertensis ? 34 | What is the number of synonym of species of crotalus mitchellii muertensis ? 35 | What is the Link from a Wikipage to an external page of species of crotalus mitchellii muertensis ? 36 | What is the number of Link from a Wikipage to an external page of species of crotalus mitchellii muertensis ? 37 | What is the Wikipage page ID of species of university of maryland arboretum & botanical garden ? 38 | What is the Wikipage page ID of species of crotalus mitchellii muertensis ? 39 | What is the Wikipage page ID of species of cyclura cychlura figginsi ? 40 | What is the Wikipage redirect of species of black-chinned emperor tamarin ? 41 | What is the number of Wikipage redirect of species of black-chinned emperor tamarin ? 42 | What is the Wikipage revision ID of species of cyclura cychlura figginsi ? 43 | What is the Wikipage revision ID of species of crotalus mitchellii muertensis ? 44 | What is the Wikipage revision ID of species of university of maryland arboretum & botanical garden ? 45 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/question_form.csv: -------------------------------------------------------------------------------- 1 | When is the ,Who is the ,What is the ,Where is the ,What is the number of ,Which one is the oldest based on ,Which one the highest based on,Which one the highest based on 2 | select ?x ,select ?x ,select ?x ,select ?x ,select count(*) as ?x ,select distinct(?x) ,select distinct(?x),select distinct(?x) 3 | } , } , } , } , } , } order by ?x limit 1 ,} order by ?x limit 1,} order by ?x limit 1 4 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/readme.md: -------------------------------------------------------------------------------- 1 | # Utilities 2 | 3 | Download the SubjectiveEye3D dataset here. The link: [https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz](https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz) 4 | 5 | ## Note 6 | 7 | Apart from the main project a number of utilities were created various tasks from full fledge code to few lines of script. A brief of those utilities is as follows. I encourage you to feel free to use them for evaluating, pre-processing and analyse different aspects of the project. The utilities are present in `pepeline_3/utility` 8 | 9 | - Comparer 10 | - Relavent embedding extractor 11 | - QALD JSON geberator to enable use in GERBIL 12 | - Test fixer for thresholding the question present in the general test set. 13 | - Vocab extractor from previous tensorflow models. 14 | 15 | Specific instructions for running each of the utilities is provided in their respective directories. 16 | 17 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/vocab_extractor_from_model/embedding_extractor.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import tensorflow as tf 3 | import numpy as np 4 | """ 5 | - The following code when run with proper model location is capable of extracting the trained embeddings of a given model. 6 | - The embeddings are present in the form: 7 | - The embedding decoder outputs sparql language embeddings 8 | - The embedding encoder outputs english language embeddings 9 | """ 10 | def restore_session(self, session): 11 | saver = tf.train.import_meta_graph('./translate.ckpt-32000.meta') 12 | saver.restore(session, './translate.ckpt-32000') 13 | 14 | 15 | def test_word2vec(): 16 | opts = Options() 17 | with tf.Graph().as_default(), tf.Session() as session: 18 | with tf.device("/cpu:0"): 19 | model = Word2Vec(opts, session) 20 | model.restore_session(session) 21 | model.get_embedding("assistance") 22 | accum = [] 23 | with tf.Session() as sess: 24 | saver = tf.train.import_meta_graph('translate.ckpt-32000.meta') 25 | print("***************") 26 | print(saver.restore(sess, "translate.ckpt-32000")) 27 | print(tf.all_variables()) 28 | lis = (sess.run(('embeddings/decoder/embedding_decoder:0'))) 29 | print(np.shape(lis)) 30 | decode = open('vocab.sparql','r').readlines() 31 | embed = open('embed_vocab.sparql','w') 32 | if(len(decode) == np.shape(lis)[0]): 33 | for dec in range(len(decode)): 34 | accum.append([decode[dec][:-1]]+list(lis[dec,:])) 35 | temp = ' '.join(str(v) for v in accum[-1]) 36 | #print(temp) 37 | embed.write(temp+'\n') 38 | embed.close() 39 | 40 | 41 | lis = (sess.run(('embeddings/encoder/embedding_encoder:0'))) 42 | print(np.shape(lis)) 43 | decode = open('vocab.en','r').readlines() 44 | embed = open('embed_vocab.en','w') 45 | if(len(decode) == np.shape(lis)[0]): 46 | for dec in range(len(decode)): 47 | accum.append([decode[dec][:-1]]+list(lis[dec,:])) 48 | temp = ' '.join(str(v) for v in accum[-1]) 49 | #print(temp) 50 | embed.write(temp+'\n') 51 | embed.close() 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /gsoc/anand/pipeline_3/utility/vocab_extractor_from_model/readme.md: -------------------------------------------------------------------------------- 1 | # Embedding Extractor 2 | 3 | - Download the files from [here](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/anand-pipeline_3-vocab_extractor_from_model-files.zip) 4 | - The following code when run with proper model location is capable of extracting the trained embeddings of a given model. 5 | - The embeddings are present in the form: ` ` 6 | - The embedding decoder outputs sparql language embeddings 7 | - The embedding encoder outputs english language embeddings 8 | 9 | -------------------------------------------------------------------------------- /gsoc/anand/readme.md: -------------------------------------------------------------------------------- 1 | # A Neural QA Model for DBpedia 2 | ## Abstract 3 | With booming amount of information being continuously added to the internet, organising the facts becomes a very difficult task. Currently DBpedia hosts billions of such data points and corresponding relations in the RDF format. 4 | 5 | Extracting data from such data sources requires a query to be made in SPARQL and the response to the query is a link that contains the information pertaining to the answer or the answer itself. 6 | 7 | Accessing such data is difficult for a lay user, who does not know how to write a query. This proposal tries to built upon a System :(​ https://github.com/AKSW/NSpM/tree/master ​) — which tries to make this humongous linked data available to a larger user base in their natural languages(now restricted to English) by improving, adding and amending upon the existing codebase. 8 | 9 | The primary objective of the project is to be able to translate any natural language question to a valid SPARQL query. 10 | 11 | >> You can find the supporting blogs at : https://anandpanchbhai.com/A-Neural-QA-Model-for-DBpedia/ 12 | 13 | ## Pipeline 1 14 | 15 | The code is a completely automated and fixed version of what was done by the previous developer working on the project. The instruction for running the same are provided inside the `pipeline_1` folder. 16 | 17 | ## Pipeline 3 18 | 19 | Pipeline 3 refers to the newest code implementation that was introducedas part of GSoC'19 by Anand Panchbhai. The highlights of this pipeline include the folowing: 20 | 21 | - A fast, automated and custom question set generator for DBpedia which can generate questions for any class. 22 | - It also generates test set to test different composionality based queries. 23 | - The ranking of the questions is also generated as part of the project which help us understand which question might be more natural compared to others. A higher ranl signifies a higher probability of the question being natural. 24 | - This pipeline was used to create test sets on which further experiments were done. an the corresponding results are as follows: 25 | 26 | ### Eukaryotes 27 | 28 | With the following configuration: 29 | 30 | | Size | Layers | Dropout | Attention Mechanism | Embeddings |BLEU | Accuracy | 31 | |------ | ----------| -------- | ------------------- |------------|-----|---------| 32 | |128 | 2 | 0.7 | Scaled Luong | Yes: SPARQL (RDF2VEC), English Previous embeddings | 93 | 63| 33 | 34 | Grid search was done on this dataset to determine the best possible hyperparameters for the NMT model pertaining to the English to SPARQL conversion. THe grid search stats can be found at [Grid Search](https://anandpanchbhai.com/A-Neural-QA-Model-for-DBpedia/static/GridSearch/GridSearch.html) 35 | 36 | ### Person 37 | 38 | After successfully completing the Grid search on the Eukaryotes data we moved on to checking the viability of the model on other ontologies namely Person. The data set was humongous compared to what we had dealt with earlier. The train set contained: 302277 queries and the test after thresholding contained: 113982 queries. We were still able to get some pretty interesting results for our best expected model configuration obtained from the grid search done earlier: 39 | 40 | With the following configuration: 41 | 42 | | Size | Layers | Dropout | Attention Mechanism | Embeddings |BLEU | Accuracy | 43 | |------ | ----------| -------- | ------------------- |------------|-----|---------| 44 | |128 | 2 | 0.7 | Scaled Luong | Yes: SPARQL (RDF2VEC), English Previous embeddings | 80 | 40| 45 | 46 | The graphs are as follows: 47 | 48 | ![Test BLEU](.images/test-bleu.png) 49 | BLEU scores for person test set. 50 | ![Test Accuracy](.images/test-accuracy.png) 51 | Accuracy scores for person test set. 52 | 53 | 54 | ## Utilities 55 | 56 | Apart from the main project a number of utilities were created various tasks from full fledge code to few lines of script. A brief of those utilities is as follows. I encourage you to feel free to use them for evaluating, pre-processing and analyse different aspects of the project. The utilities are present in `pepeline_3/utility` 57 | 58 | - Comparer 59 | - Relavent embedding extractor 60 | - QALD JSON geberator to enable use in GERBIL 61 | - Test fixer for thresholding the question present in the general test set. 62 | - Vocab extractor from previous tensorflow models. 63 | 64 | Specific instructions for running each of the utilities is provided in their respective directories. 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/README.md: -------------------------------------------------------------------------------- 1 | 2 | # One-Command Pipeline # 3 | To run the complete pipeline, please use the command: 4 | 5 | ```bash 6 | ./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer] 7 | ``` 8 | 9 | | Parameter | Description | Type | Default | 10 | | :--------:|-------------|------|--------:| 11 | | $1 | The project's name | String | Required | 12 | | $2 | Dimension of the GloVe embeddings | Integer {50,100,200,300} | 300 | 13 | | $3 | Number of unit in the LSTM cells | Integer | 512 | 14 | | $4 | Training steps | Integer | 60000 | 15 | | $5 | Examples per template | Integer | 600 | 16 | 17 | Examples: 18 | 19 | ```bash 20 | ./pipeline.sh Project1 21 | ``` 22 | ```bash 23 | ./pipeline.sh Project2 300 512 60000 600 24 | ``` 25 | 26 | 27 | # Code Notes # 28 | ## Paraphrase Questions 29 | After the creation of templates and the elimination of the never asked queries, the questions will be passed to the Paraphraser. 30 | - `paraphrase_questions.py`: This aims to paraphrase the question template and return several possible candidates with their scores (potentially textual similarity, POS taggings, etc.). The main pipeline will select the templates with a strategy and add it/them to the template set. 31 | - `textual_similarity.py`: This aims to calculate the scores of similarity between the candidates and the original question template. 32 | 33 | To test the Paraphraser 34 | ```bash 35 | python paraphrase_questions.py --sentence "what is your name ?" 36 | ``` 37 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/batch_paraphrase.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import tensorflow as tf 4 | tf.compat.v1.enable_eager_execution() 5 | from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced 6 | from constant import Constant 7 | 8 | const = Constant() 9 | seperator = "\t" 10 | 11 | const.URL = "https://datascience-models-ramsri.s3.amazonaws.com/t5_paraphraser.zip" 12 | 13 | def batch_paraphrase(templates_path, model_dir): 14 | folder_path = get_pretrained_model(const.URL) 15 | set_seed(42) 16 | tokenizer, device, model = prepare_model(folder_path) 17 | dir = os.path.realpath(templates_path) 18 | with open(dir, "r") as lines: 19 | with open(dir + "_paraphrased", "w") as w: 20 | for line in lines: 21 | prop = line.strip("\n").split(seperator) 22 | question = prop[3] 23 | paraphrased_candidates = paraphrase_questions(tokenizer, device, model, question) 24 | paraphrased = pick_final_sentence(question, paraphrased_candidates) 25 | advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir) 26 | w.write(line) 27 | print("Original", line) 28 | # for i, candidate in enumerate(paraphrased_candidates): 29 | # new_prop = prop[:-1] 30 | # new_prop[3] = candidate 31 | # new_prop.append("Paraphrased {}\n".format(i)) 32 | # print(new_prop) 33 | # new_line = seperator.join(new_prop) 34 | # 35 | # w.write(new_line) 36 | new_prop = prop[:-1] 37 | new_prop[3] = paraphrased 38 | new_prop.append("Paraphrased \n") 39 | new_line = seperator.join(new_prop) 40 | w.write(new_line) 41 | print("Paraphrase", new_line) 42 | 43 | new_prop = prop[:-1] 44 | new_prop[3] = advanced 45 | new_prop.append("Paraphrased advanced\n") 46 | new_line = seperator.join(new_prop) 47 | w.write(new_line) 48 | print("Advanced", new_line) 49 | 50 | 51 | if __name__=="__main__": 52 | parser = argparse.ArgumentParser() 53 | requiredNamed = parser.add_argument_group('Required Arguments') 54 | 55 | requiredNamed.add_argument('--templates', dest='templates', metavar='templates file', 56 | help='templates file', required=True) 57 | requiredNamed.add_argument('--model', dest='model', metavar='model_dir', 58 | help='path of directory of the fine-tuned model', required=True) 59 | 60 | 61 | args = parser.parse_args() 62 | templates_path = args.templates 63 | model_dir = args.model 64 | batch_paraphrase(templates_path, model_dir) 65 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/bert_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch #version==1.6.0 3 | from transformers import BertTokenizer 4 | from transformers import BertForSequenceClassification 5 | import numpy as np 6 | 7 | 8 | 9 | def load_model(model_dir, device): 10 | model = BertForSequenceClassification.from_pretrained(model_dir) 11 | tokenizer = BertTokenizer.from_pretrained(model_dir) 12 | 13 | # Copy the model to the GPU. 14 | model.to(device) 15 | return model, tokenizer 16 | 17 | def encode(test_set, tokenizer): 18 | input_ids = [] 19 | attention_masks = [] 20 | for sent in test_set[:10]: 21 | encoded_dict = tokenizer.encode_plus( 22 | sent[0], 23 | sent[1], # Sentence to encode. 24 | add_special_tokens=True, # Add '[CLS]' and '[SEP]' 25 | max_length=64, # Pad & truncate all sentences. 26 | truncation=True, 27 | pad_to_max_length=True, 28 | return_attention_mask=True, # Construct attn. masks. 29 | return_tensors='pt', # Return pytorch tensors. 30 | ) 31 | input_ids.append(encoded_dict['input_ids']) 32 | 33 | # And its attention mask (simply differentiates padding from non-padding). 34 | attention_masks.append(encoded_dict['attention_mask']) 35 | input_ids = torch.cat(input_ids, dim=0) 36 | attention_masks = torch.cat(attention_masks, dim=0) 37 | return input_ids, attention_masks 38 | 39 | def predict(device, test_set, model, tokenizer): 40 | input_ids, attention_masks = encode(test_set, tokenizer) 41 | model.eval() 42 | # Tracking variables 43 | predictions = [] 44 | # Add batch to GPU 45 | b_input_ids = input_ids.to(device) 46 | b_input_mask = attention_masks.to(device) 47 | # Telling the model not to compute or store gradients, saving memory and 48 | # speeding up prediction 49 | with torch.no_grad(): 50 | # Forward pass, calculate logit predictions 51 | outputs = model(b_input_ids, token_type_ids=None, 52 | attention_mask=b_input_mask) 53 | logits = outputs[0] 54 | # Move logits and labels to CPU 55 | logits = logits.detach().cpu().numpy() 56 | # Store predictions and true labels 57 | predictions.append(logits) 58 | pred_labels = [] 59 | print(predictions) 60 | for i in range(len(predictions[0])): 61 | # get the highest score of logits to be the class 62 | # the result will be 0,1,2 so it should be -1 63 | pred_labels.append(np.argmax(predictions[0][i]).flatten()[0]-1) 64 | return pred_labels 65 | 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser() 68 | requiredNamed = parser.add_argument_group('Required Arguments') 69 | 70 | requiredNamed.add_argument('--model', dest='model', metavar='model folder', 71 | help='Bert fine-tuned model\'s folder path', required=True) 72 | # requiredNamed.add_argument('--testset', dest='testset', metavar='testset', 73 | # help='A list: [[Origin, Paraphrase1],[O, P2]..]', required=True) 74 | 75 | 76 | # If there's a GPU available... 77 | if torch.cuda.is_available(): 78 | 79 | # Tell PyTorch to use the GPU. 80 | device = torch.device("cuda") 81 | 82 | print('There are %d GPU(s) available.' % torch.cuda.device_count()) 83 | 84 | print('We will use the GPU:', torch.cuda.get_device_name(0)) 85 | 86 | # If not... 87 | else: 88 | print('No GPU available, using the CPU instead.') 89 | device = torch.device("cpu") 90 | 91 | 92 | testset = [['When is the birth date of
?', 'When is the birthday of ?'], 93 | ["When is the birth date of ?", "When was born ?"], 94 | ["When is the birth date of ?", "Where does come from ?"], 95 | ["When is the birth date of ?","What is the birth name of ?"], 96 | ["What is the ingredient of ?","What is the Ingredient for ?"], 97 | ["What is the ingredient of ?","What is 's ingredient ?"]] 98 | args = parser.parse_args() 99 | model = args.model 100 | # testset = args.testset 101 | model, tokenizer = load_model(model, device) 102 | pred_labels = predict(device, testset, model, tokenizer) 103 | for i, pair in enumerate(testset): 104 | print(" ".join(pair), pred_labels[i]) -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/eliminator.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | 4 | def eliminator(): 5 | """ 6 | The function remove the templates which are considered as less popular 7 | based on the proposed ranking mechanism, the input files should be pre processed 8 | and a TRUE or FALSE should be added as the last column. 9 | 10 | This function just removes the entries with FALSE as the last entry in a row 11 | and create a file name new_train.csv to be used for futher purposes. 12 | """ 13 | lines = open(location,'r').readlines() 14 | print(len(lines)) 15 | accum = [] 16 | nspm_ready = open("new_train.csv",'w') 17 | for line in tqdm(lines): 18 | values = line.split(",") 19 | if(len(values)<8): 20 | print("Input file is of wrong format, please add the corect bolean values as the last column entry to use this function") 21 | print(values[-1]) 22 | if(values[-1]=="TRUE\n"): 23 | accum.append(";".join(values[:-2])+"\n") 24 | nspm_ready.write(accum[-1]) 25 | nspm_ready.close() 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | requiredNamed = parser.add_argument_group('Required Arguments') 30 | 31 | requiredNamed.add_argument('--location', dest='location', metavar='location', 32 | help='location of the file to be pruned.', required=True) 33 | args = parser.parse_args() 34 | location = args.location 35 | eliminator(location) 36 | pass 37 | 38 | 39 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/fetch_ranks.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | def fetch_ranks(filename='../utility/wikidata.rank'): 5 | """ 6 | The function loads rank from a supplied position. 7 | """ 8 | sub = open(filename,'r').readlines() 9 | diction={} 10 | 11 | print("Loading Rankings") 12 | for val in tqdm(sub): 13 | diction[val.split('\t')[0]] = float(val.split('\t')[1]) 14 | return diction 15 | 16 | if __name__ == "__main__": 17 | fetch_ranks() 18 | pass 19 | 20 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/fetch_ranks_sub.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import numpy as np 3 | 4 | def fetch_ranks(filename='part-r-00000'): 5 | """ 6 | The function loads ranks from a supplied location, 7 | The ranks fileshould belong to the subjective 3d format 8 | of saving ranks. 9 | """ 10 | sub = open(filename,'r').readlines() 11 | diction={} 12 | 13 | print("Loading Rankings") 14 | for val in tqdm(sub): 15 | diction[val.split('\t')[0].strip()[1:-1].strip()] = float(val.split('\t')[-2].split('"')[1]) 16 | return diction 17 | 18 | if __name__ == "__main__": 19 | fetch_ranks() 20 | pass 21 | 22 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/get_properties.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import json 3 | import sys 4 | import csv 5 | import io 6 | import argparse 7 | import os 8 | from bs4 import BeautifulSoup 9 | from tqdm import tqdm 10 | 11 | def get_properties(url, project_name="test_project", output_file = "get_properties.csv", multi=False): 12 | """ 13 | - This function extracts the information regarding : [Name, Label, Domain, Range] from a page like this : 14 | http://mappings.dbpedia.org/server/ontology/classes/Place and saves it in a file in CSV format. 15 | - This code on execution creates a csv which contains all the properties, ontology, 16 | class related information and data types as field values in each row. 17 | - This function also returns a 2D list of the information mentioned above to the calling 18 | function 19 | """ 20 | page = urllib.request.urlopen(url) 21 | soup = BeautifulSoup(page, "html.parser") 22 | if(not os.path.isdir(project_name)): 23 | os.makedirs(project_name) 24 | if multi: 25 | output_file = open(project_name + "/" + output_file, 'a', encoding="utf-8") 26 | else: 27 | output_file = open(project_name+"/" + output_file, 'w', encoding="utf-8") 28 | fl = 0 29 | accum = [] 30 | for rows in tqdm(soup.find_all("tr")): 31 | x = rows.find_all("td") 32 | if len(x) <= 2: 33 | fl = 1 34 | continue 35 | if fl == 1: 36 | fl = 2 37 | continue 38 | name = rows.find_all("td")[0].get_text().replace(" (edit)", "") 39 | label = rows.find_all("td")[1].get_text() 40 | dom = rows.find_all("td")[2].get_text() 41 | rng = rows.find_all("td")[3].get_text() 42 | if rows.find_all("td")[0].find('a'): 43 | URL_name = ((rows.find_all("td")[0].find('a').attrs['href'])) 44 | 45 | final = name + "," + label + "," + dom + "," + rng 46 | #+ ","+ URL_name.split(':')[-1] 47 | accum.append(final) 48 | output_file.write(final+"\n") 49 | output_file.close() 50 | return accum 51 | 52 | 53 | """ 54 | Name, Label, Domain, Range, URL_name 55 | """ 56 | 57 | if __name__ == "__main__": 58 | """ 59 | Section to parse the command line arguments. 60 | """ 61 | parser = argparse.ArgumentParser() 62 | requiredNamed = parser.add_argument_group('Required Arguments') 63 | 64 | requiredNamed.add_argument('--url', dest='url', metavar='url', 65 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 66 | requiredNamed.add_argument( 67 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 68 | requiredNamed.add_argument( 69 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 70 | args = parser.parse_args() 71 | url = args.url 72 | output_file = args.out_put 73 | project_name = args.project_name 74 | get_properties(url = url, project_name= project_name, output_file = output_file) 75 | pass 76 | -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/question_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def question_generator(query): 5 | pass 6 | 7 | 8 | 9 | if __name__ == "__main__": 10 | """ 11 | Section to parse the command line arguments. 12 | """ 13 | parser = argparse.ArgumentParser() 14 | requiredNamed = parser.add_argument_group('Required Arguments') 15 | 16 | requiredNamed.add_argument('--url', dest='url', metavar='url', 17 | help='Webpage URL: eg-http://mappings.dbpedia.org/server/ontology/classes/Place', required=True) 18 | requiredNamed.add_argument( 19 | '--output_file', dest='out_put', metavar='out_put', help='temp.csv', required=True) 20 | requiredNamed.add_argument( 21 | '--project_name', dest='project_name', metavar='project_name', help='test', required=True) 22 | args = parser.parse_args() 23 | url = args.url 24 | output_file = args.out_put 25 | project_name = args.project_name 26 | pass -------------------------------------------------------------------------------- /gsoc/zheyuan/pipeline/textual_similarity.py: -------------------------------------------------------------------------------- 1 | import math, argparse 2 | import numpy as np 3 | import tensorflow as tf 4 | import tensorflow_hub as hub 5 | import os 6 | import nltk 7 | nltk.download('punkt') 8 | nltk.download('averaged_perceptron_tagger') 9 | nltk.download('universal_tagset') 10 | from nltk import pos_tag, word_tokenize, RegexpParser 11 | 12 | from constant import Constant 13 | const = Constant() 14 | const.MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder-large/5" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"] 15 | # os.environ['TFHUB_CACHE_DIR'] = '/tmp/tfhub_modules' 16 | print('start') 17 | embed = hub.load(const.MODULE_URL) 18 | 19 | def similarities(sentence, paraphrases): 20 | vectors = embed([sentence] + paraphrases) 21 | cosine_similarities = [] 22 | for v2 in vectors[1:]: 23 | cosine_similarities.append(cosine_similarity(np.array(vectors[0]), np.array(v2))) 24 | 25 | return cosine_similarities 26 | 27 | def similarity(sentence,paraphrase): 28 | vectors = embed([sentence, paraphrase]) 29 | return cosine_similarity(vectors[0], vectors[1]) 30 | 31 | # def cosine_similarities(v1, vectors): 32 | # #Calculate semantic similarity between two original v1 and paraphrased vectors 33 | # similarities = [] 34 | # for v2 in vectors: 35 | # similarities.append(cosine_similarity(v1,v2)) 36 | # return similarities 37 | 38 | 39 | def cosine_similarity(v1, v2): 40 | # Calculate semantic similarity between two sentence vectors 41 | mag1 = np.linalg.norm(v1) 42 | mag2 = np.linalg.norm(v2) 43 | if (not mag1) or (not mag2): 44 | return 0 45 | return np.dot(v1, v2) / (mag1 * mag2) 46 | 47 | 48 | def prof_similarity(v1, v2): 49 | #Calculate the semantic similarity based on the angular distance 50 | cosine = cosine_similarity(v1, v2) 51 | prof_similarity = 1 - math.acos(cosine) / math.pi 52 | return prof_similarity 53 | 54 | def minDistance(s1, s2): 55 | """ 56 | :type s1: str 57 | :type s2: str 58 | :rtype: int 59 | """ 60 | 61 | len1 = len(s1) 62 | len2 = len(s2) 63 | dp = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)] 64 | for i in range(len1 + 1): 65 | for j in range(len2 + 1): 66 | if i > 0 and j == 0: 67 | dp[i][j] = dp[i - 1][j] + 1 68 | elif j > 0 and i == 0: 69 | dp[i][j] = dp[i][j - 1] + 1 70 | elif j > 0 and i > 0: 71 | res1 = dp[i - 1][j] + 1 72 | res2 = dp[i][j - 1] + 1 73 | res3 = not s1[i - 1] == s2[j - 1] and dp[i - 1][j - 1] + 1 or dp[i - 1][j - 1] 74 | dp[i][j] = min(res1, res2, res3) 75 | return dp[len1][len2] 76 | 77 | 78 | def words_distance(sentence1, sentence2): 79 | return minDistance(word_tokenize(sentence1), word_tokenize(sentence2)) 80 | 81 | 82 | def tags_distance(sentence1, sentence2): 83 | tagged1 = pos_tag(word_tokenize(sentence1), tagset='universal') 84 | tagged2 = pos_tag(word_tokenize(sentence2), tagset='universal') 85 | tags1 = [j for i, j in tagged1] 86 | tags2 = [j for i, j in tagged2] 87 | return minDistance(tags1, tags2) 88 | 89 | 90 | from collections import Counter 91 | 92 | def count_NNP(sentence): 93 | tokens = word_tokenize(sentence) 94 | tagged = pos_tag(tokens) 95 | tag = [j for i, j in tagged] 96 | result = Counter(tag) 97 | return result["NNP"] 98 | 99 | def has_NNP(sentence, num): 100 | tokens = word_tokenize(sentence) 101 | tagged = pos_tag(tokens) 102 | tag = [j for i, j in tagged] 103 | result = Counter(tag) 104 | return result["NNP"] <= num 105 | 106 | 107 | 108 | if __name__=="__main__": 109 | parser = argparse.ArgumentParser() 110 | requiredNamed = parser.add_argument_group('Required Arguments') 111 | 112 | requiredNamed.add_argument('--s1', dest='sentence1', metavar='sentence1', 113 | help='a sentence', required=True) 114 | requiredNamed.add_argument('--s2', dest='sentence2', metavar='sentence2', 115 | help='another sentence', required=True) 116 | args = parser.parse_args() 117 | s1 = args.sentence1 118 | s2 = args.sentence2 119 | print("cosine similarity:", similarity(s1, s2), "Edit distance: ", edit_distance(s1,s2)) 120 | pass 121 | -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/GloVe/glove_finetune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import numpy as np 4 | from collections import Counter 5 | from nltk.corpus import brown 6 | from mittens import GloVe, Mittens 7 | from sklearn.feature_extraction import stop_words 8 | from sklearn.feature_extraction.text import CountVectorizer 9 | 10 | 11 | def glove2dict(glove_filename): 12 | with open(glove_filename, encoding='utf-8') as f: 13 | reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE) 14 | embed = {line[0]: np.array(list(map(float, line[1:]))) 15 | for line in reader} 16 | return embed 17 | 18 | def batch_finetune(finetune_glove, batch_word, dimension): 19 | oov = [token for token in batch_word if token not in finetune_glove.keys()] 20 | 21 | en_doc = [' '.join(batch_word)] 22 | 23 | corp_vocab = list(set(oov)) 24 | cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab) 25 | X = cv.fit_transform(en_doc) 26 | Xc = (X.T * X) 27 | Xc.setdiag(0) 28 | coocc_ar = Xc.toarray() 29 | 30 | mittens_model = Mittens(n=dimension, max_iter=1800) 31 | new_embeddings = mittens_model.fit( 32 | coocc_ar, 33 | vocab=corp_vocab, 34 | initial_embedding_dict=finetune_glove) 35 | 36 | newglove = dict(zip(corp_vocab, new_embeddings)) 37 | finetune_glove.update(newglove) 38 | return finetune_glove 39 | 40 | def calculate_unknown(finetune_glove,dimension): 41 | vecs = np.zeros((len(finetune_glove), dimension), dtype=np.float32) 42 | for i, key in enumerate(finetune_glove): 43 | vecs[i] = np.array(finetune_glove[key], dtype=np.float32) 44 | unknown = np.mean(vecs, axis=0) 45 | return unknown 46 | 47 | def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): 48 | word_en = [] 49 | with open(project_path+"/data.en", "r") as lines: 50 | for sentence in lines: 51 | sentence = sentence.strip("\n") 52 | sentence = " " + sentence + " " 53 | for word in sentence.split(): 54 | word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) 55 | print(len(word_en), word_en[:20]) 56 | 57 | vocab_en = list(set(word_en) - set(["", ""])) 58 | 59 | pre_glove = glove2dict(glove_path) 60 | stride = 700000 61 | start = 0 62 | end = start+stride 63 | finetune_glove = pre_glove.copy() 64 | while end"] = unk 73 | with open(project_path+"/embed.en", "w") as w: 74 | for word in finetune_glove: 75 | w.write(word + " " + str(list(finetune_glove[word])).replace("[", "").replace("]", "").replace(",", "") + "\n") 76 | 77 | if __name__=="__main__": 78 | parser = argparse.ArgumentParser() 79 | requiredNamed = parser.add_argument_group('Required Arguments') 80 | 81 | requiredNamed.add_argument('--path', dest='path', metavar='path', 82 | help='path of project that contains the data..eb/sparql files', required=True) 83 | requiredNamed.add_argument('--dimension', dest='dimension', metavar='dimension', 84 | help='path of project that contains the data..eb/sparql files', required=False) 85 | 86 | args = parser.parse_args() 87 | path = args.path 88 | dimension = args.dimension 89 | 90 | if dimension: 91 | if dimension <= 50: 92 | dimension = 50 93 | elif dimension <= 100: 94 | dimension = 100 95 | elif dimension <= 200: 96 | dimension = 200 97 | else: 98 | dimension = 300 99 | finetune_glove(path,"glove.6B/glove.6B."+dimension+"d.txt", dimension=dimension) 100 | else: 101 | finetune_glove(path, "glove.6B/glove.6B.300d.txt") 102 | pass -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline of Benchmark # 2 | This benchmark pipeline uses irbench as a local tool to calculate metrics on final answers. 3 | 4 | ## To run the code 5 | Firstly, you need to follow the instructions of NSpM to train the NMT model. Let's say that the model is trained on `monument_300`. 6 | 7 | Then, run our pipeline to generate the answers JSON file 8 | ```bash 9 | python benchmark.py --model --test [--answer ] 10 | ``` 11 | For example, using [this file](https://nspm-models.s3.eu-west-2.amazonaws.com/misc/qald-9-train-multilingual.qald.json): 12 | ```bash 13 | python benchmark.py --model monument_300 --test qald-9-train-multilingual.qald.json 14 | ``` 15 | 16 | Finally, evaluate out answers using the irbench 17 | Remember to clone the project of irebench and download their release jar file 18 | ```bash 19 | java -jar irbench-v0.0.1-beta.2.jar -evaluate "qald-9-train-multilingual" "/answer.qald.json" "f-score" 20 | ``` 21 | For other configurations details, please visit their [site](https://github.com/AKSW/irbench) and some tips are given for the jdk version in my [blogs](https://baiblanc.github.io/2020/06/23/GSOC-Week-Three/) -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from tqdm import tqdm 4 | from interpreter import interprete 5 | from extract_questions import read_json, write_to_ask 6 | from retrieve_answers import read_sparqls, retrieve 7 | from reconstruct_json import construct_json 8 | 9 | 10 | 11 | def benchmark(trained_model, test_set, answer_file="answers.json"): 12 | # Deconstruct the questions and infos from test set 13 | questions_info, questions = read_json(test_set) 14 | 15 | # Write the questions to to_ask.txt 16 | write_to_ask(questions) 17 | 18 | # Use Interpreter to interprete them into decoded queries stored in output_decoded.txt 19 | interprete(trained_model) 20 | 21 | # Use the sparql endpoint (http://dbpedia.org/sparql) to retrieve answers of the queries 22 | sparqls = read_sparqls() 23 | answers = [] 24 | print("Retrieving answers of queries via SPARQL endpoint") 25 | for query in tqdm(sparqls): 26 | try: 27 | answer_group = retrieve(query) 28 | except: 29 | answer_group = [] 30 | answers.append(answer_group) 31 | 32 | json_file = construct_json(test_set.replace(".qald.json",""), questions_info, questions, sparqls, answers) 33 | path = "../gsoc/zheyuan/utility/benchmark/" 34 | with open(path+"answers-"+test_set, "w") as f: 35 | # js = json.dumps(json_file, indent=4, separators=(',', ':')) 36 | json.dump(json_file, f, indent=4, separators=(', ', ': ')) 37 | 38 | 39 | 40 | if __name__ == "__main__": 41 | """ 42 | Section to parse the command line arguments. 43 | """ 44 | parser = argparse.ArgumentParser() 45 | requiredNamed = parser.add_argument_group('Required Arguments') 46 | 47 | requiredNamed.add_argument('--model', dest='model', metavar='[modelId]', 48 | help='the trained model', required=True) 49 | requiredNamed.add_argument('--test', dest='test', metavar='[testset file name]', 50 | help='the testing qald set file name', required=True) 51 | requiredNamed.add_argument('--answer', dest='answer', metavar='[answer file name]', 52 | help='the answers of qald dataset file name', required=False) 53 | args = parser.parse_args() 54 | trained_model = args.model 55 | test_set = args.test 56 | answer_file = args.answer 57 | benchmark(trained_model,test_set, answer_file) 58 | pass -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/extract_questions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | 4 | def read_json(file): 5 | with open(file,'r') as load_f: 6 | load_dict = json.load(load_f) 7 | dataset_id = load_dict['dataset']['id'] 8 | questions = {} 9 | questions_info = [] 10 | 11 | info_names = ['id','answertype','aggregation','onlydbo','hybrid'] 12 | for question_dict in load_dict['questions']: 13 | id = "" 14 | question_infos = {} 15 | for key in question_dict: 16 | value = question_dict[key] 17 | 18 | 19 | if key == "id": 20 | id = value 21 | if key in info_names: 22 | question_infos[key] = value 23 | 24 | elif key == "question": 25 | for question in value: 26 | if question['language'] == "en": 27 | questions[id] = question["string"] 28 | questions_info.append(question_infos) 29 | 30 | return questions_info, questions 31 | def write_to_ask(questions): 32 | with open('to_ask1.txt', 'w') as write_f: 33 | for key in questions: 34 | question = questions[key] 35 | write_f.write(question.lower().replace("?"," ?")+"\n") 36 | 37 | if __name__ == "__main__": 38 | """ 39 | Section to parse the command line arguments. 40 | """ 41 | parser = argparse.ArgumentParser() 42 | requiredNamed = parser.add_argument_group('Required Arguments') 43 | 44 | requiredNamed.add_argument('--path', dest='path', metavar='[path]', 45 | help='the test set\'s file path', required=True) 46 | args = parser.parse_args() 47 | path = args.path 48 | questions_info, questions = read_json(path) 49 | print(questions_info, questions) 50 | # write_to_ask(questions) 51 | pass -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/interpreter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def interprete(trained_model_folder): 4 | os.chdir('../../../../nmt') 5 | os.system('pwd') 6 | print('start') 7 | folder_name = 'data/'+trained_model_folder 8 | print('python -m nmt.nmt --vocab_prefix=../' + folder_name + '/vocab --out_dir=../' + folder_name + '_model --inference_input_file=../gsoc/zheyuan/utility/benchmark/to_ask1.txt --inference_output_file=../gsoc/zheyuan/utility/benchmark/output1.txt --src=en --tgt=sparql | tail -n4') 9 | os.system( 10 | 'python -m nmt.nmt --vocab_prefix=../' + folder_name + '/vocab --out_dir=../' + folder_name + '_model --inference_input_file=../gsoc/zheyuan/utility/benchmark/to_ask1.txt --inference_output_file=../gsoc/zheyuan/utility/benchmark/output1.txt --src=en --tgt=sparql | tail -n4') 11 | 12 | os.system('''if [ $? -eq 0 ] 13 | then 14 | echo "" 15 | echo "ANSWER IN SPARQL SEQUENCE:" 16 | ENCODED="$(cat ../gsoc/zheyuan/utility/benchmark/output1.txt)" 17 | python ../interpreter.py "${ENCODED}" > ../gsoc/zheyuan/utility/benchmark/output_decoded1.txt 18 | cat ../gsoc/zheyuan/utility/benchmark/output_decoded1.txt 19 | echo "" 20 | fi''') 21 | print('end') 22 | 23 | if __name__ == "__main__": 24 | """ 25 | Section to test the Interpreter. 26 | """ 27 | interprete('monument_300') 28 | pass 29 | 30 | -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/reconstruct_json.py: -------------------------------------------------------------------------------- 1 | def construct_json(dataset_id,infos, questions, sparqls, answers): 2 | qald_test_answers_dict = {} 3 | qald_test_answers_dict["dataset"] = {'id':dataset_id} 4 | qald_test_answers_dict['questions'] = [] 5 | print(len(answers)) 6 | for index,info in enumerate(infos): 7 | 8 | question_dict = info 9 | 10 | id = info["id"] 11 | question = questions[id] 12 | question_dict["question"] = [{ 13 | "language" : "en", 14 | "string" : question 15 | }] 16 | question_dict["query"] = {"sparql" : sparqls[index]} 17 | question_dict["answers"] = answers[index] 18 | print(answers[index]) 19 | qald_test_answers_dict['questions'].append(question_dict) 20 | return qald_test_answers_dict 21 | 22 | -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/benchmark/retrieve_answers.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import urllib.request 5 | import urllib.parse 6 | from bs4 import BeautifulSoup 7 | 8 | def read_sparqls(): 9 | os.system("pwd") 10 | sparqls = [] 11 | file_path = "../gsoc/zheyuan/utility/benchmark/output_decoded1.txt" 12 | with open(file_path, 'r') as lines: 13 | for line in lines: 14 | sparqls.append(line) 15 | return sparqls 16 | 17 | def retrieve(query): 18 | try: # python3 19 | query = urllib.parse.quote_plus(query) 20 | except: # python2 21 | query = urllib.quote_plus(query) 22 | url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=" + query + "&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+" 23 | page = urllib.request.urlopen(url) 24 | soup = BeautifulSoup(page, "html.parser") 25 | total = len(soup.find_all("tr")) 26 | answers = [] 27 | 28 | for rows in (soup.find_all("tr")): 29 | answer_dict = { 30 | "head": { 31 | "vars": ["uri"] 32 | }, "results": { 33 | "bindings": [] 34 | } 35 | } 36 | for td in rows.find_all("a"): 37 | for a in td: 38 | uri = { 39 | "uri": { 40 | "type": "uri", 41 | "value": a 42 | } 43 | } 44 | 45 | 46 | answer_dict["results"]["bindings"].append(uri) 47 | 48 | for td in rows.find_all("pre"): 49 | for pre in td: 50 | # Eliminate the answer if it is longer than 50(not a URI nor a simple literal) 51 | if len(pre) <= 50: 52 | uri = { 53 | "uri": { 54 | "type": "uri", 55 | "value": a 56 | } 57 | } 58 | 59 | answer_dict["results"]["bindings"].append(uri) 60 | if answer_dict["results"]["bindings"]: 61 | answers.append(answer_dict) 62 | 63 | if not answers: 64 | return [{ 65 | "head" : { 66 | "vars" : [ "date" ] 67 | }, 68 | "results" : { } 69 | }] 70 | return answers 71 | 72 | 73 | if __name__ == "__main__": 74 | """ 75 | Section to parse the command line arguments. 76 | """ 77 | # parser = argparse.ArgumentParser() 78 | # requiredNamed = parser.add_argument_group('Required Arguments') 79 | # 80 | # requiredNamed.add_argument('--query', dest='query', metavar='query', 81 | # help='query of SPARQL', required=True) 82 | # args = parser.parse_args() 83 | # query = args.query 84 | answer_groups = [] 85 | i = 1 86 | with open("../output_decoded.txt", 'r') as lines: 87 | for line in lines: 88 | i+=1 89 | try: 90 | answer_group = retrieve(line) 91 | except: 92 | answer_group=[] 93 | answer_groups.append(answer_group) 94 | 95 | print(len(answer_groups), answer_groups) 96 | 97 | 98 | pass -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/queryFilter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from tqdm import tqdm 3 | from bs4 import BeautifulSoup 4 | import urllib.request 5 | 6 | labels = ['publisher', 'leaderName', 'mayor', 'country', 'musicComposer', 'routeEnd', 'starring', 'targetAirport', 'timeZone', 'origin', 'architect', 'team', 'Holiday', 'party', 'language', 'activeYearsEndDate', 'Protein', 'founder', 'foundingDate', 'governmentType', 'deathDate', 'type', 'birthName', 'vicePresident', 'knownFor', 'birthYear', 'crosses', 'city', 'height', 'ingredient', 'spouse', 'battle', 'child', 'location', 'doctoralAdvisor', 'portrayer', 'wineRegion', 'influenced', 'Beverage', 'developer', 'programmingLanguage', 'completionDate', 'budget', 'Organisation', 'numberOfPages', 'Sport', 'deathCause', 'growingGrape', 'product', 'capital', 'bandMember', 'largestCity', 'director', 'mission', 'ethnicGroup', 'officialLanguage', 'leader', 'foundationPlace', 'writer', 'date', 'abbreviation', 'dissolutionDate', 'successor', 'runtime', 'sourceCountry', 'maximumDepth', 'numberOfLocations', 'currency', 'state', 'birthDate', 'series', 'firstAscentPerson', 'composer', 'creator', 'influencedBy', 'almaMater', 'presenter', 'editor', 'discoverer', 'areaTotal', 'restingPlace', 'deathPlace', 'class', 'populationTotal', 'alias', 'owner', 'author', 'birthPlace', 'award'] 7 | ontologies = [] 8 | for label in labels: 9 | if re.match(r'^[A-Z].*', label): 10 | print(label) 11 | ontologies.append(label) 12 | # ontologies = '[Holiday,Protein,Beverage,Organisation,Sport]' 13 | properties = list(set(labels) ^ set(ontologies)) 14 | print(properties) 15 | domains = set() 16 | for property in properties: 17 | url = "http://dbpedia.org/ontology/"+property 18 | page = urllib.request.urlopen(url) 19 | soup = BeautifulSoup(page, "html.parser") 20 | for a in soup.find_all(rel="rdfs:domain"): 21 | domains.add(re.search(r'dbo:([a-zA-Z]+)', a.text).group(1)) 22 | 23 | print(re.search(r'dbo:([a-zA-Z]+)', a.text).group(1)) 24 | print(list(domains | set(ontologies))) 25 | # '[Airline, MeanOfTransportation, Mountain, Organisation, Beverage, Food, Software, Film, Protein, WrittenWork, Work, TelevisionShow, Band, ArchitecturalStructure, PopulatedPlace, Sport, Stream, Person, RouteOfTransportation, Place, Bridge, FictionalCharacter, Holiday, WineRegion, Scientist, Grape]' 26 | # python multi_generate_templates.py --label '[Airline, MeanOfTransportation, Mountain, Organisation, Beverage, Food, Software, Film, Protein, WrittenWork, Work, TelevisionShow, Band, ArchitecturalStructure, PopulatedPlace, Sport, Stream, Person, RouteOfTransportation, Place, Bridge, FictionalCharacter, Holiday, WineRegion, Scientist, Grape]'--project_name test --depth 1 --multi True -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/question_form.csv: -------------------------------------------------------------------------------- 1 | When is the ,Who is the ,What is the ,Where is the ,What is the number of ,Which one is the oldest based on ,Which one the highest based on ,Which one the highest based on 2 | select ?x ,select ?x ,select ?x ,select ?x ,select count(*) as ?x ,select distinct(?x) ,select distinct(?x) ,select distinct(?x) 3 | } , } , } , } , } , } order by ?x limit 1 ,} order by ?x limit 1,} order by ?x limit 1 4 | -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/readme.md: -------------------------------------------------------------------------------- 1 | # Utilities 2 | 3 | Download the SubjectiveEye3D dataset here. The link: [https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz](https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz) 4 | 5 | ```bash 6 | wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz 7 | gzip -d part-r-00000.gz 8 | ``` 9 | -------------------------------------------------------------------------------- /gsoc/zheyuan/utility/vocab_creator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from tqdm import tqdm 3 | 4 | def english_vocab(project_path): 5 | print("Creating english vocabulary") 6 | vocab_en = [] 7 | word_en = [] 8 | with open(project_path+"/data.en", "r") as lines: 9 | for sentence in tqdm(lines): 10 | sentence = sentence.strip("\n") 11 | for word in sentence.split(): 12 | word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) 13 | 14 | vocab_en = list(set(word_en)) 15 | try: 16 | vocab_en.remove("") 17 | except: 18 | print("There is no \'\' in vocab_en") 19 | with open(project_path+"/vocab.en", "w") as w: 20 | for vocab in vocab_en: 21 | 22 | w.write(vocab.strip() + "\n") 23 | 24 | def sparql_vocab(project_path): 25 | print("Creating SPARQL vocabulary") 26 | 27 | vocab_sparql = [] 28 | with open(project_path+"/data.sparql", "r") as lines: 29 | for sentence in tqdm(lines): 30 | sentence = sentence.strip("\n") 31 | for word in sentence.split(): 32 | if word == "dbr_Flying_Legend": 33 | print(sentence) 34 | vocab_sparql.append(word) 35 | vocab_sparql = list(set(vocab_sparql)) 36 | with open(project_path+"/vocab.sparql", "w") as w: 37 | for vocab in vocab_sparql: 38 | w.write(vocab.strip() + "\n") 39 | 40 | def add_s_tokens(path): 41 | with open(path+"/data.sparql", "r") as lines: 42 | with open(path+"/../../GloVe/data_s.sparql", "w") as w: 43 | for line in lines: 44 | new_line = " " + line.strip() + " \n" 45 | w.write(new_line) 46 | 47 | if __name__=="__main__": 48 | parser = argparse.ArgumentParser() 49 | requiredNamed = parser.add_argument_group('Required Arguments') 50 | 51 | requiredNamed.add_argument('--path', dest='path', metavar='path', 52 | help='path of project that contains the data.en/sparql files', required=True) 53 | args = parser.parse_args() 54 | path = args.path 55 | english_vocab(path) 56 | sparql_vocab(path) 57 | add_s_tokens(path) 58 | pass -------------------------------------------------------------------------------- /nspm/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | -------------------------------------------------------------------------------- /nspm/data_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Data generation. 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://arxiv.org/abs/1708.07624 8 | 9 | Version 2.0.0 10 | 11 | """ 12 | import argparse 13 | import tensorflow as tf 14 | from sklearn.model_selection import train_test_split 15 | 16 | from prepare_dataset import load_dataset, convert 17 | 18 | 19 | global output_direc 20 | 21 | 22 | def merging_datafile(input_dir, output_dir): 23 | input_diren = input_dir + '/data.en' 24 | input_dirspq = input_dir + '/data.sparql' 25 | output_dir += '/data.txt' 26 | file1 = open(input_diren, 'r', encoding="utf8") 27 | Lines1 = file1.readlines() 28 | file2 = open(input_dirspq, 'r', encoding="utf8") 29 | Lines2 = file2.readlines() 30 | s = [] 31 | for i in range(len(Lines1)): 32 | s.append(Lines1[i].replace('\n', " ") + "\t " + Lines2[i]) 33 | 34 | filef = open(output_dir, 'w', encoding="utf8") 35 | filef.writelines(s) 36 | file1.close() 37 | file2.close() 38 | filef.close() 39 | return output_dir 40 | 41 | 42 | def data_gen(input_dir, output_dir): 43 | 44 | output_direc = merging_datafile(input_dir, output_dir) 45 | 46 | input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(output_direc) 47 | 48 | # Calculate max_length of the target tensors 49 | max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1] 50 | input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) 51 | 52 | # Show length 53 | print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)) 54 | 55 | print("Input Language; index to word mapping") 56 | convert(inp_lang, input_tensor_train[0]) 57 | print() 58 | print("Target Language; index to word mapping") 59 | convert(targ_lang, target_tensor_train[0]) 60 | buffer_size = len(input_tensor_train) 61 | batch_size = 16 62 | steps_per_epoch = len(input_tensor_train) // batch_size 63 | embedding_dim = 256 64 | units = 1024 65 | vocab_inp_size = len(inp_lang.word_index) + 1 66 | vocab_tar_size = len(targ_lang.word_index) + 1 67 | 68 | dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(buffer_size) 69 | dataset = dataset.batch(batch_size, drop_remainder=True) 70 | example_input_batch, example_target_batch = next(iter(dataset)) 71 | 72 | return dataset, vocab_inp_size, vocab_tar_size, embedding_dim, units, batch_size, example_input_batch, steps_per_epoch, targ_lang, max_length_targ, max_length_inp, inp_lang, targ_lang 73 | 74 | 75 | if __name__ == '__main__': 76 | 77 | parser = argparse.ArgumentParser() 78 | requiredNamed = parser.add_argument_group('required named arguments') 79 | requiredNamed.add_argument( 80 | '--input', dest='input', metavar='inputDirectory', help='dataset directory', required=True) 81 | requiredNamed.add_argument( 82 | '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True) 83 | requiredNamed.add_argument( 84 | '--inputstr', dest='inputstr', metavar='inputString', help='Input string for translation', required=False) 85 | args = parser.parse_args() 86 | input_dir = args.input 87 | output_dir = args.output 88 | 89 | data_gen(input_dir, output_dir) 90 | -------------------------------------------------------------------------------- /nspm/filter_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Script to filter data by a given criterion. 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://arxiv.org/abs/1708.07624 8 | 9 | Version 2.0.0 10 | 11 | """ 12 | import argparse 13 | import collections 14 | import json 15 | import os 16 | import sys 17 | 18 | from generator_utils import encode, save_cache, extract_encoded_entities 19 | import importlib 20 | 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser() 24 | requiredNamed = parser.add_argument_group('required named arguments') 25 | requiredNamed.add_argument('--dataset', dest='dataset', metavar='data_300.en', help='dataset', required=True) 26 | requiredNamed.add_argument('--used_resources', dest='used_resources', metavar='used_resources.json', help='json file', required=True) 27 | requiredNamed.add_argument('--minimum', dest='minimum', metavar='15', help='minimum number of occurence', required=True) 28 | requiredNamed.add_argument('--comp', dest='comp', metavar='all|any', help='require minimum for all/any resources in the query', required=True) 29 | args = parser.parse_args() 30 | 31 | dataset_file = args.dataset 32 | used_resources_file = args.used_resources 33 | MINIMUM = int(args.minimum) 34 | COMP = any if args.comp == 'any' else all 35 | 36 | importlib.reload(sys) 37 | sys.setdefaultencoding("utf-8") 38 | 39 | 40 | dataset_root, _ = os.path.splitext(dataset_file) 41 | used_resources_root, _ = os.path.splitext(used_resources_file) 42 | filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(dataset_root, MINIMUM, COMP.__name__) 43 | filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__) 44 | 45 | used_resources = collections.Counter(json.loads(open(used_resources_file).read())) 46 | filtered_resources = [elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM] 47 | save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources))) 48 | valid_encoded_resources = [encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources] 49 | check = lambda encoded_entity : encoded_entity in valid_encoded_resources 50 | 51 | valid_lines = [] 52 | filtered_queries = [] 53 | with open(dataset_root+'.sparql', 'r') as sparql_file: 54 | for linenumber, line in enumerate(sparql_file): 55 | entities = extract_encoded_entities(line) 56 | valid = COMP(list(map(check, entities))) 57 | if valid: 58 | filtered_queries.append(line) 59 | valid_lines.append(linenumber) 60 | 61 | filtered_questions = [] 62 | with open(dataset_root+'.en', 'r') as en_file: 63 | for linenumber, line in enumerate(en_file): 64 | if linenumber in valid_lines: 65 | filtered_questions.append(line) 66 | 67 | with open(filtered_en_file, 'w') as filtered: 68 | filtered.writelines(filtered_questions) 69 | 70 | with open(filtered_sparql_file, 'w') as filtered: 71 | filtered.writelines(filtered_queries) 72 | -------------------------------------------------------------------------------- /nspm/generator_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Generator test unit. 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://arxiv.org/abs/1708.07624 8 | 9 | Version 1.0.0 10 | 11 | """ 12 | import generator 13 | import generator_utils 14 | import operator 15 | 16 | 17 | def test_extract_variables(): 18 | query = 'select distinct(?x) ?y where { ?x a C . ?x a ?y}' 19 | query2 = 'select distinct ?a where' 20 | 21 | result = generator_utils.extract_variables(query) 22 | result2 = generator_utils.extract_variables(query2) 23 | 24 | assert result == ['x', 'y'] 25 | assert result2 == ['a'] 26 | 27 | 28 | def test_single_resource_sort(): 29 | matches = [{'usages': [17]}, {'usages': [0]}, {'usages': [3]}, {'usages': [2]}, {'usages': [1]}] 30 | 31 | result = sorted(matches, key=generator.prioritize_usage) 32 | 33 | assert list(map(operator.itemgetter(0), list(map(operator.itemgetter('usages'), result)))) == [17, 3, 2, 1, 0 ] 34 | 35 | 36 | def test_couple_resource_sort(): 37 | matches = [{'usages': [17, 2]}, {'usages': [0, 0]}, {'usages': [3, 2]}, {'usages': [2, 2]}, {'usages': [1, 2]}] 38 | 39 | result = sorted(matches, key=generator.prioritize_usage) 40 | 41 | assert list(map(operator.itemgetter('usages'), result)) == [[17, 2], [3, 2], [2, 2], [1, 2], [0, 0]] 42 | 43 | 44 | def test_encoding(): 45 | original = 'SELECT ?city WHERE { ?m skos:broader dbc:Cities_in_Germany . ?city dct:subject ?m . ?city dbo:areaTotal ?area . ?b dbo:artist dbr:John_Halsey_(musician) } order by asc (?area)' 46 | expected_encoding = 'SELECT var_city WHERE brack_open var_m skos_broader dbc_Cities_in_Germany sep_dot var_city dct_subject var_m sep_dot var_city dbo_areaTotal var_area sep_dot var_b dbo_artist dbr_John_Halsey_ attr_open musician attr_close brack_close _oba_ var_area ' 47 | 48 | result = generator_utils.encode(original) 49 | 50 | assert result == expected_encoding 51 | assert str.strip(generator_utils.decode(result)) == original 52 | 53 | 54 | def test_shorten_query(): 55 | shorten = generator_utils.shorten_query 56 | 57 | assert shorten('ORDER BY var_area') == '_oba_ var_area' 58 | assert shorten('order by asc par_open var_area par_close') == '_oba_ var_area' 59 | assert shorten('order by desc attr_open var_area attr_close') == '_obd_ var_area' 60 | 61 | 62 | def test_normalize_predicates(): 63 | alt = 'dbp_placeOfBirth' 64 | 65 | assert generator_utils.normalize_predicates(alt) == 'dbo_birthPlace' -------------------------------------------------------------------------------- /nspm/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Dataset preparation. 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://arxiv.org/abs/1708.07624 8 | 9 | Version 2.0.0 10 | 11 | """ 12 | import tensorflow as tf 13 | 14 | import unicodedata 15 | import re 16 | import io 17 | 18 | 19 | def unicode_to_ascii(s): 20 | return ''.join(c for c in unicodedata.normalize('NFD', s) 21 | if unicodedata.category(c) != 'Mn') 22 | 23 | 24 | def preprocess_sentence(w): 25 | w = unicode_to_ascii(w.strip()) 26 | 27 | # creating a space between a word and the punctuation following it 28 | # eg: "he is a boy." => "he is a boy ." 29 | # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation 30 | w = re.sub(r"([?.!,¿])", r" \1 ", w) 31 | w = re.sub(r'[" "]+', " ", w) 32 | 33 | # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",") 34 | w = re.sub(r"[^a-zA-Z?.!,¿_]+", " ", w) 35 | 36 | w = w.strip() 37 | 38 | # adding a start and an end token to the sentence 39 | # so that the model know when to start and stop predicting. 40 | w = ' ' + w + ' ' 41 | return w 42 | 43 | 44 | def create_dataset(path, num_examples): 45 | lines = io.open(path, encoding='UTF-8').read().strip().split('\n') 46 | 47 | word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]] 48 | 49 | return zip(*word_pairs) 50 | 51 | 52 | def tokenize(lang): 53 | lang_tokenizer = tf.keras.preprocessing.text.Tokenizer( 54 | filters='',lower=False) 55 | lang_tokenizer.fit_on_texts(lang) 56 | 57 | tensor = lang_tokenizer.texts_to_sequences(lang) 58 | 59 | tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, 60 | padding='post') 61 | 62 | return tensor, lang_tokenizer 63 | 64 | 65 | def load_dataset(path, num_examples=None): 66 | # creating cleaned input, output pairs 67 | inp_lang_wp, targ_lang_wp = create_dataset(path, num_examples) 68 | 69 | input_tensor, inp_lang = tokenize(inp_lang_wp) 70 | target_tensor, targ_lang = tokenize(targ_lang_wp) 71 | 72 | return input_tensor, target_tensor, inp_lang, targ_lang 73 | 74 | 75 | def convert(lang, tensor): 76 | for t in tensor: 77 | if t!=0: 78 | print ("%d ----> %s" % (t, lang.index_word[t])) 79 | 80 | 81 | def merging_datafile(input_dir,output_dir): 82 | input_diren=input_dir+'/data.en' 83 | input_dirspq=input_dir+'/data.sparql' 84 | output_dir+='/data.txt' 85 | file1 = open(input_diren,'r',encoding="utf8") 86 | Lines1 = file1.readlines() 87 | file2 = open(input_dirspq,'r',encoding="utf8") 88 | Lines2 = file2.readlines() 89 | s=[] 90 | for i in range(len(Lines1)): 91 | s.append(Lines1[i].replace('\n'," ")+"\t "+Lines2[i]) 92 | 93 | filef = open(output_dir,'w',encoding="utf8") 94 | filef.writelines(s) 95 | file1.close() 96 | file2.close() 97 | filef.close() 98 | return output_dir 99 | -------------------------------------------------------------------------------- /nspm/split_in_train_dev_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Neural SPARQL Machines - Script to split data into train, dev, and test sets. 5 | 6 | 'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 7 | https://arxiv.org/abs/1708.07624 8 | 9 | Version 2.0.0 10 | 11 | """ 12 | import argparse 13 | #!/usr/bin/env python 14 | import random 15 | import os 16 | import io 17 | 18 | TRAINING_PERCENTAGE = 80 19 | TEST_PERCENTAGE = 10 20 | DEV_PERCENTAGE = 10 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser() 24 | requiredNamed = parser.add_argument_group('required named arguments') 25 | requiredNamed.add_argument('--lines', dest='lines', metavar='lines', 26 | help='total number of lines (wc -l )', required=True) 27 | requiredNamed.add_argument('--dataset', dest='dataset', 28 | metavar='dataset.sparql', help='sparql dataset file', required=True) 29 | args = parser.parse_args() 30 | 31 | lines = int(args.lines) 32 | dataset_file = os.path.splitext(args.dataset)[0] 33 | sparql_file = dataset_file + '.sparql' 34 | en_file = dataset_file + '.en' 35 | 36 | random.seed() 37 | 38 | test_and_dev_percentage = sum([TEST_PERCENTAGE, DEV_PERCENTAGE]) 39 | number_of_test_and_dev_examples = int( 40 | lines * test_and_dev_percentage / 100) 41 | number_of_dev_examples = int( 42 | number_of_test_and_dev_examples * DEV_PERCENTAGE / test_and_dev_percentage) 43 | 44 | dev_and_test = random.sample(range(lines), number_of_test_and_dev_examples) 45 | dev = random.sample(dev_and_test, number_of_dev_examples) 46 | with io.open(sparql_file, encoding="utf-8") as original_sparql, io.open(en_file, encoding="utf-8") as original_en: 47 | sparql = original_sparql.readlines() 48 | english = original_en.readlines() 49 | 50 | dev_sparql_lines = [] 51 | dev_en_lines = [] 52 | train_sparql_lines = [] 53 | train_en_lines = [] 54 | test_sparql_lines = [] 55 | test_en_lines = [] 56 | 57 | for i in range(len(sparql)): 58 | sparql_line = sparql[i] 59 | en_line = english[i] 60 | if i in dev_and_test: 61 | if i in dev: 62 | dev_sparql_lines.append(sparql_line) 63 | dev_en_lines.append(en_line) 64 | else: 65 | test_sparql_lines.append(sparql_line) 66 | test_en_lines.append(en_line) 67 | else: 68 | train_sparql_lines.append(sparql_line) 69 | train_en_lines.append(en_line) 70 | 71 | with io.open('train.sparql', 'w', encoding="utf-8") as train_sparql, io.open('train.en', 'w', encoding="utf-8") as train_en, \ 72 | io.open('dev.sparql', 'w', encoding="utf-8") as dev_sparql, io.open('dev.en', 'w', encoding="utf-8") as dev_en, \ 73 | io.open('test.sparql', 'w', encoding="utf-8") as test_sparql, io.open('test.en', 'w', encoding="utf-8") as test_en: 74 | 75 | train_sparql.writelines(train_sparql_lines) 76 | train_en.writelines(train_en_lines) 77 | dev_sparql.writelines(dev_sparql_lines) 78 | dev_en.writelines(dev_en_lines) 79 | test_sparql.writelines(test_sparql_lines) 80 | test_en.writelines(test_en_lines) 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | astor==0.8.1 3 | backports.functools-lru-cache==1.6.1 4 | backports.weakref==1.0rc1 5 | beautifulsoup4==4.8.2 6 | bleach==3.3.0 7 | enum34==1.1.9 8 | funcsigs==1.0.2 9 | gast==0.2.2 10 | grpcio==1.27.2 11 | h5py==2.10.0 12 | html5lib==0.9999999 13 | Markdown==3.1.1 14 | mock==3.0.5 15 | numpy==1.19.3 16 | protobuf==3.11.3 17 | pytest==6.2.2 18 | rdflib==5.0.0 19 | six==1.14.0 20 | soupsieve==1.9.5 21 | tensorboard==2.1.0 22 | tensorflow==2.4.0 23 | termcolor==1.1.0 24 | tqdm==4.56.0 25 | Werkzeug==1.0.1 26 | airML==0.0.3 27 | -------------------------------------------------------------------------------- /test/interpreter_airml_test.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import unittest 3 | 4 | 5 | class TestAirML(unittest.TestCase): 6 | def test_airml_input_args_with_valid_kn(self): 7 | process = subprocess.Popen( 8 | ['python3', 'interpreter.py', "--airml", "http://nspm.org/art", "--output", "test", "--inputstr", 9 | '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE) 10 | output, err = process.communicate() 11 | output = output.decode("utf-8") 12 | self.assertTrue("http://nspm.org/art KB installed." in output) 13 | self.assertTrue("Predicted translation:" in output) 14 | 15 | def test_airml_input_args_with_invalid_kn(self): 16 | process = subprocess.Popen( 17 | ['python3', 'interpreter.py', "--airml", "http://nspm.org/arts", "--output", "test", "--inputstr", 18 | '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE) 19 | output, err = process.communicate() 20 | output = output.decode("utf-8") 21 | self.assertTrue("Predicted translation:" not in output) 22 | 23 | def test_airml_without_input_arg(self): 24 | process = subprocess.Popen( 25 | ['python3', 'interpreter.py', "--output", "test", "--inputstr", 26 | '"yuncken freeman has architected in how many cities?"'], stdout=subprocess.PIPE) 27 | output, err = process.communicate() 28 | output = output.decode("utf-8") 29 | self.assertTrue("--input or --airml argument should be provided to load the model." in output) 30 | self.assertTrue("Predicted translation:" not in output) 31 | 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | --------------------------------------------------------------------------------