├── 04.generate_knowledge_graph
    ├── 16.paperfieldsofstudy.py
    ├── 11.paperurls.py
    ├── 17.paperrecommendations.py
    ├── 20.paperabstracts.py
    ├── 22.paperfieldsofstudynew.py
    ├── 06.paperauthoraffiliations.py
    ├── 08.paperreferences.py
    ├── 21.papertag.py
    ├── 12.entityrelatedentities.py
    ├── 13.fieldofstudychildren.py
    ├── 24.paperauthoraffiliations_disambiguated.py
    ├── 25.authororcid.py
    ├── 14.fieldofstudyextendedattributes.py
    ├── 07.paperextendedattributes.py
    ├── 09.paperresources.py
    ├── 19.papercitationcontexts.py
    ├── 04.conferenceseries.py
    ├── 18.relatedfieldsofstudy.py
    ├── 02.authors.py
    ├── 23.authors_disambiguated.py
    ├── 15.fieldsofstudy.py
    ├── 05.journals.py
    ├── 01.affiliations.py
    ├── 03.conferenceinstances.py
    ├── 10.papers.py
    └── OWL file.xml
├── 03.statistical_analysis
    ├── 13.paper_types.py
    ├── 01.paper_by_year.py
    ├── 08.author_activity.py
    ├── 03.field_of_study_over_time.py
    ├── 05.paper_by_year.py
    ├── 10.number_of_authors_over_time.py
    ├── 04.field_of_study_over_time_custom.py
    ├── 02.average_author_coauthor.py
    ├── 07.reference_citation_by_year.py
    ├── 12.author_interdisciplinary_chord_graph.ipynb
    ├── 09.author_fos.py
    ├── 00.count_properties.py
    └── 06.paper_citation_reference.py
├── 01.field_of_study_classification
    ├── 02.extract_labels.py
    ├── 03.generate_low_level_fos.py
    ├── 05.generate_journal_fos_3.py
    ├── 05.generate_journal_fos_1.py
    ├── 05.generate_journal_fos_2.py
    ├── 10.assign_fos_to_paper.py
    ├── 01.tokenize_abstracts.py
    ├── 11.keyword_extraction.py
    ├── 07.generate_training_evaluation_data_sets.py
    ├── 09.classification.py
    ├── 04.generate_high_level_fos_1.py
    ├── 08.training.py
    ├── 00.create_abstract.py
    ├── 04.generate_high_level_fos_2.py
    └── 06.evaluate_with_journal_label.py
├── 02.knowledge_graph_embeddings
    ├── 04.train_embedding.sh
    ├── 00.prepare_author_input_graph.py
    ├── 02.prepare_paper_input_graph.py
    ├── 01.prepare_data_authors.py
    └── 03.prepare_data_papers.py
├── 00.entity_resolution
    ├── 02.extract_author_with_paper_id.py
    ├── 03.extract_paper_with_author_id.py
    ├── 01.extract_paper_id_with_doi.py
    ├── 16.sort_doi.py
    ├── 06.add_to_authors_paper_id.py
    ├── 14.recreate_files.py
    ├── 10.add_to_authors_year.py
    ├── 07.add_to_authors_doi.py
    ├── 17.doi_merge_orcid.py
    ├── 05.paper_id_merge_author_ids.py
    ├── 04.author_id_merge_paper_id.py
    ├── 09.add_to_authors_titles.py
    ├── 00.prepare_paper_references.py
    ├── 12.add_to_authors_references.py
    ├── 00.execute.sh
    ├── 18.add_to_authors_orcid.py
    ├── 11.add_to_authors_journal_and_conference.py
    ├── 08.add_to_authors_coauthors.py
    ├── 15.extract_orcid_data.py
    ├── 19.disambiguation_evaluation.py
    └── 13.disambiguation_data.py
└── README.md


/04.generate_knowledge_graph/16.paperfieldsofstudy.py:
--------------------------------------------------------------------------------
1 | with open("PaperFieldsOfStudy.txt", "r") as f:
2 |     with open("16.PaperFieldsOfStudy.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, FieldOfStudyId, Score = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/spar/fabio/hasDiscipline> <http://ma-graph.org/entity/{FieldOfStudyId}> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/11.paperurls.py:
--------------------------------------------------------------------------------
1 | with open("PaperUrls.txt", "r") as f:
2 |     with open("11.PaperUrls.nt", "w") as g:
3 |         for line in f:
4 |             PaperId = line.split("\t")[0]
5 |             SourceUrl = line.split("\t")[2]
6 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/spar/fabio/hasURL> "{SourceUrl}"^^<http://www.w3.org/2001/XMLSchema#anyURI> .\n')
7 |             


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/17.paperrecommendations.py:
--------------------------------------------------------------------------------
1 | with open("PaperRecommendations.txt", "r") as f:
2 |     with open("17.PaperRecommendations.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, RecommendedPaperId, Score = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/recommends> <http://ma-graph.org/entity/{RecommendedPaperId}> .\n')


--------------------------------------------------------------------------------
/03.statistical_analysis/13.paper_types.py:
--------------------------------------------------------------------------------
 1 | type_dict = {}
 2 | #Add file path for Papers.txt
 3 | with open("Papers.txt", "r") as f:
 4 |     with open("13.paper_types.txt", "w") as g:
 5 |         for line in f:
 6 |             doctype = line.split("\t")[3]
 7 |             try:
 8 |                 type_dict[doctype] += 1
 9 |             except KeyError:
10 |                 type_dict[doctype] = 1
11 |         g.write(str(type_dict))


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/20.paperabstracts.py:
--------------------------------------------------------------------------------
1 | with open("01.field_of_study_classification/00.paper_abstracts.txt", "r") as f:
2 |     with open("20.PaperAbstracts.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, PaperAbstract = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <purl.org/dc/terms/abstract> "{PaperAbstract}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/22.paperfieldsofstudynew.py:
--------------------------------------------------------------------------------
1 | with open("01.field_of_study_classification/10.paperid_with_fos.txt", "r") as f:
2 |     with open("22.PaperFieldsOfStudyNew.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, FieldOfStudy = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/spar/fabio/hasDiscipline> <http://ma-graph.org/entity/{FieldOfStudy}> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/06.paperauthoraffiliations.py:
--------------------------------------------------------------------------------
1 | with open("PaperAuthorAffiliations.txt", "r") as f:
2 |     with open("06.PaperAuthorAffiliations.nt", "w") as g:
3 |         for line in f:
4 |             PaperId = line.split("\t")[0]
5 |             AuthorId = line.split("\t")[1]
6 |             g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://purl.org/dc/terms/creator> <http://ma-graph.org/entity/{PaperId}> .\n')
7 |             


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/08.paperreferences.py:
--------------------------------------------------------------------------------
1 | with open("PaperReferences.txt", "r") as f:
2 |     with open("08.PaperReferences.nt", "w") as g:
3 |         for line in f:
4 |             PaperId = line.split("\t")[0].strip()
5 |             PaperReferenceId = line.split("\t")[1].strip()
6 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/spar/cito/cites> <http://ma-graph.org/entity/{PaperReferenceId}> .\n')
7 |             


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/21.papertag.py:
--------------------------------------------------------------------------------
1 | with open("01.field_of_study_classification/11.paper_keywords.txt", "r") as f:
2 |     with open("21.PaperTags.nt", "w") as g:
3 |         for line in f:
4 |             PaperId = line.split("\t")[0]
5 |             PaperTag = line.strip("\n").split("\t")[3]
6 |             g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/hasTag> "{PaperTag}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/12.entityrelatedentities.py:
--------------------------------------------------------------------------------
1 | with open("EntityRelatedEntities.txt", "r") as f:
2 |     with open("12.EntityRelatedEntities.nt", "w") as g:
3 |         for line in f:
4 |             EntityId = line.split("\t")[0]
5 |             RelatedEntityId = line.split("\t")[2]
6 |             g.write(f'<http://ma-graph.org/entity/{EntityId}> <http://ma-graph.org/property/isRelatedTo> <http://ma-graph.org/entity/{RelatedEntityId}> .\n')
7 |             


--------------------------------------------------------------------------------
/01.field_of_study_classification/02.extract_labels.py:
--------------------------------------------------------------------------------
 1 | #Add path to FieldsOfStudy.txt
 2 | with open("FieldsOfStudy.txt", "r") as f:
 3 |     with open("02.labels.txt", "w") as g:
 4 |         index = 0
 5 |         for line in f:
 6 |             id = line.split("\t")[0]
 7 |             name = line.split("\t")[3]
 8 |             level = line.split("\t")[5]
 9 |             if level == "0":
10 |                 g.write(f"{id}\t{name}\t{index}\n")
11 |                 index += 1


--------------------------------------------------------------------------------
/03.statistical_analysis/01.paper_by_year.py:
--------------------------------------------------------------------------------
 1 | year_dict = {}
 2 | 
 3 | #Add file path to Papers.txt
 4 | with open("Papers.txt", "r") as inp:
 5 |     for line in inp:
 6 |         year = line.split("\t")[7]
 7 |         try:
 8 |             year_dict[year] += 1
 9 |         except KeyError:
10 |             year_dict[year] = 1
11 | with open("01.paper_year_distribution.txt", "w") as outp:
12 |     for item in year_dict:
13 |         outp.write(f"{item}\t{year_dict[item]}\n")
14 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/03.generate_low_level_fos.py:
--------------------------------------------------------------------------------
 1 | fos_set = set()
 2 | with open("02.labels.txt", "r") as f:
 3 |     for line in f:
 4 |         fos_set.add(line.split("\t")[0])
 5 | 
 6 | #Add path to sorted PaperFieldsOfStudy.txt
 7 | with open("SortedPaperFieldsOfStudy.txt", "r") as f:
 8 |     with open("03.papers_with_direct_labels.txt", "w") as g:
 9 |         for line in f:
10 |             if line.split("\t")[1] in fos_set:
11 |                 g.write(line)
12 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/13.fieldofstudychildren.py:
--------------------------------------------------------------------------------
1 | with open("FieldOfStudyChildren.txt", "r") as f:
2 |     with open("13.FieldOfStudyChildren.nt", "w") as g:
3 |         for line in f:
4 |             FieldOfStudyId = line.split("\t")[0]
5 |             ChildFieldOfStudyId = line.split("\t")[1].strip()
6 |             g.write(f'<http://ma-graph.org/entity/{ChildFieldOfStudyId}> <http://ma-graph.org/property/hasParent> <http://ma-graph.org/entity/{FieldOfStudyId}> .\n')
7 |             


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/24.paperauthoraffiliations_disambiguated.py:
--------------------------------------------------------------------------------
1 | with open("00.entity_resolution/14.PaperAuthorAffiliations_new.txt", "r") as f:
2 |     with open("24.PaperAuthorAffiliations_disambiguated.nt", "w") as g:
3 |         for line in f:
4 |             PaperId = line.split("\t")[0]
5 |             AuthorId = line.split("\t")[1]
6 |             g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://purl.org/dc/terms/creator> <http://ma-graph.org/entity/{PaperId}> .\n')
7 |             


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/25.authororcid.py:
--------------------------------------------------------------------------------
1 | with open("00.entity_resolution/18.authors_with_orcid.txt", "r") as f:
2 |     with open("25.AuthorORCID.nt", "w") as g:
3 |         for line in f:
4 |             AuthorId = line.split("\t")[0]
5 |             ORCID = line.strip("\n").split("\t")[17]
6 |             if not ORCID == "":
7 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/hasORCID> <{ORCID}>^^<http://www.w3.org/2001/XMLSchema#string> .\n')
8 |             


--------------------------------------------------------------------------------
/02.knowledge_graph_embeddings/04.train_embedding.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DGLBACKEND=pytorch dglke_train --model_name TransE_l2 --data_path 02.knowledge_graph_embeddings --dataset mag_author --data_files 01.author_entities.dict 01.author_relations.dict 01.author_train.tsv 01.author_valid.tsv 01.author_test.tsv --format udd_hrt --batch_size 1000 --neg_sample_size 1000 --hidden_dim 100 --gamma 19.9 --lr 0.25 --max_step 1000000 --log_interval 100 --batch_size_eval 1000 --neg_sample_size_eval 1000 -adv --regularization_coef 1.00E-09 --gpu 0 1 2 3 4 5 6 7 --valid --test --mix_cpu_gpu
3 | 


--------------------------------------------------------------------------------
/02.knowledge_graph_embeddings/00.prepare_author_input_graph.py:
--------------------------------------------------------------------------------
 1 | pred_list = ["<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://xmlns.com/foaf/0.1/name>", "<http://www.w3.org/ns/org#memberOf>", "<http://ma-graph.org/property/paperCount>", "<http://ma-graph.org/property/citationCount>"]
 2 | 
 3 | #Add file path to Authors.nt
 4 | with open("00.authors_input.txt", "w") as g:
 5 |     with open("Authors.nt", "r") as f:
 6 |         for line in f: 
 7 |             pred = line.split(" ")[1]
 8 |             if pred in pred_list:
 9 |                 g.write(line)
10 |         
11 | 


--------------------------------------------------------------------------------
/00.entity_resolution/02.extract_author_with_paper_id.py:
--------------------------------------------------------------------------------
 1 | line_count = 0
 2 | 
 3 | print("Starting...")
 4 | 
 5 | #Add path to PaperAuthorAffiliations.txt
 6 | with open("PaperAuthorAffiliations.txt", "r") as inp:
 7 |     with open("02.author_id_with_paper_id.txt", "w") as outp:
 8 |         for line in inp:
 9 |             line_count += 1
10 | 
11 |             paper_id = line.split("\t")[0].strip()
12 |             author_id = line.split("\t")[1].strip()
13 |             outp.write(author_id + "\t" + paper_id + "\n")
14 | 
15 |             print(line_count)
16 | 
17 | print("Finished.")
18 | 


--------------------------------------------------------------------------------
/00.entity_resolution/03.extract_paper_with_author_id.py:
--------------------------------------------------------------------------------
 1 | line_count = 0
 2 | 
 3 | print("Starting...")
 4 | 
 5 | #Add path to PaperAuthorAffiliations.txt
 6 | with open("PaperAuthorAffiliations.txt", "r") as inp:
 7 |     with open("03.paper_id_with_author_id.txt", "w") as outp:
 8 |         for line in inp:
 9 |             line_count += 1
10 | 
11 |             paper_id = line.split("\t")[0].strip()
12 |             author_id = line.split("\t")[1].strip()
13 |             outp.write(paper_id + "\t" + author_id + "\n")
14 | 
15 |             print(line_count)
16 | 
17 | print("Finished.")
18 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/05.generate_journal_fos_3.py:
--------------------------------------------------------------------------------
 1 | paper_label_dict = {}
 2 | with open("05.paper_journal_labels.txt", "r") as f:
 3 |     for line in f:
 4 |         paper_label_dict[line.split("\t")[0].strip()] = line.split("\t")[1].strip()
 5 | 
 6 | with open("00.paper_abstracts.txt", "r") as f:
 7 |     with open("05.papers_with_journal_labels.txt", "w") as g:
 8 |         for line in f:
 9 |             items = line.strip().split("\t")
10 |             if items[0] in paper_label_dict:
11 |                 g.write(items[0] + "\t" + items[1] + "\t" + paper_label_dict[items[0]] + "\n")
12 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/14.fieldofstudyextendedattributes.py:
--------------------------------------------------------------------------------
1 | with open("FieldOfStudyExtendedAttributes.txt", "r") as f:
2 |     with open("14.FieldOfStudyExtendedAttributes.nt", "w") as g:
3 |         for line in f:
4 |             FieldOfStudyId, AttributeType, AttributeValue = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{FieldOfStudyId}> <http://ma-graph.org/property/hasAttributeType> "{AttributeType}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
6 |             g.write(f'<http://ma-graph.org/entity/{FieldOfStudyId}> <http://ma-graph.org/property/hasAttributeValue"> "{AttributeValue}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')


--------------------------------------------------------------------------------
/01.field_of_study_classification/05.generate_journal_fos_1.py:
--------------------------------------------------------------------------------
 1 | labels_mapping = {}
 2 | with open("02.labels.txt", "r") as f:
 3 |     for line in f:
 4 |         labels_mapping[line.split("\t")[1].strip().lower()] = line.split("\t")[2].strip()
 5 | 
 6 | labels_list = [*labels_mapping.keys()]
 7 | 
 8 | #Add path to Journals.txt
 9 | with open("Journals.txt", "r") as f:
10 |     with open("05.journals_label.txt", "w") as g:
11 |         for line in f:
12 |             journal = line.split("\t")[3].strip().lower()
13 |             for label in labels_list:    
14 |                 if " " + label + " " in journal or " " + label + "s " in journal:
15 |                     g.write(line.strip() + "\t" + labels_mapping[label] + "\n")
16 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/05.generate_journal_fos_2.py:
--------------------------------------------------------------------------------
 1 | journal_labels = {}
 2 | with open("05.journals_label.txt", "r") as f:
 3 |     for line in f:
 4 |         journal_labels[line.split("\t")[0].strip()] = line.split("\t")[10].strip()
 5 | 
 6 | #Add path to Papers.txt
 7 | line_count = 1
 8 | with open("Papers.txt", "r") as f:
 9 |     with open("05.paper_journal_labels.txt", "w") as g:
10 |         for line in f:
11 |             print(line_count)
12 |             paper_id = line.split("\t")[0].strip()
13 |             journal_id = line.split("\t")[10].strip()
14 |             if journal_id in journal_labels:
15 |                 g.write(paper_id + "\t" + journal_labels[journal_id] + "\n")
16 |             line_count += 1
17 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/07.paperextendedattributes.py:
--------------------------------------------------------------------------------
1 | with open("PaperExtendedAttributes.txt", "r") as f:
2 |     with open("07.PaperExtendedAttributes.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, AttributeType, AttributeValue = line.strip("\n").split("\t")
5 |             if not AttributeType == "":
6 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/hasAttributeType"> "{AttributeType}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
7 |             if not AttributeValue == "":
8 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/hasAttributeValue"> "{AttributeValue}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
9 |             


--------------------------------------------------------------------------------
/01.field_of_study_classification/10.assign_fos_to_paper.py:
--------------------------------------------------------------------------------
 1 | def label_fos(input_file, output_file):
 2 |     with open("02.labels.txt", "r") as f:
 3 |         label_dict = {}
 4 |         for line in f:
 5 |             label_dict[line.strip().split("\t")[2]] = line.split("\t")[0]
 6 |     with open(f"{input_file}.txt", "r") as f:
 7 |         label_list = []
 8 |         for line in f:
 9 |             label_list.append(line.strip())
10 |     with open(output_file, "a") as f:
11 |         with open(input_file, "r") as g:
12 |             for index, line in enumerate(g):
13 |                 paperid = line.split("\t")[0]
14 |                 f.write(f"{paperid}\t{label_dict[label_list[index]]}\n")
15 | 
16 | f = open("10.paperid_with_fos.txt", "w")
17 | f.close()
18 | 
19 | label_fos("01.tokenized_abstracts.txt", "10.paperid_with_fos.txt")
20 | 


--------------------------------------------------------------------------------
/00.entity_resolution/01.extract_paper_id_with_doi.py:
--------------------------------------------------------------------------------
 1 | list_of_paper_id = []
 2 | line_count = 0
 3 | 
 4 | print("Starting...")
 5 | 
 6 | #Add path to Papers.txt
 7 | with open("Papers.txt", "r") as inp:
 8 |     for line in inp:
 9 |         line_count += 1
10 | 
11 |         paper_id = line.split("\t")[0].strip()
12 |         doi = line.split("\t")[2].strip()
13 |         if not doi == "":
14 |             list_of_paper_id.append((int(paper_id), doi))
15 |             
16 |         print(line_count)
17 | 
18 |     print("Start sorting...")
19 |     list_of_paper_id.sort(key=lambda tup: tup[0])
20 |     print("Finished sorting.")
21 | 
22 | print("Start writing...")
23 | with open("01.paper_id_with_doi_sorted.txt", "w") as outp:
24 |     for item in list_of_paper_id:
25 |         outp.write(str(item[0]) + "\t" + item[1] + "\n")
26 | 
27 | print("Finished.")
28 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/08.author_activity.py:
--------------------------------------------------------------------------------
 1 | years_dict = {}
 2 | for year in range(1800, 2022):
 3 |     years_dict[year] = set()
 4 | 
 5 | with open("00.entity_resolution/10.authors_with_year.txt", "r") as f:
 6 |     for line in f:
 7 |         authorid = line.split("\t")[0]
 8 |         years = line.split("\t")[-1].strip()
 9 |         if not years == "":
10 |             f1 = lambda x: x
11 |             f2 = lambda x: max(x-1, 1800)
12 |             f3 = lambda x: min(x+1, 2021)
13 |             years_list = list(map(int, years.split(",")))
14 |             active_years = [f(year) for year in years_list for f in (f1, f2, f3)]
15 |             for year in active_years:
16 |                 years_dict[year].add(authorid)
17 | with open("08.author_activity.txt", "w") as f:
18 |     for item in years_dict:
19 |         f.write(f"{item}\t{len(years_dict[item])}\n")
20 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/03.field_of_study_over_time.py:
--------------------------------------------------------------------------------
 1 | with open("01.field_of_study_classification/03.papers_with_direct_labels.txt", "r") as f:
 2 |     fos_dict = {}
 3 |     for line in f:
 4 |         paperid = line.split("\t")[0]
 5 |         fos = line.split("\t")[1]
 6 |         fos_dict[paperid] = fos
 7 | 
 8 | with open("01.field_of_study_classification/02.labels.txt", "r") as f:
 9 |     labels = set()
10 |     for line in f:
11 |         labels.add(line.split("\t")[0])
12 | 
13 | for item in labels:
14 |     f = open(f"03.field_of_study_over_time/{item}.txt", "w")
15 |     f.close()
16 | 
17 | #Add file path for Papers.txt
18 | with open("Papers.txt", "r") as f:
19 |     for line in f:
20 |         paperid = line.split("\t")[0]
21 |         try:
22 |             with open(f"03.field_of_study_over_time/{fos_dict[paperid]}.txt", "a") as g:
23 |                 g.write(line)
24 |         except KeyError:
25 |             pass
26 |         
27 | 
28 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/09.paperresources.py:
--------------------------------------------------------------------------------
 1 | with open("PaperResources.txt", "r") as f:
 2 |     with open("09.PaperResources.nt", "w") as g:
 3 |         for line in f:
 4 |             PaperId, ResourceType, ResourceUrl, SourceUrl, RelationshipType = line.strip("\n").split("\t")
 5 |             if not ResourceType == "":
 6 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/rank> "{ResourceType}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 7 |             if not ResourceUrl == "":
 8 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/hasResourceUrl> "{ResourceUrl}"^^<http://www.w3.org/2001/XMLSchema#anyURI> .\n')
 9 |             if not RelationshipType == "":
10 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/hasRelationshipType> "{RelationshipType}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')


--------------------------------------------------------------------------------
/03.statistical_analysis/05.paper_by_year.py:
--------------------------------------------------------------------------------
 1 | def by_year(input_file):    
 2 |     year_dict = {}
 3 |     with open(f"{input_file}.txt", "r") as inp:
 4 |         for line in inp:
 5 |             year = line.split("\t")[7]
 6 |             try:
 7 |                 year_dict[year] += 1
 8 |             except KeyError:
 9 |                 year_dict[year] = 1
10 |     with open(f"{input_file}_output.txt", "w") as outp:
11 |         for item in year_dict:
12 |             outp.write(f"{item}\t{year_dict[item]}\n")
13 | 
14 | by_year(121332964)
15 | by_year(138885662)
16 | by_year(144133560)
17 | by_year(17744445)
18 | by_year(205649164)
19 | by_year(41008148)
20 | by_year(95457728)
21 | by_year(127313418)
22 | by_year(142362112)
23 | by_year(15744967)
24 | by_year(185592680)
25 | by_year(33923547)
26 | by_year(71924100)
27 | by_year(127413603)
28 | by_year(144024400)
29 | by_year(162324750)
30 | by_year(192562407)
31 | by_year(39432304)
32 | by_year(86803240)
33 | 


--------------------------------------------------------------------------------
/00.entity_resolution/16.sort_doi.py:
--------------------------------------------------------------------------------
 1 | list_of_doi_orcid = []
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | with open("15.orcid_title_doi.txt", "r") as inp:  
 7 |     for line in inp:
 8 |         print(line_count)
 9 | 
10 |         try:
11 |             orcid, name, title, doi = map(str.strip, line.split("\t"))
12 |             if not doi == "" and not name == "":
13 |                 list_of_doi_orcid.append((doi.replace("(", "").replace(")", "").replace("http://dx.doi.org/", ""), name.strip(), orcid.strip()))
14 |         except ValueError:
15 |             pass
16 | 
17 |         line_count += 1
18 | 
19 | print("Start sorting...")
20 | list_of_doi_orcid.sort(key=lambda tup: tup[0])
21 | print("Finished sorting.")
22 | 
23 | with open("16.doi_orcid_sorted.txt", "w") as outp:
24 |     for item in list_of_doi_orcid:
25 |         for name in item[1].split(";"):
26 |             outp.write(item[0] + "\t" + name + "\t" + item[2] + "\n")
27 | 
28 | print("Finished.")
29 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/10.number_of_authors_over_time.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | 
 3 | with open("00.entity_resolution/05.paper_id_with_merged_author_id.txt", "r") as f:
 4 |     paper_dict = {}
 5 |     for line in f:
 6 |         paperid = line.split("\t")[0]
 7 |         authors = len(line.strip().split("\t")[1].split(","))
 8 |         paper_dict[paperid] = authors
 9 | 
10 | 
11 | #Add file path for Papers.txt
12 | with open("Papers.txt", "r") as f:
13 |     year_dict = {year: [] for year in range(1800, 2022)}
14 |     for line in f:
15 |         paperid = line.split("\t")[0]
16 |         year = line.split("\t")[7]
17 |         if not year == "":
18 |             try:
19 |                 year_dict[int(year)].append(paper_dict[paperid])
20 |             except KeyError:
21 |                 pass
22 | 
23 | with open("10.author_number_by_year.txt", "w") as f:
24 |     for item in year_dict:
25 |         f.write(f"{item}\t{statistics.mean(year_dict[item])}\t{max(year_dict[item])}\n")
26 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/19.papercitationcontexts.py:
--------------------------------------------------------------------------------
1 | with open("PaperCitationContexts.txt", "r") as f:
2 |     with open("19.PaperCitationContexts.nt", "w") as g:
3 |         for line in f:
4 |             PaperId, PaperReferenceId, CitationContext = line.strip("\n").split("\t")
5 |             g.write(f'<http://ma-graph.org/entity/{PaperId}-{PaperReferenceId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/Citation> .\n')
6 |             g.write(f'<http://ma-graph.org/entity/{PaperId}-{PaperReferenceId}> <http://purl.org/spar/cito/hasCitingEntity> <http://ma-graph.org/entity/{PaperId}> .\n')
7 |             g.write(f'<http://ma-graph.org/entity/{PaperId}-{PaperReferenceId}> <http://purl.org/spar/cito/hasCitedEntity> <http://ma-graph.org/entity/{PaperReferenceId}> .\n')
8 |             g.write(f'<http://ma-graph.org/entity/{PaperId}-{PaperReferenceId}> <http://purl.org/spar/c4o/hasContext> "{CitationContext}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')


--------------------------------------------------------------------------------
/01.field_of_study_classification/01.tokenize_abstracts.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import stopwords
 3 | from nltk.stem import PorterStemmer 
 4 | from nltk.tokenize import word_tokenize 
 5 | import string
 6 | 
 7 | nltk.download('stopwords')
 8 | nltk.download('punkt')
 9 | 
10 | stop_words = set(stopwords.words('english'))
11 | 
12 | line_count = 1
13 | 
14 | with open("00.paper_abstracts.txt", "r") as inp:
15 |     with open("01.tokenized_abstracts.txt", "w") as outp:
16 |         for line in inp:
17 |             print(line_count)
18 |             abstract = line.split("\t")[1].strip()
19 |             temp_abstract = abstract.translate(str.maketrans('', '', string.punctuation))
20 |             word_tokens = word_tokenize(temp_abstract) 
21 |             ps = PorterStemmer()
22 |             tokenized_abstract = [ps.stem(w) for w in word_tokens if not w in stop_words]
23 |             outp.write(line.split("\t")[0] + "\t" + " ".join(tokenized_abstract) + "\n")
24 |             line_count += 1
25 | 


--------------------------------------------------------------------------------
/00.entity_resolution/06.add_to_authors_paper_id.py:
--------------------------------------------------------------------------------
 1 | dict_of_paperids = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | with open("04.author_id_with_merged_paper_id.txt", "r") as inp:
 7 |     for line in inp:
 8 |         print("Loading: " + str(line_count))
 9 | 
10 |         author_id = line.split("\t")[0].strip()
11 |         paper_ids = line.split("\t")[1].strip()
12 |         dict_of_paperids[author_id] = paper_ids
13 | 
14 |         line_count += 1
15 | 
16 | line_count = 1
17 | 
18 | #Add path to Authors.txt
19 | with open("00.Authors.txt", "r") as inp:
20 |     with open("06.authors_with_paper_id.txt", "w") as outp:
21 |         for line in inp:
22 |             print("Searching: " + str(line_count))
23 | 
24 |             current_author = line.split("\t")[0].strip()
25 |             try:
26 |                 outp.write(line.strip("\n") + "\t" + dict_of_paperids[current_author] + "\n")
27 |             except KeyError:
28 |                 outp.write(line.strip("\n") + "\t\n")
29 | 
30 |             line_count += 1
31 | 
32 | print("Finished.")
33 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/11.keyword_extraction.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | import pytextrank
 3 | import multiprocessing
 4 | 
 5 | nlp = spacy.load("en_core_web_sm")
 6 | tr = pytextrank.TextRank()
 7 | nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
 8 | 
 9 | def extraction(item):
10 |     output = ""
11 |     paperid = item[0]
12 |     text = item[1]
13 |     doc = nlp(text)
14 |     for p in doc._.phrases[:5]:
15 |         output += f"{paperid}\t{p.rank:.4f}\t{p.count:5d}\t{p.text}\n"
16 |     return output
17 |    
18 | line_count = 1
19 | with open(f"00.paper_abstracts.txt", "r") as f:
20 |     abstracts = []
21 |     for line in f:
22 |         print("Loading: " + str(line_count))
23 |         paperid = line.strip().split("\t")[0]
24 |         text = line.strip().replace(paperid, "").replace("\t", " ")
25 |         abstracts.append((paperid, text))
26 |         line_count += 1
27 | 
28 | with open(f"11.paper_keywords.txt", "w") as f:
29 |     p = multiprocessing.Pool(8)
30 |     for result in p.imap(extraction, abstracts):
31 |         f.write(result)
32 | 
33 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/04.field_of_study_over_time_custom.py:
--------------------------------------------------------------------------------
 1 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/3.field_of_study/3.mag_journal_labels/bert/0.data/paperid_with_fos.txt", "r") as f:
 2 |     fos_dict = {}
 3 |     for line in f:
 4 |         paperid = line.split("\t")[0]
 5 |         fos = line.split("\t")[1].strip()
 6 |         fos_dict[paperid] = fos
 7 | 
 8 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/3.field_of_study/3.mag_journal_labels/00.labels.txt", "r") as f:
 9 |     labels = set()
10 |     for line in f:
11 |         labels.add(line.split("\t")[0])
12 | 
13 | for item in labels:
14 |     f = open(f"04.field_of_study_over_time/{item}.txt", "w")
15 |     f.close()
16 | 
17 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/0.data/0.mag_20200619/mag/Papers.txt", "r") as f:
18 |     for line in f:
19 |         paperid = line.split("\t")[0]
20 |         try:
21 |             with open(f"04.field_of_study_over_time/{fos_dict[paperid]}.txt", "a") as g:
22 |                 g.write(line)
23 |         except KeyError:
24 |             pass
25 |         
26 | 
27 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/02.average_author_coauthor.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | 
 3 | with open("00.entity_resolution/05.paper_id_with_merged_author_id.txt", "r") as f:
 4 |     author_count = [len(line.strip().split("\t")[1].split(",")) for line in f]
 5 | with open("00.entity_resolution/04.author_id_with_merged_paper_id.txt", "r") as f:
 6 |     paper_count = [len(line.strip().split("\t")[1].split(",")) for line in f]
 7 | with open("00.entity_resolution/08.authors_with_co_authors.txt", "r") as f:
 8 |     coauthor_count = [len(line.strip("\n").split("\t")[11].split(",")) for line in f]    
 9 |     
10 | with open("02.author_paper_average.txt", "w") as f:
11 |     f.write(f"Average author per paper: {statistics.mean(author_count)}\n")
12 |     f.write(f"Maximum author per paper: {max(author_count)}\n")
13 |     f.write(f"Average paper per author: {statistics.mean(paper_count)}\n")
14 |     f.write(f"Maximum paper per author: {max(paper_count)}\n")
15 |     f.write(f"Average coauthor per author: {statistics.mean(coauthor_count)}\n")
16 |     f.write(f"Maximum coauthor per author: {max(coauthor_count)}\n")
17 | 


--------------------------------------------------------------------------------
/00.entity_resolution/14.recreate_files.py:
--------------------------------------------------------------------------------
 1 | id_mapping = {}
 2 | with open("13.all_positives.txt", "r") as f:
 3 |     for line in f:
 4 |         id_mapping[line.split("\t")[0].strip()] = line.split("\t")[1].strip()
 5 | line_count = 1
 6 | 
 7 | #Add path to PaperAuthorAffiliations.txt
 8 | with open("PaperAuthorAffiliations.txt", "r") as f:
 9 |     with open("14.PaperAuthorAffiliations_new.txt", "w") as g:
10 |         for line in f:
11 |             print("PaperAuthorAffiliations: " + str(line_count))
12 |             author_id = line.split("\t")[1].strip()
13 |             if author_id in id_mapping:
14 |                 g.write(line.replace(author_id, id_mapping[author_id]))
15 |             else:
16 |                 g.write(line)
17 |             line_count += 1
18 | line_count = 1
19 | with open("13.disambiguated_file.txt", "r") as f:
20 |     with open("14.Authors_new.txt", "w") as g:
21 |         for line in f:
22 |             print("Authors: " + str(line_count))
23 |             items = line.strip().split("\t")
24 |             g.write("\t".join(items[0:9]) + "\n")
25 |             line_count += 1
26 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/07.generate_training_evaluation_data_sets.py:
--------------------------------------------------------------------------------
 1 | with open("05.papers_with_journal_labels.txt", "r") as f:
 2 |     with open("train.csv", "w") as g:
 3 |         with open("eval.csv", "w") as h:    
 4 |             fos_dict = {}
 5 |             line_count = 1
 6 |             for line in f:
 7 |                 print(line_count)
 8 |                 fos = line.split("\t")[2].strip()
 9 |                 try:
10 |                     if fos_dict[fos] < 2000:
11 |                         h.write(line.split("\t")[1] + " ," + fos + "\n")
12 |                         fos_dict[fos] += 1
13 |                     elif fos_dict[fos] < 22000:
14 |                         g.write(line.split("\t")[1] + " ," + fos + "\n")
15 |                         fos_dict[fos] += 1
16 |                     else:
17 |                         pass
18 |                 except KeyError:
19 |                     h.write(line.split("\t")[1] + " ," + fos + "\n")
20 |                     fos_dict[fos] = 1
21 |                 line_count += 1
22 | for item in fos_dict:
23 |     print(str(item) + "\t" + str(fos_dict[item]-2000))
24 | 


--------------------------------------------------------------------------------
/00.entity_resolution/10.add_to_authors_year.py:
--------------------------------------------------------------------------------
 1 | dict_of_years = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | #Add path to Papers.txt
 7 | with open("Papers.txt", "r") as inp:
 8 |     for line in inp:
 9 |         print("Loading: " + str(line_count))
10 | 
11 |         paper_id = line.split("\t")[0].strip()
12 |         year = line.split("\t")[7].strip()
13 |         dict_of_years[paper_id] = year
14 | 
15 |         line_count += 1
16 | 
17 | line_count = 1
18 | 
19 | with open("09.authors_with_titles.txt", "r") as inp:
20 |     with open("10.authors_with_year.txt", "w") as outp:
21 |         for line in inp:
22 |             print("Searching: " + str(line_count))
23 | 
24 |             paper_ids = line.split("\t")[9].strip().split(",")
25 |             years = set()
26 |             for paper_id in paper_ids:
27 |                 try:
28 |                     years.add(dict_of_years[paper_id])
29 |                 except KeyError:
30 |                     pass
31 |             outp.write(line.strip("\n") + "\t" + ",".join(years) + "\n")
32 | 
33 |             line_count += 1
34 | 
35 | print("Finished.")
36 | 
37 | 


--------------------------------------------------------------------------------
/00.entity_resolution/07.add_to_authors_doi.py:
--------------------------------------------------------------------------------
 1 | dict_of_dois = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | with open("01.paper_id_with_doi_sorted.txt", "r") as inp:
 7 |     for line in inp:
 8 |         print("Loading: " + str(line_count))
 9 | 
10 |         paper_id = line.split("\t")[0].strip()
11 |         doi = line.split("\t")[1].strip()
12 |         dict_of_dois[paper_id] = doi
13 | 
14 |         line_count += 1
15 | 
16 | line_count = 1
17 | 
18 | with open("06.authors_with_paper_id.txt", "r") as inp:
19 |     with open("07.authors_with_paper_doi.txt", "w") as outp:
20 |         for line in inp:
21 |             print("Searching: " + str(line_count))
22 | 
23 |             paper_ids = line.split("\t")[9].strip()
24 |             current_dois = []
25 |             for current_id in paper_ids.split(","):
26 |                 try:
27 |                     current_dois.append(dict_of_dois[current_id])
28 |                 except KeyError:
29 |                     pass
30 |             outp.write(line.strip("\n") + "\t" + ",".join(current_dois) + "\n")
31 | 
32 |             line_count += 1
33 | 
34 | print("Finished.")
35 | 


--------------------------------------------------------------------------------
/00.entity_resolution/17.doi_merge_orcid.py:
--------------------------------------------------------------------------------
 1 | line_count = 1
 2 | 
 3 | print("Starting...")
 4 | 
 5 | with open("16.doi_orcid_sorted.txt", "r") as inp:
 6 |     with open("17.doi_with_merged_orcid.txt", "w") as outp:
 7 |         current_doi = ""
 8 |         current_orcids = ""
 9 |         for line in inp:
10 |             print(line_count)
11 | 
12 |             doi = line.split("\t")[0].strip()
13 |             name = line.split("\t")[1].strip()
14 |             orcid = line.split("\t")[2].strip()
15 | 
16 |             if not name.strip() == "":
17 |                 if doi == current_doi:
18 |                     current_orcids += (";" + (name + "," + orcid))
19 |                 elif current_doi == "":
20 |                     current_doi = doi
21 |                     current_orcids = name + "," + orcid
22 |                 else:
23 |                     outp.write(current_doi + "\t" + current_orcids + "\n")
24 |                     current_doi = doi
25 |                     current_orcids = name + "," + orcid
26 | 
27 |             line_count += 1
28 |     
29 |         outp.write(current_doi + "\t" + current_orcids)
30 | 
31 | print("Finished.")
32 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/09.classification.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from simpletransformers.classification import ClassificationModel
 3 | import os
 4 | import pickle
 5 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 6 | 
 7 | print("Loading model...")
 8 | model = ClassificationModel('bert', 'scibert_20000_input_2_epoch/outputs/best_model', num_labels=19, use_cuda=True, args={"fp16": False, "n_gpu": 4, "eval_batch_size": 4096})
 9 | print("Finished loading model...")
10 | 
11 | def classify(inp, model):
12 |     print(f"Loading input {inp}...")
13 |     with open(f"{inp}", "r") as f:
14 |         input_list = []
15 |         for line in f:
16 |             input_list.append(line.split(",")[0])
17 |     print("Finished loading input.")
18 |     print(f"Classification in progress for {inp}...")
19 |     with open(f"{inp}labeled.txt", "w") as g:
20 |         predictions, raw_outputs = model.predict(input_list)
21 |         for item in predictions:
22 |             g.write(f"{item}\n")
23 |     print("Finished classification")
24 | 
25 | #If necessary, split the input file into multiple parts and classify in sequence
26 | classify("01.tokenized_abstracts.txt", model)
27 | 


--------------------------------------------------------------------------------
/00.entity_resolution/05.paper_id_merge_author_ids.py:
--------------------------------------------------------------------------------
 1 | line_count = 1
 2 | 
 3 | print("Starting...")
 4 | 
 5 | with open("03.paper_id_with_author_id_sorted.txt", "r") as inp:
 6 |     with open("05.paper_id_with_merged_author_id.txt", "w") as outp:
 7 |         current_paper_id = ""
 8 |         current_authors = ""
 9 | 
10 |         for line in inp:
11 |             print(line_count)
12 | 
13 |             paper_id = line.split("\t")[0].strip()
14 |             author_id = line.split("\t")[1].strip()
15 | 
16 |             if paper_id == current_paper_id:
17 |                 current_authors += (";" + author_id)
18 |             elif current_paper_id == "":
19 |                 current_paper_id = paper_id
20 |                 current_authors = author_id
21 |             else:
22 |                 list_of_authors = ",".join(current_authors.split(";")).strip(",")
23 |                 outp.write(current_paper_id + "\t" + list_of_authors + "\n")
24 |                 current_paper_id = paper_id
25 |                 current_authors = author_id
26 | 
27 |             line_count += 1
28 | 
29 |         outp.write(current_paper_id + "\t" + list_of_authors + "\n")
30 | 
31 | print("Finished.")
32 | 


--------------------------------------------------------------------------------
/00.entity_resolution/04.author_id_merge_paper_id.py:
--------------------------------------------------------------------------------
 1 | line_count = 1
 2 | 
 3 | print("Starting...")
 4 | 
 5 | with open("02.author_id_with_paper_id_sorted.txt", "r") as inp:
 6 |     with open("04.author_id_with_merged_paper_id.txt", "w") as outp:
 7 |         current_author_id = ""
 8 |         current_papers = ""
 9 | 
10 |         for line in inp:
11 |             print(line_count)
12 | 
13 |             paper_id = line.split("\t")[1].strip()
14 |             author_id = line.split("\t")[0].strip()
15 | 
16 |             if author_id == current_author_id:
17 |                 current_papers += (";" + paper_id)
18 |             elif current_author_id == "":
19 |                 current_author_id = author_id
20 |                 current_papers = paper_id
21 |             else:
22 |                 list_of_papers = ",".join(current_papers.split(";")).strip(",")
23 |                 outp.write(current_author_id + "\t" + list_of_papers + "\n")
24 |                 current_author_id = author_id
25 |                 current_papers = paper_id
26 | 
27 |             line_count += 1
28 | 
29 |         outp.write(current_author_id + "\t" + list_of_papers + "\n")
30 |         
31 | print("Finished.")
32 | 


--------------------------------------------------------------------------------
/00.entity_resolution/09.add_to_authors_titles.py:
--------------------------------------------------------------------------------
 1 | dict_of_titles = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | #Add path to Papers.txt
 7 | with open("Papers.txt", "r") as inp:
 8 |     for line in inp:
 9 |         print("Loading: " + str(line_count))
10 | 
11 |         paper_id = line.split("\t")[0].strip()
12 |         paper_title = line.split("\t")[4].strip()
13 |         book_title = line.split("\t")[6].strip()
14 |         dict_of_titles[paper_id] = paper_title + book_title
15 | 
16 |         line_count += 1
17 | 
18 | line_count = 1
19 | 
20 | with open("08.authors_with_co_authors.txt", "r") as inp:
21 |     with open("09.authors_with_titles.txt", "w") as outp:
22 |         for line in inp:
23 |             print("Searching: " + str(line_count))
24 | 
25 |             paper_ids = line.split("\t")[9].strip().split(",")
26 |             titles = []
27 |             for paper_id in paper_ids:
28 |                 try:
29 |                     titles.append(dict_of_titles[paper_id])
30 |                 except KeyError:
31 |                     pass
32 |             outp.write(line.strip("\n") + "\t" + ",".join(titles) + "\n")
33 | 
34 |             line_count += 1
35 | 
36 | print("Finished.")
37 | 
38 | 


--------------------------------------------------------------------------------
/00.entity_resolution/00.prepare_paper_references.py:
--------------------------------------------------------------------------------
 1 | line_count = 1
 2 | 
 3 | print("Starting...")
 4 | 
 5 | #Add path to PaperReferences.txt
 6 | with open("PaperReferences.txt", "r") as inp:
 7 |     with open("00.paper_id_with_merged_references.txt", "w") as outp:
 8 |         current_paper_id = ""
 9 |         current_references = ""
10 | 
11 |         for line in inp:
12 |             print(line_count)
13 | 
14 |             paper_id = line.split("\t")[0].strip()
15 |             author_id = line.split("\t")[1].strip()
16 | 
17 |             if paper_id == current_paper_id:
18 |                 current_references += (";" + author_id)
19 |             elif current_paper_id == "":
20 |                 current_paper_id = paper_id
21 |                 current_references = author_id
22 |             else:
23 |                 list_of_references = ",".join(current_references.split(";")).strip(",")
24 |                 outp.write(current_paper_id + "\t" + list_of_references + "\n")
25 |                 current_paper_id = paper_id
26 |                 current_references = author_id
27 | 
28 |             line_count += 1
29 | 
30 |         outp.write(current_paper_id + "\t" + list_of_references + "\n")
31 | 
32 | print("Finished.")
33 | 


--------------------------------------------------------------------------------
/00.entity_resolution/12.add_to_authors_references.py:
--------------------------------------------------------------------------------
 1 | dict_of_references = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | with open("00.paper_id_with_merged_references.txt", "r") as inp:
 7 |     for line in inp:
 8 |         print("Loading: " + str(line_count))
 9 | 
10 |         paper_id = line.split("\t")[0].strip()
11 |         references = line.split("\t")[1].strip()
12 |         dict_of_references[paper_id] = references
13 |         line_count += 1
14 | 
15 | line_count = 1
16 | found = 0
17 | not_found = 0
18 | 
19 | with open("11.authors_with_journal_and_conference.txt", "r") as inp:
20 |     with open("12.authors_with_references.txt", "w") as outp:
21 |         for line in inp:
22 |             print("Searching: " + str(line_count))
23 | 
24 |             paper_ids = line.split("\t")[9].strip().split(",")
25 |             references = set()
26 |             for paper_id in paper_ids:
27 |                 try:
28 |                     references.update(dict_of_references[paper_id].split(","))
29 |                 except KeyError:
30 |                     pass
31 |             outp.write(line.strip("\n") + "\t" + ",".join(references) + "\n")
32 | 
33 |             line_count += 1
34 | 
35 | print("Finished.")
36 | 


--------------------------------------------------------------------------------
/00.entity_resolution/00.execute.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python 00.prepare_paper_references.py
 3 | python 01.extract_paper_id_with_doi.py
 4 | python 02.extract_author_with_paper_id.py
 5 | 
 6 | sort -n -t$'\t' -k1 02.author_id_with_paper_id.txt > 02.author_id_with_paper_id_sorted.txt
 7 | 
 8 | python 03.extract_paper_with_author_id.py
 9 | 
10 | sort -n -t$'\t' -k1 03.paper_id_with_author_id.txt > 03.paper_id_with_author_id_sorted.txt 
11 | 
12 | python 04.author_id_merge_paper_id.py
13 | python 05.paper_id_merge_author_ids.py
14 | python 06.add_to_authors_paper_id.py
15 | python 07.add_to_authors_doi.py
16 | python 08.add_to_authors_coauthors.py
17 | python 09.add_to_authors_titles.py
18 | python 10.add_to_authors_year.py
19 | python 11.add_to_authors_journal_and_conference.py
20 | python 12.add_to_authors_references.py
21 | 
22 | mkdir sort
23 | split -l 5000000 -d 12.authors_with_references.txt sort/sort_file
24 | cd sort
25 | for file in sort_file*; do
26 |     echo $file
27 |     LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file
28 |     done
29 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../12.authors_with_references_sorted.txt
30 | cd ..
31 | rm -r sort
32 | 
33 | python 13.disambiguation_data.py
34 | python 14.recreate_files


--------------------------------------------------------------------------------
/03.statistical_analysis/07.reference_citation_by_year.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | 
 3 | paper_references = {}
 4 | paper_citations = {}
 5 | #Add file path for Papers.txt
 6 | with open("Papers.txt", "r") as f:
 7 |     for line in f:
 8 |         year = line.split("\t")[7]
 9 |         references = int(line.split("\t")[18])
10 |         citations = int(line.split("\t")[19])
11 |         if not year == "":
12 |             try:
13 |                 paper_references[year].append(references)
14 |             except KeyError:
15 |                 paper_references[year] = [references]
16 |             try:
17 |                 paper_citations[year].append(citations)
18 |             except KeyError:
19 |                 paper_citations[year] = [citations]
20 | 
21 | with open("07.paper_references_by_year.txt", "w") as f:
22 |     for item in paper_references:
23 |         f.write(f"{item}\t{sum(paper_references[item])}\t{len(paper_references[item])}\t{statistics.mean(paper_references[item])}\t{statistics.median(paper_references[item])}\t{max(paper_references[item])}\n")
24 | 
25 | with open("07.paper_citations_by_year.txt", "w") as f:
26 |     for item in paper_citations:
27 |         f.write(f"{item}\t{sum(paper_citations[item])}\t{len(paper_citations[item])}\t{statistics.mean(paper_citations[item])}\t{statistics.median(paper_citations[item])}\t{max(paper_citations[item])}\n")
28 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/04.generate_high_level_fos_1.py:
--------------------------------------------------------------------------------
 1 | fos_dict = {}
 2 | paper_labels = set()
 3 | 
 4 | with open("02.labels.txt", "r") as f:
 5 |     for line in f:
 6 |         paper_labels.add(line.split("\t")[0].strip())
 7 | 
 8 | def find_top_fos(fos, fos_dict, paper_labels):
 9 |     output =  {fos}
10 |     while any(f in fos_dict for f in output):
11 |         for f in {f for f in output if f in fos_dict}:
12 |             output.update(fos_dict[f])
13 |             output.remove(f)
14 |     return output.intersection(paper_labels)
15 | 
16 | line_count = 1
17 | #Add path to FieldOfStudyChildren.txt
18 | with open("FieldOfStudyChildren.txt", "r") as f:
19 |     for line in f:
20 |         child_fos = line.split("\t")[1].strip()
21 |         parent_fos = line.split("\t")[0].strip()
22 |         if child_fos in fos_dict:
23 |             fos_dict[child_fos].add(parent_fos)
24 |         else:
25 |             fos_dict[child_fos] = {parent_fos}
26 | 
27 | #Add path to sorted PaperFieldsOfStudy.txt
28 | with open("SortedPaperFieldsOfStudy", "r") as f:
29 |     with open("04.papers_top_level_labels.txt", "w") as g:
30 |         for line in f:
31 |             print(line_count)
32 |             fos = line.split("\t")[1].strip()
33 |             for f in find_top_fos(fos, fos_dict, paper_labels):
34 |                 g.write(line.replace(fos, f))
35 |             line_count += 1 
36 | 
37 | 


--------------------------------------------------------------------------------
/02.knowledge_graph_embeddings/02.prepare_paper_input_graph.py:
--------------------------------------------------------------------------------
 1 | pred_list = ["<http://purl.org/dc/terms/title>", "<http://prismstandard.org/namespaces/1.2/basic/publicationDate>", "<http://purl.org/dc/terms/publisher>", "<http://ma-graph.org/property/referenceCount>", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://ma-graph.org/property/appearsInConferenceSeries>", "<http://ma-graph.org/property/bookTitle>", "<http://ma-graph.org/property/citationCount>", "<http://ma-graph.org/property/appearsInConferenceInstance>", "<http://ma-graph.org/property/appearsInJournal>", "<http://ma-graph.org/property/paperCount>", "<http://xmlns.com/foaf/0.1/name>"]
 2 | 
 3 | with open("02.papers_input.txt", "w") as g:
 4 |     #Add file path to Papers.nt
 5 |     with open("Papers.nt", "r") as f:
 6 |         for line in f: 
 7 |             pred = line.split(" ")[1]
 8 |             if pred in pred_list:
 9 |                 g.write(line)
10 | 
11 |     #Add file path to Journals.nt
12 |     with open("Journals.nt", "r") as f:
13 |        for line in f:
14 |             pred = line.split(" ")[1]
15 |             if pred in pred_list:
16 |                 g.write(line)
17 | 
18 |     #Add file path to ConferenceSeries.nt
19 |     with open("ConferenceSeries.nt", "r") as f:
20 |         for line in f:
21 |             pred = line.split(" ")[1]
22 |             if pred in pred_list:
23 |                 g.write(line)
24 |         
25 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/08.training.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from simpletransformers.classification import ClassificationModel
 3 | from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report
 4 | import os
 5 | import pickle
 6 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 7 | 
 8 | def f1_multiclass(labels, preds):
 9 |     return f1_score(labels, preds, average='micro')
10 | def report(labels, preds):
11 |     return classification_report(labels, preds)
12 | 
13 | train_df = pd.read_csv('train.csv', header=None, dtype={"0":str, "1":str})
14 | eval_df = pd.read_csv('eval.csv', header=None, dtype={"0":str, "1":str})
15 | 
16 | #change the chosen model accordingly
17 | model = ClassificationModel('bert', 'allenai/scibert_scivocab_uncased', num_labels=19, use_cuda=True, args={"fp16": False, "n_gpu": 4, "num_train_epochs": 2, "evaluate_during_training": True})
18 | 
19 | model.train_model(train_df, eval_df=eval_df)
20 | 
21 | result, model_outputs, wrong_predictions = model.eval_model(eval_df, f1=f1_multiclass, acc=accuracy_score, classification_report=report)
22 | 
23 | with open("results.txt", "w") as f:
24 |     f.write(str(result))
25 | with open("model_outputs.txt", "w") as f:
26 |     f.write(str(model_outputs))
27 | with open("wrong_predictions.txt", "w") as f:
28 |     f.write(str(wrong_predictions))
29 | 
30 | model_name = "bert_model.sav"
31 | pickle.dump(model, open(model_name, "wb"))
32 | 
33 | 


--------------------------------------------------------------------------------
/00.entity_resolution/18.add_to_authors_orcid.py:
--------------------------------------------------------------------------------
 1 | from pyjarowinkler import distance
 2 | 
 3 | dict_of_orcids = {}
 4 | line_count = 1
 5 | 
 6 | print("Starting...")
 7 | 
 8 | with open("17.doi_with_merged_orcid.txt", "r") as inp:
 9 |     for line in inp:
10 |         print("Loading: " + str(line_count))
11 | 
12 |         doi = line.split("\t")[0].strip()
13 |         orcid = line.split("\t")[1].strip()
14 |         dict_of_orcids[doi] = orcid
15 | 
16 |         line_count += 1
17 | 
18 | line_count = 1
19 | 
20 | with open("12.authors_with_references_sorted.txt", "r") as inp:
21 |     with open("18.authors_with_orcid.txt", "w") as outp:
22 |         for line in inp:
23 |             print("Searching: " + str(line_count))
24 | 
25 |             name = line.split("\t")[3].strip()
26 |             dois = line.split("\t")[10].strip().split(";")
27 |             orcids = set()
28 |             for doi in dois:
29 |                 try:
30 |                     found_orcids = dict_of_orcids[doi].split(";")
31 |                     for orcid in found_orcids:
32 |                         if distance.get_jaro_distance(str.lower(name), str.lower(orcid.split(",")[0]), winkler=True, scaling=0.1)>0.9:
33 |                                 orcids.update([orcid.split(",")[1].strip()])
34 |                 except KeyError:
35 |                     pass
36 |             outp.write(line.strip("\n") + "\t" + ",".join(orcids).strip() + "\n")
37 |             
38 |             line_count += 1
39 | 
40 | print("Finished.")
41 | 
42 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/00.create_abstract.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | with open("00.paper_abstracts.txt", "w") as output:
 4 |     #Add path to PaperAbstractsInvertedIndex.txt.1
 5 |     with open("PaperAbstractsInvertedIndex.txt.1") as f:
 6 |         for line in f:
 7 |             paper_id, inverted_index = line.strip().split("\t")
 8 |             index_length = json.loads(inverted_index)["IndexLength"]
 9 |             indexes = json.loads(inverted_index)["InvertedIndex"]
10 |             sentence_list = [" "]*index_length
11 |             for word in indexes:
12 |                 word_index = list(indexes[word])
13 |                 for index in word_index:
14 |                     sentence_list[index] = word.replace("\n", " ").replace("\r", "").replace("\t", " ")
15 |             output.write(paper_id + "\t" + " ".join(sentence_list) + "\n")
16 |     #Add path to PaperAbstractsInvertedIndex.txt.2        
17 |     with open("PaperAbstractsInvertedIndex.txt.2") as f:
18 |         for line in f:
19 |             paper_id, inverted_index = line.strip().split("\t")
20 |             index_length = json.loads(inverted_index)["IndexLength"]
21 |             indexes = json.loads(inverted_index)["InvertedIndex"]
22 |             sentence_list = [" "]*index_length
23 |             for word in indexes:
24 |                 word_index = list(indexes[word])
25 |                 for index in word_index:
26 |                     sentence_list[index] = word.replace("\n", " ").replace("\r", "").replace("\t", " ")
27 |             output.write(paper_id + "\t" + " ".join(sentence_list) + "\n")
28 | 


--------------------------------------------------------------------------------
/00.entity_resolution/11.add_to_authors_journal_and_conference.py:
--------------------------------------------------------------------------------
 1 | dict_of_journals = {}
 2 | dict_of_conferences = {}
 3 | line_count = 1
 4 | 
 5 | print("Starting...")
 6 | print("Loading lists...")
 7 | #Add path to Papers.txt
 8 | with open("Papers.txt", "r") as inp:
 9 |     for line in inp:
10 |         print("Loading: " + str(line_count))
11 | 
12 |         paper_id = line.split("\t")[0].strip()
13 |         journal = line.split("\t")[11].strip()
14 |         conference = line.split("\t")[12].strip()
15 |         dict_of_journals[paper_id] = journal
16 |         dict_of_conferences[paper_id] = conference
17 | 
18 |         line_count += 1
19 | 
20 | line_count = 1
21 | 
22 | with open("10.authors_with_year.txt", "r") as inp:
23 |     with open("11.authors_with_journal_and_conference.txt", "w") as outp:
24 |         for line in inp:
25 |             print("Searching: " + str(line_count))
26 | 
27 |             paper_ids = line.split("\t")[9].strip().split(",")
28 |             journals = set()
29 |             conferences = set()
30 |             for paper_id in paper_ids:
31 |                 try:
32 |                     journals.add(dict_of_journals[paper_id])
33 |                 except KeyError:
34 |                     pass
35 |                 try:
36 |                     conferences.add(dict_of_journals[paper_id])
37 |                 except KeyError:
38 |                     pass
39 |             outp.write(line.strip("\n") + "\t" + ",".join(journals).strip(",") + "\t" + ",".join(conferences).strip(",") + "\n")
40 | 
41 |             line_count += 1
42 | 
43 | print("Finished.")
44 | 


--------------------------------------------------------------------------------
/01.field_of_study_classification/04.generate_high_level_fos_2.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | 
 3 | line_count = 1
 4 | with open("04.papers_top_level_labels.txt", "r") as f:
 5 |     with open("04.papers_with_indirect_labels.txt", "w") as g:
 6 |         current_paper = ""
 7 |         paper_dict = {}
 8 |         paper_count = {}
 9 |         for line in f:
10 |             print(line_count)
11 |             paperid = line.split("\t")[0]
12 |             fos = int(line.split("\t")[1])
13 |             score = float(line.split("\t")[2].strip())
14 |             if current_paper == "":
15 |                 current_paper = paperid
16 |                 paper_dict[fos] = score
17 |                 paper_count[fos] = 1
18 |             elif paperid == current_paper:
19 |                 try:
20 |                     paper_dict[fos] += score
21 |                     paper_count[fos] += 1
22 |                 except KeyError:
23 |                     paper_dict[fos] = score
24 |                     paper_count[fos] = 1
25 |             else:
26 |                 g.write(current_paper + "\t" + str(max(paper_dict.items(), key=operator.itemgetter(1))[0]) + "\t" + str(max(paper_dict.items(), key=operator.itemgetter(1))[1]) + "\n")
27 |                 current_paper = paperid
28 |                 paper_dict.clear()
29 |                 paper_count.clear()
30 |                 paper_dict[fos] = score
31 |                 paper_count[fos] = 1
32 |             line_count += 1
33 |         for item in paper_dict:
34 |             g.write(current_paper + "\t" + str(item) + "\t" + str(paper_dict[item]/max((paper_count[item]-1), 1)) + "\n")
35 | 


--------------------------------------------------------------------------------
/03.statistical_analysis/12.author_interdisciplinary_chord_graph.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from chord import Chord"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 3,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "f = open('08.author_fos_matrix.txt', 'r+')\n",
19 |     "matrix = f.readline()\n",
20 |     "f.close()"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 5,
26 |    "metadata": {},
27 |    "outputs": [],
28 |    "source": [
29 |     "names = [\"Computer Science\", \"Biology\", \"Political Science\", \"Materials Science\", \"Geography\", \"Chemistry\", \"Economics\", \"Mathematics\", \"Geology\", \"Engineering\", \"Physics\", \"Sociology\", \"Business\", \"Medicine\", \"Psychology\", \"Art\", \"History\", \"Philosophy\", \"Environmental science\"]\n",
30 |     "Chord(matrix, names, wrap_labels = False, width=3600).to_html()"
31 |    ]
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "Python 3",
37 |    "language": "python",
38 |    "name": "python3"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 3
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython3",
50 |    "version": "3.7.7"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 4
55 | }
56 | 


--------------------------------------------------------------------------------
/00.entity_resolution/08.add_to_authors_coauthors.py:
--------------------------------------------------------------------------------
 1 | dict_of_coauthors = {}
 2 | line_count = 1
 3 | 
 4 | print("Starting...")
 5 | 
 6 | with open("05.paper_id_with_merged_author_id.txt", "r") as inp:
 7 |     for line in inp:
 8 |         print("Loading: " + str(line_count))
 9 |         paper_id = line.split("\t")[0].strip()
10 |         author_ids = line.split("\t")[1].strip()
11 |         dict_of_coauthors[paper_id] = author_ids
12 |         line_count+= 1
13 | 
14 | number_of_papers = 0
15 | number_of_papers_with_coauthors = 0
16 | number_of_mismatches = 0
17 | line_count = 1
18 | 
19 | with open("07.authors_with_paper_doi.txt", "r") as inp:
20 |     with open("08.authors_with_co_authors.txt", "w") as outp:
21 |         for line in inp:
22 |             print("Searching: " + str(line_count))
23 |             author_id = line.split("\t")[0].strip()
24 |             paper_ids = line.split("\t")[9].strip().split(",")
25 |             co_authors = set()
26 |             for paper_id in paper_ids:
27 |                 number_of_papers += 1
28 |                 try:
29 |                     co_authors.update(dict_of_coauthors[paper_id].split(","))
30 |                     number_of_papers_with_coauthors += 1
31 |                     try: 
32 |                         co_authors.remove(author_id)
33 |                     except KeyError:
34 |                         number_of_mismatches += 1
35 |                 except KeyError:
36 |                     pass            
37 |             outp.write(line.strip("\n") + "\t" + ",".join(co_authors) + "\n")
38 | 
39 |             line_count += 1
40 | 
41 | print("Total number of papers: " + str(number_of_papers) + ". With coauthors :" + str(number_of_papers_with_coauthors) + " . With number of mismatches: " + str(number_of_mismatches))
42 | 
43 | print("Finished.")
44 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/04.conferenceseries.py:
--------------------------------------------------------------------------------
 1 | with open("ConferenceSeries.txt", "r") as f:
 2 |     with open("04.ConferenceSeries.nt", "w") as g:
 3 |         for line in f:
 4 |             ConferenceSeriesId, Rank, NormalizedName, DisplayName, PaperCount, PaperFamilyCount, CitationCount, CreatedDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/ConferenceSeries> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not DisplayName == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
10 |             if not PaperCount == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
12 |             if not PaperFamilyCount == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
14 |             if not CitationCount == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
16 |             if not CreatedDate == "":    
17 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceSeriesId}> <http://purl.org/dc/terms/created> "{CreatedDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/01.field_of_study_classification/06.evaluate_with_journal_label.py:
--------------------------------------------------------------------------------
 1 | paper_label_dict = {}
 2 | label_total_count = {}
 3 | label_matching_count = {}
 4 | with open("02.labels.txt", "r") as f:
 5 |     for line in f:
 6 |         paper_label_dict[line.split("\t")[0].strip()] = line.split("\t")[2].strip()
 7 |         label_total_count[line.split("\t")[2].strip()] = 0
 8 |         label_matching_count[line.split("\t")[2].strip()] = 0
 9 | 
10 | line_count = 1
11 | journal_label = {}
12 | mag_label = {}
13 | with open("05.papers_with_journal_labels.txt", "r") as f:
14 |     for line in f:
15 |         print("Loading Journal: " + str(line_count))
16 |         paper_id = line.split("\t")[0]
17 |         label = line.strip().split("\t")[2]
18 |         journal_label[paper_id] = label
19 |         line_count += 1
20 | 
21 | line_count = 1
22 | #Edit the following data path depending on which labels you want to evaluate
23 | with open("03.papers_with_direct_labels.txt", "r") as f:
24 |     for line in f:
25 |         print("Loading MAG: " + str(line_count))
26 |         paper_id = line.split("\t")[0]
27 |         label = line.split("\t")[1]
28 |         mag_label[paper_id] = label
29 |         line_count += 1
30 | 
31 | line_count = 1
32 | total_count = 0
33 | matching = 0
34 | for item in journal_label:
35 |     print("Comparing: " + str(line_count))
36 |     try:
37 |         if journal_label[item] == mag_label[item]:
38 |             matching += 1
39 |             label_matching_count[journal_label[item]] += 1
40 |         total_count += 1
41 |         label_total_count[journal_label[item]] += 1
42 |     except KeyError:
43 |         pass
44 |     line_count += 1
45 | 
46 | print("Total: " + str(total_count))
47 | print("Matching: " + str(matching))
48 | for item in label_matching_count:
49 |     print("Label: " + item + " Total: " + str(label_total_count[item]) + " Matching: " + str(label_matching_count[item]))
50 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/18.relatedfieldsofstudy.py:
--------------------------------------------------------------------------------
 1 | with open("RelatedFieldOfStudy.txt", "r") as f:
 2 |     with open("18.RelatedFieldOfStudy.nt", "w") as g:
 3 |         for line in f:
 4 |             FieldOfStudy1, Type1, FieldOfStudy2, Type2, Rank = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/isRelatedTo> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
 6 |             if Type1 == "disease":
 7 |                 if Type2 == "disease_cause":
 8 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/diseaseHasDiseaseCause> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
 9 |                 if Type2 == "medical_treatment":
10 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/diseaseHasMedicalTreatment> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
11 |                 if Type2 == "symptom":
12 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/diseaseHasSymptom> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
13 |             elif Type1 =="medical_treatment":
14 |                 if Type2 == "disease_cause":
15 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/medicalTreatmentForDiseaseCause> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
16 |                 if Type2 == "symptom":
17 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/medicalTreatmentForSymptom> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')
18 |             elif Type1 =="symptom":
19 |                 if Type2 == "disease_cause":
20 |                     g.write(f'<http://ma-graph.org/entity/{FieldOfStudy1}> <http://ma-graph.org/property/symptomHasDiseaseCause> <http://ma-graph.org/entity/{FieldOfStudy2}> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/02.authors.py:
--------------------------------------------------------------------------------
 1 | with open("Authors.txt", "r") as f:
 2 |     with open("02.Authors.nt", "w") as g:
 3 |         for line in f:
 4 |             AuthorId, Rank, NormalizedName, DisplayName, LastKnownAffiliationId, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/Author> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not LastKnownAffiliationId == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://www.w3.org/ns/org#memberOf> <http://ma-graph.org/entity/{LastKnownAffiliationId}> .\n')
10 |             if not DisplayName == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
12 |             if not PaperCount == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
14 |             if not PaperFamilyCount == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
16 |             if not CitationCount == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
18 |             if not CreateDate == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://purl.org/dc/terms/created> "{CreateDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/23.authors_disambiguated.py:
--------------------------------------------------------------------------------
 1 | with open("00.entity_resolution/14.Authors_new.txt", "r") as f:
 2 |     with open("23.Authors_disambiguated.nt", "w") as g:
 3 |         for line in f:
 4 |             AuthorId, Rank, NormalizedName, DisplayName, LastKnownAffiliationId, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/Author> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not LastKnownAffiliationId == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://www.w3.org/ns/org#memberOf> <http://ma-graph.org/entity/{LastKnownAffiliationId}> .\n')
10 |             if not DisplayName == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
12 |             if not PaperCount == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
14 |             if not PaperFamilyCount == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
16 |             if not CitationCount == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
18 |             if not CreateDate == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{AuthorId}> <http://purl.org/dc/terms/created> "{CreateDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/15.fieldsofstudy.py:
--------------------------------------------------------------------------------
 1 | with open("FieldsOfStudy.txt", "r") as f:
 2 |     with open("15.FieldsOfStudy.nt", "w") as g:
 3 |         for line in f:
 4 |             FieldsOfStudyId, Rank, NormalizedName, DisplayName, MainType, Level, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/FieldOfStudy> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not DisplayName == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
10 |             if not Level == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://ma-graph.org/property/level> "{Level}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
12 |             if not PaperCount == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
14 |             if not PaperFamilyCount == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
16 |             if not CitationCount == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
18 |             if not CreateDate == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{FieldsOfStudyId}> <http://purl.org/dc/terms/created> "{CreateDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/03.statistical_analysis/09.author_fos.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | with open("01.field_of_study_classification/10.paperid_with_fos.txt", "r") as f:
 4 |     line_count = 1
 5 |     fos_dict = {}
 6 |     for line in f:
 7 |         paperid = line.split("\t")[0]
 8 |         fos = line.split("\t")[1].strip()
 9 |         fos_dict[paperid] = fos
10 |         print(f"Loading one: {line_count}.")
11 |         line_count += 1
12 | 
13 | with open("01.field_of_study_classification/02.labels.txt", "r") as f:
14 |     label_mapping = {}
15 |     labels = []
16 |     line_count = 1
17 |     for line in f:
18 |         label_mapping[line.split("\t")[0]] = int(line.strip().split("\t")[2])
19 |         labels.append(line.split("\t")[0])
20 |         print(f"Loading two: {line_count}.")
21 |         line_count += 1
22 | 
23 | with open("00.entity_resolution/06.authors_with_paper_id.txt", "r") as f:
24 |     with open("09.author_fos.txt", "w") as g:
25 |         line_count = 1
26 |         matrix = [[0 for x in range(19)] for y in range(19)]
27 |         for line in f:
28 |             print(line_count)
29 |             authorid = line.split("\t")[0]
30 |             paperids = line.split("\t")[-1].strip("\n").split(",")
31 |             fos_list = [fos_dict[paperid] for paperid in paperids if paperid in fos_dict]
32 |             if fos_list:
33 |                 author_dict = {fos: 0 for fos in labels}
34 |                 for item in fos_list:
35 |                     author_dict[item] += 1
36 |                 fos_string = ",".join(map(str, [*author_dict.values()]))
37 |                 g.write(f"{authorid}\t{fos_string}\n")
38 |                 fos_set = set(fos_list)
39 |                 if len(fos_set) > 1:
40 |                     fos_combinations = list(itertools.permutations(fos_set, 2))
41 |                     for combination in fos_combinations:
42 |                         matrix[label_mapping[combination[0]]][label_mapping[combination[1]]] += 1
43 |             line_count += 1
44 |         with open("09.author_fos_matrix.txt", "w") as h:
45 |             h.write(str(matrix))
46 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/05.journals.py:
--------------------------------------------------------------------------------
 1 | with open("Journals.txt", "r") as f:
 2 |     with open("05.Journals.nt", "w") as g:
 3 |         for line in f:
 4 |             JournalId, Rank, NormalizedName, DisplayName, Issn, Publisher, Webpage, PaperCount, PaperFamilyCount, CitationCount, CreatedDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/ConferenceSeries> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not DisplayName == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
10 |             if not Issn == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://id.loc.gov/vocabulary/identifiers/issn> "{Issn}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
12 |             if not Publisher == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://purl.org/dc/terms/publisher> "{Publisher}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
14 |             if not Webpage == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://xmlns.com/foaf/0.1/homepage> "{Webpage}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
16 |             if not PaperCount == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
18 |             if not PaperFamilyCount == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
20 |             if not CitationCount == "":
21 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
22 |             if not CreatedDate == "":    
23 |                 g.write(f'<http://ma-graph.org/entity/{JournalId}> <http://purl.org/dc/terms/created> "{CreatedDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/03.statistical_analysis/00.count_properties.py:
--------------------------------------------------------------------------------
 1 | def count_occurence(list_of_objects):
 2 |     occurence_dictionary = {}
 3 |     for item in list_of_objects:
 4 |         if item in occurence_dictionary:
 5 |             occurence_dictionary[item] += 1
 6 |         else:
 7 |             occurence_dictionary[item] = 1
 8 |     return occurence_dictionary
 9 | 
10 | papers_properties = ["PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", "Year", "Date", "OnlineDate", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", "EstimatedCitation", "OriginalVenue", "FamilyId", "CreatedDate"]
11 | authors_properties = ["AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", "PaperCount", "PaperFamilyCount", "CitationCount", "CreateDate"]
12 | 
13 | papers_properties_dict = {}
14 | authors_properties_dict = {}
15 | 
16 | 
17 | for item in papers_properties:
18 |     papers_properties_dict[item] = 0
19 | 
20 | #Add file path to Papers.txt
21 | with open("Papers.txt", "r") as inp:
22 |     line_count = 0
23 |     for line in inp:
24 |         line_count += 1
25 |         entries = line.split("\t")
26 |         for index, entry in enumerate(entries):
27 |             if not entry.strip() == "":
28 |                 papers_properties_dict[papers_properties[index]] += 1
29 |         print("Paper: " + str(line_count))
30 | with open("00.papers_statistics.txt", "w") as outp:
31 |     outp.write("Total papers: " + str(line_count) + "\n")
32 |     for item in papers_properties_dict:
33 |         outp.write(item + "\t" + str(papers_properties_dict[item]) + "\n")
34 | 
35 | 
36 | for item in authors_properties:
37 |     authors_properties_dict[item] = 0
38 | 
39 | list_of_paper_count = []
40 | list_of_citation_count = []
41 | 
42 | #Add file path to Authors.txt
43 | with open("Authors.txt", "r") as inp:
44 |     line_count = 0
45 |     for line in inp:
46 |         line_count += 1
47 |         entries = line.split("\t")
48 |         list_of_paper_count.append(entries[5])
49 |         list_of_citation_count.append(entries[6])
50 |         for index, entry in enumerate(entries):
51 |             if not entry.strip() == "":
52 |                 authors_properties_dict[authors_properties[index]] += 1
53 |         print("Authors: " + str(line_count))
54 |         
55 | with open("00.authors_statistics.txt", "w") as outp:
56 |     outp.write("Total Authors: " + str(line_count))
57 |     for item in authors_properties_dict:
58 |         outp.write(item + "\t" + str(authors_properties_dict[item]) + "\n")
59 |     print("\nNow calculating paper count...")
60 |     outp.write("Counter for paper count: " + str(count_occurence(list_of_paper_count)) + "\n")
61 |     print("Finished.\nNow calculating citation count...")
62 |     outp.write("Counter for citation count: " + str(count_occurence(list_of_citation_count)))
63 |     print("Finished.")
64 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/01.affiliations.py:
--------------------------------------------------------------------------------
 1 | with open("Affiliations.txt", "r") as f:
 2 |     with open("01.Affiliations.nt", "w") as g:
 3 |         for line in f:
 4 |             AffiliationId, Rank, NormalizedName, DisplayName, GridId, OfficialPage, WikiPage, PaperCount, PaperFamilyCount, CitationCount, Latitude, Longitude, CreatedDate = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org//class/Affiliation> .\n')
 6 |             if not Rank == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
 8 |             if not DisplayName == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
10 |             if not GridId == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://ma-graph.org/property/grid> <http://www.grid.ac/institutes/{GridId}> .\n')
12 |             if not OfficialPage == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://xmlns.com/foaf/0.1/homepage> <{OfficialPage}> .\n')
14 |             if not WikiPage == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <{WikiPage}> .\n')
16 |             if not WikiPage == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://www.w3.org/2002/07/owl#sameAs> <{WikiPage}> .\n')
18 |             if not PaperCount == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://ma-graph.org/property/paperCount> "{PaperCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
20 |             if not PaperFamilyCount == "":
21 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
22 |             if not CitationCount == "":
23 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
24 |             if not Latitude == "":
25 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://www.w3.org/2003/01/geo/wgs84_pos#lat> "{Latitude}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
26 |             if not Longitude == "":
27 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://www.w3.org/2003/01/geo/wgs84_pos#long> "{Longitude}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
28 |             if not CreatedDate == "":
29 |                 g.write(f'<http://ma-graph.org/entity/{AffiliationId}> <http://purl.org/dc/terms/created> "{CreatedDate}"^^<http://www.w3.org/2001/XMLSchema#date> .')


--------------------------------------------------------------------------------
/02.knowledge_graph_embeddings/01.prepare_data_authors.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | line_count = 1
 4 | entity_count = 0
 5 | relations_count = 0
 6 | entity_dict = {}
 7 | relations_dict = {}
 8 | with open("00.authors_input.txt", "r") as f:
 9 |     for line in f:
10 |         print("Dictionary: " + str(line_count))
11 |         if len(line.split(" ")) > 1:
12 |             sub = line.split(" ")[0].rstrip(">").replace("<http://ma-graph.org/", "")
13 |             pred = line.split(" ")[1].rstrip(">").split("/")[-1].replace("22-rdf-syntax-ns#", "").replace("org#", "")
14 |             obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^<http://www.w3.org/2001/XMLSchema#date>", "").replace("^^<http://www.w3.org/2001/XMLSchema#integer>", "").replace("^^<http://www.w3.org/2001/XMLSchema#string>", "").rstrip(">").replace("<http://ma-graph.org/", "").replace('"', '')
15 |             if not sub in entity_dict:
16 |                 entity_dict[sub] = entity_count
17 |                 entity_count += 1
18 |             if not pred in relations_dict:
19 |                 relations_dict[pred] = relations_count
20 |                 relations_count += 1
21 |             if not obj in entity_dict:
22 |                 entity_dict[obj] = entity_count
23 |                 entity_count += 1
24 |         line_count += 1
25 | 
26 | with open("01.author_entities.dict", "w") as f:
27 |     for item in entity_dict:
28 |         f.write(str(entity_dict[item]) + "\t" + str(item) + "\n")
29 | with open("01.author_relations.dict", "w") as f:
30 |     for item in relations_dict:
31 |         f.write(str(relations_dict[item]) + "\t" + str(item) + "\n")
32 | 
33 | line_count = 1
34 | with open("00.authors_input.txt", "r") as f:
35 |     with open("01.author_train.tsv", "w") as g:
36 |         with open("01.author_valid.tsv", "w") as h:
37 |             with open("01.author_test.tsv", "w") as i:
38 |                 for line in f:
39 |                     print("Splitting: " + str(line_count))
40 |                     if len(line.split(" ")) > 1:
41 |                         sub = line.split(" ")[0].rstrip(">").replace("<http://ma-graph.org/", "")
42 |                         pred = line.split(" ")[1].rstrip(">").split("/")[-1].replace("22-rdf-syntax-ns#", "").replace("org#", "")
43 |                         obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^<http://www.w3.org/2001/XMLSchema#date>", "").replace("^^<http://www.w3.org/2001/XMLSchema#integer>", "").replace("^^<http://www.w3.org/2001/XMLSchema#string>", "").rstrip(">").replace("<http://ma-graph.org/", "").replace('"', '')
44 |                         num = random.randint(1, 1000)
45 |                         if num == 999:
46 |                             h.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
47 |                         elif num == 1000:
48 |                             i.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
49 |                         g.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
50 |                     line_count += 1
51 | 


--------------------------------------------------------------------------------
/02.knowledge_graph_embeddings/03.prepare_data_papers.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | line_count = 1
 4 | entity_count = 0
 5 | relations_count = 0
 6 | entity_dict = {}
 7 | relations_dict = {}
 8 | with open("02.papers_input.txt", "r") as f:
 9 |     for line in f:
10 |         print("Dictionary: " + str(line_count))
11 |         if len(line.split(" ")) > 1:
12 |             sub = line.split(" ")[0].rstrip(">").replace("<http://ma-graph.org/", "")
13 |             pred = line.split(" ")[1].rstrip(">").split("/")[-1].replace("22-rdf-syntax-ns#", "")
14 |             obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^<http://www.w3.org/2001/XMLSchema#date>", "").replace("^^<http://www.w3.org/2001/XMLSchema#integer>", "").replace("^^<http://www.w3.org/2001/XMLSchema#string>", "").rstrip(">").replace("<http://ma-graph.org/", "").replace("<http://purl.org/spar/fabio/", "").replace('"', '')
15 |             if not sub in entity_dict:
16 |                 entity_dict[sub] = entity_count
17 |                 entity_count += 1
18 |             if not pred in relations_dict:
19 |                 relations_dict[pred] = relations_count
20 |                 relations_count += 1
21 |             if not obj in entity_dict:
22 |                 entity_dict[obj] = entity_count
23 |                 entity_count += 1
24 |         line_count += 1
25 | 
26 | with open("03.paper_entities.dict", "w") as f:
27 |     for item in entity_dict:
28 |         f.write(str(entity_dict[item]) + "\t" + str(item) + "\n")
29 | with open("03.paper_relations.dict", "w") as f:
30 |     for item in relations_dict:
31 |         f.write(str(relations_dict[item]) + "\t" + str(item) + "\n")
32 | 
33 | line_count = 1
34 | with open("02.papers_input.txt", "r") as f:
35 |     with open("03.paper_train.tsv", "w") as g:
36 |         with open("03.paper_valid.tsv", "w") as h:
37 |             with open("03.paper_test.tsv", "w") as i:
38 |                 for line in f:
39 |                     print("Splitting: " + str(line_count))
40 |                     if len(line.split(" ")) > 1:
41 |                         sub = line.split(" ")[0].rstrip(">").replace("<http://ma-graph.org/", "")
42 |                         pred = line.split(" ")[1].rstrip(">").split("/")[-1].replace("22-rdf-syntax-ns#", "")
43 |                         obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^<http://www.w3.org/2001/XMLSchema#date>", "").replace("^^<http://www.w3.org/2001/XMLSchema#integer>", "").replace("^^<http://www.w3.org/2001/XMLSchema#string>", "").rstrip(">").replace("<http://ma-graph.org/", "").replace("<http://purl.org/spar/fabio/", "").replace('"', '')
44 |                         num = random.randint(1, 1000)
45 |                         if num == 999:
46 |                             h.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
47 |                         elif num == 1000:
48 |                             i.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
49 |                         g.write(str(entity_dict[sub]) + "\t" + str(relations_dict[pred]) + "\t" + str(entity_dict[obj]) + "\n")
50 |                     line_count += 1
51 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/03.conferenceinstances.py:
--------------------------------------------------------------------------------
 1 | with open("ConferenceInstances.txt", "r") as f:
 2 |     with open("03.ConferenceInstances.nt", "w") as g:
 3 |         for line in f:
 4 |             ConferenceInstanceId, NormalizedName, DisplayName, ConferenceSeriesId, Location, OfficialUrl, StartDate, EndDate, AbstractRegistrationDate, SubmissionDeadlineDate, NotificationDueDate, FinalVersionDueDate, PageCount, PaperFamilyCount, CitationCount, Latitude, Longitude, CreatedDate  = line.strip("\n").split("\t")
 5 |             g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/ConferenceInstance> .\n')
 6 |             if not DisplayName == "":
 7 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://xmlns.com/foaf/0.1/name> "{DisplayName}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
 8 |             if not ConferenceInstanceId == "":
 9 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://purl.org/dc/terms/isPartOf> <http://ma-graph.org/entity/{ConferenceSeriesId}> .\n')
10 |             if not Location == "":
11 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <dbpedia.org/ontology/location> <{Location}> .\n')
12 |             if not OfficialUrl == "":
13 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://xmlns.com/foaf/0.1/homepage> <{OfficialUrl}> .\n')
14 |             if not StartDate == "":
15 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://purl.org/NET/c4dm/timeline.owl#start> "{StartDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
16 |             if not EndDate == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://purl.org/NET/c4dm/timeline.owl#end> "{EndDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
18 |             if not AbstractRegistrationDate == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/abstractRegistrationDate> "{AbstractRegistrationDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
20 |             if not SubmissionDeadlineDate == "":
21 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/submissionDeadlineDate> "{SubmissionDeadlineDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
22 |             if not NotificationDueDate == "":
23 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/notificationDueDate> "{NotificationDueDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
24 |             if not FinalVersionDueDate == "":
25 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/finalVersionDueDate> "{FinalVersionDueDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
26 |             if not PageCount == "":
27 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/pageCount> "{PageCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
28 |             if not PaperFamilyCount == "":
29 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/paperFamilyCount> "{PaperFamilyCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
30 |             if not CitationCount == "":
31 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
32 |             if not CreatedDate == "":
33 |                 g.write(f'<http://ma-graph.org/entity/{ConferenceInstanceId}> <http://purl.org/dc/terms/created> "{CreatedDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
34 | 
35 | 


--------------------------------------------------------------------------------
/00.entity_resolution/15.extract_orcid_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import xml.etree.ElementTree as ET
 3 | 
 4 | #Add directory to ORCID files
 5 | directory = "ORCID_files"
 6 | 
 7 | print("Starting...")
 8 | 
 9 | with open("15.orcid_title_doi.txt", "w") as outp:
10 |     for folder in os.listdir(directory):
11 |         print(folder)
12 |         
13 |         folder_path = os.path.join(directory, folder)
14 |         if os.path.isdir(folder_path):
15 |             for subfolder in os.listdir(folder_path):
16 |                 subfolder_path = os.path.join(folder_path, subfolder)
17 |                 for subsubfolder in os.listdir(subfolder_path):
18 |                     subsubfolder_path = os.path.join(subfolder_path, subsubfolder)
19 |                     orcid = subsubfolder_path.split("/")[-1]
20 | 
21 |                     real_name = ""
22 |                     educations_path = os.path.join(subsubfolder_path, "educations")
23 |                     employments_path = os.path.join(subsubfolder_path, "employments")
24 |                     works_path = os.path.join(subsubfolder_path, "works")
25 | 
26 |                     try:
27 |                         for education_file in os.listdir(educations_path):
28 |                             education_file_path = os.path.join(educations_path, education_file)
29 |                             tree = ET.parse(education_file_path)
30 |                             root = tree.getroot()
31 |                             name = root.find("{http://www.orcid.org/ns/common}source")
32 |                             if name is not None:
33 |                                 name2 = name.find("{http://www.orcid.org/ns/common}source-name")
34 |                             if name2 is not None:
35 |                                 real_name = name2.text
36 | 
37 |                     except OSError:
38 |                         try:
39 |                             for employment_file in os.listdir(employments_path):
40 |                                     employments_file_path = os.path.join(employments_path, employment_file)
41 |                                     tree = ET.parse(employments_file_path)
42 |                                     root = tree.getroot()
43 |                                     name = root.find("{http://www.orcid.org/ns/common}source")
44 |                                     if name is not None:
45 |                                         name2 = name.find("{http://www.orcid.org/ns/common}source-name")
46 |                                     if name2 is not None:
47 |                                         real_name = name2.text
48 |                         except OSError:
49 |                             pass
50 |                     
51 |                     try:
52 |                         for work_file in os.listdir(works_path):
53 |                             work_file_path = os.path.join(works_path, work_file)
54 |                             tree = ET.parse(work_file_path)
55 |                             root = tree.getroot()
56 | 
57 |                             title = root.find("{http://www.orcid.org/ns/work}title")
58 |                             if title is not None:
59 |                                 real_title = title.find("{http://www.orcid.org/ns/common}title").text.replace("\t", " ").replace("\\", " ")
60 |                             
61 |                             real_doi = ""
62 |                             doi = root.find("{http://www.orcid.org/ns/common}external-ids")
63 |                             if doi is not None:
64 |                                 dois = doi.findall("{http://www.orcid.org/ns/common}external-id")
65 |                                 for doi in dois:
66 |                                     if doi.find("{http://www.orcid.org/ns/common}external-id-type").text == "doi":
67 |                                         real_doi = doi.find("{http://www.orcid.org/ns/common}external-id-value").text   
68 | 
69 |                         outp.write("\t".join([orcid, real_name, real_title, real_doi]) + "\n") 
70 | 
71 |                     except OSError:
72 |                         pass
73 |                                  
74 | print("Finished.")


--------------------------------------------------------------------------------
/03.statistical_analysis/06.paper_citation_reference.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | 
 3 | paper_references = []
 4 | paper_citations = []
 5 | #Add file path for Papers.txt
 6 | with open("Papers.txt", "r") as f:
 7 |     for line in f:
 8 |         references = int(line.split("\t")[18])
 9 |         citations = int(line.split("\t")[19])
10 |         paper_references.append(references)
11 |         paper_citations.append(citations)
12 | 
13 | paper_references_filtered = list(filter(lambda num: num != 0, paper_references))
14 | paper_citations_filtered = list(filter(lambda num: num != 0, paper_citations))
15 | 
16 | with open("06.paper_references_citations_general.txt", "w") as f:
17 |     f.write(f"Average number of references per paper: {statistics.mean(paper_references)}\n")
18 |     f.write(f"Median number of references per paper: {statistics.median(paper_references_filtered)}\n")
19 |     f.write(f"Maximum number of references per paper: {max(paper_references)}\n")
20 |     f.write(f"Minimum number of references per paper: {min(paper_references)}\n")
21 |     f.write(f"Paper with references: {len(paper_references_filtered)}\n")
22 |     f.write(f"Average number of references per paper filtered: {statistics.mean(paper_references_filtered)}\n")
23 |     f.write(f"Average number of citations per paper: {statistics.mean(paper_citations)}\n")
24 |     f.write(f"Median number of citations per paper: {statistics.median(paper_citations_filtered)}\n")
25 |     f.write(f"Maximum number of citations per paper: {max(paper_citations)}\n")
26 |     f.write(f"Minimum number of citations per paper: {min(paper_citations)}\n")
27 |     f.write(f"Paper with citations: {len(paper_citations_filtered)}\n")
28 |     f.write(f"Average number of citations per paper filtered: {statistics.mean(paper_citations_filtered)}\n")
29 | 
30 | paper_references = {}
31 | paper_citations = {}
32 | #Add file path for Papers.txt
33 | with open("Papers.txt", "r") as f:
34 |     for line in f:
35 |         papertype = line.split("\t")[3].lower()
36 |         references = int(line.split("\t")[18])
37 |         citations = int(line.split("\t")[19])
38 |         try:
39 |             paper_references[papertype].append(references)
40 |             paper_citations[papertype].append(citations)
41 |         except KeyError:
42 |             paper_references[papertype] = [references]
43 |             paper_citations[papertype] = [citations]
44 | 
45 | paper_references_filtered = {papertype: list(filter(lambda num: num != 0, paper_references[papertype])) for papertype in paper_references}
46 | paper_citations_filtered = {papertype: list(filter(lambda num: num != 0, paper_citations[papertype])) for papertype in paper_citations}
47 | 
48 | with open("06.paper_references_citations_detailed.txt", "w") as f:
49 |     for papertype in paper_references:
50 |         f.write(f"Average number of references per paper {papertype}: {statistics.mean(paper_references[papertype])}\n")
51 |         f.write(f"Median number of references per paper {papertype}: {statistics.median(paper_references_filtered[papertype])}\n")
52 |         f.write(f"Maximum number of references per paper {papertype}: {max(paper_references[papertype])}\n")
53 |         f.write(f"Minimum number of references per paper {papertype}: {min(paper_references[papertype])}\n")
54 |         f.write(f"Paper with references {papertype}: {len(paper_references_filtered[papertype])}\n")
55 |         f.write(f"Average number of references per paper filtered {papertype}: {statistics.mean(paper_references_filtered[papertype])}\n")
56 |         f.write(f"Average number of citations per paper {papertype}: {statistics.mean(paper_citations[papertype])}\n")
57 |         f.write(f"Median number of citations per paper {papertype}: {statistics.median(paper_citations_filtered[papertype])}\n")
58 |         f.write(f"Maximum number of citations per paper {papertype}: {max(paper_citations[papertype])}\n")
59 |         f.write(f"Minimum number of citations per paper {papertype}: {min(paper_citations[papertype])}\n")
60 |         f.write(f"Paper with citations {papertype}: {len(paper_citations_filtered[papertype])}\n")
61 |         f.write(f"Average number of citations per paper filtered {papertype}: {statistics.mean(paper_citations_filtered[papertype])}\n\n")
62 |     
63 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/10.papers.py:
--------------------------------------------------------------------------------
 1 | with open("Papers.txt", "r") as f:
 2 |     with open("10.Papers.nt", "w") as g:
 3 |         for line in f:
 4 |             PaperId, Rank, Doi, DocType, PaperTitle, OriginalTitle, BookTitle, Year, Date, OnlineDate, Publisher, JournalId, ConferenceSeriesId, ConferenceInstanceId, Volume, Issue, FirstPage, LastPage, ReferenceCount, CitationCount, EstimatedCitation, OriginalVenue, FamilyId, CreatedDate = line.strip("\n").split("\t")
 5 |             if DocType == "Journal":
 6 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ma-graph.org/class/Paper> .\n')
 7 |             elif DocType == "Conference":
 8 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/ConferencePaper> .\n')
 9 |             elif DocType == "Book":
10 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Book> .\n')
11 |             elif DocType == "BookChapter":
12 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/BookChapter> .\n')
13 |             elif DocType == "Patent":
14 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/PatentDocument> .\n')
15 |             
16 |             if not Rank == "":
17 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/rank> "{Rank}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
18 |             if not Doi == "":
19 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/spar/datacite/doi> "{Doi}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
20 |             if not OriginalTitle == "":
21 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/dc/terms/title> "{OriginalTitle}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
22 |             if not BookTitle == "":
23 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/dc/terms/title> "{BookTitle}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
24 |             if not Date == "":
25 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "{Date}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')
26 |             if not Publisher == "":
27 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/dc/terms/publisher> "{Publisher}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
28 |             if not JournalId == "":
29 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/appearsInJournal> "<http://ma-graph.org/entity/{JournalId}> .\n')
30 |             if not ConferenceSeriesId == "":
31 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/appearsInConferenceSeries> "<http://ma-graph.org/entity/{ConferenceSeriesId}> .\n')
32 |             if not ConferenceInstanceId == "":
33 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/appearsInConferenceInstance> "<http://ma-graph.org/entity/{ConferenceInstanceId}> .\n')
34 |             if not Volume == "":
35 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://prismstandard.org/namespaces/basic/2.0/volume> "{Volume}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
36 |             if not Issue == "":
37 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://prismstandard.org/namespaces/basic/2.0/issueIdentifier> "{Issue}"^^<http://www.w3.org/2001/XMLSchema#string> .\n')
38 |             if not FirstPage == "":
39 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://prismstandard.org/namespaces/basic/2.0/startingPage> "{FirstPage}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
40 |             if not LastPage == "":
41 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://prismstandard.org/namespaces/basic/2.0/endingPage> "{LastPage}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
42 |             if not ReferenceCount == "":
43 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/referenceCount> "{ReferenceCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
44 |             if not CitationCount == "":
45 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/citationCount> "{CitationCount}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
46 |             if not EstimatedCitation == "":
47 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/estimatedCitationCount> "{EstimatedCitation}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
48 |             if not FamilyId == "":
49 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://ma-graph.org/property/familyId> "{FamilyId}"^^<http://www.w3.org/2001/XMLSchema#integer> .\n')
50 |             if not CreatedDate == "":    
51 |                 g.write(f'<http://ma-graph.org/entity/{PaperId}> <http://purl.org/dc/terms/created> "{CreatedDate}"^^<http://www.w3.org/2001/XMLSchema#date> .\n')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **Enhancing the Microsoft Academic Knowledge Graph**  
  2 | Code for the Master Thesis "Enhancing the Microsoft Academic Knowledge Graph"
  3 | <br><br>
  4 | 
  5 | **Entity Resolution**  
  6 | 
  7 | Required packages:
  8 | [pyjarowinkler](https://pypi.org/project/pyjarowinkler/)
  9 | 
 10 | Code for data preperation + disambiguation + recreating file (or use execute.sh)  
 11 | ````
 12 | python 00.prepare_paper_references.py
 13 | python 01.extract_paper_id_with_doi.py
 14 | python 02.extract_author_with_paper_id.py
 15 | 
 16 | LANG=en_US.UTF-8 LC_ALL=C sort -n -t$'\t' -k1 02.author_id_with_paper_id.txt > 02.author_id_with_paper_id_sorted.txt
 17 | 
 18 | python 03.extract_paper_with_author_id.py
 19 | 
 20 | LANG=en_US.UTF-8 LC_ALL=C sort -n -t$'\t' -k1 03.paper_id_with_author_id.txt > 03.paper_id_with_author_id_sorted.txt 
 21 | 
 22 | python 04.author_id_merge_paper_id.py
 23 | python 05.paper_id_merge_author_ids.py
 24 | python 06.add_to_authors_paper_id.py
 25 | python 07.add_to_authors_doi.py
 26 | python 08.add_to_authors_coauthors.py
 27 | python 09.add_to_authors_titles.py
 28 | python 10.add_to_authors_year.py
 29 | python 11.add_to_authors_journal_and_conference.py
 30 | python 12.add_to_authors_references.py
 31 | 
 32 | mkdir sort
 33 | split -l 5000000 -d 12.authors_with_references.txt sort/sort_file
 34 | cd sort
 35 | for file in sort_file*; do
 36 |     echo $file
 37 |     LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file
 38 |     done
 39 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../12.authors_with_references_sorted.txt
 40 | cd ..
 41 | rm -r sort
 42 | 
 43 | python 13.disambiguation_data.py
 44 | python 14.recreate_files
 45 | ````
 46 | 
 47 | Edit the following data paths for MAG files:  
 48 | * 00.prepare_paper_references.py: path to PaperReferences.txt  
 49 | * 01.extract_paper_id_with_doi.py: path to Papers.txt  
 50 | * 02.extract_author_with_paper_id.py: path to PaperAuthorAffiliations.txt  
 51 | * 03.extract_paper_with_author_id.py: path to PaperAuthorAffiliations.txt  
 52 | * 06.add_to_authors_paper_id.py: path to Authors.txt  
 53 | * 09.add_to_authors_titles.py: path to Papers.txt  
 54 | * 10.add_to_authors_year.py: path to Papers.txt  
 55 | * 11.add_to_authors_journal_and_conference.py: path to Papers.txt  
 56 | * 14.recreate_files.py: path to PaperAuthorAffiliations.txt  
 57 | 
 58 | Files 15-19 are used for evaluation in the Thesis
 59 | <br><br>
 60 | 
 61 | **Field of Study Classification**  
 62 | 
 63 | Required packages:
 64 | [NLTK](http://www.nltk.org/), [Pandas](https://pypi.org/project/pandas/), [scikit-learn](https://scikit-learn.org/stable/index.html), [simpletransformers](https://pypi.org/project/simpletransformers/), [spaCy](https://spacy.io/), [pytextrank](https://pypi.org/project/pytextrank/)
 65 | 
 66 | * files 00 and 01 are used to convert MAG paper abstracts from Inverted Indexes to Full Texts  
 67 | * file 02 is used to extract field of study labels from the MAG (all 19 low level FoS), edit the path to FieldsOfStudy.txt accordingly  
 68 | * files 03-06 require a sorted version of the PaperFieldsOfStudy.txt file, which can be done with the following code:
 69 | ````
 70 | mkdir sort
 71 | split -l 5000000 -d PaperFieldsOfStudy.txt sort/sort_file
 72 | cd sort
 73 | for file in sort_file*; do
 74 |     echo $file
 75 |     LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file
 76 |     done
 77 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../SortedPaperFieldsOfStudy.txt
 78 | cd ..
 79 | rm -r sort
 80 | ````
 81 | * execute file 03 in sequence to generate the data set using direct labels, edit path to the sorted PaperFieldsOfStudy.txt accordingly  
 82 | * execute both 04 files in sequence to generate the data set using indirect labels, edit paths to FieldOfStudyChildren.txt and the sorted PaperFieldsOfStudy.txt accordingly  
 83 | * execute all three 05 files in order to generate the data set using journal labels
 84 | * use file 06 in order to evaluate MAG labels from data sets generated by 03 and 04, edit the file path accordingly  
 85 | * use file 07 to generate training and evalution sets  
 86 | * use file 08 to train and evaluate the classifier, edit the model as well as hyperparameters accordingly  
 87 | * use file 09 to classify MAG papers  
 88 | * use file 10 to match extracted labels with MAG papers  
 89 | * use file 11 to extract keywords  
 90 | <br><br>
 91 | 
 92 | **Knowledge Graph Embeddings**  
 93 | 
 94 | Required packages: 
 95 | [DGL-KE](https://aws-dglke.readthedocs.io/en/latest/install.html)  
 96 | 
 97 | * use files 00 and 01 to generate input files for training author embeddings, add file path to input graph Authors.nt  
 98 | * use files 02 and 03 to generate input files for training paper embeddings, add file path to input graph Papers.nt, Journals.nt, and ConferenceSeries.nt  
 99 | * execute 04 or the following console command for training embeddings, edit file paths, data sets, and hyperparameters accordingly  
100 | ````
101 | DGLBACKEND=pytorch dglke_train --model_name TransE_l2 --data_path 02.knowledge_graph_embeddings --dataset mag_author --data_files 01.author_entities.dict 01.author_relations.dict 01.author_train.tsv 01.author_valid.tsv 01.author_test.tsv --format udd_hrt --batch_size 1000 --neg_sample_size 1000 --hidden_dim 100 --gamma 19.9 --lr 0.25 --max_step 1000000 --log_interval 100 --batch_size_eval 1000 --neg_sample_size_eval 1000 -adv --regularization_coef 1.00E-09 --gpu 0 1 2 3 4 5 6 7 --valid --test --mix_cpu_gpu
102 | ````
103 | <br><br>
104 | 
105 | **Statistical Analysis**  
106 | Required packages: 
107 | [Pandas](https://pypi.org/project/pandas/), [NumPy](https://numpy.org/), [seaborn](https://seaborn.pydata.org/), [matplotlib](https://matplotlib.org/), [chord](https://pypi.org/project/chord/)  
108 | 
109 | Includes files used to generate graphs and data for statistical analysis  
110 | * file 00 is used to count entity properties, edit file paths for Authors.txt and Papers.txt  
111 | * file 01 is used to calculate the number of papers published per year, edit file path for Papers.txt  
112 | * file 02 is used to generate data for table 25, uses files created during entity resolution, edit file paths accordingly  
113 | * file 04 and 05 to calculate data for figures 08, 09, 10, 11, 12 and 13, create a folder named 04.field_of_study_over_time beforehand, use file 04 to split data by individual field of study, use file 05 to generate time data for each field of study  
114 | * file 06 is used to generate data for table 27, edit file path for Papers.txt  
115 | * file 07 is used to generate data for figures 04 and 05, edit file path for Papers.txt  
116 | * file 08 is used to generate data for figure 03, uses file generate during entity resolution, edit file paths accordingly  
117 | * file 09 and 12 are used to generate data for and figure 15, file 09 uses data generate during entity resolution and field of study classification. File 09 generates a matrix which is loaded by 12 to generate the chord graph  
118 | * file 10 is used to generate data for figure 06, uses file generated during entity resolution, edit file path for Papers.txt  
119 | * file 11 is used to generate figure 14  
120 | * file 13 is used to generate data for table 26
121 | <br><br>
122 | 
123 | **Knowledge Graph Creation**  
124 | Includes files used to generate the MAKG and as well as the ontology file 
125 | * file 00-20 creates RDF representations of existing MAG files, edit file paths accordingly  
126 | * file 21 uses our extract keywords for each paper, edit file path accordingly  
127 | * file 22 uses our field of study labels for papers, edit file path accordingly  
128 | * file 23 and 24 use our disambiguated author and paperauthoraffiliation files, edit file paths accordingly  
129 | * file 25 links MAG authors (undisambiguated, though disambituated authors input can be created using our provided files) to their ORCIDs, requires ORCID file generated during entity resolution, edit file path accordingly
130 | 


--------------------------------------------------------------------------------
/00.entity_resolution/19.disambiguation_evaluation.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | from datetime import datetime
  4 | from pyjarowinkler import distance
  5 | 
  6 | 
  7 | def compare_affiliation(author1, author2):
  8 |     affiliation1 = author1.split("\t")[4].strip()
  9 |     affiliation2 = author2.split("\t")[4].strip()
 10 |     if affiliation1 == "" or affiliation2 == "":
 11 |         return False
 12 |     else:
 13 |         return affiliation1 == affiliation2
 14 | 
 15 | 
 16 | def compare_coauthors(author1, author2):
 17 |     coauthors1 = set(author1.split("\t")[11].strip().split(","))
 18 |     coauthors2 = set(author2.split("\t")[11].strip().split(","))
 19 |     if len(coauthors1) == 0 or len(coauthors2) == 0:
 20 |         return 0
 21 |     else:
 22 |         return len(coauthors1.intersection(coauthors2))
 23 | 
 24 | 
 25 | def most_frequent(List):
 26 |     return sorted(set(List), key=List.count, reverse=True)[:10]
 27 | 
 28 | 
 29 | def compare_titles(author1, author2):
 30 |     titles1 = author1.split("\t")[12].strip().replace(";", ",").split(",")
 31 |     titles2 = author2.split("\t")[12].strip().replace(";", ",").split(",")
 32 |     if len(titles1) == 0 or len(titles2) == 0:
 33 |         return 0
 34 |     else:
 35 |         most_freq1 = set(most_frequent(titles1))
 36 |         most_freq2 = set(most_frequent(titles2))
 37 |         return len(most_freq1.intersection(most_freq2))
 38 | 
 39 | 
 40 | def compare_years(author1, author2):
 41 |     if author1.split("\t")[13].strip() == "" or author2.split("\t")[13].strip() == "":
 42 |         return False
 43 |     else:
 44 |         years1 = set(map(int, author1.split("\t")[13].strip().split(",")))
 45 |         years2 = set(map(int, author2.split("\t")[13].strip().split(",")))
 46 |         min_years1 = min(years1)
 47 |         max_years1 = max(years1)
 48 |         min_years2 = min(years2)
 49 |         max_years2 = max(years2)
 50 |         return abs(min_years1 - max_years2) < 10 or abs(min_years2 - max_years1) < 10
 51 | 
 52 | 
 53 | def compare_journals(author1, author2):
 54 |     journals1 = set(author1.split("\t")[14].strip().split(","))
 55 |     journals2 = set(author2.split("\t")[14].strip().split(","))
 56 |     if len(journals1) == 0 or len(journals2) == 0:
 57 |         return 0
 58 |     else:
 59 |         return len(journals1.intersection(journals2))
 60 | 
 61 | 
 62 | def compare_conferences(author1, author2):
 63 |     conferences1 = set(author1.split("\t")[15].strip().split(","))
 64 |     conferences2 = set(author2.split("\t")[15].strip().split(","))
 65 |     if len(conferences1) == 0 or len(conferences2) == 0:
 66 |         return 0
 67 |     else:
 68 |         return len(conferences1.intersection(conferences2))
 69 | 
 70 | 
 71 | def self_references(author1, author2):
 72 |     paperids1 = set(author1.split("\t")[9].strip().split(","))
 73 |     paperids2 = set(author2.split("\t")[9].strip().split(","))
 74 |     references1 = set(author1.split("\t")[16].strip().split(","))
 75 |     references2 = set(author2.split("\t")[16].strip().split(","))
 76 |     return max(len(paperids1.intersection(references2)), len(paperids2.intersection(references1)))
 77 | 
 78 | 
 79 | def common_references(author1, author2):
 80 |     references1 = set(author1.split("\t")[16].strip().split(","))
 81 |     references2 = set(author2.split("\t")[16].strip().split(","))
 82 |     if len(references1) == 0 or len(references2) == 0:
 83 |         return 0
 84 |     else:
 85 |         return len(references1.intersection(references2))
 86 | 
 87 | 
 88 | def compare_authors(author1, author2):
 89 |     score_affiliation = 0
 90 |     score_coauthors = 0
 91 |     score_titles = 0
 92 |     score_years = 0
 93 |     score_journals = 0
 94 |     score_conferences = 0
 95 |     score_self_reference = 0
 96 |     score_references = 0
 97 |     score = 0
 98 | 
 99 |     if compare_affiliation(author1, author2):
100 |         score_affiliation += 5
101 | 
102 |     if compare_coauthors(author1, author2) == 1:
103 |         score_coauthors += 3
104 |     elif compare_coauthors(author1, author2) == 2:
105 |         score_coauthors += 5
106 |     elif compare_coauthors(author1, author2) > 2:
107 |         score_coauthors += 8
108 | 
109 |     if compare_titles(author1, author2) == 1:
110 |         score_titles += 3
111 |     elif compare_titles(author1, author2) == 2:
112 |         score_titles += 5
113 |     elif compare_titles(author1, author2) >= 3:
114 |         score_titles += 8
115 | 
116 |     if compare_years(author1, author2):
117 |         score_years += 3
118 | 
119 |     if compare_journals(author1, author2) >= 1:
120 |         score_journals += 4
121 | 
122 |     if compare_conferences(author1, author2) >= 1:
123 |         score_conferences += 4
124 | 
125 |     if self_references(author1, author2) >= 1:
126 |         score_self_reference += 8
127 | 
128 |     if common_references(author1, author2) == 1:
129 |         score_references += 2
130 |     elif common_references(author1, author2) == 2:
131 |         score_references += 3
132 |     elif common_references(author1, author2) >= 3:
133 |         score_references += 5
134 | 
135 |     return [score_affiliation, score_coauthors, score_titles, score_years, score_journals, score_conferences, score_self_reference, score_references]
136 | 
137 | 
138 | with open("12.authors_with_references_sorted.txt", "r") as inp:
139 |     with open("19.results_evaluation.txt", "w") as outp:
140 |         with open("19.all_false_positives.txt", "w") as outp2:
141 |             true_positive = 0
142 |             true_negative = 0
143 |             false_positive = 0
144 |             false_negative = 0
145 | 
146 |             previous_name = ""
147 |             current_authors = []
148 |             true_positives_values = [0,0,0,0,0,0,0,0]
149 |             true_negatives_values = [0,0,0,0,0,0,0,0]
150 |             false_positives_values = [0,0,0,0,0,0,0,0]
151 |             false_negatives_values = [0,0,0,0,0,0,0,0]
152 |             line_count = 1
153 | 
154 |             for line in inp:
155 |                 print(line_count)
156 |                 name = line.split("\t")[2].strip()
157 |                 if previous_name == "" and len(current_authors) < 500:
158 |                     previous_name = name
159 |                     current_authors.append(line)
160 |                 elif distance.get_jaro_distance(str.lower(name), str.lower(previous_name), winkler=True, scaling=0.1) > 0.97 and len(current_authors) < 500:
161 |                     previous_name = name
162 |                     current_authors.append(line)
163 |                 else:
164 |                     comparisons = list(itertools.combinations(current_authors, 2))
165 |                     for item in comparisons:
166 |                         if sum(compare_authors(item[0], item[1])) > 10:
167 |                             if item[0].split("\t")[16].strip() == item[1].split("\t")[16].strip():
168 |                                 true_positive += 1
169 |                                 true_positives_values = [x + y for x, y in zip(true_positives_values, compare_authors(item[0], item[1]))]
170 |                             else:
171 |                                 false_positive += 1
172 |                                 false_positives_values = [x + y for x, y in zip(false_positives_values, compare_authors(item[0], item[1]))]
173 |                                 outp2.write(item[0].strip() + "\t" + item[1].strip() + "\n")                         
174 |                         else:
175 |                             if item[0].split("\t")[16].strip() == item[1].split("\t")[16].strip():
176 |                                 false_negative += 1
177 |                                 false_negatives_values = [x + y for x, y in zip(false_negatives_values, compare_authors(item[0], item[1]))]
178 |                             else:
179 |                                 true_negative += 1
180 |                                 true_negatives_values = [x + y for x, y in zip(true_negatives_values, compare_authors(item[0], item[1]))]
181 |                     previous_name = ""
182 |                     current_authors = []
183 | 
184 |                 line_count += 1
185 | 
186 |             total_comparisons = true_positive + false_positive + true_negative + false_negative
187 |             total_positives = true_positive + false_negative
188 |             total_negatives = true_negative + false_positive
189 | 
190 |             precision = true_positive / (true_positive + false_positive)
191 |             recall = true_positive / (true_positive + false_negative)
192 |             accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative)
193 | 
194 |             outp.write("Total comparisons: " + str(total_comparisons) + "\n")
195 |             outp.write("Total positives: " + str(total_positives) + "\n")
196 |             outp.write("Total negatives: " + str(total_negatives) + "\n\n")
197 |             outp.write("True positives: " + str(true_positive) + "\n")
198 |             outp.write("False positives: " + str(false_positive) + "\n")
199 |             outp.write("True negatitves: " + str(true_negative) + "\n")
200 |             outp.write("False negatives: " + str(false_negative) + "\n\n")
201 |             outp.write("Precision: " + str(precision) + "\n")
202 |             outp.write("Recall: " + str(recall) + "\n")
203 |             outp.write("Accuracy: " + str(accuracy) + "\n\n")
204 |             outp.write("Average true positive: " + str([value/max(true_positive, 1) for value in true_positives_values]) + "\n")
205 |             outp.write("Average true negative: " + str([value/max(true_negative, 1) for value in true_negatives_values]) + "\n")
206 |             outp.write("Average false positive: " + str([value/max(false_positive, 1) for value in false_positives_values]) + "\n")
207 |             outp.write("Average false negative: " + str([value/max(false_negative, 1) for value in false_negatives_values]) + "\n")
208 | 
209 | 


--------------------------------------------------------------------------------
/00.entity_resolution/13.disambiguation_data.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | from datetime import datetime
  4 | from pyjarowinkler import distance
  5 | 
  6 | #Parameters
  7 | score_affiliation = 1
  8 | score_coauthors_1 = 3
  9 | score_coauthors_2 = 5
 10 | score_coauthors_3 = 8
 11 | score_titles_1 = 3
 12 | score_titles_2 = 5
 13 | score_titles_3 = 8
 14 | score_years = 3
 15 | score_journals = 3
 16 | score_conferences = 3
 17 | score_self_reference = 8
 18 | score_references_1 = 2
 19 | score_references_2 = 3
 20 | score_references_3 = 5
 21 | threshold_matching = 10
 22 | threshold_blocking = 0.95
 23 | scaling_factor = 0.1
 24 | max_block_size = 500
 25 | 
 26 | 
 27 | def compare_affiliation(author1, author2):
 28 |     affiliation1 = author1.split("\t")[4].strip()
 29 |     affiliation2 = author2.split("\t")[4].strip()
 30 |     if affiliation1 == "" or affiliation2 == "":
 31 |         return False
 32 |     else:
 33 |         return affiliation1 == affiliation2
 34 | 
 35 | 
 36 | def compare_coauthors(author1, author2):
 37 |     coauthors1 = set(author1.split("\t")[11].strip().split(","))
 38 |     coauthors2 = set(author2.split("\t")[11].strip().split(","))
 39 |     if len(coauthors1) == 0 or len(coauthors2) == 0:
 40 |         return 0
 41 |     else:
 42 |         return len(coauthors1.intersection(coauthors2))
 43 | 
 44 | 
 45 | def most_frequent(List):
 46 |     return sorted(set(List), key=List.count, reverse=True)[:10]
 47 | 
 48 | 
 49 | def compare_titles(author1, author2):
 50 |     titles1 = author1.split("\t")[12].strip().replace(";", ",").split(",")
 51 |     titles2 = author2.split("\t")[12].strip().replace(";", ",").split(",")
 52 |     if len(titles1) == 0 or len(titles2) == 0:
 53 |         return 0
 54 |     else:
 55 |         most_freq1 = set(most_frequent(titles1))
 56 |         most_freq2 = set(most_frequent(titles2))
 57 |         return len(most_freq1.intersection(most_freq2))
 58 | 
 59 | 
 60 | def compare_years(author1, author2):
 61 |     if author1.split("\t")[13].strip() == "" or author2.split("\t")[13].strip() == "":
 62 |         return False
 63 |     else:
 64 |         years1 = set(map(int, author1.split("\t")[13].strip().split(",")))
 65 |         years2 = set(map(int, author2.split("\t")[13].strip().split(",")))
 66 |         min_years1 = min(years1)
 67 |         max_years1 = max(years1)
 68 |         min_years2 = min(years2)
 69 |         max_years2 = max(years2)
 70 |         return abs(min_years1 - max_years2) < 10 or abs(min_years2 - max_years1) < 10
 71 | 
 72 | 
 73 | def compare_journals(author1, author2):
 74 |     journals1 = set(author1.split("\t")[14].strip().split(","))
 75 |     journals2 = set(author2.split("\t")[14].strip().split(","))
 76 |     if len(journals1) == 0 or len(journals2) == 0:
 77 |         return 0
 78 |     else:
 79 |         return len(journals1.intersection(journals2))
 80 | 
 81 | 
 82 | def compare_conferences(author1, author2):
 83 |     conferences1 = set(author1.split("\t")[15].strip().split(","))
 84 |     conferences2 = set(author2.split("\t")[15].strip().split(","))
 85 |     if len(conferences1) == 0 or len(conferences2) == 0:
 86 |         return 0
 87 |     else:
 88 |         return len(conferences1.intersection(conferences2))
 89 | 
 90 | 
 91 | def self_references(author1, author2):
 92 |     paperids1 = set(author1.split("\t")[9].strip().split(","))
 93 |     paperids2 = set(author2.split("\t")[9].strip().split(","))
 94 |     references1 = set(author1.split("\t")[16].strip().split(","))
 95 |     references2 = set(author2.split("\t")[16].strip().split(","))
 96 |     if len(paperids1) == 0 or len(paperids2) == 0 or len(references1) == 0 or len(references2) == 0:
 97 |         return 0
 98 |     else:
 99 |         return max(len(paperids1.intersection(references2)), len(paperids2.intersection(references1)))
100 | 
101 | 
102 | def common_references(author1, author2):
103 |     references1 = set(author1.split("\t")[16].strip().split(","))
104 |     references2 = set(author2.split("\t")[16].strip().split(","))
105 |     if len(references1) == 0 or len(references2) == 0:
106 |         return 0
107 |     else:
108 |         return len(references1.intersection(references2))
109 | 
110 | 
111 | def compare_authors(author1, author2):
112 |     score = 0
113 |     if compare_affiliation(author1, author2):
114 |         score += score_affiliation
115 | 
116 |     if compare_coauthors(author1, author2) == 1:
117 |         score += score_coauthors_1
118 |     elif compare_coauthors(author1, author2) == 2:
119 |         score += score_coauthors_2
120 |     elif compare_coauthors(author1, author2) > 2:
121 |         score += score_coauthors_3
122 | 
123 |     if compare_titles(author1, author2) == 1:
124 |         score += score_titles_1
125 |     elif compare_titles(author1, author2) == 2:
126 |         score += score_titles_2
127 |     elif compare_titles(author1, author2) >= 3:
128 |         score += score_titles_3
129 | 
130 |     if compare_years(author1, author2):
131 |         score += score_years
132 | 
133 |     if compare_journals(author1, author2) >= 1:
134 |         score += score_journals
135 | 
136 |     if compare_conferences(author1, author2) >= 1:
137 |         score += score_conferences
138 | 
139 |     if self_references(author1, author2) >= 1:
140 |         score += score_self_reference
141 | 
142 |     if common_references(author1, author2) == 1:
143 |         score += score_references_1
144 |     elif common_references(author1, author2) == 2:
145 |         score += score_references_2
146 |     elif common_references(author1, author2) >= 3:
147 |         score += score_references_3
148 | 
149 |     return score
150 | 
151 | 
152 | def get_id(author):
153 |     return author.split("\t")[0]
154 | 
155 | 
156 | def earlier_date(author1, author2):
157 |     date_object1 = datetime.strptime(author1[8], "%Y-%m-%d")
158 |     date_object2 = datetime.strptime(author2[8], "%Y-%m-%d")
159 |     earliest = min(date_object1, date_object2)
160 |     stringified = "-".join([str(earliest.year),
161 |                             str(earliest.month), str(earliest.day)])
162 |     return stringified
163 | 
164 | 
165 | def latest_affiliation(author1, author2):
166 |     date_object1 = datetime.strptime(author1[8], "%Y-%m-%d")
167 |     date_object2 = datetime.strptime(author2[8], "%Y-%m-%d")
168 |     if date_object1 < date_object2:
169 |         return author2[4]
170 |     else:
171 |         return author1[4]
172 | 
173 | 
174 | def add_paper_count(author1, author2):
175 |     return str(int(author1[5]) + int(author2[5]))
176 | 
177 | def add_paper_family_count(author1, author2):
178 |     return str(int(author1[6]) + int(author2[6]))
179 | 
180 | def add_citation_count(author1, author2):
181 |     return str(int(author1[7]) + int(author2[7]))
182 | 
183 | 
184 | def merge_authors(tuple_of_authors):
185 |     author1 = tuple_of_authors[0].strip("\n").split("\t")
186 |     author2 = tuple_of_authors[1].strip("\n").split("\t")
187 |     output = "\t".join(author1[0:4]) + "\t" + latest_affiliation(author1, author2) + "\t" + add_paper_count(author1, author2) + "\t" + add_paper_family_count(author1, author2) + "\t" + add_citation_count(author1, author2) + "\t" + earlier_date(author1, author2) + "\t" + (author1[9]+","+author2[9]).strip(",") + "\t" + (author1[10]+","+author2[10]).strip(",") + "\t" + (author1[11] +","+author2[11]).strip(",") + "\t" + (author1[12]+","+author2[12]).strip(",")  + "\t" + (author1[13]+","+author2[13]).strip(",")  + "\t" + (author1[14]+","+author2[14]).strip(",")  + "\t" + (author1[15]+","+author2[15]).strip(",")  + "\t" + (author1[16]+","+author2[16]).strip(",")
188 |     return output
189 | 
190 | 
191 | def add_to_mapping(dict_of_maps, entry1, entry2):
192 |     if entry2 not in dict_of_maps:
193 |         dict_of_maps[entry1] = entry2
194 |         return dict_of_maps
195 |     else:
196 |         return add_to_mapping(dict_of_maps, entry1, dict_of_maps[entry2])
197 | 
198 | 
199 | def disambiguate(list_of_authors, result, positive, negative):
200 |     author_dictionary = {get_id(author): author.strip("\n") for author in list_of_authors}
201 |     author_list = [get_id(author) for author in list_of_authors]
202 |     mapping = {}
203 |     result = result.copy()
204 |     comparisons = list(itertools.combinations(author_list, 2))
205 |     for item in comparisons:
206 |         try:
207 |             if compare_authors(author_dictionary[item[0]], author_dictionary[item[1]]) > threshold_matching:
208 |                 positive += 1
209 |                 if item[0] not in mapping:
210 |                     mapping = add_to_mapping(mapping, item[1], item[0])
211 |                     result = add_to_mapping(result, item[1], item[0])
212 |                     author_dictionary[item[0]] = merge_authors((author_dictionary[item[0]], author_dictionary[item[1]]))
213 |                     del author_dictionary[item[1]]
214 |                 else:
215 |                     author_dictionary[mapping[item[0]]] = merge_authors((author_dictionary[mapping[item[0]]], author_dictionary[item[1]]))
216 |                     mapping = add_to_mapping(mapping, item[1], item[0])
217 |                     result = add_to_mapping(result, item[1], item[0])
218 |                     del author_dictionary[item[1]]
219 |             else:
220 |                 negative += 1
221 |         except KeyError:
222 |             pass
223 |     return author_dictionary, result, positive, negative
224 |     
225 | 
226 | with open("12.authors_with_references_sorted.txt", "r") as inp:
227 |     with open("13.results.txt", "w") as outp:
228 |         with open("13.all_positives.txt", "w") as outp2:
229 |             with open("13.disambiguated_file.txt", "w") as outp3:
230 |                 positive = 0
231 |                 negative = 0
232 | 
233 |                 previous_name = ""
234 |                 current_authors = []
235 | 
236 |                 line_count = 1
237 | 
238 |                 for line in inp:
239 |                     print("Disambiguation: " + str(line_count))
240 | 
241 |                     name = line.split("\t")[2].strip()
242 |                     if previous_name == "" and len(current_authors) < max_block_size:
243 |                         previous_name = name
244 |                         current_authors.append(line)
245 |                     elif distance.get_jaro_distance(str.lower(name), str.lower(previous_name), winkler=True, scaling=scaling_factor) > threshold_blocking and len(current_authors) < max_block_size:
246 |                         previous_name = name
247 |                         current_authors.append(line)
248 |                     else:
249 |                         result = {}                            
250 |                         authors, result, positive, negative = disambiguate(current_authors, result, positive, negative)
251 |                         previous_name = name
252 |                         current_authors = [line]
253 |                         for item in authors:
254 |                             outp3.write(authors[item] + "\n")
255 |                         for item in result:
256 |                             outp2.write(item + "\t" + result[item] + "\n")
257 | 			
258 |                     line_count += 1
259 | 
260 |                 result = {} 
261 |                 authors, result, positive, negative = disambiguate(current_authors, result, positive, negative)
262 |                 for item in authors:
263 |                     outp3.write(authors[item] + "\n")
264 |                 for item in result:
265 |                     outp2.write(item + "\t" + result[item] + "\n")
266 | 
267 |         total_comparisons = positive + negative
268 | 
269 |         outp.write("Total comparisons: " + str(total_comparisons) + "\n")
270 |         outp.write("Total positives: " + str(positive) + ": " + str(positive/total_comparisons) + "\n")
271 |         outp.write("Total negatives: " + str(negative) + ": " + str(negative/total_comparisons))
272 | 


--------------------------------------------------------------------------------
/04.generate_knowledge_graph/OWL file.xml:
--------------------------------------------------------------------------------
  1 | <rdf:RDF xml:base="http://ma-graph.org/vocab">
  2 | <owl:Ontology rdf:about="http://ma-graph.org/vocab#"> </owl:Ontology>
  3 | 
  4 | 
  5 | <!--
  6 |  ///////////////////////////////////////////////////////////////////////////////////////
  7 | //
  8 | // Object Properties
  9 | //
 10 | ///////////////////////////////////////////////////////////////////////////////////////
 11 | -->
 12 | 
 13 | <!-- http://www.w3.org/ns/org#memberOf -->
 14 | <owl:ObjectProperty rdf:about="http://www.w3.org/ns/org#memberOf">
 15 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
 16 | <rdfs:range rdf:resource="http://ma-graph.org/class/Affiliation"/>
 17 | </owl:ObjectProperty>
 18 | <!-- http://ma-graph.org/property/hasParent -->
 19 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasParent">
 20 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 21 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 22 | </owl:ObjectProperty>
 23 | <!-- http://purl.org/dc/terms/isPartOf -->
 24 | <owl:ObjectProperty rdf:about="http://purl.org/dc/terms/isPartOf">
 25 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
 26 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
 27 | </owl:ObjectProperty>
 28 | <!-- http://purl.org/dc/terms/creator -->
 29 | <owl:ObjectProperty rdf:about="http://purl.org/dc/terms/creator">
 30 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 31 | <rdfs:range rdf:resource="http://ma-graph.org/class/Author"/>
 32 | </owl:ObjectProperty>
 33 | <!-- http://purl.org/spar/cito/hasCitedEntity -->
 34 | <owl:ObjectProperty rdf:about="http://purl.org/spar/cito/hasCitedEntity">
 35 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Citation"/>
 36 | <rdfs:range rdf:resource="http://ma-graph.org/class/Paper"/>
 37 | </owl:ObjectProperty>
 38 | <!-- http://purl.org/spar/cito/hasCitingEntity -->
 39 | <owl:ObjectProperty rdf:about="http://purl.org/spar/cito/hasCitingEntity">
 40 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Citation"/>
 41 | <rdfs:range rdf:resource="http://ma-graph.org/class/Paper"/>
 42 | </owl:ObjectProperty>
 43 | <!-- http://ma-graph.org/property/recommends -->
 44 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/recommends">
 45 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 46 | <rdfs:range rdf:resource="http://ma-graph.org/class/Paper"/>
 47 | </owl:ObjectProperty>
 48 | <!-- http://purl.org/spar/fabio/hasDiscipline -->
 49 | <owl:ObjectProperty rdf:about="http://purl.org/spar/fabio/hasDiscipline">
 50 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 51 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 52 | </owl:ObjectProperty>
 53 | <!-- http://purl.org/spar/cito/cites -->
 54 | <owl:ObjectProperty rdf:about="http://purl.org/spar/cito/cites">
 55 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 56 | <rdfs:range rdf:resource="http://ma-graph.org/class/Paper"/>
 57 | </owl:ObjectProperty>
 58 | <!-- http://ma-graph.org/property/appearsInConferenceInstance -->
 59 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/appearsInConferenceInstance">
 60 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 61 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
 62 | </owl:ObjectProperty>
 63 | <!-- http://ma-graph.org/property/appearsInConferenceSeries -->
 64 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/appearsInConferenceSeries">
 65 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 66 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
 67 | </owl:ObjectProperty>
 68 | <!-- http://ma-graph.org/property/appearsInJournal -->
 69 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/appearsInJournal">
 70 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
 71 | <rdfs:range rdf:resource="http://ma-graph.org/class/Journal"/>
 72 | </owl:ObjectProperty>
 73 | <!-- http://ma-graph.org/property/diseaseHasDiseaseCause -->
 74 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/diseaseHasDiseaseCause">
 75 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 76 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 77 | </owl:ObjectProperty>
 78 | <!-- http://ma-graph.org/property/diseaseHasMedicalTreatment -->
 79 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/diseaseHasMedicalTreatment">
 80 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 81 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 82 | </owl:ObjectProperty>
 83 | <!-- http://ma-graph.org/property/diseaseHasDiseaseSymptom -->
 84 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/diseaseHasSymptom">
 85 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 86 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 87 | </owl:ObjectProperty>
 88 | <!-- http://ma-graph.org/property/medicalTreatmentForDiseaseCause -->
 89 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/medicalTreatmentForDiseaseCause">
 90 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 91 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 92 | </owl:ObjectProperty>
 93 | <!-- http://ma-graph.org/property/medicalTreatmentForSymptom -->
 94 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/medicalTreatmentForSymptom">
 95 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 96 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
 97 | </owl:ObjectProperty>
 98 | <!-- http://ma-graph.org/property/symptomHasDiseaseCause -->
 99 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/symptomHasDiseaseCause">
100 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
101 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
102 | </owl:ObjectProperty>
103 | 
104 | <!--
105 | ///////////////////////////////////////////////////////////////////////////////////////
106 | //
107 | // Classes
108 | //
109 | ///////////////////////////////////////////////////////////////////////////////////////
110 | -->
111 | 
112 | <!-- http://ma-graph.org/class/Affiliation -->
113 | <owl:Class rdf:about="http://ma-graph.org/class/Affiliation">
114 | <rdfs:label xml:lang="en">Affiliation</rdfs:label>
115 | </owl:Class>
116 | <!-- http://ma-graph.org/class/Author -->
117 | <owl:Class rdf:about="http://ma-graph.org/class/Author">
118 | <rdfs:label xml:lang="en">Author</rdfs:label>
119 | </owl:Class>
120 | <!-- http://ma-graph.org/class/ConferenceInstance -->
121 | <owl:Class rdf:about="http://ma-graph.org/class/ConferenceInstance">
122 | <rdfs:label xml:lang="en">Conference Instance</rdfs:label>
123 | </owl:Class>
124 | <!-- http://ma-graph.org/class/ConferenceSeries -->
125 | <owl:Class rdf:about="http://ma-graph.org/class/ConferenceSeries">
126 | <rdfs:label xml:lang="en">Conference Series</rdfs:label>
127 | </owl:Class>
128 | <!-- http://ma-graph.org/class/FieldOfStudy -->
129 | <owl:Class rdf:about="http://ma-graph.org/class/FieldOfStudy">
130 | <rdfs:label xml:lang="en">Field of study</rdfs:label>
131 | </owl:Class>
132 | <!-- http://ma-graph.org/class/Journal -->
133 | <owl:Class rdf:about="http://ma-graph.org/class/Journal">
134 | <rdfs:label xml:lang="en">Journal</rdfs:label>
135 | </owl:Class>
136 | <!-- http://ma-graph.org/class/Citation -->
137 | <owl:Class rdf:about="http://ma-graph.org/class/Citation">
138 | <rdfs:label xml:lang="en">Citation</rdfs:label>
139 | </owl:Class>
140 | <!-- http://ma-graph.org/class/Paper -->
141 | <owl:Class rdf:about="http://ma-graph.org/class/Paper">
142 | <rdfs:label xml:lang="en">Paper</rdfs:label>
143 | </owl:Class>
144 | 
145 | <!--
146 |  ///////////////////////////////////////////////////////////////////////////////////////
147 | //
148 | // Data properties
149 | //
150 | ///////////////////////////////////////////////////////////////////////////////////////
151 | -->
152 | 
153 | <!-- ma-graph -->
154 | <!-- http://xmlns.com/foaf/0.1/homepage-->
155 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/homepage">
156 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
157 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
158 | </owl:DatatypeProperty>
159 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/homepage">
160 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
161 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
162 | </owl:DatatypeProperty>
163 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/homepage">
164 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
165 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
166 | </owl:DatatypeProperty>
167 | <!-- http://ma-graph.org/property/isRelatedTo -->
168 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
169 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
170 | <rdfs:range rdf:resource="http://ma-graph.org/class/Affiliation"/>
171 | </owl:DatatypeProperty>
172 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
173 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
174 | <rdfs:range rdf:resource="http://ma-graph.org/class/Journal"/>
175 | </owl:DatatypeProperty>
176 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
177 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
178 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
179 | </owl:DatatypeProperty>
180 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
181 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
182 | <rdfs:range rdf:resource="http://ma-graph.org/class/Affiliation"/>
183 | </owl:DatatypeProperty>
184 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
185 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
186 | <rdfs:range rdf:resource="http://ma-graph.org/class/Journal"/>
187 | </owl:DatatypeProperty>
188 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
189 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
190 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
191 | </owl:DatatypeProperty>
192 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
193 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
194 | <rdfs:range rdf:resource="http://ma-graph.org/class/Affiliation"/>
195 | </owl:DatatypeProperty>
196 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
197 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
198 | <rdfs:range rdf:resource="http://ma-graph.org/class/Journal"/>
199 | </owl:DatatypeProperty>
200 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
201 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
202 | <rdfs:range rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
203 | </owl:DatatypeProperty>
204 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/isRelatedTo">
205 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
206 | <rdfs:range rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
207 | </owl:DatatypeProperty>
208 | <!-- http://ma-graph.org/property/citationCount -->
209 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/citationCount">
210 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
211 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
212 | </owl:DatatypeProperty>
213 | <!-- http://ma-graph.org/property/hasORCID -->
214 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/hasORCID">
215 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
216 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
217 | </owl:DatatypeProperty>
218 | <!-- http://purl.org/dc/terms/created -->
219 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/created">
220 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
221 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
222 | </owl:DatatypeProperty>
223 | <!-- http://ma-graph.org/property/grid-->
224 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/grid">
225 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
226 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
227 | </owl:DatatypeProperty>
228 | <!-- http://www.w3.org/2003/01/geo/wgs84_pos#lat-->
229 | <owl:DatatypeProperty rdf:about="http://www.w3.org/2003/01/geo/wgs84_pos#lat">
230 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
231 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
232 | </owl:DatatypeProperty>
233 | <owl:DatatypeProperty rdf:about="http://www.w3.org/2003/01/geo/wgs84_pos#lat">
234 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
235 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
236 | </owl:DatatypeProperty>
237 | <!-- http://www.w3.org/2003/01/geo/wgs84_pos#long-->
238 | <owl:DatatypeProperty rdf:about="http://www.w3.org/2003/01/geo/wgs84_pos#long">
239 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
240 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
241 | </owl:DatatypeProperty>
242 | <owl:DatatypeProperty rdf:about="http://www.w3.org/2003/01/geo/wgs84_pos#long">
243 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
244 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
245 | </owl:DatatypeProperty>
246 | <!-- http://ma-graph.org/property/paperCount -->
247 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperCount">
248 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
249 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
250 | </owl:DatatypeProperty>
251 | <!-- http://ma-graph.org/property/paperFamilyCount -->
252 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
253 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
254 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
255 | </owl:DatatypeProperty>
256 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
257 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
258 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
259 | </owl:DatatypeProperty>
260 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
261 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
262 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
263 | </owl:DatatypeProperty>
264 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
265 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
266 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
267 | </owl:DatatypeProperty>
268 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
269 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
270 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
271 | </owl:DatatypeProperty>
272 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/paperFamilyCount">
273 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
274 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
275 | </owl:DatatypeProperty>
276 | <!-- http://ma-graph.org/property/hasAttributeType -->
277 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasAttributeType">
278 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
279 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
280 | </owl:ObjectProperty>
281 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasAttributeType">
282 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
283 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
284 | </owl:ObjectProperty>
285 | <!-- http://ma-graph.org/property/hasAttributeValue -->
286 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasAttributeValue">
287 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
288 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
289 | </owl:ObjectProperty>
290 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasAttributeValue">
291 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
292 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
293 | </owl:ObjectProperty>
294 | <!-- http://ma-graph.org/property/hasTag -->
295 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasTag">
296 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
297 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
298 | </owl:ObjectProperty>
299 | <!-- http://ma-graph.org/property/hasResourceType -->
300 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasResourceType">
301 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
302 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
303 | </owl:ObjectProperty>
304 | <!-- http://ma-graph.org/property/hasResourceUrl -->
305 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasResourceUrl">
306 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
307 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
308 | </owl:ObjectProperty>
309 | <!-- http://ma-graph.org/property/hasRelationshipType -->
310 | <owl:ObjectProperty rdf:about="http://ma-graph.org/property/hasRelationshipType">
311 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
312 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
313 | </owl:ObjectProperty>
314 | <!-- http://ma-graph.org/property/familyId -->
315 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/familyId">
316 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
317 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
318 | </owl:DatatypeProperty>
319 | <!-- http://ma-graph.org/property/familyRank -->
320 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/familyRank">
321 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
322 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
323 | </owl:DatatypeProperty>
324 | <!-- http://ma-graph.org/property/rank -->
325 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/rank">
326 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
327 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
328 | </owl:DatatypeProperty>
329 | <!-- http://xmlns.com/foaf/0.1/name -->
330 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
331 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Affiliation"/>
332 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
333 | </owl:DatatypeProperty>
334 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
335 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Author"/>
336 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
337 | </owl:DatatypeProperty>
338 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
339 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
340 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
341 | </owl:DatatypeProperty>
342 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
343 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceSeries"/>
344 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
345 | </owl:DatatypeProperty>
346 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
347 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
348 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
349 | </owl:DatatypeProperty>
350 | <owl:DatatypeProperty rdf:about="http://xmlns.com/foaf/0.1/name">
351 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
352 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
353 | </owl:DatatypeProperty>
354 | <!-- http://id.loc.gov/vocabulary/identifiers/issn-->
355 | <owl:DatatypeProperty rdf:about="http://id.loc.gov/vocabulary/identifiers/issn">
356 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
357 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
358 | </owl:DatatypeProperty>
359 | <!-- http://purl.org/dc/terms/publisher -->
360 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/publisher">
361 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Journal"/>
362 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
363 | </owl:DatatypeProperty>
364 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/publisher">
365 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
366 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
367 | </owl:DatatypeProperty>
368 | <!-- http://purl.org/dc/terms/abstract -->
369 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/abstract">
370 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
371 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
372 | </owl:DatatypeProperty>
373 | <!-- http://purl.org/NET/c4dm/timeline.owl#start -->
374 | <owl:DatatypeProperty rdf:about="http://purl.org/NET/c4dm/timeline.owl#start">
375 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
376 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
377 | </owl:DatatypeProperty>
378 | <!-- http://dbpedia.org/ontology/location -->
379 | <owl:DatatypeProperty rdf:about="http://dbpedia.org/ontology/location">
380 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
381 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
382 | </owl:DatatypeProperty>
383 | <!-- http://ma-graph.org/property/abstractRegistrationDate -->
384 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/abstractRegistrationDate">
385 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
386 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
387 | </owl:DatatypeProperty>
388 | <!-- http://ma-graph.org/property/finalVersionDueDate -->
389 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/finalVersionDueDate">
390 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
391 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
392 | </owl:DatatypeProperty>
393 | <!-- http://ma-graph.org/property/notificationDueDate -->
394 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/notificationDueDate">
395 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
396 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
397 | </owl:DatatypeProperty>
398 | <!-- http://ma-graph.org/property/submissionDeadlineDate -->
399 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/submissionDeadlineDate">
400 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
401 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
402 | </owl:DatatypeProperty>
403 | <!-- http://purl.org/NET/c4dm/timeline.owl#end -->
404 | <owl:DatatypeProperty rdf:about="http://purl.org/NET/c4dm/timeline.owl#end">
405 | <rdfs:domain rdf:resource="http://ma-graph.org/class/ConferenceInstance"/>
406 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
407 | </owl:DatatypeProperty>
408 | <!-- http://ma-graph.org/property/category -->
409 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/category">
410 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
411 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
412 | </owl:DatatypeProperty>
413 | <!-- http://ma-graph.org/property/level -->
414 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/level">
415 | <rdfs:domain rdf:resource="http://ma-graph.org/class/FieldOfStudy"/>
416 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
417 | </owl:DatatypeProperty>
418 | <!-- http://purl.org/spar/c4o/hasContext -->
419 | <owl:DatatypeProperty rdf:about="http://purl.org/spar/c4o/hasContext">
420 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Citation"/>
421 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
422 | </owl:DatatypeProperty>
423 | <!-- http://purl.org/dc/terms/language -->
424 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/language">
425 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
426 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#language"/>
427 | </owl:DatatypeProperty>
428 | <!-- http://purl.org/spar/fabio/hasURL -->
429 | <owl:DatatypeProperty rdf:about="http://purl.org/spar/fabio/hasURL">
430 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
431 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#anyURI"/>
432 | </owl:DatatypeProperty>
433 | <!-- http://prismstandard.org/namespaces/basic/2.0/startingPage -->
434 | <owl:DatatypeProperty rdf:about="http://prismstandard.org/namespaces/basic/2.0/startingPage">
435 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
436 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
437 | </owl:DatatypeProperty>
438 | <!-- http://prismstandard.org/namespaces/basic/2.0/publicationDate -->
439 | <owl:DatatypeProperty rdf:about="http://prismstandard.org/namespaces/basic/2.0/publicationDate">
440 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
441 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#date"/>
442 | </owl:DatatypeProperty>
443 | <!-- http://prismstandard.org/namespaces/basic/2.0/endingPage -->
444 | <owl:DatatypeProperty rdf:about="http://prismstandard.org/namespaces/basic/2.0/endingPage">
445 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
446 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
447 | </owl:DatatypeProperty>
448 | <!-- http://prismstandard.org/namespaces/basic/2.0/issueIdentifier -->
449 | <owl:DatatypeProperty rdf:about="http://prismstandard.org/namespaces/basic/2.0/issueIdentifier">
450 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
451 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
452 | </owl:DatatypeProperty>
453 | <!-- http://prismstandard.org/namespaces/basic/2.0/volume -->
454 | <owl:DatatypeProperty rdf:about="http://prismstandard.org/namespaces/basic/2.0/volume">
455 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
456 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
457 | </owl:DatatypeProperty>
458 | <!-- http://purl.org/dc/terms/title -->
459 | <owl:DatatypeProperty rdf:about="http://purl.org/dc/terms/title">
460 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
461 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
462 | </owl:DatatypeProperty>
463 | <!-- http://purl.org/spar/datacite/doi -->
464 | <owl:DatatypeProperty rdf:about="http://purl.org/spar/datacite/doi">
465 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
466 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
467 | </owl:DatatypeProperty>
468 | <!-- http://ma-graph.org/property/bookTitle -->
469 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/bookTitle">
470 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
471 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#string"/>
472 | </owl:DatatypeProperty>
473 | <!-- http://ma-graph.org/property/estimatedCitationCount -->
474 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/estimatedCitationCount">
475 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
476 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
477 | </owl:DatatypeProperty>
478 | <!-- http://ma-graph.org/property/referenceCount -->
479 | <owl:DatatypeProperty rdf:about="http://ma-graph.org/property/referenceCount">
480 | <rdfs:domain rdf:resource="http://ma-graph.org/class/Paper"/>
481 | <rdfs:range rdf:resource="http://www.w3.org/2001/XMLSchema#integer"/>
482 | </owl:DatatypeProperty>
483 | </rdf:RDF>


--------------------------------------------------------------------------------