├── 04.generate_knowledge_graph ├── 16.paperfieldsofstudy.py ├── 11.paperurls.py ├── 17.paperrecommendations.py ├── 20.paperabstracts.py ├── 22.paperfieldsofstudynew.py ├── 06.paperauthoraffiliations.py ├── 08.paperreferences.py ├── 21.papertag.py ├── 12.entityrelatedentities.py ├── 13.fieldofstudychildren.py ├── 24.paperauthoraffiliations_disambiguated.py ├── 25.authororcid.py ├── 14.fieldofstudyextendedattributes.py ├── 07.paperextendedattributes.py ├── 09.paperresources.py ├── 19.papercitationcontexts.py ├── 04.conferenceseries.py ├── 18.relatedfieldsofstudy.py ├── 02.authors.py ├── 23.authors_disambiguated.py ├── 15.fieldsofstudy.py ├── 05.journals.py ├── 01.affiliations.py ├── 03.conferenceinstances.py ├── 10.papers.py └── OWL file.xml ├── 03.statistical_analysis ├── 13.paper_types.py ├── 01.paper_by_year.py ├── 08.author_activity.py ├── 03.field_of_study_over_time.py ├── 05.paper_by_year.py ├── 10.number_of_authors_over_time.py ├── 04.field_of_study_over_time_custom.py ├── 02.average_author_coauthor.py ├── 07.reference_citation_by_year.py ├── 12.author_interdisciplinary_chord_graph.ipynb ├── 09.author_fos.py ├── 00.count_properties.py └── 06.paper_citation_reference.py ├── 01.field_of_study_classification ├── 02.extract_labels.py ├── 03.generate_low_level_fos.py ├── 05.generate_journal_fos_3.py ├── 05.generate_journal_fos_1.py ├── 05.generate_journal_fos_2.py ├── 10.assign_fos_to_paper.py ├── 01.tokenize_abstracts.py ├── 11.keyword_extraction.py ├── 07.generate_training_evaluation_data_sets.py ├── 09.classification.py ├── 04.generate_high_level_fos_1.py ├── 08.training.py ├── 00.create_abstract.py ├── 04.generate_high_level_fos_2.py └── 06.evaluate_with_journal_label.py ├── 02.knowledge_graph_embeddings ├── 04.train_embedding.sh ├── 00.prepare_author_input_graph.py ├── 02.prepare_paper_input_graph.py ├── 01.prepare_data_authors.py └── 03.prepare_data_papers.py ├── 00.entity_resolution ├── 02.extract_author_with_paper_id.py ├── 03.extract_paper_with_author_id.py ├── 01.extract_paper_id_with_doi.py ├── 16.sort_doi.py ├── 06.add_to_authors_paper_id.py ├── 14.recreate_files.py ├── 10.add_to_authors_year.py ├── 07.add_to_authors_doi.py ├── 17.doi_merge_orcid.py ├── 05.paper_id_merge_author_ids.py ├── 04.author_id_merge_paper_id.py ├── 09.add_to_authors_titles.py ├── 00.prepare_paper_references.py ├── 12.add_to_authors_references.py ├── 00.execute.sh ├── 18.add_to_authors_orcid.py ├── 11.add_to_authors_journal_and_conference.py ├── 08.add_to_authors_coauthors.py ├── 15.extract_orcid_data.py ├── 19.disambiguation_evaluation.py └── 13.disambiguation_data.py └── README.md /04.generate_knowledge_graph/16.paperfieldsofstudy.py: -------------------------------------------------------------------------------- 1 | with open("PaperFieldsOfStudy.txt", "r") as f: 2 | with open("16.PaperFieldsOfStudy.nt", "w") as g: 3 | for line in f: 4 | PaperId, FieldOfStudyId, Score = line.strip("\n").split("\t") 5 | g.write(f' .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/11.paperurls.py: -------------------------------------------------------------------------------- 1 | with open("PaperUrls.txt", "r") as f: 2 | with open("11.PaperUrls.nt", "w") as g: 3 | for line in f: 4 | PaperId = line.split("\t")[0] 5 | SourceUrl = line.split("\t")[2] 6 | g.write(f' "{SourceUrl}"^^ .\n') 7 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/17.paperrecommendations.py: -------------------------------------------------------------------------------- 1 | with open("PaperRecommendations.txt", "r") as f: 2 | with open("17.PaperRecommendations.nt", "w") as g: 3 | for line in f: 4 | PaperId, RecommendedPaperId, Score = line.strip("\n").split("\t") 5 | g.write(f' .\n') -------------------------------------------------------------------------------- /03.statistical_analysis/13.paper_types.py: -------------------------------------------------------------------------------- 1 | type_dict = {} 2 | #Add file path for Papers.txt 3 | with open("Papers.txt", "r") as f: 4 | with open("13.paper_types.txt", "w") as g: 5 | for line in f: 6 | doctype = line.split("\t")[3] 7 | try: 8 | type_dict[doctype] += 1 9 | except KeyError: 10 | type_dict[doctype] = 1 11 | g.write(str(type_dict)) -------------------------------------------------------------------------------- /04.generate_knowledge_graph/20.paperabstracts.py: -------------------------------------------------------------------------------- 1 | with open("01.field_of_study_classification/00.paper_abstracts.txt", "r") as f: 2 | with open("20.PaperAbstracts.nt", "w") as g: 3 | for line in f: 4 | PaperId, PaperAbstract = line.strip("\n").split("\t") 5 | g.write(f' "{PaperAbstract}"^^ .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/22.paperfieldsofstudynew.py: -------------------------------------------------------------------------------- 1 | with open("01.field_of_study_classification/10.paperid_with_fos.txt", "r") as f: 2 | with open("22.PaperFieldsOfStudyNew.nt", "w") as g: 3 | for line in f: 4 | PaperId, FieldOfStudy = line.strip("\n").split("\t") 5 | g.write(f' .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/06.paperauthoraffiliations.py: -------------------------------------------------------------------------------- 1 | with open("PaperAuthorAffiliations.txt", "r") as f: 2 | with open("06.PaperAuthorAffiliations.nt", "w") as g: 3 | for line in f: 4 | PaperId = line.split("\t")[0] 5 | AuthorId = line.split("\t")[1] 6 | g.write(f' .\n') 7 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/08.paperreferences.py: -------------------------------------------------------------------------------- 1 | with open("PaperReferences.txt", "r") as f: 2 | with open("08.PaperReferences.nt", "w") as g: 3 | for line in f: 4 | PaperId = line.split("\t")[0].strip() 5 | PaperReferenceId = line.split("\t")[1].strip() 6 | g.write(f' .\n') 7 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/21.papertag.py: -------------------------------------------------------------------------------- 1 | with open("01.field_of_study_classification/11.paper_keywords.txt", "r") as f: 2 | with open("21.PaperTags.nt", "w") as g: 3 | for line in f: 4 | PaperId = line.split("\t")[0] 5 | PaperTag = line.strip("\n").split("\t")[3] 6 | g.write(f' "{PaperTag}"^^ .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/12.entityrelatedentities.py: -------------------------------------------------------------------------------- 1 | with open("EntityRelatedEntities.txt", "r") as f: 2 | with open("12.EntityRelatedEntities.nt", "w") as g: 3 | for line in f: 4 | EntityId = line.split("\t")[0] 5 | RelatedEntityId = line.split("\t")[2] 6 | g.write(f' .\n') 7 | -------------------------------------------------------------------------------- /01.field_of_study_classification/02.extract_labels.py: -------------------------------------------------------------------------------- 1 | #Add path to FieldsOfStudy.txt 2 | with open("FieldsOfStudy.txt", "r") as f: 3 | with open("02.labels.txt", "w") as g: 4 | index = 0 5 | for line in f: 6 | id = line.split("\t")[0] 7 | name = line.split("\t")[3] 8 | level = line.split("\t")[5] 9 | if level == "0": 10 | g.write(f"{id}\t{name}\t{index}\n") 11 | index += 1 -------------------------------------------------------------------------------- /03.statistical_analysis/01.paper_by_year.py: -------------------------------------------------------------------------------- 1 | year_dict = {} 2 | 3 | #Add file path to Papers.txt 4 | with open("Papers.txt", "r") as inp: 5 | for line in inp: 6 | year = line.split("\t")[7] 7 | try: 8 | year_dict[year] += 1 9 | except KeyError: 10 | year_dict[year] = 1 11 | with open("01.paper_year_distribution.txt", "w") as outp: 12 | for item in year_dict: 13 | outp.write(f"{item}\t{year_dict[item]}\n") 14 | -------------------------------------------------------------------------------- /01.field_of_study_classification/03.generate_low_level_fos.py: -------------------------------------------------------------------------------- 1 | fos_set = set() 2 | with open("02.labels.txt", "r") as f: 3 | for line in f: 4 | fos_set.add(line.split("\t")[0]) 5 | 6 | #Add path to sorted PaperFieldsOfStudy.txt 7 | with open("SortedPaperFieldsOfStudy.txt", "r") as f: 8 | with open("03.papers_with_direct_labels.txt", "w") as g: 9 | for line in f: 10 | if line.split("\t")[1] in fos_set: 11 | g.write(line) 12 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/13.fieldofstudychildren.py: -------------------------------------------------------------------------------- 1 | with open("FieldOfStudyChildren.txt", "r") as f: 2 | with open("13.FieldOfStudyChildren.nt", "w") as g: 3 | for line in f: 4 | FieldOfStudyId = line.split("\t")[0] 5 | ChildFieldOfStudyId = line.split("\t")[1].strip() 6 | g.write(f' .\n') 7 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/24.paperauthoraffiliations_disambiguated.py: -------------------------------------------------------------------------------- 1 | with open("00.entity_resolution/14.PaperAuthorAffiliations_new.txt", "r") as f: 2 | with open("24.PaperAuthorAffiliations_disambiguated.nt", "w") as g: 3 | for line in f: 4 | PaperId = line.split("\t")[0] 5 | AuthorId = line.split("\t")[1] 6 | g.write(f' .\n') 7 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/25.authororcid.py: -------------------------------------------------------------------------------- 1 | with open("00.entity_resolution/18.authors_with_orcid.txt", "r") as f: 2 | with open("25.AuthorORCID.nt", "w") as g: 3 | for line in f: 4 | AuthorId = line.split("\t")[0] 5 | ORCID = line.strip("\n").split("\t")[17] 6 | if not ORCID == "": 7 | g.write(f' <{ORCID}>^^ .\n') 8 | -------------------------------------------------------------------------------- /02.knowledge_graph_embeddings/04.train_embedding.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DGLBACKEND=pytorch dglke_train --model_name TransE_l2 --data_path 02.knowledge_graph_embeddings --dataset mag_author --data_files 01.author_entities.dict 01.author_relations.dict 01.author_train.tsv 01.author_valid.tsv 01.author_test.tsv --format udd_hrt --batch_size 1000 --neg_sample_size 1000 --hidden_dim 100 --gamma 19.9 --lr 0.25 --max_step 1000000 --log_interval 100 --batch_size_eval 1000 --neg_sample_size_eval 1000 -adv --regularization_coef 1.00E-09 --gpu 0 1 2 3 4 5 6 7 --valid --test --mix_cpu_gpu 3 | -------------------------------------------------------------------------------- /02.knowledge_graph_embeddings/00.prepare_author_input_graph.py: -------------------------------------------------------------------------------- 1 | pred_list = ["", "", "", "", ""] 2 | 3 | #Add file path to Authors.nt 4 | with open("00.authors_input.txt", "w") as g: 5 | with open("Authors.nt", "r") as f: 6 | for line in f: 7 | pred = line.split(" ")[1] 8 | if pred in pred_list: 9 | g.write(line) 10 | 11 | -------------------------------------------------------------------------------- /00.entity_resolution/02.extract_author_with_paper_id.py: -------------------------------------------------------------------------------- 1 | line_count = 0 2 | 3 | print("Starting...") 4 | 5 | #Add path to PaperAuthorAffiliations.txt 6 | with open("PaperAuthorAffiliations.txt", "r") as inp: 7 | with open("02.author_id_with_paper_id.txt", "w") as outp: 8 | for line in inp: 9 | line_count += 1 10 | 11 | paper_id = line.split("\t")[0].strip() 12 | author_id = line.split("\t")[1].strip() 13 | outp.write(author_id + "\t" + paper_id + "\n") 14 | 15 | print(line_count) 16 | 17 | print("Finished.") 18 | -------------------------------------------------------------------------------- /00.entity_resolution/03.extract_paper_with_author_id.py: -------------------------------------------------------------------------------- 1 | line_count = 0 2 | 3 | print("Starting...") 4 | 5 | #Add path to PaperAuthorAffiliations.txt 6 | with open("PaperAuthorAffiliations.txt", "r") as inp: 7 | with open("03.paper_id_with_author_id.txt", "w") as outp: 8 | for line in inp: 9 | line_count += 1 10 | 11 | paper_id = line.split("\t")[0].strip() 12 | author_id = line.split("\t")[1].strip() 13 | outp.write(paper_id + "\t" + author_id + "\n") 14 | 15 | print(line_count) 16 | 17 | print("Finished.") 18 | -------------------------------------------------------------------------------- /01.field_of_study_classification/05.generate_journal_fos_3.py: -------------------------------------------------------------------------------- 1 | paper_label_dict = {} 2 | with open("05.paper_journal_labels.txt", "r") as f: 3 | for line in f: 4 | paper_label_dict[line.split("\t")[0].strip()] = line.split("\t")[1].strip() 5 | 6 | with open("00.paper_abstracts.txt", "r") as f: 7 | with open("05.papers_with_journal_labels.txt", "w") as g: 8 | for line in f: 9 | items = line.strip().split("\t") 10 | if items[0] in paper_label_dict: 11 | g.write(items[0] + "\t" + items[1] + "\t" + paper_label_dict[items[0]] + "\n") 12 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/14.fieldofstudyextendedattributes.py: -------------------------------------------------------------------------------- 1 | with open("FieldOfStudyExtendedAttributes.txt", "r") as f: 2 | with open("14.FieldOfStudyExtendedAttributes.nt", "w") as g: 3 | for line in f: 4 | FieldOfStudyId, AttributeType, AttributeValue = line.strip("\n").split("\t") 5 | g.write(f' "{AttributeType}"^^ .\n') 6 | g.write(f' "{AttributeValue}"^^ .\n') -------------------------------------------------------------------------------- /01.field_of_study_classification/05.generate_journal_fos_1.py: -------------------------------------------------------------------------------- 1 | labels_mapping = {} 2 | with open("02.labels.txt", "r") as f: 3 | for line in f: 4 | labels_mapping[line.split("\t")[1].strip().lower()] = line.split("\t")[2].strip() 5 | 6 | labels_list = [*labels_mapping.keys()] 7 | 8 | #Add path to Journals.txt 9 | with open("Journals.txt", "r") as f: 10 | with open("05.journals_label.txt", "w") as g: 11 | for line in f: 12 | journal = line.split("\t")[3].strip().lower() 13 | for label in labels_list: 14 | if " " + label + " " in journal or " " + label + "s " in journal: 15 | g.write(line.strip() + "\t" + labels_mapping[label] + "\n") 16 | -------------------------------------------------------------------------------- /01.field_of_study_classification/05.generate_journal_fos_2.py: -------------------------------------------------------------------------------- 1 | journal_labels = {} 2 | with open("05.journals_label.txt", "r") as f: 3 | for line in f: 4 | journal_labels[line.split("\t")[0].strip()] = line.split("\t")[10].strip() 5 | 6 | #Add path to Papers.txt 7 | line_count = 1 8 | with open("Papers.txt", "r") as f: 9 | with open("05.paper_journal_labels.txt", "w") as g: 10 | for line in f: 11 | print(line_count) 12 | paper_id = line.split("\t")[0].strip() 13 | journal_id = line.split("\t")[10].strip() 14 | if journal_id in journal_labels: 15 | g.write(paper_id + "\t" + journal_labels[journal_id] + "\n") 16 | line_count += 1 17 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/07.paperextendedattributes.py: -------------------------------------------------------------------------------- 1 | with open("PaperExtendedAttributes.txt", "r") as f: 2 | with open("07.PaperExtendedAttributes.nt", "w") as g: 3 | for line in f: 4 | PaperId, AttributeType, AttributeValue = line.strip("\n").split("\t") 5 | if not AttributeType == "": 6 | g.write(f' "{AttributeType}"^^ .\n') 7 | if not AttributeValue == "": 8 | g.write(f' "{AttributeValue}"^^ .\n') 9 | -------------------------------------------------------------------------------- /01.field_of_study_classification/10.assign_fos_to_paper.py: -------------------------------------------------------------------------------- 1 | def label_fos(input_file, output_file): 2 | with open("02.labels.txt", "r") as f: 3 | label_dict = {} 4 | for line in f: 5 | label_dict[line.strip().split("\t")[2]] = line.split("\t")[0] 6 | with open(f"{input_file}.txt", "r") as f: 7 | label_list = [] 8 | for line in f: 9 | label_list.append(line.strip()) 10 | with open(output_file, "a") as f: 11 | with open(input_file, "r") as g: 12 | for index, line in enumerate(g): 13 | paperid = line.split("\t")[0] 14 | f.write(f"{paperid}\t{label_dict[label_list[index]]}\n") 15 | 16 | f = open("10.paperid_with_fos.txt", "w") 17 | f.close() 18 | 19 | label_fos("01.tokenized_abstracts.txt", "10.paperid_with_fos.txt") 20 | -------------------------------------------------------------------------------- /00.entity_resolution/01.extract_paper_id_with_doi.py: -------------------------------------------------------------------------------- 1 | list_of_paper_id = [] 2 | line_count = 0 3 | 4 | print("Starting...") 5 | 6 | #Add path to Papers.txt 7 | with open("Papers.txt", "r") as inp: 8 | for line in inp: 9 | line_count += 1 10 | 11 | paper_id = line.split("\t")[0].strip() 12 | doi = line.split("\t")[2].strip() 13 | if not doi == "": 14 | list_of_paper_id.append((int(paper_id), doi)) 15 | 16 | print(line_count) 17 | 18 | print("Start sorting...") 19 | list_of_paper_id.sort(key=lambda tup: tup[0]) 20 | print("Finished sorting.") 21 | 22 | print("Start writing...") 23 | with open("01.paper_id_with_doi_sorted.txt", "w") as outp: 24 | for item in list_of_paper_id: 25 | outp.write(str(item[0]) + "\t" + item[1] + "\n") 26 | 27 | print("Finished.") 28 | -------------------------------------------------------------------------------- /03.statistical_analysis/08.author_activity.py: -------------------------------------------------------------------------------- 1 | years_dict = {} 2 | for year in range(1800, 2022): 3 | years_dict[year] = set() 4 | 5 | with open("00.entity_resolution/10.authors_with_year.txt", "r") as f: 6 | for line in f: 7 | authorid = line.split("\t")[0] 8 | years = line.split("\t")[-1].strip() 9 | if not years == "": 10 | f1 = lambda x: x 11 | f2 = lambda x: max(x-1, 1800) 12 | f3 = lambda x: min(x+1, 2021) 13 | years_list = list(map(int, years.split(","))) 14 | active_years = [f(year) for year in years_list for f in (f1, f2, f3)] 15 | for year in active_years: 16 | years_dict[year].add(authorid) 17 | with open("08.author_activity.txt", "w") as f: 18 | for item in years_dict: 19 | f.write(f"{item}\t{len(years_dict[item])}\n") 20 | -------------------------------------------------------------------------------- /03.statistical_analysis/03.field_of_study_over_time.py: -------------------------------------------------------------------------------- 1 | with open("01.field_of_study_classification/03.papers_with_direct_labels.txt", "r") as f: 2 | fos_dict = {} 3 | for line in f: 4 | paperid = line.split("\t")[0] 5 | fos = line.split("\t")[1] 6 | fos_dict[paperid] = fos 7 | 8 | with open("01.field_of_study_classification/02.labels.txt", "r") as f: 9 | labels = set() 10 | for line in f: 11 | labels.add(line.split("\t")[0]) 12 | 13 | for item in labels: 14 | f = open(f"03.field_of_study_over_time/{item}.txt", "w") 15 | f.close() 16 | 17 | #Add file path for Papers.txt 18 | with open("Papers.txt", "r") as f: 19 | for line in f: 20 | paperid = line.split("\t")[0] 21 | try: 22 | with open(f"03.field_of_study_over_time/{fos_dict[paperid]}.txt", "a") as g: 23 | g.write(line) 24 | except KeyError: 25 | pass 26 | 27 | 28 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/09.paperresources.py: -------------------------------------------------------------------------------- 1 | with open("PaperResources.txt", "r") as f: 2 | with open("09.PaperResources.nt", "w") as g: 3 | for line in f: 4 | PaperId, ResourceType, ResourceUrl, SourceUrl, RelationshipType = line.strip("\n").split("\t") 5 | if not ResourceType == "": 6 | g.write(f' "{ResourceType}"^^ .\n') 7 | if not ResourceUrl == "": 8 | g.write(f' "{ResourceUrl}"^^ .\n') 9 | if not RelationshipType == "": 10 | g.write(f' "{RelationshipType}"^^ .\n') -------------------------------------------------------------------------------- /03.statistical_analysis/05.paper_by_year.py: -------------------------------------------------------------------------------- 1 | def by_year(input_file): 2 | year_dict = {} 3 | with open(f"{input_file}.txt", "r") as inp: 4 | for line in inp: 5 | year = line.split("\t")[7] 6 | try: 7 | year_dict[year] += 1 8 | except KeyError: 9 | year_dict[year] = 1 10 | with open(f"{input_file}_output.txt", "w") as outp: 11 | for item in year_dict: 12 | outp.write(f"{item}\t{year_dict[item]}\n") 13 | 14 | by_year(121332964) 15 | by_year(138885662) 16 | by_year(144133560) 17 | by_year(17744445) 18 | by_year(205649164) 19 | by_year(41008148) 20 | by_year(95457728) 21 | by_year(127313418) 22 | by_year(142362112) 23 | by_year(15744967) 24 | by_year(185592680) 25 | by_year(33923547) 26 | by_year(71924100) 27 | by_year(127413603) 28 | by_year(144024400) 29 | by_year(162324750) 30 | by_year(192562407) 31 | by_year(39432304) 32 | by_year(86803240) 33 | -------------------------------------------------------------------------------- /00.entity_resolution/16.sort_doi.py: -------------------------------------------------------------------------------- 1 | list_of_doi_orcid = [] 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | with open("15.orcid_title_doi.txt", "r") as inp: 7 | for line in inp: 8 | print(line_count) 9 | 10 | try: 11 | orcid, name, title, doi = map(str.strip, line.split("\t")) 12 | if not doi == "" and not name == "": 13 | list_of_doi_orcid.append((doi.replace("(", "").replace(")", "").replace("http://dx.doi.org/", ""), name.strip(), orcid.strip())) 14 | except ValueError: 15 | pass 16 | 17 | line_count += 1 18 | 19 | print("Start sorting...") 20 | list_of_doi_orcid.sort(key=lambda tup: tup[0]) 21 | print("Finished sorting.") 22 | 23 | with open("16.doi_orcid_sorted.txt", "w") as outp: 24 | for item in list_of_doi_orcid: 25 | for name in item[1].split(";"): 26 | outp.write(item[0] + "\t" + name + "\t" + item[2] + "\n") 27 | 28 | print("Finished.") 29 | -------------------------------------------------------------------------------- /03.statistical_analysis/10.number_of_authors_over_time.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | 3 | with open("00.entity_resolution/05.paper_id_with_merged_author_id.txt", "r") as f: 4 | paper_dict = {} 5 | for line in f: 6 | paperid = line.split("\t")[0] 7 | authors = len(line.strip().split("\t")[1].split(",")) 8 | paper_dict[paperid] = authors 9 | 10 | 11 | #Add file path for Papers.txt 12 | with open("Papers.txt", "r") as f: 13 | year_dict = {year: [] for year in range(1800, 2022)} 14 | for line in f: 15 | paperid = line.split("\t")[0] 16 | year = line.split("\t")[7] 17 | if not year == "": 18 | try: 19 | year_dict[int(year)].append(paper_dict[paperid]) 20 | except KeyError: 21 | pass 22 | 23 | with open("10.author_number_by_year.txt", "w") as f: 24 | for item in year_dict: 25 | f.write(f"{item}\t{statistics.mean(year_dict[item])}\t{max(year_dict[item])}\n") 26 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/19.papercitationcontexts.py: -------------------------------------------------------------------------------- 1 | with open("PaperCitationContexts.txt", "r") as f: 2 | with open("19.PaperCitationContexts.nt", "w") as g: 3 | for line in f: 4 | PaperId, PaperReferenceId, CitationContext = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | g.write(f' .\n') 7 | g.write(f' .\n') 8 | g.write(f' "{CitationContext}"^^ .\n') -------------------------------------------------------------------------------- /01.field_of_study_classification/01.tokenize_abstracts.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | from nltk.stem import PorterStemmer 4 | from nltk.tokenize import word_tokenize 5 | import string 6 | 7 | nltk.download('stopwords') 8 | nltk.download('punkt') 9 | 10 | stop_words = set(stopwords.words('english')) 11 | 12 | line_count = 1 13 | 14 | with open("00.paper_abstracts.txt", "r") as inp: 15 | with open("01.tokenized_abstracts.txt", "w") as outp: 16 | for line in inp: 17 | print(line_count) 18 | abstract = line.split("\t")[1].strip() 19 | temp_abstract = abstract.translate(str.maketrans('', '', string.punctuation)) 20 | word_tokens = word_tokenize(temp_abstract) 21 | ps = PorterStemmer() 22 | tokenized_abstract = [ps.stem(w) for w in word_tokens if not w in stop_words] 23 | outp.write(line.split("\t")[0] + "\t" + " ".join(tokenized_abstract) + "\n") 24 | line_count += 1 25 | -------------------------------------------------------------------------------- /00.entity_resolution/06.add_to_authors_paper_id.py: -------------------------------------------------------------------------------- 1 | dict_of_paperids = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | with open("04.author_id_with_merged_paper_id.txt", "r") as inp: 7 | for line in inp: 8 | print("Loading: " + str(line_count)) 9 | 10 | author_id = line.split("\t")[0].strip() 11 | paper_ids = line.split("\t")[1].strip() 12 | dict_of_paperids[author_id] = paper_ids 13 | 14 | line_count += 1 15 | 16 | line_count = 1 17 | 18 | #Add path to Authors.txt 19 | with open("00.Authors.txt", "r") as inp: 20 | with open("06.authors_with_paper_id.txt", "w") as outp: 21 | for line in inp: 22 | print("Searching: " + str(line_count)) 23 | 24 | current_author = line.split("\t")[0].strip() 25 | try: 26 | outp.write(line.strip("\n") + "\t" + dict_of_paperids[current_author] + "\n") 27 | except KeyError: 28 | outp.write(line.strip("\n") + "\t\n") 29 | 30 | line_count += 1 31 | 32 | print("Finished.") 33 | -------------------------------------------------------------------------------- /01.field_of_study_classification/11.keyword_extraction.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import pytextrank 3 | import multiprocessing 4 | 5 | nlp = spacy.load("en_core_web_sm") 6 | tr = pytextrank.TextRank() 7 | nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) 8 | 9 | def extraction(item): 10 | output = "" 11 | paperid = item[0] 12 | text = item[1] 13 | doc = nlp(text) 14 | for p in doc._.phrases[:5]: 15 | output += f"{paperid}\t{p.rank:.4f}\t{p.count:5d}\t{p.text}\n" 16 | return output 17 | 18 | line_count = 1 19 | with open(f"00.paper_abstracts.txt", "r") as f: 20 | abstracts = [] 21 | for line in f: 22 | print("Loading: " + str(line_count)) 23 | paperid = line.strip().split("\t")[0] 24 | text = line.strip().replace(paperid, "").replace("\t", " ") 25 | abstracts.append((paperid, text)) 26 | line_count += 1 27 | 28 | with open(f"11.paper_keywords.txt", "w") as f: 29 | p = multiprocessing.Pool(8) 30 | for result in p.imap(extraction, abstracts): 31 | f.write(result) 32 | 33 | -------------------------------------------------------------------------------- /03.statistical_analysis/04.field_of_study_over_time_custom.py: -------------------------------------------------------------------------------- 1 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/3.field_of_study/3.mag_journal_labels/bert/0.data/paperid_with_fos.txt", "r") as f: 2 | fos_dict = {} 3 | for line in f: 4 | paperid = line.split("\t")[0] 5 | fos = line.split("\t")[1].strip() 6 | fos_dict[paperid] = fos 7 | 8 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/3.field_of_study/3.mag_journal_labels/00.labels.txt", "r") as f: 9 | labels = set() 10 | for line in f: 11 | labels.add(line.split("\t")[0]) 12 | 13 | for item in labels: 14 | f = open(f"04.field_of_study_over_time/{item}.txt", "w") 15 | f.close() 16 | 17 | with open("/pfs/work7/workspace/scratch/utdkf-ws_lin-0/0.data/0.mag_20200619/mag/Papers.txt", "r") as f: 18 | for line in f: 19 | paperid = line.split("\t")[0] 20 | try: 21 | with open(f"04.field_of_study_over_time/{fos_dict[paperid]}.txt", "a") as g: 22 | g.write(line) 23 | except KeyError: 24 | pass 25 | 26 | 27 | -------------------------------------------------------------------------------- /03.statistical_analysis/02.average_author_coauthor.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | 3 | with open("00.entity_resolution/05.paper_id_with_merged_author_id.txt", "r") as f: 4 | author_count = [len(line.strip().split("\t")[1].split(",")) for line in f] 5 | with open("00.entity_resolution/04.author_id_with_merged_paper_id.txt", "r") as f: 6 | paper_count = [len(line.strip().split("\t")[1].split(",")) for line in f] 7 | with open("00.entity_resolution/08.authors_with_co_authors.txt", "r") as f: 8 | coauthor_count = [len(line.strip("\n").split("\t")[11].split(",")) for line in f] 9 | 10 | with open("02.author_paper_average.txt", "w") as f: 11 | f.write(f"Average author per paper: {statistics.mean(author_count)}\n") 12 | f.write(f"Maximum author per paper: {max(author_count)}\n") 13 | f.write(f"Average paper per author: {statistics.mean(paper_count)}\n") 14 | f.write(f"Maximum paper per author: {max(paper_count)}\n") 15 | f.write(f"Average coauthor per author: {statistics.mean(coauthor_count)}\n") 16 | f.write(f"Maximum coauthor per author: {max(coauthor_count)}\n") 17 | -------------------------------------------------------------------------------- /00.entity_resolution/14.recreate_files.py: -------------------------------------------------------------------------------- 1 | id_mapping = {} 2 | with open("13.all_positives.txt", "r") as f: 3 | for line in f: 4 | id_mapping[line.split("\t")[0].strip()] = line.split("\t")[1].strip() 5 | line_count = 1 6 | 7 | #Add path to PaperAuthorAffiliations.txt 8 | with open("PaperAuthorAffiliations.txt", "r") as f: 9 | with open("14.PaperAuthorAffiliations_new.txt", "w") as g: 10 | for line in f: 11 | print("PaperAuthorAffiliations: " + str(line_count)) 12 | author_id = line.split("\t")[1].strip() 13 | if author_id in id_mapping: 14 | g.write(line.replace(author_id, id_mapping[author_id])) 15 | else: 16 | g.write(line) 17 | line_count += 1 18 | line_count = 1 19 | with open("13.disambiguated_file.txt", "r") as f: 20 | with open("14.Authors_new.txt", "w") as g: 21 | for line in f: 22 | print("Authors: " + str(line_count)) 23 | items = line.strip().split("\t") 24 | g.write("\t".join(items[0:9]) + "\n") 25 | line_count += 1 26 | -------------------------------------------------------------------------------- /01.field_of_study_classification/07.generate_training_evaluation_data_sets.py: -------------------------------------------------------------------------------- 1 | with open("05.papers_with_journal_labels.txt", "r") as f: 2 | with open("train.csv", "w") as g: 3 | with open("eval.csv", "w") as h: 4 | fos_dict = {} 5 | line_count = 1 6 | for line in f: 7 | print(line_count) 8 | fos = line.split("\t")[2].strip() 9 | try: 10 | if fos_dict[fos] < 2000: 11 | h.write(line.split("\t")[1] + " ," + fos + "\n") 12 | fos_dict[fos] += 1 13 | elif fos_dict[fos] < 22000: 14 | g.write(line.split("\t")[1] + " ," + fos + "\n") 15 | fos_dict[fos] += 1 16 | else: 17 | pass 18 | except KeyError: 19 | h.write(line.split("\t")[1] + " ," + fos + "\n") 20 | fos_dict[fos] = 1 21 | line_count += 1 22 | for item in fos_dict: 23 | print(str(item) + "\t" + str(fos_dict[item]-2000)) 24 | -------------------------------------------------------------------------------- /00.entity_resolution/10.add_to_authors_year.py: -------------------------------------------------------------------------------- 1 | dict_of_years = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | #Add path to Papers.txt 7 | with open("Papers.txt", "r") as inp: 8 | for line in inp: 9 | print("Loading: " + str(line_count)) 10 | 11 | paper_id = line.split("\t")[0].strip() 12 | year = line.split("\t")[7].strip() 13 | dict_of_years[paper_id] = year 14 | 15 | line_count += 1 16 | 17 | line_count = 1 18 | 19 | with open("09.authors_with_titles.txt", "r") as inp: 20 | with open("10.authors_with_year.txt", "w") as outp: 21 | for line in inp: 22 | print("Searching: " + str(line_count)) 23 | 24 | paper_ids = line.split("\t")[9].strip().split(",") 25 | years = set() 26 | for paper_id in paper_ids: 27 | try: 28 | years.add(dict_of_years[paper_id]) 29 | except KeyError: 30 | pass 31 | outp.write(line.strip("\n") + "\t" + ",".join(years) + "\n") 32 | 33 | line_count += 1 34 | 35 | print("Finished.") 36 | 37 | -------------------------------------------------------------------------------- /00.entity_resolution/07.add_to_authors_doi.py: -------------------------------------------------------------------------------- 1 | dict_of_dois = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | with open("01.paper_id_with_doi_sorted.txt", "r") as inp: 7 | for line in inp: 8 | print("Loading: " + str(line_count)) 9 | 10 | paper_id = line.split("\t")[0].strip() 11 | doi = line.split("\t")[1].strip() 12 | dict_of_dois[paper_id] = doi 13 | 14 | line_count += 1 15 | 16 | line_count = 1 17 | 18 | with open("06.authors_with_paper_id.txt", "r") as inp: 19 | with open("07.authors_with_paper_doi.txt", "w") as outp: 20 | for line in inp: 21 | print("Searching: " + str(line_count)) 22 | 23 | paper_ids = line.split("\t")[9].strip() 24 | current_dois = [] 25 | for current_id in paper_ids.split(","): 26 | try: 27 | current_dois.append(dict_of_dois[current_id]) 28 | except KeyError: 29 | pass 30 | outp.write(line.strip("\n") + "\t" + ",".join(current_dois) + "\n") 31 | 32 | line_count += 1 33 | 34 | print("Finished.") 35 | -------------------------------------------------------------------------------- /00.entity_resolution/17.doi_merge_orcid.py: -------------------------------------------------------------------------------- 1 | line_count = 1 2 | 3 | print("Starting...") 4 | 5 | with open("16.doi_orcid_sorted.txt", "r") as inp: 6 | with open("17.doi_with_merged_orcid.txt", "w") as outp: 7 | current_doi = "" 8 | current_orcids = "" 9 | for line in inp: 10 | print(line_count) 11 | 12 | doi = line.split("\t")[0].strip() 13 | name = line.split("\t")[1].strip() 14 | orcid = line.split("\t")[2].strip() 15 | 16 | if not name.strip() == "": 17 | if doi == current_doi: 18 | current_orcids += (";" + (name + "," + orcid)) 19 | elif current_doi == "": 20 | current_doi = doi 21 | current_orcids = name + "," + orcid 22 | else: 23 | outp.write(current_doi + "\t" + current_orcids + "\n") 24 | current_doi = doi 25 | current_orcids = name + "," + orcid 26 | 27 | line_count += 1 28 | 29 | outp.write(current_doi + "\t" + current_orcids) 30 | 31 | print("Finished.") 32 | -------------------------------------------------------------------------------- /01.field_of_study_classification/09.classification.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from simpletransformers.classification import ClassificationModel 3 | import os 4 | import pickle 5 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 6 | 7 | print("Loading model...") 8 | model = ClassificationModel('bert', 'scibert_20000_input_2_epoch/outputs/best_model', num_labels=19, use_cuda=True, args={"fp16": False, "n_gpu": 4, "eval_batch_size": 4096}) 9 | print("Finished loading model...") 10 | 11 | def classify(inp, model): 12 | print(f"Loading input {inp}...") 13 | with open(f"{inp}", "r") as f: 14 | input_list = [] 15 | for line in f: 16 | input_list.append(line.split(",")[0]) 17 | print("Finished loading input.") 18 | print(f"Classification in progress for {inp}...") 19 | with open(f"{inp}labeled.txt", "w") as g: 20 | predictions, raw_outputs = model.predict(input_list) 21 | for item in predictions: 22 | g.write(f"{item}\n") 23 | print("Finished classification") 24 | 25 | #If necessary, split the input file into multiple parts and classify in sequence 26 | classify("01.tokenized_abstracts.txt", model) 27 | -------------------------------------------------------------------------------- /00.entity_resolution/05.paper_id_merge_author_ids.py: -------------------------------------------------------------------------------- 1 | line_count = 1 2 | 3 | print("Starting...") 4 | 5 | with open("03.paper_id_with_author_id_sorted.txt", "r") as inp: 6 | with open("05.paper_id_with_merged_author_id.txt", "w") as outp: 7 | current_paper_id = "" 8 | current_authors = "" 9 | 10 | for line in inp: 11 | print(line_count) 12 | 13 | paper_id = line.split("\t")[0].strip() 14 | author_id = line.split("\t")[1].strip() 15 | 16 | if paper_id == current_paper_id: 17 | current_authors += (";" + author_id) 18 | elif current_paper_id == "": 19 | current_paper_id = paper_id 20 | current_authors = author_id 21 | else: 22 | list_of_authors = ",".join(current_authors.split(";")).strip(",") 23 | outp.write(current_paper_id + "\t" + list_of_authors + "\n") 24 | current_paper_id = paper_id 25 | current_authors = author_id 26 | 27 | line_count += 1 28 | 29 | outp.write(current_paper_id + "\t" + list_of_authors + "\n") 30 | 31 | print("Finished.") 32 | -------------------------------------------------------------------------------- /00.entity_resolution/04.author_id_merge_paper_id.py: -------------------------------------------------------------------------------- 1 | line_count = 1 2 | 3 | print("Starting...") 4 | 5 | with open("02.author_id_with_paper_id_sorted.txt", "r") as inp: 6 | with open("04.author_id_with_merged_paper_id.txt", "w") as outp: 7 | current_author_id = "" 8 | current_papers = "" 9 | 10 | for line in inp: 11 | print(line_count) 12 | 13 | paper_id = line.split("\t")[1].strip() 14 | author_id = line.split("\t")[0].strip() 15 | 16 | if author_id == current_author_id: 17 | current_papers += (";" + paper_id) 18 | elif current_author_id == "": 19 | current_author_id = author_id 20 | current_papers = paper_id 21 | else: 22 | list_of_papers = ",".join(current_papers.split(";")).strip(",") 23 | outp.write(current_author_id + "\t" + list_of_papers + "\n") 24 | current_author_id = author_id 25 | current_papers = paper_id 26 | 27 | line_count += 1 28 | 29 | outp.write(current_author_id + "\t" + list_of_papers + "\n") 30 | 31 | print("Finished.") 32 | -------------------------------------------------------------------------------- /00.entity_resolution/09.add_to_authors_titles.py: -------------------------------------------------------------------------------- 1 | dict_of_titles = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | #Add path to Papers.txt 7 | with open("Papers.txt", "r") as inp: 8 | for line in inp: 9 | print("Loading: " + str(line_count)) 10 | 11 | paper_id = line.split("\t")[0].strip() 12 | paper_title = line.split("\t")[4].strip() 13 | book_title = line.split("\t")[6].strip() 14 | dict_of_titles[paper_id] = paper_title + book_title 15 | 16 | line_count += 1 17 | 18 | line_count = 1 19 | 20 | with open("08.authors_with_co_authors.txt", "r") as inp: 21 | with open("09.authors_with_titles.txt", "w") as outp: 22 | for line in inp: 23 | print("Searching: " + str(line_count)) 24 | 25 | paper_ids = line.split("\t")[9].strip().split(",") 26 | titles = [] 27 | for paper_id in paper_ids: 28 | try: 29 | titles.append(dict_of_titles[paper_id]) 30 | except KeyError: 31 | pass 32 | outp.write(line.strip("\n") + "\t" + ",".join(titles) + "\n") 33 | 34 | line_count += 1 35 | 36 | print("Finished.") 37 | 38 | -------------------------------------------------------------------------------- /00.entity_resolution/00.prepare_paper_references.py: -------------------------------------------------------------------------------- 1 | line_count = 1 2 | 3 | print("Starting...") 4 | 5 | #Add path to PaperReferences.txt 6 | with open("PaperReferences.txt", "r") as inp: 7 | with open("00.paper_id_with_merged_references.txt", "w") as outp: 8 | current_paper_id = "" 9 | current_references = "" 10 | 11 | for line in inp: 12 | print(line_count) 13 | 14 | paper_id = line.split("\t")[0].strip() 15 | author_id = line.split("\t")[1].strip() 16 | 17 | if paper_id == current_paper_id: 18 | current_references += (";" + author_id) 19 | elif current_paper_id == "": 20 | current_paper_id = paper_id 21 | current_references = author_id 22 | else: 23 | list_of_references = ",".join(current_references.split(";")).strip(",") 24 | outp.write(current_paper_id + "\t" + list_of_references + "\n") 25 | current_paper_id = paper_id 26 | current_references = author_id 27 | 28 | line_count += 1 29 | 30 | outp.write(current_paper_id + "\t" + list_of_references + "\n") 31 | 32 | print("Finished.") 33 | -------------------------------------------------------------------------------- /00.entity_resolution/12.add_to_authors_references.py: -------------------------------------------------------------------------------- 1 | dict_of_references = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | with open("00.paper_id_with_merged_references.txt", "r") as inp: 7 | for line in inp: 8 | print("Loading: " + str(line_count)) 9 | 10 | paper_id = line.split("\t")[0].strip() 11 | references = line.split("\t")[1].strip() 12 | dict_of_references[paper_id] = references 13 | line_count += 1 14 | 15 | line_count = 1 16 | found = 0 17 | not_found = 0 18 | 19 | with open("11.authors_with_journal_and_conference.txt", "r") as inp: 20 | with open("12.authors_with_references.txt", "w") as outp: 21 | for line in inp: 22 | print("Searching: " + str(line_count)) 23 | 24 | paper_ids = line.split("\t")[9].strip().split(",") 25 | references = set() 26 | for paper_id in paper_ids: 27 | try: 28 | references.update(dict_of_references[paper_id].split(",")) 29 | except KeyError: 30 | pass 31 | outp.write(line.strip("\n") + "\t" + ",".join(references) + "\n") 32 | 33 | line_count += 1 34 | 35 | print("Finished.") 36 | -------------------------------------------------------------------------------- /00.entity_resolution/00.execute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python 00.prepare_paper_references.py 3 | python 01.extract_paper_id_with_doi.py 4 | python 02.extract_author_with_paper_id.py 5 | 6 | sort -n -t$'\t' -k1 02.author_id_with_paper_id.txt > 02.author_id_with_paper_id_sorted.txt 7 | 8 | python 03.extract_paper_with_author_id.py 9 | 10 | sort -n -t$'\t' -k1 03.paper_id_with_author_id.txt > 03.paper_id_with_author_id_sorted.txt 11 | 12 | python 04.author_id_merge_paper_id.py 13 | python 05.paper_id_merge_author_ids.py 14 | python 06.add_to_authors_paper_id.py 15 | python 07.add_to_authors_doi.py 16 | python 08.add_to_authors_coauthors.py 17 | python 09.add_to_authors_titles.py 18 | python 10.add_to_authors_year.py 19 | python 11.add_to_authors_journal_and_conference.py 20 | python 12.add_to_authors_references.py 21 | 22 | mkdir sort 23 | split -l 5000000 -d 12.authors_with_references.txt sort/sort_file 24 | cd sort 25 | for file in sort_file*; do 26 | echo $file 27 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file 28 | done 29 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../12.authors_with_references_sorted.txt 30 | cd .. 31 | rm -r sort 32 | 33 | python 13.disambiguation_data.py 34 | python 14.recreate_files -------------------------------------------------------------------------------- /03.statistical_analysis/07.reference_citation_by_year.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | 3 | paper_references = {} 4 | paper_citations = {} 5 | #Add file path for Papers.txt 6 | with open("Papers.txt", "r") as f: 7 | for line in f: 8 | year = line.split("\t")[7] 9 | references = int(line.split("\t")[18]) 10 | citations = int(line.split("\t")[19]) 11 | if not year == "": 12 | try: 13 | paper_references[year].append(references) 14 | except KeyError: 15 | paper_references[year] = [references] 16 | try: 17 | paper_citations[year].append(citations) 18 | except KeyError: 19 | paper_citations[year] = [citations] 20 | 21 | with open("07.paper_references_by_year.txt", "w") as f: 22 | for item in paper_references: 23 | f.write(f"{item}\t{sum(paper_references[item])}\t{len(paper_references[item])}\t{statistics.mean(paper_references[item])}\t{statistics.median(paper_references[item])}\t{max(paper_references[item])}\n") 24 | 25 | with open("07.paper_citations_by_year.txt", "w") as f: 26 | for item in paper_citations: 27 | f.write(f"{item}\t{sum(paper_citations[item])}\t{len(paper_citations[item])}\t{statistics.mean(paper_citations[item])}\t{statistics.median(paper_citations[item])}\t{max(paper_citations[item])}\n") 28 | -------------------------------------------------------------------------------- /01.field_of_study_classification/04.generate_high_level_fos_1.py: -------------------------------------------------------------------------------- 1 | fos_dict = {} 2 | paper_labels = set() 3 | 4 | with open("02.labels.txt", "r") as f: 5 | for line in f: 6 | paper_labels.add(line.split("\t")[0].strip()) 7 | 8 | def find_top_fos(fos, fos_dict, paper_labels): 9 | output = {fos} 10 | while any(f in fos_dict for f in output): 11 | for f in {f for f in output if f in fos_dict}: 12 | output.update(fos_dict[f]) 13 | output.remove(f) 14 | return output.intersection(paper_labels) 15 | 16 | line_count = 1 17 | #Add path to FieldOfStudyChildren.txt 18 | with open("FieldOfStudyChildren.txt", "r") as f: 19 | for line in f: 20 | child_fos = line.split("\t")[1].strip() 21 | parent_fos = line.split("\t")[0].strip() 22 | if child_fos in fos_dict: 23 | fos_dict[child_fos].add(parent_fos) 24 | else: 25 | fos_dict[child_fos] = {parent_fos} 26 | 27 | #Add path to sorted PaperFieldsOfStudy.txt 28 | with open("SortedPaperFieldsOfStudy", "r") as f: 29 | with open("04.papers_top_level_labels.txt", "w") as g: 30 | for line in f: 31 | print(line_count) 32 | fos = line.split("\t")[1].strip() 33 | for f in find_top_fos(fos, fos_dict, paper_labels): 34 | g.write(line.replace(fos, f)) 35 | line_count += 1 36 | 37 | -------------------------------------------------------------------------------- /02.knowledge_graph_embeddings/02.prepare_paper_input_graph.py: -------------------------------------------------------------------------------- 1 | pred_list = ["", "", "", "", "", "", "", "", "", "", "", ""] 2 | 3 | with open("02.papers_input.txt", "w") as g: 4 | #Add file path to Papers.nt 5 | with open("Papers.nt", "r") as f: 6 | for line in f: 7 | pred = line.split(" ")[1] 8 | if pred in pred_list: 9 | g.write(line) 10 | 11 | #Add file path to Journals.nt 12 | with open("Journals.nt", "r") as f: 13 | for line in f: 14 | pred = line.split(" ")[1] 15 | if pred in pred_list: 16 | g.write(line) 17 | 18 | #Add file path to ConferenceSeries.nt 19 | with open("ConferenceSeries.nt", "r") as f: 20 | for line in f: 21 | pred = line.split(" ")[1] 22 | if pred in pred_list: 23 | g.write(line) 24 | 25 | -------------------------------------------------------------------------------- /01.field_of_study_classification/08.training.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from simpletransformers.classification import ClassificationModel 3 | from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report 4 | import os 5 | import pickle 6 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 7 | 8 | def f1_multiclass(labels, preds): 9 | return f1_score(labels, preds, average='micro') 10 | def report(labels, preds): 11 | return classification_report(labels, preds) 12 | 13 | train_df = pd.read_csv('train.csv', header=None, dtype={"0":str, "1":str}) 14 | eval_df = pd.read_csv('eval.csv', header=None, dtype={"0":str, "1":str}) 15 | 16 | #change the chosen model accordingly 17 | model = ClassificationModel('bert', 'allenai/scibert_scivocab_uncased', num_labels=19, use_cuda=True, args={"fp16": False, "n_gpu": 4, "num_train_epochs": 2, "evaluate_during_training": True}) 18 | 19 | model.train_model(train_df, eval_df=eval_df) 20 | 21 | result, model_outputs, wrong_predictions = model.eval_model(eval_df, f1=f1_multiclass, acc=accuracy_score, classification_report=report) 22 | 23 | with open("results.txt", "w") as f: 24 | f.write(str(result)) 25 | with open("model_outputs.txt", "w") as f: 26 | f.write(str(model_outputs)) 27 | with open("wrong_predictions.txt", "w") as f: 28 | f.write(str(wrong_predictions)) 29 | 30 | model_name = "bert_model.sav" 31 | pickle.dump(model, open(model_name, "wb")) 32 | 33 | -------------------------------------------------------------------------------- /00.entity_resolution/18.add_to_authors_orcid.py: -------------------------------------------------------------------------------- 1 | from pyjarowinkler import distance 2 | 3 | dict_of_orcids = {} 4 | line_count = 1 5 | 6 | print("Starting...") 7 | 8 | with open("17.doi_with_merged_orcid.txt", "r") as inp: 9 | for line in inp: 10 | print("Loading: " + str(line_count)) 11 | 12 | doi = line.split("\t")[0].strip() 13 | orcid = line.split("\t")[1].strip() 14 | dict_of_orcids[doi] = orcid 15 | 16 | line_count += 1 17 | 18 | line_count = 1 19 | 20 | with open("12.authors_with_references_sorted.txt", "r") as inp: 21 | with open("18.authors_with_orcid.txt", "w") as outp: 22 | for line in inp: 23 | print("Searching: " + str(line_count)) 24 | 25 | name = line.split("\t")[3].strip() 26 | dois = line.split("\t")[10].strip().split(";") 27 | orcids = set() 28 | for doi in dois: 29 | try: 30 | found_orcids = dict_of_orcids[doi].split(";") 31 | for orcid in found_orcids: 32 | if distance.get_jaro_distance(str.lower(name), str.lower(orcid.split(",")[0]), winkler=True, scaling=0.1)>0.9: 33 | orcids.update([orcid.split(",")[1].strip()]) 34 | except KeyError: 35 | pass 36 | outp.write(line.strip("\n") + "\t" + ",".join(orcids).strip() + "\n") 37 | 38 | line_count += 1 39 | 40 | print("Finished.") 41 | 42 | -------------------------------------------------------------------------------- /01.field_of_study_classification/00.create_abstract.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("00.paper_abstracts.txt", "w") as output: 4 | #Add path to PaperAbstractsInvertedIndex.txt.1 5 | with open("PaperAbstractsInvertedIndex.txt.1") as f: 6 | for line in f: 7 | paper_id, inverted_index = line.strip().split("\t") 8 | index_length = json.loads(inverted_index)["IndexLength"] 9 | indexes = json.loads(inverted_index)["InvertedIndex"] 10 | sentence_list = [" "]*index_length 11 | for word in indexes: 12 | word_index = list(indexes[word]) 13 | for index in word_index: 14 | sentence_list[index] = word.replace("\n", " ").replace("\r", "").replace("\t", " ") 15 | output.write(paper_id + "\t" + " ".join(sentence_list) + "\n") 16 | #Add path to PaperAbstractsInvertedIndex.txt.2 17 | with open("PaperAbstractsInvertedIndex.txt.2") as f: 18 | for line in f: 19 | paper_id, inverted_index = line.strip().split("\t") 20 | index_length = json.loads(inverted_index)["IndexLength"] 21 | indexes = json.loads(inverted_index)["InvertedIndex"] 22 | sentence_list = [" "]*index_length 23 | for word in indexes: 24 | word_index = list(indexes[word]) 25 | for index in word_index: 26 | sentence_list[index] = word.replace("\n", " ").replace("\r", "").replace("\t", " ") 27 | output.write(paper_id + "\t" + " ".join(sentence_list) + "\n") 28 | -------------------------------------------------------------------------------- /00.entity_resolution/11.add_to_authors_journal_and_conference.py: -------------------------------------------------------------------------------- 1 | dict_of_journals = {} 2 | dict_of_conferences = {} 3 | line_count = 1 4 | 5 | print("Starting...") 6 | print("Loading lists...") 7 | #Add path to Papers.txt 8 | with open("Papers.txt", "r") as inp: 9 | for line in inp: 10 | print("Loading: " + str(line_count)) 11 | 12 | paper_id = line.split("\t")[0].strip() 13 | journal = line.split("\t")[11].strip() 14 | conference = line.split("\t")[12].strip() 15 | dict_of_journals[paper_id] = journal 16 | dict_of_conferences[paper_id] = conference 17 | 18 | line_count += 1 19 | 20 | line_count = 1 21 | 22 | with open("10.authors_with_year.txt", "r") as inp: 23 | with open("11.authors_with_journal_and_conference.txt", "w") as outp: 24 | for line in inp: 25 | print("Searching: " + str(line_count)) 26 | 27 | paper_ids = line.split("\t")[9].strip().split(",") 28 | journals = set() 29 | conferences = set() 30 | for paper_id in paper_ids: 31 | try: 32 | journals.add(dict_of_journals[paper_id]) 33 | except KeyError: 34 | pass 35 | try: 36 | conferences.add(dict_of_journals[paper_id]) 37 | except KeyError: 38 | pass 39 | outp.write(line.strip("\n") + "\t" + ",".join(journals).strip(",") + "\t" + ",".join(conferences).strip(",") + "\n") 40 | 41 | line_count += 1 42 | 43 | print("Finished.") 44 | -------------------------------------------------------------------------------- /01.field_of_study_classification/04.generate_high_level_fos_2.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | line_count = 1 4 | with open("04.papers_top_level_labels.txt", "r") as f: 5 | with open("04.papers_with_indirect_labels.txt", "w") as g: 6 | current_paper = "" 7 | paper_dict = {} 8 | paper_count = {} 9 | for line in f: 10 | print(line_count) 11 | paperid = line.split("\t")[0] 12 | fos = int(line.split("\t")[1]) 13 | score = float(line.split("\t")[2].strip()) 14 | if current_paper == "": 15 | current_paper = paperid 16 | paper_dict[fos] = score 17 | paper_count[fos] = 1 18 | elif paperid == current_paper: 19 | try: 20 | paper_dict[fos] += score 21 | paper_count[fos] += 1 22 | except KeyError: 23 | paper_dict[fos] = score 24 | paper_count[fos] = 1 25 | else: 26 | g.write(current_paper + "\t" + str(max(paper_dict.items(), key=operator.itemgetter(1))[0]) + "\t" + str(max(paper_dict.items(), key=operator.itemgetter(1))[1]) + "\n") 27 | current_paper = paperid 28 | paper_dict.clear() 29 | paper_count.clear() 30 | paper_dict[fos] = score 31 | paper_count[fos] = 1 32 | line_count += 1 33 | for item in paper_dict: 34 | g.write(current_paper + "\t" + str(item) + "\t" + str(paper_dict[item]/max((paper_count[item]-1), 1)) + "\n") 35 | -------------------------------------------------------------------------------- /03.statistical_analysis/12.author_interdisciplinary_chord_graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from chord import Chord" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "f = open('08.author_fos_matrix.txt', 'r+')\n", 19 | "matrix = f.readline()\n", 20 | "f.close()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "names = [\"Computer Science\", \"Biology\", \"Political Science\", \"Materials Science\", \"Geography\", \"Chemistry\", \"Economics\", \"Mathematics\", \"Geology\", \"Engineering\", \"Physics\", \"Sociology\", \"Business\", \"Medicine\", \"Psychology\", \"Art\", \"History\", \"Philosophy\", \"Environmental science\"]\n", 30 | "Chord(matrix, names, wrap_labels = False, width=3600).to_html()" 31 | ] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.7.7" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 4 55 | } 56 | -------------------------------------------------------------------------------- /00.entity_resolution/08.add_to_authors_coauthors.py: -------------------------------------------------------------------------------- 1 | dict_of_coauthors = {} 2 | line_count = 1 3 | 4 | print("Starting...") 5 | 6 | with open("05.paper_id_with_merged_author_id.txt", "r") as inp: 7 | for line in inp: 8 | print("Loading: " + str(line_count)) 9 | paper_id = line.split("\t")[0].strip() 10 | author_ids = line.split("\t")[1].strip() 11 | dict_of_coauthors[paper_id] = author_ids 12 | line_count+= 1 13 | 14 | number_of_papers = 0 15 | number_of_papers_with_coauthors = 0 16 | number_of_mismatches = 0 17 | line_count = 1 18 | 19 | with open("07.authors_with_paper_doi.txt", "r") as inp: 20 | with open("08.authors_with_co_authors.txt", "w") as outp: 21 | for line in inp: 22 | print("Searching: " + str(line_count)) 23 | author_id = line.split("\t")[0].strip() 24 | paper_ids = line.split("\t")[9].strip().split(",") 25 | co_authors = set() 26 | for paper_id in paper_ids: 27 | number_of_papers += 1 28 | try: 29 | co_authors.update(dict_of_coauthors[paper_id].split(",")) 30 | number_of_papers_with_coauthors += 1 31 | try: 32 | co_authors.remove(author_id) 33 | except KeyError: 34 | number_of_mismatches += 1 35 | except KeyError: 36 | pass 37 | outp.write(line.strip("\n") + "\t" + ",".join(co_authors) + "\n") 38 | 39 | line_count += 1 40 | 41 | print("Total number of papers: " + str(number_of_papers) + ". With coauthors :" + str(number_of_papers_with_coauthors) + " . With number of mismatches: " + str(number_of_mismatches)) 42 | 43 | print("Finished.") 44 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/04.conferenceseries.py: -------------------------------------------------------------------------------- 1 | with open("ConferenceSeries.txt", "r") as f: 2 | with open("04.ConferenceSeries.nt", "w") as g: 3 | for line in f: 4 | ConferenceSeriesId, Rank, NormalizedName, DisplayName, PaperCount, PaperFamilyCount, CitationCount, CreatedDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not DisplayName == "": 9 | g.write(f' "{DisplayName}"^^ .\n') 10 | if not PaperCount == "": 11 | g.write(f' "{PaperCount}"^^ .\n') 12 | if not PaperFamilyCount == "": 13 | g.write(f' "{PaperFamilyCount}"^^ .\n') 14 | if not CitationCount == "": 15 | g.write(f' "{CitationCount}"^^ .\n') 16 | if not CreatedDate == "": 17 | g.write(f' "{CreatedDate}"^^ .\n') -------------------------------------------------------------------------------- /01.field_of_study_classification/06.evaluate_with_journal_label.py: -------------------------------------------------------------------------------- 1 | paper_label_dict = {} 2 | label_total_count = {} 3 | label_matching_count = {} 4 | with open("02.labels.txt", "r") as f: 5 | for line in f: 6 | paper_label_dict[line.split("\t")[0].strip()] = line.split("\t")[2].strip() 7 | label_total_count[line.split("\t")[2].strip()] = 0 8 | label_matching_count[line.split("\t")[2].strip()] = 0 9 | 10 | line_count = 1 11 | journal_label = {} 12 | mag_label = {} 13 | with open("05.papers_with_journal_labels.txt", "r") as f: 14 | for line in f: 15 | print("Loading Journal: " + str(line_count)) 16 | paper_id = line.split("\t")[0] 17 | label = line.strip().split("\t")[2] 18 | journal_label[paper_id] = label 19 | line_count += 1 20 | 21 | line_count = 1 22 | #Edit the following data path depending on which labels you want to evaluate 23 | with open("03.papers_with_direct_labels.txt", "r") as f: 24 | for line in f: 25 | print("Loading MAG: " + str(line_count)) 26 | paper_id = line.split("\t")[0] 27 | label = line.split("\t")[1] 28 | mag_label[paper_id] = label 29 | line_count += 1 30 | 31 | line_count = 1 32 | total_count = 0 33 | matching = 0 34 | for item in journal_label: 35 | print("Comparing: " + str(line_count)) 36 | try: 37 | if journal_label[item] == mag_label[item]: 38 | matching += 1 39 | label_matching_count[journal_label[item]] += 1 40 | total_count += 1 41 | label_total_count[journal_label[item]] += 1 42 | except KeyError: 43 | pass 44 | line_count += 1 45 | 46 | print("Total: " + str(total_count)) 47 | print("Matching: " + str(matching)) 48 | for item in label_matching_count: 49 | print("Label: " + item + " Total: " + str(label_total_count[item]) + " Matching: " + str(label_matching_count[item])) 50 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/18.relatedfieldsofstudy.py: -------------------------------------------------------------------------------- 1 | with open("RelatedFieldOfStudy.txt", "r") as f: 2 | with open("18.RelatedFieldOfStudy.nt", "w") as g: 3 | for line in f: 4 | FieldOfStudy1, Type1, FieldOfStudy2, Type2, Rank = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if Type1 == "disease": 7 | if Type2 == "disease_cause": 8 | g.write(f' .\n') 9 | if Type2 == "medical_treatment": 10 | g.write(f' .\n') 11 | if Type2 == "symptom": 12 | g.write(f' .\n') 13 | elif Type1 =="medical_treatment": 14 | if Type2 == "disease_cause": 15 | g.write(f' .\n') 16 | if Type2 == "symptom": 17 | g.write(f' .\n') 18 | elif Type1 =="symptom": 19 | if Type2 == "disease_cause": 20 | g.write(f' .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/02.authors.py: -------------------------------------------------------------------------------- 1 | with open("Authors.txt", "r") as f: 2 | with open("02.Authors.nt", "w") as g: 3 | for line in f: 4 | AuthorId, Rank, NormalizedName, DisplayName, LastKnownAffiliationId, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not LastKnownAffiliationId == "": 9 | g.write(f' .\n') 10 | if not DisplayName == "": 11 | g.write(f' "{DisplayName}"^^ .\n') 12 | if not PaperCount == "": 13 | g.write(f' "{PaperCount}"^^ .\n') 14 | if not PaperFamilyCount == "": 15 | g.write(f' "{PaperFamilyCount}"^^ .\n') 16 | if not CitationCount == "": 17 | g.write(f' "{CitationCount}"^^ .\n') 18 | if not CreateDate == "": 19 | g.write(f' "{CreateDate}"^^ .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/23.authors_disambiguated.py: -------------------------------------------------------------------------------- 1 | with open("00.entity_resolution/14.Authors_new.txt", "r") as f: 2 | with open("23.Authors_disambiguated.nt", "w") as g: 3 | for line in f: 4 | AuthorId, Rank, NormalizedName, DisplayName, LastKnownAffiliationId, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not LastKnownAffiliationId == "": 9 | g.write(f' .\n') 10 | if not DisplayName == "": 11 | g.write(f' "{DisplayName}"^^ .\n') 12 | if not PaperCount == "": 13 | g.write(f' "{PaperCount}"^^ .\n') 14 | if not PaperFamilyCount == "": 15 | g.write(f' "{PaperFamilyCount}"^^ .\n') 16 | if not CitationCount == "": 17 | g.write(f' "{CitationCount}"^^ .\n') 18 | if not CreateDate == "": 19 | g.write(f' "{CreateDate}"^^ .\n') -------------------------------------------------------------------------------- /04.generate_knowledge_graph/15.fieldsofstudy.py: -------------------------------------------------------------------------------- 1 | with open("FieldsOfStudy.txt", "r") as f: 2 | with open("15.FieldsOfStudy.nt", "w") as g: 3 | for line in f: 4 | FieldsOfStudyId, Rank, NormalizedName, DisplayName, MainType, Level, PaperCount, PaperFamilyCount, CitationCount, CreateDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not DisplayName == "": 9 | g.write(f' "{DisplayName}"^^ .\n') 10 | if not Level == "": 11 | g.write(f' "{Level}"^^ .\n') 12 | if not PaperCount == "": 13 | g.write(f' "{PaperCount}"^^ .\n') 14 | if not PaperFamilyCount == "": 15 | g.write(f' "{PaperFamilyCount}"^^ .\n') 16 | if not CitationCount == "": 17 | g.write(f' "{CitationCount}"^^ .\n') 18 | if not CreateDate == "": 19 | g.write(f' "{CreateDate}"^^ .\n') -------------------------------------------------------------------------------- /03.statistical_analysis/09.author_fos.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | with open("01.field_of_study_classification/10.paperid_with_fos.txt", "r") as f: 4 | line_count = 1 5 | fos_dict = {} 6 | for line in f: 7 | paperid = line.split("\t")[0] 8 | fos = line.split("\t")[1].strip() 9 | fos_dict[paperid] = fos 10 | print(f"Loading one: {line_count}.") 11 | line_count += 1 12 | 13 | with open("01.field_of_study_classification/02.labels.txt", "r") as f: 14 | label_mapping = {} 15 | labels = [] 16 | line_count = 1 17 | for line in f: 18 | label_mapping[line.split("\t")[0]] = int(line.strip().split("\t")[2]) 19 | labels.append(line.split("\t")[0]) 20 | print(f"Loading two: {line_count}.") 21 | line_count += 1 22 | 23 | with open("00.entity_resolution/06.authors_with_paper_id.txt", "r") as f: 24 | with open("09.author_fos.txt", "w") as g: 25 | line_count = 1 26 | matrix = [[0 for x in range(19)] for y in range(19)] 27 | for line in f: 28 | print(line_count) 29 | authorid = line.split("\t")[0] 30 | paperids = line.split("\t")[-1].strip("\n").split(",") 31 | fos_list = [fos_dict[paperid] for paperid in paperids if paperid in fos_dict] 32 | if fos_list: 33 | author_dict = {fos: 0 for fos in labels} 34 | for item in fos_list: 35 | author_dict[item] += 1 36 | fos_string = ",".join(map(str, [*author_dict.values()])) 37 | g.write(f"{authorid}\t{fos_string}\n") 38 | fos_set = set(fos_list) 39 | if len(fos_set) > 1: 40 | fos_combinations = list(itertools.permutations(fos_set, 2)) 41 | for combination in fos_combinations: 42 | matrix[label_mapping[combination[0]]][label_mapping[combination[1]]] += 1 43 | line_count += 1 44 | with open("09.author_fos_matrix.txt", "w") as h: 45 | h.write(str(matrix)) 46 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/05.journals.py: -------------------------------------------------------------------------------- 1 | with open("Journals.txt", "r") as f: 2 | with open("05.Journals.nt", "w") as g: 3 | for line in f: 4 | JournalId, Rank, NormalizedName, DisplayName, Issn, Publisher, Webpage, PaperCount, PaperFamilyCount, CitationCount, CreatedDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not DisplayName == "": 9 | g.write(f' "{DisplayName}"^^ .\n') 10 | if not Issn == "": 11 | g.write(f' "{Issn}"^^ .\n') 12 | if not Publisher == "": 13 | g.write(f' "{Publisher}"^^ .\n') 14 | if not Webpage == "": 15 | g.write(f' "{Webpage}"^^ .\n') 16 | if not PaperCount == "": 17 | g.write(f' "{PaperCount}"^^ .\n') 18 | if not PaperFamilyCount == "": 19 | g.write(f' "{PaperFamilyCount}"^^ .\n') 20 | if not CitationCount == "": 21 | g.write(f' "{CitationCount}"^^ .\n') 22 | if not CreatedDate == "": 23 | g.write(f' "{CreatedDate}"^^ .\n') -------------------------------------------------------------------------------- /03.statistical_analysis/00.count_properties.py: -------------------------------------------------------------------------------- 1 | def count_occurence(list_of_objects): 2 | occurence_dictionary = {} 3 | for item in list_of_objects: 4 | if item in occurence_dictionary: 5 | occurence_dictionary[item] += 1 6 | else: 7 | occurence_dictionary[item] = 1 8 | return occurence_dictionary 9 | 10 | papers_properties = ["PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", "Year", "Date", "OnlineDate", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", "EstimatedCitation", "OriginalVenue", "FamilyId", "CreatedDate"] 11 | authors_properties = ["AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", "PaperCount", "PaperFamilyCount", "CitationCount", "CreateDate"] 12 | 13 | papers_properties_dict = {} 14 | authors_properties_dict = {} 15 | 16 | 17 | for item in papers_properties: 18 | papers_properties_dict[item] = 0 19 | 20 | #Add file path to Papers.txt 21 | with open("Papers.txt", "r") as inp: 22 | line_count = 0 23 | for line in inp: 24 | line_count += 1 25 | entries = line.split("\t") 26 | for index, entry in enumerate(entries): 27 | if not entry.strip() == "": 28 | papers_properties_dict[papers_properties[index]] += 1 29 | print("Paper: " + str(line_count)) 30 | with open("00.papers_statistics.txt", "w") as outp: 31 | outp.write("Total papers: " + str(line_count) + "\n") 32 | for item in papers_properties_dict: 33 | outp.write(item + "\t" + str(papers_properties_dict[item]) + "\n") 34 | 35 | 36 | for item in authors_properties: 37 | authors_properties_dict[item] = 0 38 | 39 | list_of_paper_count = [] 40 | list_of_citation_count = [] 41 | 42 | #Add file path to Authors.txt 43 | with open("Authors.txt", "r") as inp: 44 | line_count = 0 45 | for line in inp: 46 | line_count += 1 47 | entries = line.split("\t") 48 | list_of_paper_count.append(entries[5]) 49 | list_of_citation_count.append(entries[6]) 50 | for index, entry in enumerate(entries): 51 | if not entry.strip() == "": 52 | authors_properties_dict[authors_properties[index]] += 1 53 | print("Authors: " + str(line_count)) 54 | 55 | with open("00.authors_statistics.txt", "w") as outp: 56 | outp.write("Total Authors: " + str(line_count)) 57 | for item in authors_properties_dict: 58 | outp.write(item + "\t" + str(authors_properties_dict[item]) + "\n") 59 | print("\nNow calculating paper count...") 60 | outp.write("Counter for paper count: " + str(count_occurence(list_of_paper_count)) + "\n") 61 | print("Finished.\nNow calculating citation count...") 62 | outp.write("Counter for citation count: " + str(count_occurence(list_of_citation_count))) 63 | print("Finished.") 64 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/01.affiliations.py: -------------------------------------------------------------------------------- 1 | with open("Affiliations.txt", "r") as f: 2 | with open("01.Affiliations.nt", "w") as g: 3 | for line in f: 4 | AffiliationId, Rank, NormalizedName, DisplayName, GridId, OfficialPage, WikiPage, PaperCount, PaperFamilyCount, CitationCount, Latitude, Longitude, CreatedDate = line.strip("\n").split("\t") 5 | g.write(f' .\n') 6 | if not Rank == "": 7 | g.write(f' "{Rank}"^^ .\n') 8 | if not DisplayName == "": 9 | g.write(f' "{DisplayName}"^^ .\n') 10 | if not GridId == "": 11 | g.write(f' .\n') 12 | if not OfficialPage == "": 13 | g.write(f' <{OfficialPage}> .\n') 14 | if not WikiPage == "": 15 | g.write(f' <{WikiPage}> .\n') 16 | if not WikiPage == "": 17 | g.write(f' <{WikiPage}> .\n') 18 | if not PaperCount == "": 19 | g.write(f' "{PaperCount}"^^ .\n') 20 | if not PaperFamilyCount == "": 21 | g.write(f' "{PaperFamilyCount}"^^ .\n') 22 | if not CitationCount == "": 23 | g.write(f' "{CitationCount}"^^ .\n') 24 | if not Latitude == "": 25 | g.write(f' "{Latitude}"^^ .\n') 26 | if not Longitude == "": 27 | g.write(f' "{Longitude}"^^ .\n') 28 | if not CreatedDate == "": 29 | g.write(f' "{CreatedDate}"^^ .') -------------------------------------------------------------------------------- /02.knowledge_graph_embeddings/01.prepare_data_authors.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | line_count = 1 4 | entity_count = 0 5 | relations_count = 0 6 | entity_dict = {} 7 | relations_dict = {} 8 | with open("00.authors_input.txt", "r") as f: 9 | for line in f: 10 | print("Dictionary: " + str(line_count)) 11 | if len(line.split(" ")) > 1: 12 | sub = line.split(" ")[0].rstrip(">").replace("").split("/")[-1].replace("22-rdf-syntax-ns#", "").replace("org#", "") 14 | obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^", "").replace("^^", "").replace("^^", "").rstrip(">").replace(" 1: 41 | sub = line.split(" ")[0].rstrip(">").replace("").split("/")[-1].replace("22-rdf-syntax-ns#", "").replace("org#", "") 43 | obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^", "").replace("^^", "").replace("^^", "").rstrip(">").replace(" 1: 12 | sub = line.split(" ")[0].rstrip(">").replace("").split("/")[-1].replace("22-rdf-syntax-ns#", "") 14 | obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^", "").replace("^^", "").replace("^^", "").rstrip(">").replace(" 1: 41 | sub = line.split(" ")[0].rstrip(">").replace("").split("/")[-1].replace("22-rdf-syntax-ns#", "") 43 | obj = " ".join(line.split(" ")[2:]).strip().rstrip(".").strip().replace("^^", "").replace("^^", "").replace("^^", "").rstrip(">").replace(" .\n') 6 | if not DisplayName == "": 7 | g.write(f' "{DisplayName}"^^ .\n') 8 | if not ConferenceInstanceId == "": 9 | g.write(f' .\n') 10 | if not Location == "": 11 | g.write(f' <{Location}> .\n') 12 | if not OfficialUrl == "": 13 | g.write(f' <{OfficialUrl}> .\n') 14 | if not StartDate == "": 15 | g.write(f' "{StartDate}"^^ .\n') 16 | if not EndDate == "": 17 | g.write(f' "{EndDate}"^^ .\n') 18 | if not AbstractRegistrationDate == "": 19 | g.write(f' "{AbstractRegistrationDate}"^^ .\n') 20 | if not SubmissionDeadlineDate == "": 21 | g.write(f' "{SubmissionDeadlineDate}"^^ .\n') 22 | if not NotificationDueDate == "": 23 | g.write(f' "{NotificationDueDate}"^^ .\n') 24 | if not FinalVersionDueDate == "": 25 | g.write(f' "{FinalVersionDueDate}"^^ .\n') 26 | if not PageCount == "": 27 | g.write(f' "{PageCount}"^^ .\n') 28 | if not PaperFamilyCount == "": 29 | g.write(f' "{PaperFamilyCount}"^^ .\n') 30 | if not CitationCount == "": 31 | g.write(f' "{CitationCount}"^^ .\n') 32 | if not CreatedDate == "": 33 | g.write(f' "{CreatedDate}"^^ .\n') 34 | 35 | -------------------------------------------------------------------------------- /00.entity_resolution/15.extract_orcid_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | 4 | #Add directory to ORCID files 5 | directory = "ORCID_files" 6 | 7 | print("Starting...") 8 | 9 | with open("15.orcid_title_doi.txt", "w") as outp: 10 | for folder in os.listdir(directory): 11 | print(folder) 12 | 13 | folder_path = os.path.join(directory, folder) 14 | if os.path.isdir(folder_path): 15 | for subfolder in os.listdir(folder_path): 16 | subfolder_path = os.path.join(folder_path, subfolder) 17 | for subsubfolder in os.listdir(subfolder_path): 18 | subsubfolder_path = os.path.join(subfolder_path, subsubfolder) 19 | orcid = subsubfolder_path.split("/")[-1] 20 | 21 | real_name = "" 22 | educations_path = os.path.join(subsubfolder_path, "educations") 23 | employments_path = os.path.join(subsubfolder_path, "employments") 24 | works_path = os.path.join(subsubfolder_path, "works") 25 | 26 | try: 27 | for education_file in os.listdir(educations_path): 28 | education_file_path = os.path.join(educations_path, education_file) 29 | tree = ET.parse(education_file_path) 30 | root = tree.getroot() 31 | name = root.find("{http://www.orcid.org/ns/common}source") 32 | if name is not None: 33 | name2 = name.find("{http://www.orcid.org/ns/common}source-name") 34 | if name2 is not None: 35 | real_name = name2.text 36 | 37 | except OSError: 38 | try: 39 | for employment_file in os.listdir(employments_path): 40 | employments_file_path = os.path.join(employments_path, employment_file) 41 | tree = ET.parse(employments_file_path) 42 | root = tree.getroot() 43 | name = root.find("{http://www.orcid.org/ns/common}source") 44 | if name is not None: 45 | name2 = name.find("{http://www.orcid.org/ns/common}source-name") 46 | if name2 is not None: 47 | real_name = name2.text 48 | except OSError: 49 | pass 50 | 51 | try: 52 | for work_file in os.listdir(works_path): 53 | work_file_path = os.path.join(works_path, work_file) 54 | tree = ET.parse(work_file_path) 55 | root = tree.getroot() 56 | 57 | title = root.find("{http://www.orcid.org/ns/work}title") 58 | if title is not None: 59 | real_title = title.find("{http://www.orcid.org/ns/common}title").text.replace("\t", " ").replace("\\", " ") 60 | 61 | real_doi = "" 62 | doi = root.find("{http://www.orcid.org/ns/common}external-ids") 63 | if doi is not None: 64 | dois = doi.findall("{http://www.orcid.org/ns/common}external-id") 65 | for doi in dois: 66 | if doi.find("{http://www.orcid.org/ns/common}external-id-type").text == "doi": 67 | real_doi = doi.find("{http://www.orcid.org/ns/common}external-id-value").text 68 | 69 | outp.write("\t".join([orcid, real_name, real_title, real_doi]) + "\n") 70 | 71 | except OSError: 72 | pass 73 | 74 | print("Finished.") -------------------------------------------------------------------------------- /03.statistical_analysis/06.paper_citation_reference.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | 3 | paper_references = [] 4 | paper_citations = [] 5 | #Add file path for Papers.txt 6 | with open("Papers.txt", "r") as f: 7 | for line in f: 8 | references = int(line.split("\t")[18]) 9 | citations = int(line.split("\t")[19]) 10 | paper_references.append(references) 11 | paper_citations.append(citations) 12 | 13 | paper_references_filtered = list(filter(lambda num: num != 0, paper_references)) 14 | paper_citations_filtered = list(filter(lambda num: num != 0, paper_citations)) 15 | 16 | with open("06.paper_references_citations_general.txt", "w") as f: 17 | f.write(f"Average number of references per paper: {statistics.mean(paper_references)}\n") 18 | f.write(f"Median number of references per paper: {statistics.median(paper_references_filtered)}\n") 19 | f.write(f"Maximum number of references per paper: {max(paper_references)}\n") 20 | f.write(f"Minimum number of references per paper: {min(paper_references)}\n") 21 | f.write(f"Paper with references: {len(paper_references_filtered)}\n") 22 | f.write(f"Average number of references per paper filtered: {statistics.mean(paper_references_filtered)}\n") 23 | f.write(f"Average number of citations per paper: {statistics.mean(paper_citations)}\n") 24 | f.write(f"Median number of citations per paper: {statistics.median(paper_citations_filtered)}\n") 25 | f.write(f"Maximum number of citations per paper: {max(paper_citations)}\n") 26 | f.write(f"Minimum number of citations per paper: {min(paper_citations)}\n") 27 | f.write(f"Paper with citations: {len(paper_citations_filtered)}\n") 28 | f.write(f"Average number of citations per paper filtered: {statistics.mean(paper_citations_filtered)}\n") 29 | 30 | paper_references = {} 31 | paper_citations = {} 32 | #Add file path for Papers.txt 33 | with open("Papers.txt", "r") as f: 34 | for line in f: 35 | papertype = line.split("\t")[3].lower() 36 | references = int(line.split("\t")[18]) 37 | citations = int(line.split("\t")[19]) 38 | try: 39 | paper_references[papertype].append(references) 40 | paper_citations[papertype].append(citations) 41 | except KeyError: 42 | paper_references[papertype] = [references] 43 | paper_citations[papertype] = [citations] 44 | 45 | paper_references_filtered = {papertype: list(filter(lambda num: num != 0, paper_references[papertype])) for papertype in paper_references} 46 | paper_citations_filtered = {papertype: list(filter(lambda num: num != 0, paper_citations[papertype])) for papertype in paper_citations} 47 | 48 | with open("06.paper_references_citations_detailed.txt", "w") as f: 49 | for papertype in paper_references: 50 | f.write(f"Average number of references per paper {papertype}: {statistics.mean(paper_references[papertype])}\n") 51 | f.write(f"Median number of references per paper {papertype}: {statistics.median(paper_references_filtered[papertype])}\n") 52 | f.write(f"Maximum number of references per paper {papertype}: {max(paper_references[papertype])}\n") 53 | f.write(f"Minimum number of references per paper {papertype}: {min(paper_references[papertype])}\n") 54 | f.write(f"Paper with references {papertype}: {len(paper_references_filtered[papertype])}\n") 55 | f.write(f"Average number of references per paper filtered {papertype}: {statistics.mean(paper_references_filtered[papertype])}\n") 56 | f.write(f"Average number of citations per paper {papertype}: {statistics.mean(paper_citations[papertype])}\n") 57 | f.write(f"Median number of citations per paper {papertype}: {statistics.median(paper_citations_filtered[papertype])}\n") 58 | f.write(f"Maximum number of citations per paper {papertype}: {max(paper_citations[papertype])}\n") 59 | f.write(f"Minimum number of citations per paper {papertype}: {min(paper_citations[papertype])}\n") 60 | f.write(f"Paper with citations {papertype}: {len(paper_citations_filtered[papertype])}\n") 61 | f.write(f"Average number of citations per paper filtered {papertype}: {statistics.mean(paper_citations_filtered[papertype])}\n\n") 62 | 63 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/10.papers.py: -------------------------------------------------------------------------------- 1 | with open("Papers.txt", "r") as f: 2 | with open("10.Papers.nt", "w") as g: 3 | for line in f: 4 | PaperId, Rank, Doi, DocType, PaperTitle, OriginalTitle, BookTitle, Year, Date, OnlineDate, Publisher, JournalId, ConferenceSeriesId, ConferenceInstanceId, Volume, Issue, FirstPage, LastPage, ReferenceCount, CitationCount, EstimatedCitation, OriginalVenue, FamilyId, CreatedDate = line.strip("\n").split("\t") 5 | if DocType == "Journal": 6 | g.write(f' .\n') 7 | elif DocType == "Conference": 8 | g.write(f' .\n') 9 | elif DocType == "Book": 10 | g.write(f' .\n') 11 | elif DocType == "BookChapter": 12 | g.write(f' .\n') 13 | elif DocType == "Patent": 14 | g.write(f' .\n') 15 | 16 | if not Rank == "": 17 | g.write(f' "{Rank}"^^ .\n') 18 | if not Doi == "": 19 | g.write(f' "{Doi}"^^ .\n') 20 | if not OriginalTitle == "": 21 | g.write(f' "{OriginalTitle}"^^ .\n') 22 | if not BookTitle == "": 23 | g.write(f' "{BookTitle}"^^ .\n') 24 | if not Date == "": 25 | g.write(f' "{Date}"^^ .\n') 26 | if not Publisher == "": 27 | g.write(f' "{Publisher}"^^ .\n') 28 | if not JournalId == "": 29 | g.write(f' " .\n') 30 | if not ConferenceSeriesId == "": 31 | g.write(f' " .\n') 32 | if not ConferenceInstanceId == "": 33 | g.write(f' " .\n') 34 | if not Volume == "": 35 | g.write(f' "{Volume}"^^ .\n') 36 | if not Issue == "": 37 | g.write(f' "{Issue}"^^ .\n') 38 | if not FirstPage == "": 39 | g.write(f' "{FirstPage}"^^ .\n') 40 | if not LastPage == "": 41 | g.write(f' "{LastPage}"^^ .\n') 42 | if not ReferenceCount == "": 43 | g.write(f' "{ReferenceCount}"^^ .\n') 44 | if not CitationCount == "": 45 | g.write(f' "{CitationCount}"^^ .\n') 46 | if not EstimatedCitation == "": 47 | g.write(f' "{EstimatedCitation}"^^ .\n') 48 | if not FamilyId == "": 49 | g.write(f' "{FamilyId}"^^ .\n') 50 | if not CreatedDate == "": 51 | g.write(f' "{CreatedDate}"^^ .\n') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Enhancing the Microsoft Academic Knowledge Graph** 2 | Code for the Master Thesis "Enhancing the Microsoft Academic Knowledge Graph" 3 |

4 | 5 | **Entity Resolution** 6 | 7 | Required packages: 8 | [pyjarowinkler](https://pypi.org/project/pyjarowinkler/) 9 | 10 | Code for data preperation + disambiguation + recreating file (or use execute.sh) 11 | ```` 12 | python 00.prepare_paper_references.py 13 | python 01.extract_paper_id_with_doi.py 14 | python 02.extract_author_with_paper_id.py 15 | 16 | LANG=en_US.UTF-8 LC_ALL=C sort -n -t$'\t' -k1 02.author_id_with_paper_id.txt > 02.author_id_with_paper_id_sorted.txt 17 | 18 | python 03.extract_paper_with_author_id.py 19 | 20 | LANG=en_US.UTF-8 LC_ALL=C sort -n -t$'\t' -k1 03.paper_id_with_author_id.txt > 03.paper_id_with_author_id_sorted.txt 21 | 22 | python 04.author_id_merge_paper_id.py 23 | python 05.paper_id_merge_author_ids.py 24 | python 06.add_to_authors_paper_id.py 25 | python 07.add_to_authors_doi.py 26 | python 08.add_to_authors_coauthors.py 27 | python 09.add_to_authors_titles.py 28 | python 10.add_to_authors_year.py 29 | python 11.add_to_authors_journal_and_conference.py 30 | python 12.add_to_authors_references.py 31 | 32 | mkdir sort 33 | split -l 5000000 -d 12.authors_with_references.txt sort/sort_file 34 | cd sort 35 | for file in sort_file*; do 36 | echo $file 37 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file 38 | done 39 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../12.authors_with_references_sorted.txt 40 | cd .. 41 | rm -r sort 42 | 43 | python 13.disambiguation_data.py 44 | python 14.recreate_files 45 | ```` 46 | 47 | Edit the following data paths for MAG files: 48 | * 00.prepare_paper_references.py: path to PaperReferences.txt 49 | * 01.extract_paper_id_with_doi.py: path to Papers.txt 50 | * 02.extract_author_with_paper_id.py: path to PaperAuthorAffiliations.txt 51 | * 03.extract_paper_with_author_id.py: path to PaperAuthorAffiliations.txt 52 | * 06.add_to_authors_paper_id.py: path to Authors.txt 53 | * 09.add_to_authors_titles.py: path to Papers.txt 54 | * 10.add_to_authors_year.py: path to Papers.txt 55 | * 11.add_to_authors_journal_and_conference.py: path to Papers.txt 56 | * 14.recreate_files.py: path to PaperAuthorAffiliations.txt 57 | 58 | Files 15-19 are used for evaluation in the Thesis 59 |

60 | 61 | **Field of Study Classification** 62 | 63 | Required packages: 64 | [NLTK](http://www.nltk.org/), [Pandas](https://pypi.org/project/pandas/), [scikit-learn](https://scikit-learn.org/stable/index.html), [simpletransformers](https://pypi.org/project/simpletransformers/), [spaCy](https://spacy.io/), [pytextrank](https://pypi.org/project/pytextrank/) 65 | 66 | * files 00 and 01 are used to convert MAG paper abstracts from Inverted Indexes to Full Texts 67 | * file 02 is used to extract field of study labels from the MAG (all 19 low level FoS), edit the path to FieldsOfStudy.txt accordingly 68 | * files 03-06 require a sorted version of the PaperFieldsOfStudy.txt file, which can be done with the following code: 69 | ```` 70 | mkdir sort 71 | split -l 5000000 -d PaperFieldsOfStudy.txt sort/sort_file 72 | cd sort 73 | for file in sort_file*; do 74 | echo $file 75 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 -o $file $file 76 | done 77 | LANG=en_US.UTF-8 LC_ALL=C sort -t$'\t' -k3 sort_file* > ../SortedPaperFieldsOfStudy.txt 78 | cd .. 79 | rm -r sort 80 | ```` 81 | * execute file 03 in sequence to generate the data set using direct labels, edit path to the sorted PaperFieldsOfStudy.txt accordingly 82 | * execute both 04 files in sequence to generate the data set using indirect labels, edit paths to FieldOfStudyChildren.txt and the sorted PaperFieldsOfStudy.txt accordingly 83 | * execute all three 05 files in order to generate the data set using journal labels 84 | * use file 06 in order to evaluate MAG labels from data sets generated by 03 and 04, edit the file path accordingly 85 | * use file 07 to generate training and evalution sets 86 | * use file 08 to train and evaluate the classifier, edit the model as well as hyperparameters accordingly 87 | * use file 09 to classify MAG papers 88 | * use file 10 to match extracted labels with MAG papers 89 | * use file 11 to extract keywords 90 |

91 | 92 | **Knowledge Graph Embeddings** 93 | 94 | Required packages: 95 | [DGL-KE](https://aws-dglke.readthedocs.io/en/latest/install.html) 96 | 97 | * use files 00 and 01 to generate input files for training author embeddings, add file path to input graph Authors.nt 98 | * use files 02 and 03 to generate input files for training paper embeddings, add file path to input graph Papers.nt, Journals.nt, and ConferenceSeries.nt 99 | * execute 04 or the following console command for training embeddings, edit file paths, data sets, and hyperparameters accordingly 100 | ```` 101 | DGLBACKEND=pytorch dglke_train --model_name TransE_l2 --data_path 02.knowledge_graph_embeddings --dataset mag_author --data_files 01.author_entities.dict 01.author_relations.dict 01.author_train.tsv 01.author_valid.tsv 01.author_test.tsv --format udd_hrt --batch_size 1000 --neg_sample_size 1000 --hidden_dim 100 --gamma 19.9 --lr 0.25 --max_step 1000000 --log_interval 100 --batch_size_eval 1000 --neg_sample_size_eval 1000 -adv --regularization_coef 1.00E-09 --gpu 0 1 2 3 4 5 6 7 --valid --test --mix_cpu_gpu 102 | ```` 103 |

104 | 105 | **Statistical Analysis** 106 | Required packages: 107 | [Pandas](https://pypi.org/project/pandas/), [NumPy](https://numpy.org/), [seaborn](https://seaborn.pydata.org/), [matplotlib](https://matplotlib.org/), [chord](https://pypi.org/project/chord/) 108 | 109 | Includes files used to generate graphs and data for statistical analysis 110 | * file 00 is used to count entity properties, edit file paths for Authors.txt and Papers.txt 111 | * file 01 is used to calculate the number of papers published per year, edit file path for Papers.txt 112 | * file 02 is used to generate data for table 25, uses files created during entity resolution, edit file paths accordingly 113 | * file 04 and 05 to calculate data for figures 08, 09, 10, 11, 12 and 13, create a folder named 04.field_of_study_over_time beforehand, use file 04 to split data by individual field of study, use file 05 to generate time data for each field of study 114 | * file 06 is used to generate data for table 27, edit file path for Papers.txt 115 | * file 07 is used to generate data for figures 04 and 05, edit file path for Papers.txt 116 | * file 08 is used to generate data for figure 03, uses file generate during entity resolution, edit file paths accordingly 117 | * file 09 and 12 are used to generate data for and figure 15, file 09 uses data generate during entity resolution and field of study classification. File 09 generates a matrix which is loaded by 12 to generate the chord graph 118 | * file 10 is used to generate data for figure 06, uses file generated during entity resolution, edit file path for Papers.txt 119 | * file 11 is used to generate figure 14 120 | * file 13 is used to generate data for table 26 121 |

122 | 123 | **Knowledge Graph Creation** 124 | Includes files used to generate the MAKG and as well as the ontology file 125 | * file 00-20 creates RDF representations of existing MAG files, edit file paths accordingly 126 | * file 21 uses our extract keywords for each paper, edit file path accordingly 127 | * file 22 uses our field of study labels for papers, edit file path accordingly 128 | * file 23 and 24 use our disambiguated author and paperauthoraffiliation files, edit file paths accordingly 129 | * file 25 links MAG authors (undisambiguated, though disambituated authors input can be created using our provided files) to their ORCIDs, requires ORCID file generated during entity resolution, edit file path accordingly 130 | -------------------------------------------------------------------------------- /00.entity_resolution/19.disambiguation_evaluation.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | from datetime import datetime 4 | from pyjarowinkler import distance 5 | 6 | 7 | def compare_affiliation(author1, author2): 8 | affiliation1 = author1.split("\t")[4].strip() 9 | affiliation2 = author2.split("\t")[4].strip() 10 | if affiliation1 == "" or affiliation2 == "": 11 | return False 12 | else: 13 | return affiliation1 == affiliation2 14 | 15 | 16 | def compare_coauthors(author1, author2): 17 | coauthors1 = set(author1.split("\t")[11].strip().split(",")) 18 | coauthors2 = set(author2.split("\t")[11].strip().split(",")) 19 | if len(coauthors1) == 0 or len(coauthors2) == 0: 20 | return 0 21 | else: 22 | return len(coauthors1.intersection(coauthors2)) 23 | 24 | 25 | def most_frequent(List): 26 | return sorted(set(List), key=List.count, reverse=True)[:10] 27 | 28 | 29 | def compare_titles(author1, author2): 30 | titles1 = author1.split("\t")[12].strip().replace(";", ",").split(",") 31 | titles2 = author2.split("\t")[12].strip().replace(";", ",").split(",") 32 | if len(titles1) == 0 or len(titles2) == 0: 33 | return 0 34 | else: 35 | most_freq1 = set(most_frequent(titles1)) 36 | most_freq2 = set(most_frequent(titles2)) 37 | return len(most_freq1.intersection(most_freq2)) 38 | 39 | 40 | def compare_years(author1, author2): 41 | if author1.split("\t")[13].strip() == "" or author2.split("\t")[13].strip() == "": 42 | return False 43 | else: 44 | years1 = set(map(int, author1.split("\t")[13].strip().split(","))) 45 | years2 = set(map(int, author2.split("\t")[13].strip().split(","))) 46 | min_years1 = min(years1) 47 | max_years1 = max(years1) 48 | min_years2 = min(years2) 49 | max_years2 = max(years2) 50 | return abs(min_years1 - max_years2) < 10 or abs(min_years2 - max_years1) < 10 51 | 52 | 53 | def compare_journals(author1, author2): 54 | journals1 = set(author1.split("\t")[14].strip().split(",")) 55 | journals2 = set(author2.split("\t")[14].strip().split(",")) 56 | if len(journals1) == 0 or len(journals2) == 0: 57 | return 0 58 | else: 59 | return len(journals1.intersection(journals2)) 60 | 61 | 62 | def compare_conferences(author1, author2): 63 | conferences1 = set(author1.split("\t")[15].strip().split(",")) 64 | conferences2 = set(author2.split("\t")[15].strip().split(",")) 65 | if len(conferences1) == 0 or len(conferences2) == 0: 66 | return 0 67 | else: 68 | return len(conferences1.intersection(conferences2)) 69 | 70 | 71 | def self_references(author1, author2): 72 | paperids1 = set(author1.split("\t")[9].strip().split(",")) 73 | paperids2 = set(author2.split("\t")[9].strip().split(",")) 74 | references1 = set(author1.split("\t")[16].strip().split(",")) 75 | references2 = set(author2.split("\t")[16].strip().split(",")) 76 | return max(len(paperids1.intersection(references2)), len(paperids2.intersection(references1))) 77 | 78 | 79 | def common_references(author1, author2): 80 | references1 = set(author1.split("\t")[16].strip().split(",")) 81 | references2 = set(author2.split("\t")[16].strip().split(",")) 82 | if len(references1) == 0 or len(references2) == 0: 83 | return 0 84 | else: 85 | return len(references1.intersection(references2)) 86 | 87 | 88 | def compare_authors(author1, author2): 89 | score_affiliation = 0 90 | score_coauthors = 0 91 | score_titles = 0 92 | score_years = 0 93 | score_journals = 0 94 | score_conferences = 0 95 | score_self_reference = 0 96 | score_references = 0 97 | score = 0 98 | 99 | if compare_affiliation(author1, author2): 100 | score_affiliation += 5 101 | 102 | if compare_coauthors(author1, author2) == 1: 103 | score_coauthors += 3 104 | elif compare_coauthors(author1, author2) == 2: 105 | score_coauthors += 5 106 | elif compare_coauthors(author1, author2) > 2: 107 | score_coauthors += 8 108 | 109 | if compare_titles(author1, author2) == 1: 110 | score_titles += 3 111 | elif compare_titles(author1, author2) == 2: 112 | score_titles += 5 113 | elif compare_titles(author1, author2) >= 3: 114 | score_titles += 8 115 | 116 | if compare_years(author1, author2): 117 | score_years += 3 118 | 119 | if compare_journals(author1, author2) >= 1: 120 | score_journals += 4 121 | 122 | if compare_conferences(author1, author2) >= 1: 123 | score_conferences += 4 124 | 125 | if self_references(author1, author2) >= 1: 126 | score_self_reference += 8 127 | 128 | if common_references(author1, author2) == 1: 129 | score_references += 2 130 | elif common_references(author1, author2) == 2: 131 | score_references += 3 132 | elif common_references(author1, author2) >= 3: 133 | score_references += 5 134 | 135 | return [score_affiliation, score_coauthors, score_titles, score_years, score_journals, score_conferences, score_self_reference, score_references] 136 | 137 | 138 | with open("12.authors_with_references_sorted.txt", "r") as inp: 139 | with open("19.results_evaluation.txt", "w") as outp: 140 | with open("19.all_false_positives.txt", "w") as outp2: 141 | true_positive = 0 142 | true_negative = 0 143 | false_positive = 0 144 | false_negative = 0 145 | 146 | previous_name = "" 147 | current_authors = [] 148 | true_positives_values = [0,0,0,0,0,0,0,0] 149 | true_negatives_values = [0,0,0,0,0,0,0,0] 150 | false_positives_values = [0,0,0,0,0,0,0,0] 151 | false_negatives_values = [0,0,0,0,0,0,0,0] 152 | line_count = 1 153 | 154 | for line in inp: 155 | print(line_count) 156 | name = line.split("\t")[2].strip() 157 | if previous_name == "" and len(current_authors) < 500: 158 | previous_name = name 159 | current_authors.append(line) 160 | elif distance.get_jaro_distance(str.lower(name), str.lower(previous_name), winkler=True, scaling=0.1) > 0.97 and len(current_authors) < 500: 161 | previous_name = name 162 | current_authors.append(line) 163 | else: 164 | comparisons = list(itertools.combinations(current_authors, 2)) 165 | for item in comparisons: 166 | if sum(compare_authors(item[0], item[1])) > 10: 167 | if item[0].split("\t")[16].strip() == item[1].split("\t")[16].strip(): 168 | true_positive += 1 169 | true_positives_values = [x + y for x, y in zip(true_positives_values, compare_authors(item[0], item[1]))] 170 | else: 171 | false_positive += 1 172 | false_positives_values = [x + y for x, y in zip(false_positives_values, compare_authors(item[0], item[1]))] 173 | outp2.write(item[0].strip() + "\t" + item[1].strip() + "\n") 174 | else: 175 | if item[0].split("\t")[16].strip() == item[1].split("\t")[16].strip(): 176 | false_negative += 1 177 | false_negatives_values = [x + y for x, y in zip(false_negatives_values, compare_authors(item[0], item[1]))] 178 | else: 179 | true_negative += 1 180 | true_negatives_values = [x + y for x, y in zip(true_negatives_values, compare_authors(item[0], item[1]))] 181 | previous_name = "" 182 | current_authors = [] 183 | 184 | line_count += 1 185 | 186 | total_comparisons = true_positive + false_positive + true_negative + false_negative 187 | total_positives = true_positive + false_negative 188 | total_negatives = true_negative + false_positive 189 | 190 | precision = true_positive / (true_positive + false_positive) 191 | recall = true_positive / (true_positive + false_negative) 192 | accuracy = (true_positive + true_negative) / (true_positive + false_positive + true_negative + false_negative) 193 | 194 | outp.write("Total comparisons: " + str(total_comparisons) + "\n") 195 | outp.write("Total positives: " + str(total_positives) + "\n") 196 | outp.write("Total negatives: " + str(total_negatives) + "\n\n") 197 | outp.write("True positives: " + str(true_positive) + "\n") 198 | outp.write("False positives: " + str(false_positive) + "\n") 199 | outp.write("True negatitves: " + str(true_negative) + "\n") 200 | outp.write("False negatives: " + str(false_negative) + "\n\n") 201 | outp.write("Precision: " + str(precision) + "\n") 202 | outp.write("Recall: " + str(recall) + "\n") 203 | outp.write("Accuracy: " + str(accuracy) + "\n\n") 204 | outp.write("Average true positive: " + str([value/max(true_positive, 1) for value in true_positives_values]) + "\n") 205 | outp.write("Average true negative: " + str([value/max(true_negative, 1) for value in true_negatives_values]) + "\n") 206 | outp.write("Average false positive: " + str([value/max(false_positive, 1) for value in false_positives_values]) + "\n") 207 | outp.write("Average false negative: " + str([value/max(false_negative, 1) for value in false_negatives_values]) + "\n") 208 | 209 | -------------------------------------------------------------------------------- /00.entity_resolution/13.disambiguation_data.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | from datetime import datetime 4 | from pyjarowinkler import distance 5 | 6 | #Parameters 7 | score_affiliation = 1 8 | score_coauthors_1 = 3 9 | score_coauthors_2 = 5 10 | score_coauthors_3 = 8 11 | score_titles_1 = 3 12 | score_titles_2 = 5 13 | score_titles_3 = 8 14 | score_years = 3 15 | score_journals = 3 16 | score_conferences = 3 17 | score_self_reference = 8 18 | score_references_1 = 2 19 | score_references_2 = 3 20 | score_references_3 = 5 21 | threshold_matching = 10 22 | threshold_blocking = 0.95 23 | scaling_factor = 0.1 24 | max_block_size = 500 25 | 26 | 27 | def compare_affiliation(author1, author2): 28 | affiliation1 = author1.split("\t")[4].strip() 29 | affiliation2 = author2.split("\t")[4].strip() 30 | if affiliation1 == "" or affiliation2 == "": 31 | return False 32 | else: 33 | return affiliation1 == affiliation2 34 | 35 | 36 | def compare_coauthors(author1, author2): 37 | coauthors1 = set(author1.split("\t")[11].strip().split(",")) 38 | coauthors2 = set(author2.split("\t")[11].strip().split(",")) 39 | if len(coauthors1) == 0 or len(coauthors2) == 0: 40 | return 0 41 | else: 42 | return len(coauthors1.intersection(coauthors2)) 43 | 44 | 45 | def most_frequent(List): 46 | return sorted(set(List), key=List.count, reverse=True)[:10] 47 | 48 | 49 | def compare_titles(author1, author2): 50 | titles1 = author1.split("\t")[12].strip().replace(";", ",").split(",") 51 | titles2 = author2.split("\t")[12].strip().replace(";", ",").split(",") 52 | if len(titles1) == 0 or len(titles2) == 0: 53 | return 0 54 | else: 55 | most_freq1 = set(most_frequent(titles1)) 56 | most_freq2 = set(most_frequent(titles2)) 57 | return len(most_freq1.intersection(most_freq2)) 58 | 59 | 60 | def compare_years(author1, author2): 61 | if author1.split("\t")[13].strip() == "" or author2.split("\t")[13].strip() == "": 62 | return False 63 | else: 64 | years1 = set(map(int, author1.split("\t")[13].strip().split(","))) 65 | years2 = set(map(int, author2.split("\t")[13].strip().split(","))) 66 | min_years1 = min(years1) 67 | max_years1 = max(years1) 68 | min_years2 = min(years2) 69 | max_years2 = max(years2) 70 | return abs(min_years1 - max_years2) < 10 or abs(min_years2 - max_years1) < 10 71 | 72 | 73 | def compare_journals(author1, author2): 74 | journals1 = set(author1.split("\t")[14].strip().split(",")) 75 | journals2 = set(author2.split("\t")[14].strip().split(",")) 76 | if len(journals1) == 0 or len(journals2) == 0: 77 | return 0 78 | else: 79 | return len(journals1.intersection(journals2)) 80 | 81 | 82 | def compare_conferences(author1, author2): 83 | conferences1 = set(author1.split("\t")[15].strip().split(",")) 84 | conferences2 = set(author2.split("\t")[15].strip().split(",")) 85 | if len(conferences1) == 0 or len(conferences2) == 0: 86 | return 0 87 | else: 88 | return len(conferences1.intersection(conferences2)) 89 | 90 | 91 | def self_references(author1, author2): 92 | paperids1 = set(author1.split("\t")[9].strip().split(",")) 93 | paperids2 = set(author2.split("\t")[9].strip().split(",")) 94 | references1 = set(author1.split("\t")[16].strip().split(",")) 95 | references2 = set(author2.split("\t")[16].strip().split(",")) 96 | if len(paperids1) == 0 or len(paperids2) == 0 or len(references1) == 0 or len(references2) == 0: 97 | return 0 98 | else: 99 | return max(len(paperids1.intersection(references2)), len(paperids2.intersection(references1))) 100 | 101 | 102 | def common_references(author1, author2): 103 | references1 = set(author1.split("\t")[16].strip().split(",")) 104 | references2 = set(author2.split("\t")[16].strip().split(",")) 105 | if len(references1) == 0 or len(references2) == 0: 106 | return 0 107 | else: 108 | return len(references1.intersection(references2)) 109 | 110 | 111 | def compare_authors(author1, author2): 112 | score = 0 113 | if compare_affiliation(author1, author2): 114 | score += score_affiliation 115 | 116 | if compare_coauthors(author1, author2) == 1: 117 | score += score_coauthors_1 118 | elif compare_coauthors(author1, author2) == 2: 119 | score += score_coauthors_2 120 | elif compare_coauthors(author1, author2) > 2: 121 | score += score_coauthors_3 122 | 123 | if compare_titles(author1, author2) == 1: 124 | score += score_titles_1 125 | elif compare_titles(author1, author2) == 2: 126 | score += score_titles_2 127 | elif compare_titles(author1, author2) >= 3: 128 | score += score_titles_3 129 | 130 | if compare_years(author1, author2): 131 | score += score_years 132 | 133 | if compare_journals(author1, author2) >= 1: 134 | score += score_journals 135 | 136 | if compare_conferences(author1, author2) >= 1: 137 | score += score_conferences 138 | 139 | if self_references(author1, author2) >= 1: 140 | score += score_self_reference 141 | 142 | if common_references(author1, author2) == 1: 143 | score += score_references_1 144 | elif common_references(author1, author2) == 2: 145 | score += score_references_2 146 | elif common_references(author1, author2) >= 3: 147 | score += score_references_3 148 | 149 | return score 150 | 151 | 152 | def get_id(author): 153 | return author.split("\t")[0] 154 | 155 | 156 | def earlier_date(author1, author2): 157 | date_object1 = datetime.strptime(author1[8], "%Y-%m-%d") 158 | date_object2 = datetime.strptime(author2[8], "%Y-%m-%d") 159 | earliest = min(date_object1, date_object2) 160 | stringified = "-".join([str(earliest.year), 161 | str(earliest.month), str(earliest.day)]) 162 | return stringified 163 | 164 | 165 | def latest_affiliation(author1, author2): 166 | date_object1 = datetime.strptime(author1[8], "%Y-%m-%d") 167 | date_object2 = datetime.strptime(author2[8], "%Y-%m-%d") 168 | if date_object1 < date_object2: 169 | return author2[4] 170 | else: 171 | return author1[4] 172 | 173 | 174 | def add_paper_count(author1, author2): 175 | return str(int(author1[5]) + int(author2[5])) 176 | 177 | def add_paper_family_count(author1, author2): 178 | return str(int(author1[6]) + int(author2[6])) 179 | 180 | def add_citation_count(author1, author2): 181 | return str(int(author1[7]) + int(author2[7])) 182 | 183 | 184 | def merge_authors(tuple_of_authors): 185 | author1 = tuple_of_authors[0].strip("\n").split("\t") 186 | author2 = tuple_of_authors[1].strip("\n").split("\t") 187 | output = "\t".join(author1[0:4]) + "\t" + latest_affiliation(author1, author2) + "\t" + add_paper_count(author1, author2) + "\t" + add_paper_family_count(author1, author2) + "\t" + add_citation_count(author1, author2) + "\t" + earlier_date(author1, author2) + "\t" + (author1[9]+","+author2[9]).strip(",") + "\t" + (author1[10]+","+author2[10]).strip(",") + "\t" + (author1[11] +","+author2[11]).strip(",") + "\t" + (author1[12]+","+author2[12]).strip(",") + "\t" + (author1[13]+","+author2[13]).strip(",") + "\t" + (author1[14]+","+author2[14]).strip(",") + "\t" + (author1[15]+","+author2[15]).strip(",") + "\t" + (author1[16]+","+author2[16]).strip(",") 188 | return output 189 | 190 | 191 | def add_to_mapping(dict_of_maps, entry1, entry2): 192 | if entry2 not in dict_of_maps: 193 | dict_of_maps[entry1] = entry2 194 | return dict_of_maps 195 | else: 196 | return add_to_mapping(dict_of_maps, entry1, dict_of_maps[entry2]) 197 | 198 | 199 | def disambiguate(list_of_authors, result, positive, negative): 200 | author_dictionary = {get_id(author): author.strip("\n") for author in list_of_authors} 201 | author_list = [get_id(author) for author in list_of_authors] 202 | mapping = {} 203 | result = result.copy() 204 | comparisons = list(itertools.combinations(author_list, 2)) 205 | for item in comparisons: 206 | try: 207 | if compare_authors(author_dictionary[item[0]], author_dictionary[item[1]]) > threshold_matching: 208 | positive += 1 209 | if item[0] not in mapping: 210 | mapping = add_to_mapping(mapping, item[1], item[0]) 211 | result = add_to_mapping(result, item[1], item[0]) 212 | author_dictionary[item[0]] = merge_authors((author_dictionary[item[0]], author_dictionary[item[1]])) 213 | del author_dictionary[item[1]] 214 | else: 215 | author_dictionary[mapping[item[0]]] = merge_authors((author_dictionary[mapping[item[0]]], author_dictionary[item[1]])) 216 | mapping = add_to_mapping(mapping, item[1], item[0]) 217 | result = add_to_mapping(result, item[1], item[0]) 218 | del author_dictionary[item[1]] 219 | else: 220 | negative += 1 221 | except KeyError: 222 | pass 223 | return author_dictionary, result, positive, negative 224 | 225 | 226 | with open("12.authors_with_references_sorted.txt", "r") as inp: 227 | with open("13.results.txt", "w") as outp: 228 | with open("13.all_positives.txt", "w") as outp2: 229 | with open("13.disambiguated_file.txt", "w") as outp3: 230 | positive = 0 231 | negative = 0 232 | 233 | previous_name = "" 234 | current_authors = [] 235 | 236 | line_count = 1 237 | 238 | for line in inp: 239 | print("Disambiguation: " + str(line_count)) 240 | 241 | name = line.split("\t")[2].strip() 242 | if previous_name == "" and len(current_authors) < max_block_size: 243 | previous_name = name 244 | current_authors.append(line) 245 | elif distance.get_jaro_distance(str.lower(name), str.lower(previous_name), winkler=True, scaling=scaling_factor) > threshold_blocking and len(current_authors) < max_block_size: 246 | previous_name = name 247 | current_authors.append(line) 248 | else: 249 | result = {} 250 | authors, result, positive, negative = disambiguate(current_authors, result, positive, negative) 251 | previous_name = name 252 | current_authors = [line] 253 | for item in authors: 254 | outp3.write(authors[item] + "\n") 255 | for item in result: 256 | outp2.write(item + "\t" + result[item] + "\n") 257 | 258 | line_count += 1 259 | 260 | result = {} 261 | authors, result, positive, negative = disambiguate(current_authors, result, positive, negative) 262 | for item in authors: 263 | outp3.write(authors[item] + "\n") 264 | for item in result: 265 | outp2.write(item + "\t" + result[item] + "\n") 266 | 267 | total_comparisons = positive + negative 268 | 269 | outp.write("Total comparisons: " + str(total_comparisons) + "\n") 270 | outp.write("Total positives: " + str(positive) + ": " + str(positive/total_comparisons) + "\n") 271 | outp.write("Total negatives: " + str(negative) + ": " + str(negative/total_comparisons)) 272 | -------------------------------------------------------------------------------- /04.generate_knowledge_graph/OWL file.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 111 | 112 | 113 | 114 | Affiliation 115 | 116 | 117 | 118 | Author 119 | 120 | 121 | 122 | Conference Instance 123 | 124 | 125 | 126 | Conference Series 127 | 128 | 129 | 130 | Field of study 131 | 132 | 133 | 134 | Journal 135 | 136 | 137 | 138 | Citation 139 | 140 | 141 | 142 | Paper 143 | 144 | 145 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | --------------------------------------------------------------------------------