├── .gitignore ├── README.md ├── build-app.sh ├── data-sample ├── README.md └── semopenalex-data-sample.trig ├── embeddings-generation ├── 01_extract_triples.py ├── 02_integer_mapping.py ├── 03_marius_preprocess_import.sh ├── 04_marius_train_complex.sh ├── 04_marius_train_distmult.sh ├── 04_marius_train_gat.sh ├── 04_marius_train_gnn.sh ├── 04_marius_train_transe.sh ├── 05_marius_export_distmult_emb.sh ├── README.md ├── marius_code_modifications │ └── custom.py └── model_configs │ ├── complex_100dim_adam_500neg_samples.yaml │ ├── distmult_100dim_adam_500neg_samples.yaml │ ├── gat_100dim_adam_500neg_samples.yaml │ ├── gnn_100dim_adam_500neg_samples.yaml │ └── transe_100dim_adam_500neg_samples.yaml ├── graphdb-preload ├── .gitignore ├── docker-compose.yml ├── graphdb-home │ └── data │ │ └── repositories │ │ └── metaphactory │ │ └── config.ttl └── graphdb-license │ └── graphdb.license ├── linked-dataset-description ├── README.md └── semopenalex-description-void.ttl ├── ontologies ├── README.md └── semopenalex-ontology.ttl ├── requirements.txt ├── semopenalex-app ├── assets │ ├── font │ │ ├── Jost-Italic-VariableFont_wght.ttf │ │ ├── Jost-VariableFont_wght.ttf │ │ ├── OFL.txt │ │ ├── README.txt │ │ └── static │ │ │ ├── Jost-Black.ttf │ │ │ ├── Jost-BlackItalic.ttf │ │ │ ├── Jost-Bold.eot │ │ │ ├── Jost-Bold.ttf │ │ │ ├── Jost-Bold.woff │ │ │ ├── Jost-Bold.woff2 │ │ │ ├── Jost-BoldItalic.ttf │ │ │ ├── Jost-ExtraBold.ttf │ │ │ ├── Jost-ExtraBoldItalic.ttf │ │ │ ├── Jost-ExtraLight.ttf │ │ │ ├── Jost-ExtraLightItalic.ttf │ │ │ ├── Jost-Italic.eot │ │ │ ├── Jost-Italic.ttf │ │ │ ├── Jost-Italic.woff │ │ │ ├── Jost-Italic.woff2 │ │ │ ├── Jost-Light.ttf │ │ │ ├── Jost-LightItalic.ttf │ │ │ ├── Jost-Medium.ttf │ │ │ ├── Jost-MediumItalic.ttf │ │ │ ├── Jost-Regular.eot │ │ │ ├── Jost-Regular.ttf │ │ │ ├── Jost-Regular.woff │ │ │ ├── Jost-Regular.woff2 │ │ │ ├── Jost-SemiBold.ttf │ │ │ ├── Jost-SemiBoldItalic.ttf │ │ │ ├── Jost-Thin.ttf │ │ │ └── Jost-ThinItalic.ttf │ ├── images │ │ ├── categorization.png │ │ ├── category.png │ │ ├── essay.png │ │ ├── folder.png │ │ ├── funding.png │ │ ├── goal.png │ │ ├── intro-to-graphs.jpeg │ │ ├── lightbulb.png │ │ ├── location.png │ │ ├── metaphactory-logo.svg │ │ ├── no-photos.png │ │ ├── office-building.png │ │ ├── readme.md │ │ ├── science-research.png │ │ ├── search.png │ │ ├── semopenalex-icon.svg │ │ ├── stack-of-books.png │ │ ├── topic.png │ │ ├── university.png │ │ └── writer.png │ ├── jost.css │ └── style.css ├── config │ ├── environment.prop │ ├── global.prop │ ├── namespaces.prop │ ├── page-layout │ │ ├── footer.hbs │ │ ├── header.hbs │ │ └── html-header-resources.hbs │ ├── proxy.prop │ ├── roles │ │ ├── role-export-diagram.prop │ │ └── role-sparql-editor-view.prop │ └── ui.prop ├── data │ └── templates │ │ ├── Template%3Ahttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23Concept.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FAuthor.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FFunder.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FInstitution.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FPublisher.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FSource.html │ │ ├── Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FWork.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FAbout.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FConcept.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FDefaultConceptTemplate.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FKeywordConcept.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FMeshConcept.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FSDGConcept.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FTopicConcept.html │ │ ├── http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FUniversalSearch.html │ │ ├── http%3A%2F%2Fwww.metaphacts.com%2Fontologies%2Fplatform%23SearchResultsCustomization.html │ │ ├── http%3A%2F%2Fwww.metaphacts.com%2Fresource%2Fassets%2FDatasets.html │ │ └── http%3A%2F%2Fwww.metaphacts.com%2Fresource%2Fheader%2FResource.html ├── ldp │ └── assets │ │ ├── http%3A%2F%2Fwww.metaphacts.com%2Fontologies%2Fplatform%23ontodiaDiagramContainer.trig │ │ └── https%3A%2F%2Fsemopenalex.org%2FontodiaDiagramContainer%2Fdefault_diagram.trig └── plugin.properties ├── sparql-queries ├── README.md ├── figure-3_ml-papers-kit-2000-to-2021.py ├── figure-4_count-institution-country.txt └── table-4_count-institution-type.txt ├── transformSemOpenAlex.sh └── transformation-scripts ├── README.md ├── semopenalex-authors.py ├── semopenalex-concepts.py ├── semopenalex-dataset.py ├── semopenalex-domains.py ├── semopenalex-fields.py ├── semopenalex-funders.py ├── semopenalex-institutions.py ├── semopenalex-keywords.py ├── semopenalex-publishers.py ├── semopenalex-sources.py ├── semopenalex-subfields.py ├── semopenalex-topics.py └── semopenalex-works.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | semopenalex-app/config/repositories/* 3 | 4 | *.zip 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SemOpenAlex 2 | [SemOpenAlex](https://semopenalex.org) is a dataset about scientific publications presented in the form of a knowledge graph. 3 | 4 | The underlying [SemOpenAlex](https://semopenalex.org) dataset is based on [OpenAlex](https://docs.openalex.org). The original dataset snapshots of [OpenAlex](https://docs.openalex.org/download-snapshot) are updated about once per month. With the scripts provided in this repository, the SemOpenAlex 5 | dataset can be re-generated based on the snapshot. 6 | 7 | In the sections below, we describe the detailed steps create and load the SemOpenAlex dataset. 8 | 9 | ## SemOpenAlex Dataset 10 | 11 | To generate the SemOpenAlex dataset from the [OpenAlex](https://openalex.s3.amazonaws.com/browse.html) S3 bucket, 12 | we use the following python scripts for each entity. Each individual script downloads the latest snapshot and 13 | produces a RDF document in the TRIG format. 14 | 15 | 1. [semopenalex-authors.py](./transformation-scripts/semopenalex-authors.py) 16 | 2. [semopenalex-concepts.py](./transformation-scripts/semopenalex-concepts.py) 17 | 2. [semopenalex-institutions.py](./transformation-scripts/semopenalex-institutions.py) 18 | 3. [semopenalex-publishers.py](./transformation-scripts/semopenalex-publishers.py) 19 | 3. [semopenalex-sources.py](./transformation-scripts/semopenalex-sources.py) 20 | 5. [semopenalex-works.py](./transformation-scripts/semopenalex-works.py) 21 | 6. [semopenalex-funders.py](./transformation-scripts/semopenalex-funders.py) 22 | 23 | Note that the [semopenalex-dataset.py](./transformation-scripts/semopenalex-dataset.py) script is used to capture metadata about SemOpenAlex dataset (e.g. when it was loaded). 24 | 25 | ### Prerequisites 26 | 27 | To be able to run the above Python scripts, we need: 28 | - Python 3.7 (or later) 29 | - Python's package management tool `pip` 30 | 31 | Linux Server/OS: 32 | - disk storage space of at least 4TB 33 | - vCPU 16 34 | - RAM of at least 256GB 35 | 36 | GraphDB: 37 | - 10.0.0 (or later) 38 | - Docker installed (version >= 17.x , check with docker --version) 39 | - docker-compose installed (version >= 1.14, check with docker-compose --version) 40 | 41 | ### Python libraries installation 42 | 43 | The required Python libraries are defined in the `requirements.txt` file. The following command can be used to install all dependencies. 44 | 45 | ``` 46 | $ pip install -r requirements.txt 47 | ``` 48 | 49 | ### Adjust the location for the OpenAlex data dump root directory 50 | In order to execute the main script, 51 | it requires the `data_dump_input_root_dir` location to be defined. You can modify and adjust the following directory inside all the Python scripts code as needed. 52 | 53 | The default location is `/opt/openalex-snapshot`. Note that the folder must be mounted to a disk storage with at least 5TB. 54 | 55 | ``` 56 | data_dump_input_root_dir = '/opt/openalex-snapshot' 57 | ``` 58 | 59 | ### Tuning Python scripts for multiprocessing 60 | Depending on resources available on your Linux server, you can fine-tune multiprocessing for the following Python scripts: 61 | 62 | 1. [semopenalex-authors.py](./transformation-scripts/semopenalex-authors.py) 63 | 2. [semopenalex-works.py](./transformation-scripts/semopenalex-works.py) 64 | 65 | Currently, `CPU_THREADS` defaults to 16 and `maxtasksperchild` id defaults to 5. 66 | 67 | 68 | ### Tuning GraphDB 69 | In order to load the SemOpenAlex RDF dataset (i.e. billions of statements) to GraphDB, 70 | we use the [ImportRDF](https://graphdb.ontotext.com/documentation/10.0/loading-data-using-importrdf.html) tool, 71 | specifically [preload](https://graphdb.ontotext.com/documentation/10.0/loading-data-using-importrdf.html#preload-command-line-options) with docker. 72 | To achieve an optimal performance for loading such large amounts of RDF data, we recommend using GraphDB SE/EE features (for example, multi-threading). 73 | 74 | Note, that GraphDB can be started without a license pre-configured. As of GraphDB 10 the database will operate in Free Mode. 75 | To activate GraphDB SE/EE features, a valid license can be copied to the following place: 76 | 77 | ``` 78 | ./graphdb-preload/graphdb-license/graphdb.license 79 | ``` 80 | 81 | You may also need to tune memory reservations, memory limits for the Docker and 82 | min and max values for Java heap memory for GraphDB according to the available resources, see [docker-compose.yml](./graphdb-preload/docker-compose.yml) 83 | 84 | 85 | ### Executing the main script 86 | 87 | Before executing the following single main script (which runs all Python scripts as well as the data ingestion to GraphDB), there are few things you need to consider. 88 | 89 | First, to perform data transformation step: from OpenAlex to SemOpenAlex RDF dataset, 90 | it typically takes at least 3-4 days in our system environment (vCPU: 16 and RAM 256 GB). 91 | 92 | Second, to perform data ingestion to GraphDB, it also typically takes up 2-3 days. 93 | The overall time taken may depend and may be faster if you have bigger the computation power. 94 | 95 | Third, we compress `graphdb-home/` folder. 96 | This is for us to transfer the folder to another dedicated server for hosting the production database. 97 | You can skip or disable this step if you intend to run GraphDB in the same server. 98 | 99 | Another note, in order to avoid the main script to be accidentally terminated, 100 | we recommend using linux `screen` or alike command to run it in the background. 101 | 102 | ``` 103 | $ chmod +x transformSemOpenAlex.sh 104 | $ ./transformSemOpenAlex.sh 105 | ``` 106 | -------------------------------------------------------------------------------- /build-app.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | FOLDER=$(dirname $0) 4 | 5 | cd "$FOLDER" 6 | 7 | echo "building app in folder $(pwd)" 8 | echo "deleting .DS_Store files" 9 | find . -name .DS_Store -exec rm {} \; 10 | 11 | echo "building semopenalex main app" 12 | rm semopenalex-app.zip 13 | zip -r semopenalex-app.zip semopenalex-app 14 | 15 | echo "App file:" 16 | find . -name "*-app.zip" 17 | -------------------------------------------------------------------------------- /data-sample/README.md: -------------------------------------------------------------------------------- 1 | ## SemOpenAlex Data Sample 2 | 3 | A small data sample from SemOpenAlex which contains information about the publication [Linked Data - The Story So Far](https://semopenalex.org/work/W2015191210) and its authors [Tim Berners-Lee](https://semopenalex.org/author/A2308609296), [Christian Bizer](https://semopenalex.org/author/A247353998) and [Tom Heath](https://semopenalex.org/author/A2558543881). 4 | -------------------------------------------------------------------------------- /embeddings-generation/01_extract_triples.py: -------------------------------------------------------------------------------- 1 | # this files takes raw semantic triple files and extracts specific triples based on the relations 2 | 3 | import sys 4 | 5 | pred_list = [ 6 | "", 7 | "", 8 | "", 9 | "", 10 | "", 11 | "", 12 | "", 13 | "", 14 | "", 15 | "" 16 | ] 17 | 18 | lvl_0_concepts = set(['17744445', '138885662', '162324750', '144133560', '15744967', '33923547', '71924100', '86803240', 19 | '41008148', '127313418', '185592680', '142362112', \ 20 | '144024400', '127413603', '205649164', '95457728', '192562407', '121332964', '39432304']) 21 | 22 | lvl_1_concepts = set( 23 | ['6303427', '19417346', '26271046', '2524010', '3079626', '46141821', '61696701', '41895202', '47768531', 24 | '126348684', '74909509', '165205528', '107053488', '121864883', '169760540', '126322002', '195094911', '512399662', 25 | '147597530', '164705383', '166957645', '171146098', '19165224', '29595303', '29694066', '33332235', '54355233', 26 | '73484699', '78519656', '112930515', '162118730', '11413529', '12554922', '13736549', '27206212', '29456083', 27 | '41999313', '24326235', '42219234', '43617362', '49774154', '77088390', '61434518', '126838900', '105639569', 28 | '108827166', '112698675', '195244886', '204321447', '2522767166', '149635348', '154945302', '97355855', 29 | '185544564', '187212893', '107826830', '524765639', '138496976', '139719470', '199289684', '505870484', 30 | '556758197', '70410870', '70721500', '78458016', '79403827', '105702510', '105795698', '120314980', '120665830', 31 | '177322064', '177713679', '548259974', '49204034', '114793014', '147789679', '167562979', '175444787', '178790620', 32 | '188147891', '2989005', '9390403', '91586092', '115903868', '134018914', '194828623', '199639397', '5900021', 33 | '21547014', '22212356', '31903555', '33070731', '74916050', '94375191', '119767625', '133731056', '144237770', 34 | '170154142', '8058405', '24667770', '46312422', '49040817', '87717796', '88463610', '111472728', '95444343', 35 | '145236788', '180747234', '13280743', '18903297', '24890656', '62649853', '118552586', '126255220', '131872663', 36 | '153294291', '87355193', '144027150', '526734887', '556039675', '6557445', '15708023', '66938386', '80444323', 37 | '90856448', '116915560', '118524514', '121684516', '136229726', '175605778', '528095902', '542102704', '1276947', 38 | '2549261', '36289849', '45355965', '75630572', '138921699', '147176958', '539667460', '1862650', '30475298', 39 | '50522688', '54286561', '73283319', '110354214', '133425853', '159110408', '159467904', '190253527', '199539241', 40 | '8010536', '77595967', '136264566', '153349607', '159390177', '161191863', '162853370', '165556158', '179104552', 41 | '188027245', '97137747', '100970517', '118487528', '148383697', '151730666', '40700', '4249254', '16674752', 42 | '21951064', '44154836', '48824518', '76155785', '111919701', '118615104', '136764020', '145420912', '150903083', 43 | '173608175', '178550888', '184779094', '187736073', '201995342', '548081761', '10138342', '58640448', '459310', 44 | '74363100', '107993555', '117671659', '16005928', '23123220', '38652104', '55493867', '56739046', '57879066', 45 | '202444582', '502942594', '98274493', '106159729', '140793950', '11171543', '13965031', '18547055', '31972630', 46 | '59822182', '62520636', '99508421', '124101348', '149782125', '159985019', '549774020', '16685009', '28490314', 47 | '37621935', '42475967', '53553401', '90924648', '121955636', '124952713', '171250308', '199104240', '200601418', 48 | '37914503', '71240020', '74650414', '113775141', '199360897', '3116431', '42972112', '44870925', '89423630', 49 | '100001284', '107457646', '155647269', '191897082', '509550671', '19527891', '31258907', '34447519', '39549134', 50 | '42407357', '55587333', '78762247', '118084267', '141071460', '149923435', '178802073', '199343813', '119857082', 51 | '42360764', '60644358', '77805123', '99454951', '545542383', '21880701', '26873012', '75473681', '107038049', 52 | '111368507', '126894567', '142724271', '155202549', '186060115', '187320778', '203014093', '28826006', '95124753', 53 | '114614502', '119599485', '134306372', '134560507', '143998085', '159047783', '1965285', '17409809', '52119013', 54 | '54750564', '91375879', '107872376', '109214941', '146978453', '153911025', '183696295']) 55 | 56 | lvl_0_1_concepts = lvl_1_concepts.union(lvl_0_concepts) 57 | ent_types = ['Work', 'Author', 'Venue', 'Institution'] 58 | root_path = "" 59 | 60 | 61 | def extract_triples_from_file(fn): 62 | with open(str(root_path + fn), 63 | "w", encoding="utf-8") as g: 64 | with open( 65 | str(root_path + fn), 66 | "r", encoding="utf-8") as f: 67 | 68 | i = 0 69 | for line in f: 70 | line = line.replace("|", "").replace("> ", 71 | ">|") # del pipes first, then insert pipes as separators between objects 72 | pred = line.split("|")[1] 73 | 74 | if pred in pred_list: 75 | 76 | if pred == "": 77 | # remove author substring from subject of "authorposition" class 78 | # 79 | # to maintain relation work -> author directly (no intermediary authorposition object) 80 | 81 | line = (line[:line.index("A")] + line[line.index(">"):]) 82 | line = line.replace("authorposition", "work") 83 | g.write(line.replace("|", " ")) 84 | 85 | elif pred == "": 86 | # use 0-level concepts only 87 | # 88 | # length check excludes conceptscore subject relation 89 | if len(line.split("|")[0]) in range(40, 49) and "work" in line.split("|")[0]: 90 | if line.split("|")[2].replace("", 91 | "") in lvl_0_1_concepts: 92 | g.write(line.replace("|", " ")) 93 | 94 | elif pred == "": 95 | # country code relation for institutions. Remove intermediary geo object 96 | 97 | line = line.replace("https://semopenalex.org/geo/", "https://semopenalex.org/institution/") 98 | g.write(line.replace("|", " ")) 99 | 100 | elif pred == "": 101 | # extract the type relation for works, authors, venues and institutions 102 | if line.split("|")[2].replace("", 103 | "") in ent_types: 104 | g.write(line.replace("|", " ")) 105 | 106 | elif pred == "": 107 | # shorten the extracted work type, remove crossref prefix 108 | line = line.replace("https://api.crossref.org/types/", "") 109 | g.write(line.replace("|", " ")) 110 | 111 | elif pred == "": 112 | # remove auxiliary hostvenue class, build link from work to venue directly 113 | # object in triple (the hostvenue) is modified such that it resembles the actual venue URI 114 | obj = line.split("|")[2] 115 | obj_no_hops = (obj[:obj.index("W")] + obj[obj.index("V"):]).replace("hostvenue", "venue") 116 | 117 | line = line.replace(obj, obj_no_hops) 118 | g.write(line.replace("|", " ")) 119 | 120 | else: 121 | g.write(line.replace("|", " ")) 122 | 123 | if i % 1000000000 == 0: 124 | print(f"{i / 1000000}mio. lines processed..") 125 | i += 1 126 | 127 | return True 128 | 129 | 130 | if __name__ == '__main__': 131 | if len(sys.argv) != 2: 132 | print(('usage: python3 extract_triples_on_lsdf.py ')) 133 | sys.exit() 134 | text_file_to_process = sys.argv[1] 135 | ret = extract_triples_from_file(text_file_to_process) 136 | if not ret: 137 | print("### Done ###") 138 | sys.exit() 139 | -------------------------------------------------------------------------------- /embeddings-generation/02_integer_mapping.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # import random 3 | # import pandas as pd 4 | import csv 5 | import sys 6 | 7 | delim_char = "\t" 8 | root_path = "" 9 | 10 | 11 | def convert_triple_strings_to_int(fn): 12 | line_count = 0 13 | entity_count = 0 14 | relations_count = 0 15 | entity_dict = {} 16 | relations_dict = {} 17 | fn_new = fn + '_integer_mapped' 18 | 19 | # read triple file and add all node or relation identifiers to a dictionary 20 | with open(str(root_path + fn), "r", encoding='utf-8') as f: 21 | print("Creating dictionaries.. ") 22 | for line in f: 23 | line = line.strip("\n") 24 | if len(line.split(delim_char)) > 1 and line_count != 0: 25 | sub = line.split(delim_char)[0] 26 | pred = line.split(delim_char)[1] 27 | obj = line.split(delim_char)[2] 28 | if not sub in entity_dict: 29 | entity_dict[sub] = entity_count 30 | entity_count += 1 31 | if not pred in relations_dict: 32 | relations_dict[pred] = relations_count 33 | relations_count += 1 34 | if not obj in entity_dict: 35 | entity_dict[obj] = entity_count 36 | entity_count += 1 37 | 38 | if line_count % 25000000 == 0: 39 | print("Reading line", (line_count / 1000000), "mio.") 40 | line_count += 1 41 | 42 | line_count = 0 43 | 44 | # write dictionaries (they describe the mapping of node and relation identifiers to integers used later in training) 45 | with open(str(root_path + fn_new + "_entities.dict"), "w") as f: 46 | for item in entity_dict: 47 | f.write(str(entity_dict[item]) + "\t" + str(item) + "\n") 48 | with open(str(root_path + fn_new + "_relations.dict"), "w") as f: 49 | for item in relations_dict: 50 | f.write(str(relations_dict[item]) + "\t" + str(item) + "\n") 51 | with open(str(root_path + fn_new + "counts.txt"), "w") as f: 52 | s = str('entity count: ' + str(entity_count) + ' - relation count: ' + str(relations_count)) 53 | f.write(s) 54 | 55 | # convert the source edge list (of mostly strings) to an all-integer mapped edge list in .csv format using the previously created mapping dictionaries 56 | with open(str(root_path + fn), "r", encoding='utf-8') as f: 57 | 58 | with open(str(root_path + fn_new + ".csv"), "w", encoding='utf-8', newline='') as m: 59 | 60 | print("Mapping...") 61 | writer = csv.writer(m) 62 | for line in f: 63 | line = line.strip("\n") 64 | if len(line.split(delim_char)) > 1 and line_count != 0: 65 | sub = line.split(delim_char)[0] 66 | pred = line.split(delim_char)[1] 67 | obj = line.split(delim_char)[2] 68 | writer.writerow((entity_dict[sub], relations_dict[pred], entity_dict[obj])) 69 | 70 | if line_count % 25000000 == 0: 71 | print("Mapping line", (line_count / 1000000), "mio.") 72 | line_count += 1 73 | 74 | print("Done with file: ", str(fn)) 75 | print('entity count', entity_count) 76 | print('relation count', relations_count) 77 | return True 78 | 79 | 80 | if __name__ == '__main__': 81 | if len(sys.argv) != 2: 82 | print(('usage: python3 integer_mapping_lsdf_.py ')) 83 | sys.exit() 84 | text_file_to_process = sys.argv[1] # "all_objects.tsv" 85 | ret = convert_triple_strings_to_int(text_file_to_process) 86 | if not ret: 87 | print("### Done ###") 88 | sys.exit() 89 | -------------------------------------------------------------------------------- /embeddings-generation/03_marius_preprocess_import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=4:00:00 4 | #SBATCH --mem=376000 5 | #SBATCH --job-name=marius_preprocess_all 6 | #SBATCH --output=log_marius_preprocess_all.txt 7 | #SBATCH --container-name nv_marius_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | echo 'Start Marius import of data to ready graph for embeddings training' 12 | marius_preprocess --output_dir /mount_home/marius_imported/ --edges /mount_home/raw_data/extracted_triples.csv -d ',' --dataset_split 0.998 0.001 0.001 --columns 0 1 2 --num_partitions 1 --no_remap_ids --overwrite 13 | echo 'Marius import worked - all preprocessed' -------------------------------------------------------------------------------- /embeddings-generation/04_marius_train_complex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=15:00:00 4 | #SBATCH --mem=752000 5 | #SBATCH --job-name=marius_train_complex 6 | #SBATCH --output=sh/_log_mtrain_complex_w01_gpu_8_100dim_v10_adam_500negs.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'Start training with ComplEx approach' 13 | marius_train /marius_configs/complex_100dim_adam_500neg_samples.yaml 14 | echo 'done training epoch' 15 | echo 'start eval' 16 | marius_eval /marius_configs/complex_100dim_adam_500neg_samples.yaml 17 | echo 'done eval' 18 | echo 'done with training and evaluation for one epoch' -------------------------------------------------------------------------------- /embeddings-generation/04_marius_train_distmult.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=15:00:00 4 | #SBATCH --mem=752000 5 | #SBATCH --job-name=marius_train_distmult 6 | #SBATCH --output=log_mtrain_distmult.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'Start training with DistMult approach' 13 | marius_train /model_configs/distmult_100dim_adam_500neg_samples.yaml 14 | echo 'done training epoch' 15 | echo 'start eval' 16 | marius_eval /model_configs/distmult_100dim_adam_500neg_samples.yaml 17 | echo 'done eval' 18 | echo 'done with training and evaluation for one epoch' 19 | -------------------------------------------------------------------------------- /embeddings-generation/04_marius_train_gat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=30:00:00 4 | #SBATCH --mem=752000 5 | #SBATCH --job-name=marius_train_gat 6 | #SBATCH --output=log_mtrain_gat.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'Start training with graph attention network approach' 13 | marius_train /model_configs/gat_100dim_adam_500neg_samples.yaml 14 | echo 'done training epoch' 15 | echo 'start eval' 16 | marius_eval /model_configs/gat_100dim_adam_500neg_samples.yaml 17 | echo 'done eval' 18 | echo 'done with training and evaluation for one epoch' -------------------------------------------------------------------------------- /embeddings-generation/04_marius_train_gnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=30:00:00 4 | #SBATCH --mem=752000 5 | #SBATCH --job-name=marius_train_gnn 6 | #SBATCH --output=log_mtrain_gnn.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'Start training with GNN setup' 13 | marius_train /model_configs/gnn_100dim_adam_500neg_samples.yaml 14 | echo 'done training epoch' 15 | echo 'start eval' 16 | marius_eval /model_configs/gnn_100dim_adam_500neg_samples.yaml 17 | echo 'done eval' 18 | echo 'done with training and evaluation for one epoch' -------------------------------------------------------------------------------- /embeddings-generation/04_marius_train_transe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=15:00:00 4 | #SBATCH --mem=752000 5 | #SBATCH --job-name=marius_train_transe 6 | #SBATCH --output=log_mtrain_transe.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'Start training with TransE approach' 13 | marius_train /model_configs/transe_100dim_adam_500neg_samples.yaml 14 | echo 'done training epoch' 15 | echo 'start eval' 16 | marius_eval /model_configs/transe_100dim_adam_500neg_samples.yaml 17 | echo 'done eval' 18 | echo 'done with training and evaluation for one epoch' -------------------------------------------------------------------------------- /embeddings-generation/05_marius_export_distmult_emb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --time=48:00:00 4 | #SBATCH --mem=376000 5 | #SBATCH --job-name=emb_export_distmult 6 | #SBATCH --output=log_emb_export_distmult.txt 7 | #SBATCH --container-name nv_marius_gpu8_cuda_cudnn_pyxis_container 8 | #SBATCH --container-mounts=/home:/mount_home 9 | #SBATCH --container-writable 10 | #SBATCH --container-remap-root 11 | #SBATCH --gres=gpu:1 12 | echo 'start export' 13 | marius_postprocess --model_dir /mount_home/models/distmult --format parquet --output_dir /mount_home/parquet_export 14 | echo 'Embedding export successful!' 15 | echo 'start gzip of embeddings' 16 | gzip -k -v --fast /mount_home/parquet_export/embeddings.parquet 17 | echo 'done zipping' 18 | -------------------------------------------------------------------------------- /embeddings-generation/README.md: -------------------------------------------------------------------------------- 1 | ## Graph Entity Embeddings for SemOpenAlex 2 | 3 | ### Pre-processing, training and export 4 | 5 | *01:* Extracts triples from full RDF dump of SemOpenAlex. Mostly, relevant entity <> entity relations are extracted. Also auxiliary classes are cut short, i.e. intermediate hops are eliminated. 6 | 7 | *02:* Map all URIs to integers: enumerate entities and relation identifiers separately, write this conversion to a dict for later re-substitution. 8 | 9 | *03:* Import the edge list in integer form with Marius to ready for training, produces .bin format files. In addition, the modified source code file of the employed Marius system is given to reproduce the altered Marius import sequence in the directory *marius_code_modifications*. For Marius, see [Marius GitHub](https://github.com/marius-team/marius). 10 | 11 | *04:* Embeddings training using five different approaches. Carry out evaluation after each epoch. Re-run the bash script for each epoch subsequently. The included `.sh` shell scripts trigger one epoch of embedding training using the `.yaml` model configuration files in the directory *model_configs*. 12 | 13 | *05:* Export the embedding vectors for subsequent use using `marius_postprocess`. 14 | 15 | ### Evaluation hyperparameters 16 | 17 | We evaluated the generated embeddings using the following set of hyperparameters in a link prediction setting. 18 | 19 | | Parameter | Value | 20 | |---------|-------:| 21 | | Batch size | 2,000 | 22 | | Negative sampling size | 500 | 23 | | Filtered evaluation | False | 24 | | Pipeline sync | True | 25 | 26 | Additional information: 27 | Please refer to the 2021 Marius paper for more details on the pipelining mechanism: [Mohoney et al., 2021](https://www.usenix.org/system/files/osdi21-mohoney.pdf). 28 | 29 | 30 | ### Evaluation results 31 | 32 | After training for 3 iterations on the entire training graph, evaluation via the link prediction task yielded the following evaluation scores. The best values for the metrics mean rank (MR), mean reciprocal rank (MRR), and Hits@N are marked bold. 33 | 34 | | Metric | TransE | DistMult | ComplEx | GraphSAGE | Graph Attention Network | 35 | |---------|-------:|---------:|--------:|----------:|--------:| 36 | | MR | 43.633 | 28.268 | 28.290 | **26.053** | 35.500 | 37 | | MRR | 0.372 | **0.695**| 0.693 | 0.688 | 0.657 | 38 | | Hits@1 | 0.309 | **0.655**| 0.651 | 0.642 | 0.604 | 39 | | Hits@3 | 0.375 | **0.714**| 0.713 | 0.713 | 0.689 | 40 | | Hits@10 | 0.499 | **0.764**| 0.763 | 0.762 | 0.746 | 41 | 42 | 43 | ### Technical details 44 | 45 | All processes regarding Marius were executed in a container environment for HPC settings using the Enroot framework for Docker containers. 46 | The container image is `nvidia+cuda+11.2.2-cudnn8-devel-ubuntu18.04.sqsh` from [NVidia](https://catalog.ngc.nvidia.com/containers). 47 | In our setup we used a system running RHEL 8.4, using Python 3.7, Marius 0.0.2, PyTorch 1.9.1 and CUDA 11.2.2. for embeddings generation. 48 | -------------------------------------------------------------------------------- /embeddings-generation/marius_code_modifications/custom.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from marius.tools.preprocess.converters.spark_converter import SparkEdgeListConverter 4 | from marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter 5 | from marius.tools.preprocess.dataset import LinkPredictionDataset 6 | 7 | 8 | class CustomLinkPredictionDataset(LinkPredictionDataset): 9 | def __init__( 10 | self, output_directory: Path, files: list, delim: str = "\t", dataset_name: str = "custom", spark: bool = False 11 | ): 12 | super().__init__(output_directory, spark) 13 | 14 | self.dataset_name = dataset_name 15 | self.output_directory = output_directory 16 | 17 | if len(files) == 1: 18 | self.train_edges_file = files[0] 19 | self.valid_edges_file = None 20 | self.test_edges_file = None 21 | 22 | if len(files) == 3: 23 | self.train_edges_file = files[0] 24 | self.valid_edges_file = files[1] 25 | self.test_edges_file = files[2] 26 | 27 | self.delim = delim 28 | self.spark = spark 29 | 30 | def download(self, overwrite=False): 31 | pass 32 | 33 | def preprocess( 34 | self, 35 | num_partitions=1, 36 | remap_ids=False, 37 | splits=[0.9, 0.05, 0.05], 38 | partitioned_eval=False, 39 | sequential_train_nodes=False, 40 | columns=[0, 1, 2], 41 | ): 42 | converter = SparkEdgeListConverter if self.spark else TorchEdgeListConverter 43 | converter = converter( 44 | output_dir=self.output_directory, 45 | train_edges=self.train_edges_file, 46 | valid_edges=self.valid_edges_file, 47 | test_edges=self.test_edges_file, 48 | delim=self.delim, 49 | format="csv", # hard-coded !! 50 | columns=columns, 51 | num_partitions=num_partitions, 52 | splits=splits, 53 | remap_ids=remap_ids, 54 | partitioned_evaluation=partitioned_eval, 55 | num_nodes = 502634946, #! 56 | num_rels = 10 # ! 57 | ) 58 | 59 | converter.convert() 60 | -------------------------------------------------------------------------------- /embeddings-generation/model_configs/complex_100dim_adam_500neg_samples.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | random_seed: 456356765455 3 | learning_task: LINK_PREDICTION 4 | encoder: 5 | layers: 6 | - - type: EMBEDDING 7 | output_dim: 100 8 | decoder: 9 | type: COMPLEX 10 | options: 11 | input_dim: 50 12 | loss: 13 | type: SOFTMAX_CE 14 | options: 15 | reduction: SUM 16 | dense_optimizer: 17 | type: ADAM 18 | options: 19 | learning_rate: 0.1 20 | sparse_optimizer: 21 | type: ADAGRAD 22 | options: 23 | learning_rate: 0.1 24 | storage: 25 | device_type: cuda 26 | dataset: 27 | dataset_dir: /mount_home/marius_imported/ 28 | edges: 29 | type: FLAT_FILE 30 | options: 31 | dtype: int 32 | embeddings: 33 | type: HOST_MEMORY 34 | options: 35 | dtype: float 36 | model_dir: /mount_home/models/complex/ 37 | training: 38 | batch_size: 16000 39 | negative_sampling: 40 | num_chunks: 10 41 | negatives_per_positive: 500 42 | degree_fraction: 0.0 43 | filtered: false 44 | num_epochs: 1 45 | pipeline: 46 | sync: false 47 | staleness_bound: 16 48 | batch_loader_threads: 8 49 | gradient_update_threads: 8 50 | epochs_per_shuffle: 1 51 | logs_per_epoch: 20 52 | save_model: true 53 | evaluation: 54 | batch_size: 2000 55 | negative_sampling: 56 | num_chunks: 2 57 | negatives_per_positive: 500 58 | degree_fraction: 0.0 59 | filtered: false 60 | pipeline: 61 | sync: true 62 | epochs_per_eval: 1 63 | -------------------------------------------------------------------------------- /embeddings-generation/model_configs/distmult_100dim_adam_500neg_samples.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | random_seed: 456356765455 3 | learning_task: LINK_PREDICTION 4 | encoder: 5 | layers: 6 | - - type: EMBEDDING 7 | output_dim: 100 8 | decoder: 9 | type: DISTMULT 10 | options: 11 | input_dim: 50 12 | loss: 13 | type: SOFTMAX_CE 14 | options: 15 | reduction: SUM 16 | dense_optimizer: 17 | type: ADAM 18 | options: 19 | learning_rate: 0.1 20 | sparse_optimizer: 21 | type: ADAGRAD 22 | options: 23 | learning_rate: 0.1 24 | storage: 25 | device_type: cuda 26 | dataset: 27 | dataset_dir: /mount_home/marius_imported/ 28 | edges: 29 | type: FLAT_FILE 30 | options: 31 | dtype: int 32 | embeddings: 33 | type: HOST_MEMORY 34 | options: 35 | dtype: float 36 | model_dir: /mount_home/models/distmult/ 37 | training: 38 | batch_size: 16000 39 | negative_sampling: 40 | num_chunks: 10 41 | negatives_per_positive: 500 42 | degree_fraction: 0.0 43 | filtered: false 44 | num_epochs: 1 45 | pipeline: 46 | sync: false 47 | staleness_bound: 16 48 | batch_loader_threads: 8 49 | gradient_update_threads: 8 50 | epochs_per_shuffle: 1 51 | logs_per_epoch: 20 52 | save_model: true 53 | evaluation: 54 | batch_size: 2000 55 | negative_sampling: 56 | num_chunks: 2 57 | negatives_per_positive: 500 58 | degree_fraction: 0.0 59 | filtered: false 60 | pipeline: 61 | sync: true 62 | epochs_per_eval: 1 63 | -------------------------------------------------------------------------------- /embeddings-generation/model_configs/gat_100dim_adam_500neg_samples.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | random_seed: 456356765455 3 | learning_task: LINK_PREDICTION 4 | encoder: 5 | train_neighbor_sampling: 6 | - type: UNIFORM 7 | options: 8 | max_neighbors: 10 9 | layers: 10 | - - type: EMBEDDING 11 | output_dim: 100 12 | bias: true 13 | - - type: GNN 14 | input_dim: 100 15 | output_dim: 50 16 | options: 17 | type: GAT 18 | bias: true 19 | decoder: 20 | type: DISTMULT 21 | options: 22 | input_dim: 50 23 | loss: 24 | type: SOFTMAX_CE 25 | options: 26 | reduction: SUM 27 | dense_optimizer: 28 | type: ADAM 29 | options: 30 | learning_rate: 0.1 31 | sparse_optimizer: 32 | type: ADAGRAD 33 | options: 34 | learning_rate: 0.1 35 | storage: 36 | device_type: cuda 37 | dataset: 38 | dataset_dir: /mount_home/marius_imported/ 39 | edges: 40 | type: FLAT_FILE 41 | options: 42 | dtype: int 43 | embeddings: 44 | type: HOST_MEMORY 45 | options: 46 | dtype: float 47 | model_dir: /mount_home/models/gat/ 48 | training: 49 | batch_size: 16000 50 | negative_sampling: 51 | num_chunks: 10 52 | negatives_per_positive: 500 53 | degree_fraction: 0.0 54 | filtered: false 55 | num_epochs: 1 56 | pipeline: 57 | sync: false 58 | staleness_bound: 16 59 | batch_loader_threads: 8 60 | gradient_update_threads: 8 61 | epochs_per_shuffle: 1 62 | logs_per_epoch: 20 63 | save_model: true 64 | evaluation: 65 | batch_size: 2000 66 | negative_sampling: 67 | num_chunks: 2 68 | negatives_per_positive: 500 69 | degree_fraction: 0.0 70 | filtered: false 71 | pipeline: 72 | sync: true 73 | epochs_per_eval: 1 -------------------------------------------------------------------------------- /embeddings-generation/model_configs/gnn_100dim_adam_500neg_samples.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | random_seed: 456356765455 3 | learning_task: LINK_PREDICTION 4 | encoder: 5 | train_neighbor_sampling: 6 | - type: UNIFORM 7 | options: 8 | max_neighbors: 10 9 | layers: 10 | - - type: EMBEDDING 11 | output_dim: 100 12 | bias: true 13 | - - type: GNN 14 | input_dim: 100 15 | output_dim: 50 16 | options: 17 | type: GRAPH_SAGE 18 | bias: true 19 | decoder: 20 | type: DISTMULT 21 | options: 22 | input_dim: 50 23 | loss: 24 | type: SOFTMAX_CE 25 | options: 26 | reduction: SUM 27 | dense_optimizer: 28 | type: ADAM 29 | options: 30 | learning_rate: 0.1 31 | sparse_optimizer: 32 | type: ADAGRAD 33 | options: 34 | learning_rate: 0.1 35 | storage: 36 | device_type: cuda 37 | dataset: 38 | dataset_dir: /mount_home/marius_imported/ 39 | edges: 40 | type: FLAT_FILE 41 | options: 42 | dtype: int 43 | embeddings: 44 | type: HOST_MEMORY 45 | options: 46 | dtype: float 47 | model_dir: /mount_home/models/gnn/ 48 | training: 49 | batch_size: 16000 50 | negative_sampling: 51 | num_chunks: 10 52 | negatives_per_positive: 500 53 | degree_fraction: 0.0 54 | filtered: false 55 | num_epochs: 1 56 | pipeline: 57 | sync: false 58 | staleness_bound: 16 59 | batch_loader_threads: 8 60 | gradient_update_threads: 8 61 | epochs_per_shuffle: 1 62 | logs_per_epoch: 20 63 | save_model: true 64 | evaluation: 65 | batch_size: 2000 66 | negative_sampling: 67 | num_chunks: 2 68 | negatives_per_positive: 500 69 | degree_fraction: 0.0 70 | filtered: false 71 | pipeline: 72 | sync: true 73 | epochs_per_eval: 1 -------------------------------------------------------------------------------- /embeddings-generation/model_configs/transe_100dim_adam_500neg_samples.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | random_seed: 456356765455 3 | learning_task: LINK_PREDICTION 4 | encoder: 5 | layers: 6 | - - type: EMBEDDING 7 | output_dim: 100 8 | decoder: 9 | type: TRANSE 10 | options: 11 | input_dim: 50 12 | loss: 13 | type: SOFTMAX_CE 14 | options: 15 | reduction: SUM 16 | dense_optimizer: 17 | type: ADAM 18 | options: 19 | learning_rate: 0.1 20 | sparse_optimizer: 21 | type: ADAGRAD 22 | options: 23 | learning_rate: 0.1 24 | storage: 25 | device_type: cuda 26 | dataset: 27 | dataset_dir: /mount_home/marius_imported/ 28 | edges: 29 | type: FLAT_FILE 30 | options: 31 | dtype: int 32 | embeddings: 33 | type: HOST_MEMORY 34 | options: 35 | dtype: float 36 | model_dir: /mount_home/models/transe/ 37 | training: 38 | batch_size: 16000 39 | negative_sampling: 40 | num_chunks: 10 41 | negatives_per_positive: 500 42 | degree_fraction: 0.0 43 | filtered: false 44 | num_epochs: 1 45 | pipeline: 46 | sync: false 47 | staleness_bound: 16 48 | batch_loader_threads: 8 49 | gradient_update_threads: 8 50 | epochs_per_shuffle: 1 51 | logs_per_epoch: 20 52 | save_model: true 53 | evaluation: 54 | batch_size: 2000 55 | negative_sampling: 56 | num_chunks: 2 57 | negatives_per_positive: 500 58 | degree_fraction: 0.0 59 | filtered: false 60 | pipeline: 61 | sync: true 62 | epochs_per_eval: 1 63 | -------------------------------------------------------------------------------- /graphdb-preload/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | #GraphDB 3 | graphdb-license/graphdb.license 4 | graphdb-home/conf/ 5 | graphdb-home/work/ 6 | graphdb-home/data/repositories/metaphactory/storage/ 7 | graphdb-import/ 8 | -------------------------------------------------------------------------------- /graphdb-preload/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | graphdb: 4 | container_name: graphdb-preload 5 | image: ontotext/graphdb:10.3.2 6 | mem_limit: 250g 7 | mem_reservation: 250g 8 | 9 | entrypoint: [ "/opt/graphdb/dist/bin/importrdf", "preload", "--chunk", "20m", "--force", "--recursive", "--parsing-tasks", "24", "--recovery-point-interval", "3600", "--id", "metaphactory", "/opt/graphdb/graphdb-import/" ] 10 | 11 | environment: 12 | GDB_JAVA_OPTS: >- 13 | -Xms100g 14 | -Xmx100g 15 | -Dgraphdb.home=/opt/graphdb/home 16 | -Dgraphdb.license.file=/etc/graphdb-license 17 | -Dgraphdb.workbench.importDirectory=/opt/graphdb/graphdb-import 18 | volumes: 19 | - ./graphdb-home:/opt/graphdb/home 20 | - ./graphdb-import:/opt/graphdb/graphdb-import 21 | - ./graphdb-license/graphdb.license:/etc/graphdb-license 22 | -------------------------------------------------------------------------------- /graphdb-preload/graphdb-home/data/repositories/metaphactory/config.ttl: -------------------------------------------------------------------------------- 1 | @prefix rdfs: . 2 | @prefix rep: . 3 | @prefix sail: . 4 | @prefix xsd: . 5 | 6 | <#metaphactory> a rep:Repository; 7 | rep:repositoryID "metaphactory"; 8 | rep:repositoryImpl [ 9 | rep:repositoryType "graphdb:SailRepository"; 10 | [ 11 | "http://example.org/owlim#"; 12 | "false"; 13 | ""; 14 | "true"; 15 | "true"; 16 | "true"; 17 | "true"; 18 | "40"; 19 | "10000000"; 20 | ""; 21 | "true"; 22 | "0"; 23 | "0"; 24 | "false"; 25 | "file-repository"; 26 | "empty"; 27 | "storage"; 28 | 29 | "false"; 30 | sail:sailType "graphdb:Sail" 31 | ] 32 | ]; 33 | rdfs:label "metaphactory primary repo" . 34 | -------------------------------------------------------------------------------- /graphdb-preload/graphdb-license/graphdb.license: -------------------------------------------------------------------------------- 1 | ** valid license here ** 2 | -------------------------------------------------------------------------------- /linked-dataset-description/README.md: -------------------------------------------------------------------------------- 1 | ## SemOpenAlex Dataset Description with the VoID Vocabulary 2 | 3 | See: `semopenalex-description-void.ttl` 4 | 5 | [What is VoID?](https://www.w3.org/TR/void/) 6 | -------------------------------------------------------------------------------- /linked-dataset-description/semopenalex-description-void.ttl: -------------------------------------------------------------------------------- 1 | @prefix rdf: . 2 | @prefix rdfs: . 3 | @prefix foaf: . 4 | @prefix dcterms: . 5 | @prefix void: . 6 | @prefix xsd: . 7 | @prefix owl: . 8 | @prefix : <#> . 9 | 10 | :SemOpenAlex 11 | rdf:type void:Dataset ; 12 | foaf:homepage ; 13 | dcterms:title "SemOpenAlex" ; 14 | void:sparqlEndpoint ; 15 | dcterms:contributor ; 16 | dcterms:contributor ; 17 | dcterms:source ; 18 | dcterms:modified "2023-04-24"^^xsd:date ; 19 | dcterms:publisher :Michael_Faerber ; 20 | dcterms:publisher :Johan_Krause ; 21 | dcterms:publisher :David_Lamprecht ; 22 | dcterms:publisher :Linn_Aung ; 23 | dcterms:publisher :Peter_Haase ; 24 | dcterms:license ; 25 | dcterms:subject ; 26 | dcterms:subject ; 27 | dcterms:subject ; 28 | void:feature ; 29 | void:triples 26401183867 ; 30 | void:vocabulary ; 31 | void:vocabulary ; 32 | void:vocabulary ; 33 | void:vocabulary ; 34 | void:vocabulary ; 35 | void:vocabulary ; 36 | void:vocabulary ; 37 | void:vocabulary ; 38 | void:vocabulary ; 39 | void:vocabulary ; 40 | void:vocabulary ; 41 | void:vocabulary ; 42 | void:vocabulary ; 43 | void:vocabulary ; 44 | void:vocabulary ; 45 | void:vocabulary ; 46 | void:vocabulary ; 47 | void:linkPredicate owl:sameAs ; 48 | void:linkPredicate rdfs:seeAlso . 49 | 50 | 51 | :Michael_Faerber a 52 | foaf:Person ; 53 | rdfs:label "Michael Färber" ; 54 | foaf:homepage ; 55 | foaf:mbox . 56 | 57 | :Johan_Krause a 58 | foaf:Person ; 59 | rdfs:label "Johan Krause" ; 60 | foaf:mbox . 61 | 62 | :David_Lamprecht a 63 | foaf:Person ; 64 | rdfs:label "David Lamprecht" ; 65 | foaf:mbox . 66 | 67 | :Linn_Aung a 68 | foaf:Person ; 69 | rdfs:label "Linn Aung" ; 70 | foaf:mbox . 71 | 72 | :Peter_Haase a 73 | foaf:Person ; 74 | rdfs:label "Peter Haase" ; 75 | foaf:mbox . 76 | 77 | -------------------------------------------------------------------------------- /ontologies/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Note 3 | 1. [semopenalex-ontology.ttl](./semopenalex-ontology.ttl) is the ontology of SemOpenAlex. 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | rdflib 2 | boto3 3 | -------------------------------------------------------------------------------- /semopenalex-app/assets/font/Jost-Italic-VariableFont_wght.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/Jost-Italic-VariableFont_wght.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/Jost-VariableFont_wght.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/Jost-VariableFont_wght.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/OFL.txt: -------------------------------------------------------------------------------- 1 | Copyright 2020 The Jost Project Authors (https://github.com/indestructible-type) 2 | 3 | This Font Software is licensed under the SIL Open Font License, Version 1.1. 4 | This license is copied below, and is also available with a FAQ at: 5 | http://scripts.sil.org/OFL 6 | 7 | 8 | ----------------------------------------------------------- 9 | SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 10 | ----------------------------------------------------------- 11 | 12 | PREAMBLE 13 | The goals of the Open Font License (OFL) are to stimulate worldwide 14 | development of collaborative font projects, to support the font creation 15 | efforts of academic and linguistic communities, and to provide a free and 16 | open framework in which fonts may be shared and improved in partnership 17 | with others. 18 | 19 | The OFL allows the licensed fonts to be used, studied, modified and 20 | redistributed freely as long as they are not sold by themselves. The 21 | fonts, including any derivative works, can be bundled, embedded, 22 | redistributed and/or sold with any software provided that any reserved 23 | names are not used by derivative works. The fonts and derivatives, 24 | however, cannot be released under any other type of license. The 25 | requirement for fonts to remain under this license does not apply 26 | to any document created using the fonts or their derivatives. 27 | 28 | DEFINITIONS 29 | "Font Software" refers to the set of files released by the Copyright 30 | Holder(s) under this license and clearly marked as such. This may 31 | include source files, build scripts and documentation. 32 | 33 | "Reserved Font Name" refers to any names specified as such after the 34 | copyright statement(s). 35 | 36 | "Original Version" refers to the collection of Font Software components as 37 | distributed by the Copyright Holder(s). 38 | 39 | "Modified Version" refers to any derivative made by adding to, deleting, 40 | or substituting -- in part or in whole -- any of the components of the 41 | Original Version, by changing formats or by porting the Font Software to a 42 | new environment. 43 | 44 | "Author" refers to any designer, engineer, programmer, technical 45 | writer or other person who contributed to the Font Software. 46 | 47 | PERMISSION & CONDITIONS 48 | Permission is hereby granted, free of charge, to any person obtaining 49 | a copy of the Font Software, to use, study, copy, merge, embed, modify, 50 | redistribute, and sell modified and unmodified copies of the Font 51 | Software, subject to the following conditions: 52 | 53 | 1) Neither the Font Software nor any of its individual components, 54 | in Original or Modified Versions, may be sold by itself. 55 | 56 | 2) Original or Modified Versions of the Font Software may be bundled, 57 | redistributed and/or sold with any software, provided that each copy 58 | contains the above copyright notice and this license. These can be 59 | included either as stand-alone text files, human-readable headers or 60 | in the appropriate machine-readable metadata fields within text or 61 | binary files as long as those fields can be easily viewed by the user. 62 | 63 | 3) No Modified Version of the Font Software may use the Reserved Font 64 | Name(s) unless explicit written permission is granted by the corresponding 65 | Copyright Holder. This restriction only applies to the primary font name as 66 | presented to the users. 67 | 68 | 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font 69 | Software shall not be used to promote, endorse or advertise any 70 | Modified Version, except to acknowledge the contribution(s) of the 71 | Copyright Holder(s) and the Author(s) or with their explicit written 72 | permission. 73 | 74 | 5) The Font Software, modified or unmodified, in part or in whole, 75 | must be distributed entirely under this license, and must not be 76 | distributed under any other license. The requirement for fonts to 77 | remain under this license does not apply to any document created 78 | using the Font Software. 79 | 80 | TERMINATION 81 | This license becomes null and void if any of the above conditions are 82 | not met. 83 | 84 | DISCLAIMER 85 | THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF 87 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 88 | OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE 89 | COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL 91 | DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 92 | FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM 93 | OTHER DEALINGS IN THE FONT SOFTWARE. 94 | -------------------------------------------------------------------------------- /semopenalex-app/assets/font/README.txt: -------------------------------------------------------------------------------- 1 | Jost Variable Font 2 | ================== 3 | 4 | This download contains Jost as both variable fonts and static fonts. 5 | 6 | Jost is a variable font with this axis: 7 | wght 8 | 9 | This means all the styles are contained in these files: 10 | Jost-VariableFont_wght.ttf 11 | Jost-Italic-VariableFont_wght.ttf 12 | 13 | If your app fully supports variable fonts, you can now pick intermediate styles 14 | that aren’t available as static fonts. Not all apps support variable fonts, and 15 | in those cases you can use the static font files for Jost: 16 | static/Jost-Thin.ttf 17 | static/Jost-ExtraLight.ttf 18 | static/Jost-Light.ttf 19 | static/Jost-Regular.ttf 20 | static/Jost-Medium.ttf 21 | static/Jost-SemiBold.ttf 22 | static/Jost-Bold.ttf 23 | static/Jost-ExtraBold.ttf 24 | static/Jost-Black.ttf 25 | static/Jost-ThinItalic.ttf 26 | static/Jost-ExtraLightItalic.ttf 27 | static/Jost-LightItalic.ttf 28 | static/Jost-Italic.ttf 29 | static/Jost-MediumItalic.ttf 30 | static/Jost-SemiBoldItalic.ttf 31 | static/Jost-BoldItalic.ttf 32 | static/Jost-ExtraBoldItalic.ttf 33 | static/Jost-BlackItalic.ttf 34 | 35 | Get started 36 | ----------- 37 | 38 | 1. Install the font files you want to use 39 | 40 | 2. Use your app's font picker to view the font family and all the 41 | available styles 42 | 43 | Learn more about variable fonts 44 | ------------------------------- 45 | 46 | https://developers.google.com/web/fundamentals/design-and-ux/typography/variable-fonts 47 | https://variablefonts.typenetwork.com 48 | https://medium.com/variable-fonts 49 | 50 | In desktop apps 51 | 52 | https://theblog.adobe.com/can-variable-fonts-illustrator-cc 53 | https://helpx.adobe.com/nz/photoshop/using/fonts.html#variable_fonts 54 | 55 | Online 56 | 57 | https://developers.google.com/fonts/docs/getting_started 58 | https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Fonts/Variable_Fonts_Guide 59 | https://developer.microsoft.com/en-us/microsoft-edge/testdrive/demos/variable-fonts 60 | 61 | Installing fonts 62 | 63 | MacOS: https://support.apple.com/en-us/HT201749 64 | Linux: https://www.google.com/search?q=how+to+install+a+font+on+gnu%2Blinux 65 | Windows: https://support.microsoft.com/en-us/help/314960/how-to-install-or-remove-a-font-in-windows 66 | 67 | Android Apps 68 | 69 | https://developers.google.com/fonts/docs/android 70 | https://developer.android.com/guide/topics/ui/look-and-feel/downloadable-fonts 71 | 72 | License 73 | ------- 74 | Please read the full license text (OFL.txt) to understand the permissions, 75 | restrictions and requirements for usage, redistribution, and modification. 76 | 77 | You can use them in your products & projects – print or digital, 78 | commercial or otherwise. 79 | 80 | This isn't legal advice, please consider consulting a lawyer and see the full 81 | license for all details. 82 | -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Black.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Black.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-BlackItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-BlackItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Bold.eot -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Bold.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Bold.woff -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Bold.woff2 -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-BoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-BoldItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-ExtraBold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-ExtraBold.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-ExtraBoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-ExtraBoldItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-ExtraLight.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-ExtraLight.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-ExtraLightItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-ExtraLightItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Italic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Italic.eot -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Italic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Italic.woff -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Italic.woff2 -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Light.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-LightItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-LightItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Medium.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Medium.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-MediumItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-MediumItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Regular.eot -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Regular.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Regular.woff -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Regular.woff2 -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-SemiBold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-SemiBold.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-SemiBoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-SemiBoldItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-Thin.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-Thin.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/font/static/Jost-ThinItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/font/static/Jost-ThinItalic.ttf -------------------------------------------------------------------------------- /semopenalex-app/assets/images/categorization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/categorization.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/category.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/category.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/essay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/essay.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/folder.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/funding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/funding.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/goal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/goal.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/intro-to-graphs.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/intro-to-graphs.jpeg -------------------------------------------------------------------------------- /semopenalex-app/assets/images/lightbulb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/lightbulb.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/location.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/location.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/metaphactory-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 39 | 41 | 42 | 44 | image/svg+xml 45 | 47 | 48 | 49 | 50 | 51 | 55 | 58 | 66 | 74 | 82 | 92 | metaphactory 97 | 98 | 102 | 105 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /semopenalex-app/assets/images/no-photos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/no-photos.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/office-building.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/office-building.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/readme.md: -------------------------------------------------------------------------------- 1 | ## Icon Attribution 2 | - Writer icons created by Freepik - Flaticon 3 | - Paper icons created by Freepik - Flaticon 4 | - University icons created by Freepik - Flaticon 5 | - Idea icons created by Freepik - Flaticon 6 | - Books icons created by popo2021 - Flaticon 7 | - Building icons created by juicy_fish - Flaticon 8 | - Empty icons created by Icon.doit - Flaticon 9 | - Funding icons created by Dewi Sari - Flaticon 10 | - Science research icons created by Flat Icons - Flaticon 11 | - Goal icons created by Freepik - Flaticon 12 | - Keyword icons created by Dreamstale - Flaticon 13 | - Trending topic icons created by lutfix - Flaticon -------------------------------------------------------------------------------- /semopenalex-app/assets/images/science-research.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/science-research.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/search.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/semopenalex-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /semopenalex-app/assets/images/stack-of-books.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/stack-of-books.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/topic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/topic.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/university.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/university.png -------------------------------------------------------------------------------- /semopenalex-app/assets/images/writer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metaphacts/semopenalex/30472a291c99b3473f4251e951141406490b5b73/semopenalex-app/assets/images/writer.png -------------------------------------------------------------------------------- /semopenalex-app/assets/jost.css: -------------------------------------------------------------------------------- 1 | /* Jost regular */ 2 | 3 | @font-face { 4 | font-family: "Jost"; 5 | src: url("./font/static/Jost-Regular.eot"); 6 | src: url("./font/static/Jost-Regular.woff2") format('woff2'), 7 | url("./font/static/Jost-Regular.woff") format('woff'), 8 | url("./font/static/Jost-Regular.ttf") format('truetype'); 9 | font-weight: 400; 10 | font-style: normal; 11 | } 12 | 13 | /* Jost italic */ 14 | 15 | @font-face { 16 | font-family: "Jost"; 17 | src: url('./font/static/Jost-Italic.eot'); 18 | src: url('./font/static/Jost-Italic.woff2') format('woff2'), 19 | url('./font/static/Jost-Italic.woff') format('woff'), 20 | url('./font/static/Jost-Italic.ttf') format('truetype'); 21 | font-weight: 400; 22 | font-style: italic; 23 | } 24 | 25 | /* Jost bold */ 26 | 27 | @font-face { 28 | font-family: "Jost"; 29 | src: url('./font/static/Jost-Bold.eot'); 30 | src: url('./font/static/Jost-Bold.woff2') format('woff2'), 31 | url('./font/static/Jost-Bold.woff') format('woff'), 32 | url('./font/static/Jost-Bold.ttf') format('truetype'); 33 | font-weight: 700; 34 | font-style: normal; 35 | } 36 | -------------------------------------------------------------------------------- /semopenalex-app/assets/style.css: -------------------------------------------------------------------------------- 1 | html { 2 | font-family: Jost, "Inter", "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | } 4 | 5 | body { 6 | font-family: "Jost"; 7 | } 8 | 9 | :root { 10 | --soa-color-brown-darker: #827655; 11 | --soa-color-brown-dark: #E9DFD1; 12 | --soa-color-brown: #F3F1EE; 13 | --soa-color-blue: #205BAC; 14 | --soa-color-green: #1B9D47; 15 | --soa-color-grey: #F3F1EE; 16 | } 17 | 18 | .soa a { 19 | color: var(--soa-color-blue); 20 | } 21 | 22 | .soa .header { 23 | background-color: initial; 24 | border-bottom: 1px solid var(--soa-color-grey); 25 | } 26 | 27 | .soa .header__title-main { 28 | font-size: 4.6rem; 29 | font-weight: 700; 30 | line-height: 4.6rem; 31 | color: var(--soa-color-brown-darker); 32 | } 33 | 34 | .soa .body { 35 | padding: 2rem 0; 36 | display: flex; 37 | flex-direction: column; 38 | gap: 1.5rem; 39 | justify-content: flex-start; 40 | } 41 | 42 | /* chart */ 43 | .soa .highcharts-background { 44 | fill: transparent; 45 | } 46 | 47 | .soa .highcharts-color-0 { 48 | fill: var(--soa-color-brown-dark); 49 | stroke: var(--soa-color-brown-dark); 50 | } 51 | 52 | .soa .highcharts-color-1 { 53 | fill: var(--soa-color-blue); 54 | stroke: var(--soa-color-blue); 55 | } 56 | 57 | .soa .highcharts-column-series rect.highcharts-point { 58 | stroke: none; 59 | } 60 | 61 | .soa .highcharts-axis-title { 62 | display: none; 63 | } 64 | 65 | .semantic-chart { 66 | flex: 1 1 0; 67 | } 68 | 69 | .soa .semantic-chart .highcharts-root { 70 | max-width: 100%; 71 | width: 100%; 72 | } 73 | 74 | /* chart */ 75 | 76 | .soa .card { 77 | background: var(--soa-color-brown); 78 | border-radius: 0.5rem; 79 | padding: 1.7rem; 80 | font-size: 1.5rem; 81 | color: var(--soa-color-blue); 82 | border: none; 83 | } 84 | 85 | .soa .card__title { 86 | font-size: 1.2rem; 87 | color: var(--soa-color-brown-darker); 88 | font-weight: 700; 89 | padding-bottom: 1rem; 90 | } 91 | 92 | .soa .card select { 93 | width: 113px; 94 | padding: 3px 6px; 95 | align-self: flex-end; 96 | font-size: 0.8rem; 97 | position: absolute; 98 | top: 10px; 99 | right: 10px; 100 | } 101 | 102 | .soa .chip { 103 | display: flex; 104 | border: 1px solid var(--soa-color-grey); 105 | color: var(--soa-color-blue); 106 | font-size: 1.6rem; 107 | align-items: center; 108 | border-left: 1px solid var(--soa-color-grey); 109 | overflow: hidden; 110 | max-width: 320px; 111 | } 112 | 113 | .soa .chip__name { 114 | padding: 0rem 1rem; 115 | border-right: 1px solid var(--soa-color-grey); 116 | white-space: nowrap; 117 | overflow: hidden; 118 | text-overflow: ellipsis; 119 | } 120 | 121 | .soa .chip__icon { 122 | background-image: url(); 123 | background-repeat: no-repeat no-repeat; 124 | background-position: center center; 125 | background-size: contain; 126 | width: 15px; 127 | height: 15px; 128 | margin: 1rem; 129 | flex-shrink: 0; 130 | } 131 | 132 | /* publications */ 133 | .card.publications { 134 | padding: 2rem 10rem; 135 | height: auto; 136 | } 137 | 138 | .card.publications__body { 139 | display: grid; 140 | grid-template-columns: 1fr 1fr; 141 | gap: 1rem; 142 | } 143 | 144 | .card.publications .buttons { 145 | align-self: flex-end; 146 | display: flex; 147 | gap: 1rem; 148 | } 149 | 150 | .card.publications .buttons button { 151 | font-size: 1rem; 152 | padding: 2px 1rem; 153 | border: 1px solid var(--soa-color-grey); 154 | background-color: transparent; 155 | color: var(--soa-color-blue); 156 | } 157 | 158 | .card.publications .buttons button.selected { 159 | background-color: #FFFFFF; 160 | color: #000000; 161 | } 162 | 163 | .card.publications .publication { 164 | background-color: #FFFFFF; 165 | border-radius: 0.3rem; 166 | padding: 1rem; 167 | overflow: hidden; 168 | width: calc(50% - 0.5rem); 169 | box-sizing: border-box; 170 | display: flex; 171 | flex-direction: column; 172 | justify-content: space-between; 173 | min-height: 164px; 174 | line-height: 1.8rem; 175 | } 176 | 177 | .card.publications .publications__header { 178 | display: flex; 179 | justify-content: space-between; 180 | align-self: center; 181 | width: 100%; 182 | padding-bottom: 0.6rem; 183 | } 184 | 185 | .card.publications .publications__header .title { 186 | padding-bottom: 0; 187 | font-size: 1.2rem; 188 | } 189 | 190 | .card.publications .publication__title { 191 | font-size: 1.3rem; 192 | } 193 | 194 | .card.publications .publication__author, .publication__info { 195 | font-size: 1rem; 196 | color: #000000; 197 | } 198 | 199 | .card.publications .publication__info { 200 | color: var(--soa-color-brown-darker); 201 | } 202 | 203 | .card.publications .publication__bottom { 204 | display: flex; 205 | justify-content: space-between; 206 | font-size: 1rem; 207 | color: var(--soa-color-brown-darker); 208 | font-weight: 700; 209 | } 210 | 211 | .card.publications .publication__year { 212 | color: #000000; 213 | } 214 | 215 | .card.publications .publication__citation { 216 | display: flex; 217 | align-items: center; 218 | gap: 3.75px; 219 | } 220 | 221 | .card.publications .publication__citation-icon { 222 | background-image: url(); 223 | background-repeat: no-repeat no-repeat; 224 | background-position: center center; 225 | background-size: contain; 226 | width: 11px; 227 | height: 11px; 228 | } 229 | 230 | .card.publications .publication__citation-value { 231 | font-size: 1rem; 232 | color: var(--soa-color-brown-darker); 233 | font-weight: 700; 234 | } 235 | 236 | .card.publications .publications__foot { 237 | display: flex; 238 | justify-content: center; 239 | } 240 | 241 | .card.publications .publications__foot-button { 242 | color: var(--soa-color-blue); 243 | font-weight: 700; 244 | font-size: 1rem; 245 | background: transparent; 246 | border: none; 247 | margin: 3rem 0 1rem 0; 248 | } 249 | 250 | .card.publications .publications__foot-button--disabled { 251 | display: none; 252 | } 253 | 254 | .card.publications .DataTable--table-grid-layout { 255 | gap: 1rem; 256 | align-items: stretch; 257 | } 258 | 259 | .card.publications .DataTable--footer { 260 | margin-top: 1rem; 261 | } 262 | -------------------------------------------------------------------------------- /semopenalex-app/config/environment.prop: -------------------------------------------------------------------------------- 1 | sparqlMaxExecutionTime = 120 2 | sparqlHttpReadTimeout = 120 3 | pathsToRewrite = /author/,/concept/,/countsbyyear/,/geo/,/hostvenue/,/conceptscore/,/openaccess/,/work/,/source/,/publisher/,/institution/,/authorship/,/topic/,/keyword/,/domain/,/field/,/subfield/,/countsByYear/,/articleProcessingChargeList/,/funder/,/location/,/articleProcessingChargePaid/,/ontology/ 4 | platformBaseIri = https://semopenalex.org 5 | -------------------------------------------------------------------------------- /semopenalex-app/config/global.prop: -------------------------------------------------------------------------------- 1 | homePage = semopenalex:UniversalSearch 2 | -------------------------------------------------------------------------------- /semopenalex-app/config/namespaces.prop: -------------------------------------------------------------------------------- 1 | semopenalex = http://www.metaphactory.semopenalex.com/app/ 2 | dct = http://purl.org/dc/terms/ 3 | foaf = http://xmlns.com/foaf/0.1/ 4 | fabio = http://purl.org/spar/fabio/ 5 | soa = https://semopenalex.org/ontology/ 6 | wgs = http://www.w3.org/2003/01/geo/wgs84_pos# 7 | dbpedia-owl = https://dbpedia.org/ontology/ 8 | dbprop = https://dbpedia.org/property/ 9 | jsfn = http://www.ontotext.com/js# 10 | -------------------------------------------------------------------------------- /semopenalex-app/config/page-layout/footer.hbs: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | Powered by 5 | 6 | metaphactory 7 | 8 | & Ontotext 9 | 10 | Ontotext GraphDB 11 | 12 |
13 |
14 |
15 | -------------------------------------------------------------------------------- /semopenalex-app/config/page-layout/header.hbs: -------------------------------------------------------------------------------- 1 | 2 | 8 | {{#if systemNotificationMessage}} 9 | 13 | {{/if}} 14 | 15 | 16 | 17 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /semopenalex-app/config/page-layout/html-header-resources.hbs: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /semopenalex-app/config/proxy.prop: -------------------------------------------------------------------------------- 1 | config.proxy.wikidata-images.targetUri=https://commons.wikimedia.org/w/thumb.php 2 | config.proxy.wikidata-images.preserveCookies=true 3 | config.proxy.rcsb-pdb.targetUri=https://files.rcsb.org/view/ 4 | config.proxy.rcsb-pdb.preserveCookies=true 5 | -------------------------------------------------------------------------------- /semopenalex-app/config/roles/role-export-diagram.prop: -------------------------------------------------------------------------------- 1 | #Definition of role export-diagram 2 | #Mon Nov 25 08:35:34 UTC 2024 3 | permissions=api\:ldp\:container\:*\:export 4 | -------------------------------------------------------------------------------- /semopenalex-app/config/roles/role-sparql-editor-view.prop: -------------------------------------------------------------------------------- 1 | #Definition of role sparql-editor-view 2 | #Mon Jun 17 10:04:58 UTC 2024 3 | permissions=ui\:page\:sparql-editor 4 | -------------------------------------------------------------------------------- /semopenalex-app/config/ui.prop: -------------------------------------------------------------------------------- 1 | preferredLabels = rdfs:label,skos:prefLabel,, 2 | preferredThumbnails = { \n ?subject ?1 .\n BIND (REPLACE (STR(?1)\\, "%%28"\\, "("\\, "i") as ?2) \n BIND (REPLACE (STR(?2)\\, "%%29"\\, ")"\\, "i") as ?tmp)\n BIND (jsfn:deCodeUrl(?tmp) as ?value)\n \n} 3 | preferredDescriptions = ,,,,/,/, 4 | -------------------------------------------------------------------------------- /semopenalex-app/data/templates/Template%3Ahttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23Concept.html: -------------------------------------------------------------------------------- 1 | 55 | 56 | {{!-- use custom templates for concepts from different vocabularies --}} 57 | 58 | {{#if (string-startsWith (page-resource) "https://semopenalex.org/concept/")}} 59 | {{>semopenalex:Concept}} 60 | {{/if}} 61 | 62 | {{#if (string-startsWith (page-resource) "https://semopenalex.org/topic/")}} 63 | {{>semopenalex:TopicConcept}} 64 | {{/if}} 65 | 66 | {{#if (string-startsWith (page-resource) "https://semopenalex.org/keyword/")}} 67 | {{>semopenalex:KeywordConcept}} 68 | {{/if}} 69 | 70 | {{#if (string-startsWith (page-resource) "http://metadata.un.org/sdg/")}} 71 | {{>semopenalex:SDGConcept}} 72 | {{/if}} 73 | 74 | {{#if (string-startsWith (page-resource) "http://id.nlm.nih.gov/mesh/")}} 75 | {{>semopenalex:MeshConcept}} 76 | {{/if}} 77 | 78 | {{!-- for Domain, Field and Subfield, redirect to default concept template --}} 79 | {{#if 80 | (cond-or 81 | (string-startsWith (page-resource) "https://semopenalex.org/domain/") 82 | (string-startsWith (page-resource) "https://semopenalex.org/field/") 83 | (string-startsWith (page-resource) "https://semopenalex.org/subfield/") 84 | ) 85 | }} 86 | {{>semopenalex:DefaultConceptTemplate}} 87 | {{/if}} 88 | 89 | -------------------------------------------------------------------------------- /semopenalex-app/data/templates/Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FFunder.html: -------------------------------------------------------------------------------- 1 | 66 | 67 | 68 |
69 |
70 |
71 |
72 |
Publications
73 | 75 | 80 | 81 |
82 |
83 |
Citations
84 | 88 | 93 | 94 |
95 |
96 | 117 | 118 |
119 |
120 |
121 |
Grants
122 | 125 | 134 | 135 | 138 | 146 | 147 | 148 | 153 | 165 | 166 | 167 |
168 |
169 | 170 | 171 | 172 |
173 |
174 | 175 | 176 |
177 | 184 | 191 | 192 |
193 |
194 | 195 | 196 |
197 | 198 | 279 | 280 |
281 |
282 | 283 |
284 |
285 | 286 | 289 | 296 | 297 |
298 |
299 | 300 | 301 | {{> ::Cards}} 302 | {{> ::Concepts}} 303 | {{> ::List tab="Publications"}} 304 | 305 | 306 | 307 |
308 |
-------------------------------------------------------------------------------- /semopenalex-app/data/templates/Template%3Ahttps%3A%2F%2Fsemopenalex.org%2Fontology%2FSource.html: -------------------------------------------------------------------------------- 1 | 49 |
50 |
51 | 52 |
53 | 64 | 77 | 78 |
79 |
80 |
81 |
82 |
83 |
84 |
Publications
85 |
86 | 90 | {{publications.value}} 91 | 92 |
93 |
Citations
94 |
95 | 99 | {{citations.value}} 100 | 101 |
102 |
103 | 124 | 125 |
126 |
127 | 142 | 162 | 163 |
164 |
165 |
166 | 175 | 182 | 183 |
184 |
185 | 186 | 257 | 258 |
259 |
260 |
-------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FConcept.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 |
6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | 21 | 33 | 34 | 41 | 56 | 57 |
58 |
59 | 60 |
61 |
62 |
63 |
64 |
65 |
Publications
66 | 70 | 75 | 76 | 77 |
Citation
78 | 83 | 88 | 89 |
90 |
91 | 126 | 127 | 128 |
129 |
130 |
131 | 132 |
133 | 134 | 153 |
154 | 155 | 244 | 245 |
246 |
-------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FDefaultConceptTemplate.html: -------------------------------------------------------------------------------- 1 | 10 | 11 | [[#*inline "skosInfo"]] 12 | 13 | 14 | 15 | 16 | 17 | [[description]] 18 | 19 | 20 | [[/inline]] 21 | 22 |
23 | 24 | 27 | 28 | 29 | 30 | 31 | [[> "http://www.metaphacts.com/resource/SkosResourcePropertiesInclude" ]] 32 | 33 | 34 |
35 | 72 | 79 | 80 |
81 |
82 | 83 |
84 |
85 | 86 |
87 | [[! --

Hierarchical Relations

--]] 88 |
89 |
90 |

91 | Broader Concepts 92 | [[> skosInfo title="skos:broader" description="Concepts that are broader (more generic) than this one."]] 93 |

94 | 96 |
97 |

98 | Narrower Concepts 99 | [[> skosInfo title="skos:narrower" description="Concepts that are narrower (more specific) than this one."]] 100 |

101 | 103 |
104 |
105 | [[! --

Associative Relations

--]] 106 |

107 | Related Concepts 108 | [[> skosInfo title="skos:related" description="Relation is purely associative. Can mean everything."]] 109 |

110 | 112 |
113 |
114 | 115 |
116 | [[! -- 117 |

Equivalent Matches

118 |
119 | --]] 120 |
121 |
122 |

123 | Exact Matches 124 | [[> skosInfo title="skos:exactMatch" description="High degree of confidence that concepts are equivalent. Platform can use concepts interchangeably for query expansion."]] 125 |

126 | 128 |
129 |
130 |

131 | Close Matches 132 | [[> skosInfo title="skos:closeMatch" description="Certain degree of confidence that concepts are close to be equivalent. Platform may use concepts interchangeably for query expansion."]] 133 |

134 | 136 |
137 | [[! -- 138 |

Hierarchical Matches

139 |
140 | --]] 141 |
142 |
143 |

144 | Broad Matches 145 | [[> skosInfo title="skos:broadMatch" description="Linked concepts are broader matches (more generic)."]] 146 |

147 | 149 |
150 |
151 |

Narrow Matches 152 | [[> skosInfo title="skos:narrowMatch" description="Linked concepts are narrower matches (more specific)."]] 153 |

154 | 156 |
157 | 158 |
159 | 160 | [[! -- 161 |

Associative Matches

162 |
163 | --]] 164 |

165 | Related Matches 166 | [[> skosInfo title="skos:relatedMatch" description="Relation is purely associative. Can mean everything."]] 167 |

168 | 170 |
171 |
172 | 173 | 174 | {{> Assets:OntodiaDiagramsTable resource=(page-resource)}} 175 | 176 | 177 |
178 | 179 |
180 | 181 | 182 | 183 | 194 | 195 | -------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FMeshConcept.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 |
6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | 21 | 33 | 34 | 41 | 56 | 57 |
58 |
59 | 60 |
61 |
62 |
63 |
64 |
65 |
Publications
66 | 71 | 76 | 77 | 78 |
Citation
79 | 86 | 91 | 92 |
93 |
94 | 142 | 143 | 144 |
145 |
146 |
147 | 148 |
149 | 150 | 169 |
170 | 171 | 262 | 263 |
264 |
-------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphactory.semopenalex.com%2Fapp%2FTopicConcept.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 |
6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | 21 | 33 | 34 | 41 | 56 | 57 |
58 |
59 | 60 |
61 |
62 |
63 |
64 |
65 |
Publications
66 | 70 | 75 | 76 | 77 |
Citation
78 | 83 | 88 | 89 |
90 |
91 | 139 | 140 | 141 |
142 |
143 |
144 | 145 |
146 | 147 | 166 | 167 | 186 | 187 |
188 | 189 | 279 | 280 |
281 |
-------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphacts.com%2Fontologies%2Fplatform%23SearchResultsCustomization.html: -------------------------------------------------------------------------------- 1 | 2 | 10 | 22 | 33 | 34 | [[!-- 35 | Provide additional information for different types with the semantic-search-query-universal-entity-info 36 | component: 37 | --]] 38 | 61 | 65 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | [[!-- 94 | Uncomment and modify this template to customize result cards: 95 | --]] 96 | [[!-- 97 | 98 |
99 |
100 |
101 | {{#if (cond-eq showKnowledgePanel false)}} 102 | 103 | 104 | 105 | {{else}} 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 |
114 | 115 |
116 |
117 |
118 | 119 | link 120 | 121 | {{/if}} 122 |
123 |
124 |
125 | {{> Platform:SearchResultsFragments::addToSearchButton subject=subject.value}} 126 |
127 |
128 |
129 | --]] 130 | 131 | [[!-- 132 | Uncomment and modify this template to customize domain cards: 133 | --]] 134 | [[!-- 135 | 136 | 141 | 142 | --]] 143 | 144 | 145 | 146 | 152 | 153 | 161 | 162 | -------------------------------------------------------------------------------- /semopenalex-app/data/templates/http%3A%2F%2Fwww.metaphacts.com%2Fresource%2Fheader%2FResource.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | [[#if thumbnail]] 4 | 5 | [[/if]] 6 |
7 | 8 |
9 |
10 |

11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |

20 |
21 |
22 |
23 |
24 | IRI: {{page-resource}} 25 | 26 | 27 |
28 | 29 | 42 | 43 |
44 | 45 | 59 | 60 |
61 |
62 | 63 | 64 | [[> "http://www.metaphacts.com/resource/KnowledgeGraphBarViewControls" ]] 65 |
66 | -------------------------------------------------------------------------------- /semopenalex-app/ldp/assets/http%3A%2F%2Fwww.metaphacts.com%2Fontologies%2Fplatform%23ontodiaDiagramContainer.trig: -------------------------------------------------------------------------------- 1 | 2 | { 3 | a , 4 | , ; 5 | "Ontodia Diagram Container"; 6 | ; 7 | "2023-05-01T18:28:33.913Z"^^ . 8 | 9 | 10 | . 11 | } 12 | -------------------------------------------------------------------------------- /semopenalex-app/plugin.properties: -------------------------------------------------------------------------------- 1 | plugin.id=semopenalex-app 2 | plugin.provider=metaphacts 3 | plugin.version=5.0.0 4 | -------------------------------------------------------------------------------- /sparql-queries/README.md: -------------------------------------------------------------------------------- 1 | ## SPARQL Queries Section 3.4 2 | 3 | SPARQL Queries over the [SemOpenAlex SPARQL Endpoint](https://semopenalex.org/sparql) to generate the statistics from Section 3.4. 4 | 5 | `figure-3_ml-papers-kit-2000-to-2021.py`: Send several SPARQL Queries to the SPARQL endpoint to get the numbers of publications 6 | published in [machine learning](https://semopenalex.org/concept/C119857082) by researchers from 7 | [Karlsruhe Institute of Technology](https://semopenalex.org/institution/I102335020) from 2000 to 2021. 8 | In the paper additionaly the numbers of publications published in [Natural language processing](https://semopenalex.org/concept/C204321447) are shown. 9 | 10 | `figure-4_count-institution-country.txt`: Count the number of institutions in the United States (US). 11 | In the paper the number of institutions for the `US`, `GB`, `DE`, `CN`, `JP`, `FR`, `IN` and `CA` are shown. 12 | 13 | `table-4_count-institution-type.txt`: Count the number how many institutions are from educational type. 14 | The different institutions types are `company`, `education`, `nonprofit`, `healthcare`, `facility`, `other`, `government`, `archive` or the institution type is `unknown`. 15 | -------------------------------------------------------------------------------- /sparql-queries/figure-3_ml-papers-kit-2000-to-2021.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Send several SPARQL Queries to the SemOpenAlex SPARQL endpoint to get the number of publications 3 | published in machine learning () by researchers from 4 | Karlsruhe Institute of Technology () from 2000 to 2021 5 | ''' 6 | 7 | import requests 8 | 9 | query_template = """ 10 | PREFIX rdf: 11 | PREFIX dcterms: 12 | PREFIX org: 13 | PREFIX fabio: 14 | PREFIX soa: 15 | PREFIX xsd: 16 | 17 | SELECT (COUNT (?paper) as ?paperCount) 18 | WHERE { 19 | ?paper a . 20 | ?paper dcterms:creator ?author . 21 | ?author org:memberOf . 22 | ?paper fabio:hasPublicationYear "%d"^^xsd:integer . 23 | ?paper soa:hasConcept . 24 | } 25 | """ 26 | 27 | #SemOpenAlex SPARQL Endpoint 28 | endpoint_url = "https://semopenalex.org/sparql" 29 | 30 | #Loop through the years 2000 to 2021 31 | for year in range(2000, 2022): 32 | 33 | #Put the current year in the SPARQL query 34 | query = query_template % year 35 | 36 | response = requests.post(endpoint_url, data={"query": query}) 37 | 38 | #Display the result on the terminal 39 | print("Jahr %d: %s" % (year, response.text)) 40 | -------------------------------------------------------------------------------- /sparql-queries/figure-4_count-institution-country.txt: -------------------------------------------------------------------------------- 1 | SELECT(COUNT (?institution) as ?institutionCount) 2 | WHERE { 3 | 4 | ?institution a . 5 | ?institution "US"^^xsd:string . 6 | } 7 | -------------------------------------------------------------------------------- /sparql-queries/table-4_count-institution-type.txt: -------------------------------------------------------------------------------- 1 | PREFIX rdf: 2 | PREFIX xsd: 3 | PREFIX soa: 4 | 5 | SELECT(COUNT (?institution) as ?institutionCount) WHERE { 6 | 7 | ?institution rdf:type . 8 | ?institution soa:rorType "education"^^xsd:string . 9 | 10 | } 11 | -------------------------------------------------------------------------------- /transformSemOpenAlex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script collects and run the following python scripts in sequentially 4 | 5 | echo "SemOpenAlex transformation script started at: " $(date -u) 6 | 7 | # Data transformation from OpenAlex to RDF data dump 8 | # Running a series of python scripts one after another 9 | python3 ./transformation-scripts/semopenalex-topics.py; 10 | python3 ./transformation-scripts/semopenalex-subfields.py; 11 | python3 ./transformation-scripts/semopenalex-fields.py; 12 | python3 ./transformation-scripts/semopenalex-domains.py; 13 | python3 ./transformation-scripts/semopenalex-keywords.py; 14 | python3 ./transformation-scripts/semopenalex-concepts.py; 15 | python3 ./transformation-scripts/semopenalex-funders.py; 16 | python3 ./transformation-scripts/semopenalex-institutions.py; 17 | python3 ./transformation-scripts/semopenalex-publishers.py; 18 | python3 ./transformation-scripts/semopenalex-sources.py; 19 | python3 ./transformation-scripts/semopenalex-authors.py; 20 | python3 ./transformation-scripts/semopenalex-works.py; 21 | python3 ./transformation-scripts/semopenalex-dataset.py; 22 | 23 | # make sure to clean graphdb storage folder before ingesting data 24 | rm -rf ./graphdb-preload/graphdb-home/data/repositories/metaphactory/storage 25 | 26 | # load RDF data dump .gzip files to graphdb using preload tool 27 | docker-compose -f ./graphdb-preload/docker-compose.yml up; 28 | 29 | # gzip graphdb-home/ folder to transfrom to a dedicated server 30 | echo "Started to tar.gz graphdb-home/ folder at: " $(date -u) 31 | tar -czvf graphdb-home.tar.gz ./graphdb-preload/graphdb-home/ 32 | echo "SemOpenAlex transformation script ended at: " $(date -u) 33 | -------------------------------------------------------------------------------- /transformation-scripts/semopenalex-dataset.py: -------------------------------------------------------------------------------- 1 | from rdflib import Graph, URIRef, Literal, Dataset 2 | from rdflib.namespace import RDFS, XSD, SKOS, RDF 3 | from rdflib import Namespace 4 | from datetime import datetime 5 | from datetime import date 6 | from pathlib import Path 7 | import time 8 | import os 9 | 10 | start_time = time.ctime() 11 | print('Start to create .trig parsing and graph serialization for SemOpenAlex dataset at: '+ start_time) 12 | 13 | today = date.today() 14 | 15 | absolute_path = os.path.dirname(__file__) 16 | trig_output_dir_path = os.path.join(absolute_path, '../graphdb-preload/graphdb-import/') 17 | trig_output_file_path = f'{trig_output_dir_path}semopenalex-dataset-{today}.trig' 18 | 19 | DCAT = Namespace("http://www.w3.org/ns/dcat#") 20 | DCTERMS = Namespace("http://purl.org/dc/terms/") 21 | FOAF = Namespace("http://xmlns.com/foaf/0.1/") 22 | 23 | # semopenalex dataset context 24 | context = URIRef("https://semopenalex.org/dataset/context") 25 | 26 | # semopenalex dataset subject IRI 27 | dataset = URIRef("https://w3id.org/SemOpenAlex") 28 | 29 | # namedGraph predicate 30 | namedGraph = URIRef("http://www.w3.org/ns/sparql-service-description#namedGraph") 31 | 32 | # dataset issued time 33 | data_issued_time = datetime.now() 34 | 35 | # list of semopenalex predefined named graphs 36 | concepts_context = URIRef("https://semopenalex.org/concepts/context") 37 | institutions_context = URIRef("https://semopenalex.org/institutions/context") 38 | sources_context = URIRef("https://semopenalex.org/sources/context") 39 | authors_context = URIRef("https://semopenalex.org/authors/context") 40 | works_context = URIRef("https://semopenalex.org/works/context") 41 | publishers_context = URIRef("https://semopenalex.org/publishers/context") 42 | funders_context = URIRef("https://semopenalex.org/funders/context") 43 | topics_context = URIRef("https://semopenalex.org/topics/context") 44 | keywords_context = URIRef("https://semopenalex.org/keywords/context") 45 | 46 | 47 | #create empty graph 48 | g = Graph(identifier=context) 49 | 50 | g.add((dataset, RDF.type, DCAT.Dataset)) 51 | g.add((dataset, DCTERMS.title, Literal("SemOpenAlex", datatype = XSD.string))) 52 | g.add((dataset, DCTERMS.description, Literal("SemOpenAlex is a scientific publications dataset, presently in the form of a knowledge graph. It also offered as the basis for Knowledge Graph augmentation together with some possible use-cases that can enable AI-driven decision making.", datatype = XSD.string))) 53 | g.add((dataset, DCTERMS.issued, Literal(data_issued_time, datatype = XSD.dateTime))) 54 | g.add((dataset, DCTERMS.license, Literal("https://creativecommons.org/publicdomain/zero/1.0/legalcode", datatype = XSD.anyURI))) 55 | 56 | # keywords 57 | g.add((dataset, DCAT.version, Literal("6.0.0", datatype = XSD.string))) 58 | g.add((dataset, DCAT.keyword, Literal("concepts", datatype = XSD.string))) 59 | g.add((dataset, DCAT.keyword, Literal("topics", datatype = XSD.string))) 60 | g.add((dataset, DCAT.keyword, Literal("keywords", datatype = XSD.string))) 61 | g.add((dataset, DCAT.keyword, Literal("institutions", datatype = XSD.string))) 62 | g.add((dataset, DCAT.keyword, Literal("sources", datatype = XSD.string))) 63 | g.add((dataset, DCAT.keyword, Literal("authors", datatype = XSD.string))) 64 | g.add((dataset, DCAT.keyword, Literal("publishers", datatype = XSD.string))) 65 | g.add((dataset, DCAT.keyword, Literal("works", datatype = XSD.string))) 66 | g.add((dataset, DCAT.keyword, Literal("publications", datatype = XSD.string))) 67 | g.add((dataset, DCAT.keyword, Literal("funders", datatype = XSD.string))) 68 | g.add((dataset, DCAT.keyword, Literal("geo", datatype = XSD.string))) 69 | 70 | # linked named graphs 71 | g.add((dataset, namedGraph, concepts_context)) 72 | g.add((dataset, namedGraph, institutions_context)) 73 | g.add((dataset, namedGraph, sources_context)) 74 | g.add((dataset, namedGraph, authors_context)) 75 | g.add((dataset, namedGraph, works_context)) 76 | g.add((dataset, namedGraph, publishers_context)) 77 | g.add((dataset, namedGraph, funders_context)) 78 | g.add((dataset, namedGraph, topics_context)) 79 | g.add((dataset, namedGraph, keywords_context)) 80 | 81 | # creators 82 | metaphacts = URIRef("http://www.wikidata.org/entity/Q22132500") 83 | g.add((dataset, DCTERMS.creator, metaphacts)) 84 | g.add((metaphacts, RDF.type, FOAF.Organization)) 85 | g.add((metaphacts, FOAF.name, Literal("metaphacts GmbH", datatype = XSD.string))) 86 | aifb = URIRef("http://dbpedia.org/resource/Karlsruhe_Institute_of_Technology") 87 | g.add((dataset, DCTERMS.creator, aifb)) 88 | g.add((aifb, RDF.type, FOAF.Organization)) 89 | g.add((aifb, FOAF.name, Literal("Institute AIFB, Karlsruhe Institute of Technology (KIT)", datatype = XSD.string))) 90 | tu_dresden = URIRef("http://www.wikidata.org/entity/Q158158") 91 | g.add((dataset, DCTERMS.creator, tu_dresden)) 92 | g.add((tu_dresden, RDF.type, FOAF.Organization)) 93 | g.add((tu_dresden, FOAF.name, Literal("ScaDS.AI & TU Dresden", datatype = XSD.string))) 94 | 95 | 96 | # distributions 97 | format = URIRef("http://purl.org/dc/terms/format") 98 | dist_v1 = URIRef("http://datasets.semopenalex.org/v1/semopenalex-distribution") 99 | g.add((dataset, DCAT.distribution, dist_v1)) 100 | g.add((dist_v1, RDF.type, DCAT.Distribution)) 101 | g.add((dist_v1, DCTERMS.issued, Literal("2022-05-12", datatype = XSD.date))) 102 | g.add((dist_v1, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 103 | g.add((dist_v1, format, Literal("N-Triples", datatype = XSD.string))) 104 | g.add((dist_v1, DCAT.mediaType, Literal("text/plain", datatype = XSD.string))) 105 | g.add((dist_v1, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 106 | 107 | dist_v2 = URIRef("http://datasets.semopenalex.org/v2/semopenalex-distribution") 108 | g.add((dataset, DCAT.distribution, dist_v2)) 109 | g.add((dist_v2, RDF.type, DCAT.Distribution)) 110 | g.add((dist_v2, DCTERMS.issued, Literal("2022-11-21", datatype = XSD.date))) 111 | g.add((dist_v2, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 112 | g.add((dist_v2, format, Literal("TriG", datatype = XSD.string))) 113 | g.add((dist_v2, DCAT.mediaType, Literal("application/x-trig", datatype = XSD.string))) 114 | g.add((dist_v2, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 115 | 116 | dist_v3 = URIRef("http://datasets.semopenalex.org/v3/semopenalex-distribution") 117 | g.add((dataset, DCAT.distribution, dist_v3)) 118 | g.add((dist_v3, RDF.type, DCAT.Distribution)) 119 | g.add((dist_v3, DCTERMS.issued, Literal("2023-04-24", datatype = XSD.date))) 120 | g.add((dist_v3, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 121 | g.add((dist_v3, format, Literal("TriG", datatype = XSD.string))) 122 | g.add((dist_v3, DCAT.mediaType, Literal("application/x-trig", datatype = XSD.string))) 123 | g.add((dist_v3, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 124 | 125 | dist_v4 = URIRef("http://datasets.semopenalex.org/v4/semopenalex-distribution") 126 | g.add((dataset, DCAT.distribution, dist_v4)) 127 | g.add((dist_v4, RDF.type, DCAT.Distribution)) 128 | g.add((dist_v4, DCTERMS.issued, Literal("2023-10-28", datatype = XSD.date))) 129 | g.add((dist_v4, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 130 | g.add((dist_v4, format, Literal("TriG", datatype = XSD.string))) 131 | g.add((dist_v4, DCAT.mediaType, Literal("application/x-trig", datatype = XSD.string))) 132 | g.add((dist_v4, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 133 | 134 | dist_v5 = URIRef("http://datasets.semopenalex.org/v5/semopenalex-distribution") 135 | g.add((dataset, DCAT.distribution, dist_v5)) 136 | g.add((dist_v5, RDF.type, DCAT.Distribution)) 137 | g.add((dist_v5, DCTERMS.issued, Literal("2024-06-09", datatype = XSD.date))) 138 | g.add((dist_v5, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 139 | g.add((dist_v5, format, Literal("TriG", datatype = XSD.string))) 140 | g.add((dist_v5, DCAT.mediaType, Literal("application/x-trig", datatype = XSD.string))) 141 | g.add((dist_v5, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 142 | 143 | dist_v6 = URIRef("http://datasets.semopenalex.org/v6/semopenalex-distribution") 144 | g.add((dataset, DCAT.distribution, dist_v6)) 145 | g.add((dist_v6, RDF.type, DCAT.Distribution)) 146 | g.add((dist_v6, DCTERMS.issued, Literal(today, datatype = XSD.date))) 147 | g.add((dist_v6, DCTERMS.title, Literal("SemOpenAlex RDF dump", datatype = XSD.string))) 148 | g.add((dist_v6, format, Literal("TriG", datatype = XSD.string))) 149 | g.add((dist_v6, DCAT.mediaType, Literal("application/x-trig", datatype = XSD.string))) 150 | g.add((dist_v6, DCAT.accessURL, Literal("https://semopenalex.s3.amazonaws.com/browse.html", datatype = XSD.anyURI))) 151 | 152 | with open(trig_output_file_path, "w", encoding="utf-8") as dataset_file: 153 | dataset_file.write(g.serialize(format='trig')) 154 | 155 | dataset_file.close() 156 | 157 | print("Done with .trig parsing and graph serialization for SemOpenAlex dataset.") 158 | -------------------------------------------------------------------------------- /transformation-scripts/semopenalex-keywords.py: -------------------------------------------------------------------------------- 1 | # We use the topics files to create the keywords files because the keywords are not provided in separate files in the data snapshot 2 | # Each topic entry has a list of max 10 keywords 3 | 4 | from rdflib import Graph 5 | from rdflib import URIRef, BNode, Literal 6 | from rdflib.namespace import DCTERMS, RDF, RDFS, XSD, OWL, FOAF, SKOS 7 | from rdflib import term 8 | import json 9 | import os 10 | import glob 11 | import gzip 12 | import re 13 | import time 14 | import boto3 15 | from datetime import date 16 | from pathlib import Path 17 | from botocore import UNSIGNED 18 | from botocore.config import Config 19 | 20 | def get_file_folders(s3_client, bucket_name, prefix=""): 21 | file_names = [] 22 | folders = [] 23 | 24 | default_kwargs = { 25 | "Bucket": bucket_name, 26 | "Prefix": prefix 27 | } 28 | next_token = "" 29 | 30 | while next_token is not None: 31 | updated_kwargs = default_kwargs.copy() 32 | if next_token != "": 33 | updated_kwargs["ContinuationToken"] = next_token 34 | 35 | response = s3_client.list_objects_v2(**default_kwargs) 36 | contents = response.get("Contents") 37 | 38 | for result in contents: 39 | key = result.get("Key") 40 | if key[-1] == "/": 41 | folders.append(key) 42 | else: 43 | file_names.append(key) 44 | 45 | next_token = response.get("NextContinuationToken") 46 | 47 | return file_names, folders 48 | 49 | def download_files(s3_client, bucket_name, local_path, file_names, folders): 50 | 51 | local_path = Path(local_path) 52 | 53 | for folder in folders: 54 | folder_path = Path.joinpath(local_path, folder) 55 | folder_path.mkdir(parents=True, exist_ok=True) 56 | 57 | for file_name in file_names: 58 | file_path = Path.joinpath(local_path, file_name) 59 | file_path.parent.mkdir(parents=True, exist_ok=True) 60 | s3_client.download_file( 61 | bucket_name, 62 | file_name, 63 | str(file_path) 64 | ) 65 | 66 | replacements = [ 67 | { 68 | "search" : re.compile(r'"'), 69 | "replace" : '', #" 70 | "comment" : "Unescaped quotation marks" 71 | },{ 72 | "search" : re.compile(r'\\'), 73 | "replace" : '', #\ 74 | "comment" : "Unescaped backslash" 75 | },{ 76 | "search" : re.compile(r'\n'), 77 | "replace" : '', 78 | "comment" : "Newline string" 79 | },{ 80 | "search" : re.compile(r'\b'), 81 | "replace" : '', 82 | "comment" : "Newline string" 83 | },{ 84 | "search" : re.compile(r'\t'), 85 | "replace" : '', 86 | "comment" : "Newline string" 87 | },{ 88 | "search" : re.compile(r'\r'), 89 | "replace" : '', 90 | "comment" : "Newline string" 91 | },{ 92 | "search" : re.compile(r'\f'), 93 | "replace" : '', 94 | "comment" : "Newline string" 95 | } 96 | ] 97 | replacements_url = [ 98 | { 99 | "search" : re.compile(r'"'), 100 | "replace" : '%22', 101 | "comment" : "Unescaped quotation mark in URI" 102 | },{ 103 | "search" : re.compile(r'\\'), 104 | "replace" : '%5c', 105 | "comment" : "Unescaped backslash in URI" 106 | },{ 107 | "search" : re.compile(r'\n'), 108 | "replace" : '', 109 | "comment" : "Newline string" 110 | },{ 111 | "search" : re.compile(r'\r'), 112 | "replace" : '', 113 | "comment" : "Newline string" 114 | },{ 115 | "search" : re.compile(r'\t'), 116 | "replace" : '', 117 | "comment" : "Newline string" 118 | }, { 119 | "search": re.compile(r'/'), 120 | "replace": '', 121 | "comment": "Slash in URI" 122 | }, { 123 | "search": re.compile(r'$'), 124 | "replace": '', 125 | "comment": "Dollar sign in URI" 126 | }, { 127 | "search": re.compile(r'_'), 128 | "replace": '', 129 | "comment": "Underscore in URI" 130 | }, { 131 | "search": re.compile(r'{'), 132 | "replace": '', 133 | "comment": "Opening curly brace in URI" 134 | }, { 135 | "search": re.compile(r'}'), 136 | "replace": '', 137 | "comment": "Closing curly brace in URI" 138 | } 139 | ] 140 | 141 | def clean(nameStr): 142 | cleaned_str = nameStr 143 | for r in replacements: 144 | if re.search(r["search"], nameStr): 145 | cleaned_str = re.sub(r["search"], r["replace"], cleaned_str) 146 | return cleaned_str 147 | def clean_url(nameStr): 148 | cleaned_str = nameStr 149 | for r in replacements_url: 150 | if re.search(r["search"], nameStr): 151 | cleaned_str = re.sub(r["search"], r["replace"], cleaned_str) 152 | return cleaned_str 153 | 154 | def clean_date(dateStr): 155 | return dateStr.split("T")[0] 156 | 157 | # method to validate the keyword URIs. Only valid URIs are returned, otherwise None. 158 | def validate_keyword_uri(keyword_uri): 159 | if keyword_uri.startswith('https://semopenalex.org/keyword/'): 160 | suffix = keyword_uri[len('https://semopenalex.org/keyword/'):] 161 | suffix = suffix.replace(',', '-') 162 | if re.match(r'^[a-zA-Z0-9\-]+$', suffix): 163 | return 'https://semopenalex.org/keyword/' + suffix 164 | else: 165 | return None 166 | else: 167 | return None 168 | 169 | # method to transform the display_name of a keyword into the keyword URI 170 | #Artificial Intelligence -> https://openalex.org/keywords/artificial-intelligence 171 | #Clinical Decision Support -> https://openalex.org/keywords/clinical-decision-support 172 | def transform_keyword_to_uri(keyword): 173 | keyword = keyword.lower() 174 | keyword = keyword.replace(" ", "-") 175 | keyword = clean_url(keyword) 176 | keyword = "https://semopenalex.org/keyword/" + keyword 177 | return keyword 178 | 179 | 180 | # info for namespaces used in SOA 181 | soa_namespace_class = "https://semopenalex.org/ontology/" 182 | 183 | # SOA classes used in this file 184 | soa_class_keyword = URIRef(soa_namespace_class + "Keyword") 185 | 186 | # SOA predicates 187 | 188 | # topics entity context 189 | context = URIRef("https://semopenalex.org/keywords/context") 190 | 191 | #topic scheme URI 192 | keyword_scheme_uri = URIRef("https://semopenalex.org/keywords/") 193 | 194 | i = 0 195 | error_count = 0 196 | keywords_graph = Graph(identifier=context) 197 | 198 | today = date.today() 199 | 200 | # Entity type of the snapshot files to be processed 201 | ########## 202 | ENTITY_TYPE_INPUT = 'topics' 203 | ########## 204 | 205 | ########## 206 | ENTITY_TYPE_OUTPUT = 'keywords' 207 | ########## 208 | 209 | data_dump_input_root_dir = '/opt/openalex-snapshot' 210 | 211 | absolute_path = os.path.dirname(__file__) 212 | trig_output_dir_path = os.path.join(absolute_path, '../graphdb-preload/graphdb-import/') 213 | trig_output_file_path = f'{trig_output_dir_path}{ENTITY_TYPE_OUTPUT}-semopenalex-{today}.trig' 214 | 215 | data_dump_start_time = time.ctime() 216 | print('topics entity files started to download at: '+ data_dump_start_time) 217 | # Copy institutions entity snapshot 218 | client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) 219 | file_names, folders = get_file_folders(client, "openalex", "data/topics/") 220 | download_files(client, "openalex", data_dump_input_root_dir, file_names, folders) 221 | print('topics entity files finished to download.') 222 | 223 | start_time = time.ctime() 224 | print('topics entity started to transform at: '+ start_time) 225 | 226 | with open(trig_output_file_path, "w", encoding="utf-8") as g: 227 | 228 | #Path where the OpenAlex data for the current entity type is located 229 | data_dump_input_entity_dir = f'{data_dump_input_root_dir}/data/{ENTITY_TYPE_INPUT}/*' 230 | 231 | 232 | # initialize and add domain scheme URI 233 | keywords_graph.add((keyword_scheme_uri, RDF.type, SKOS.ConceptScheme)) 234 | keywords_graph.add((keyword_scheme_uri, SKOS.prefLabel, Literal("SemOpenAlex Keywords", datatype = XSD.string))) 235 | keywords_graph.add((keyword_scheme_uri, URIRef("http://purl.org/dc/terms/description"), Literal("SemOpenAlex keywords are based on SemOpenAlex topics. They are specific words or phrases used to capture the essential topics or ideas of an entity.", datatype = XSD.string))) 236 | 237 | 238 | for filename in glob.glob(os.path.join(data_dump_input_entity_dir, '*.gz')): 239 | with gzip.open(filename, 'r') as f: 240 | for line in f: 241 | try: 242 | json_data = json.loads(line.decode('utf-8')) 243 | 244 | # keyword-URI 245 | # keywords 246 | topic_keywords = json_data['keywords'] 247 | if not topic_keywords is None: 248 | for keyword in topic_keywords: 249 | keyword_uri = transform_keyword_to_uri(keyword) 250 | keyword_uri = validate_keyword_uri(keyword_uri) 251 | 252 | if not keyword_uri is None: 253 | if term._is_valid_uri(keyword_uri): 254 | keywords_graph.add((URIRef(keyword_uri), RDF.type, soa_class_keyword)) 255 | keywords_graph.add((URIRef(keyword_uri), RDF.type, SKOS.Concept)) 256 | keywords_graph.add((URIRef(keyword_uri), SKOS.inScheme, keyword_scheme_uri)) 257 | keywords_graph.add((keyword_scheme_uri, SKOS.hasTopConcept, URIRef(keyword_uri))) 258 | keywords_graph.add((URIRef(keyword_uri), SKOS.prefLabel, Literal(keyword, datatype=XSD.string))) 259 | 260 | i += 1 261 | if i % 100 == 0: 262 | print('Processed topics entity {} lines for the keyword files generation'.format(i)) 263 | 264 | except Exception as e: 265 | print(str((e)) + ' Error in topics entity line for the keyword files generation' + str(i + 1 + error_count)) 266 | error_count += 1 267 | pass 268 | 269 | # Write the graph to the .trig file 270 | g.write(keywords_graph.serialize(format='trig')) 271 | 272 | f.close() 273 | g.close() 274 | 275 | print("Done with .trig parsing and graph serialization..") 276 | print("Start zipping the .trig file.. ") 277 | 278 | # gzip file directly with command 279 | # -v for live output, --fast for faster compression with about 90% size reduction, -k for keeping the original .trig file 280 | os.system(f'gzip --fast -v {trig_output_file_path}') 281 | 282 | end_time = time.ctime() 283 | with open(f"{trig_output_dir_path}{ENTITY_TYPE_OUTPUT}-transformation-summary.txt", "w") as z: 284 | z.write('Start Time: {} .\n'.format(start_time)) 285 | z.write('Items (lines) processed: {} .\n'.format(i)) 286 | z.write('Errors encountered: {} .\n'.format(error_count)) 287 | z.write('End Time: {} .\n'.format(end_time)) 288 | z.close() 289 | 290 | 291 | print("Done") 292 | print('Processed {} lines in total'.format(i)) 293 | print('Error count: '+str(error_count)) 294 | print("#############################") 295 | -------------------------------------------------------------------------------- /transformation-scripts/semopenalex-subfields.py: -------------------------------------------------------------------------------- 1 | from rdflib import Graph 2 | from rdflib import URIRef, BNode, Literal 3 | from rdflib.namespace import DCTERMS, RDF, RDFS, XSD, OWL, FOAF, SKOS 4 | import json 5 | import os 6 | import glob 7 | import gzip 8 | import re 9 | import time 10 | import boto3 11 | from datetime import date 12 | from pathlib import Path 13 | from botocore import UNSIGNED 14 | from botocore.config import Config 15 | 16 | def get_file_folders(s3_client, bucket_name, prefix=""): 17 | file_names = [] 18 | folders = [] 19 | 20 | default_kwargs = { 21 | "Bucket": bucket_name, 22 | "Prefix": prefix 23 | } 24 | next_token = "" 25 | 26 | while next_token is not None: 27 | updated_kwargs = default_kwargs.copy() 28 | if next_token != "": 29 | updated_kwargs["ContinuationToken"] = next_token 30 | 31 | response = s3_client.list_objects_v2(**default_kwargs) 32 | contents = response.get("Contents") 33 | 34 | for result in contents: 35 | key = result.get("Key") 36 | if key[-1] == "/": 37 | folders.append(key) 38 | else: 39 | file_names.append(key) 40 | 41 | next_token = response.get("NextContinuationToken") 42 | 43 | return file_names, folders 44 | 45 | def download_files(s3_client, bucket_name, local_path, file_names, folders): 46 | 47 | local_path = Path(local_path) 48 | 49 | for folder in folders: 50 | folder_path = Path.joinpath(local_path, folder) 51 | folder_path.mkdir(parents=True, exist_ok=True) 52 | 53 | for file_name in file_names: 54 | file_path = Path.joinpath(local_path, file_name) 55 | file_path.parent.mkdir(parents=True, exist_ok=True) 56 | s3_client.download_file( 57 | bucket_name, 58 | file_name, 59 | str(file_path) 60 | ) 61 | 62 | replacements = [ 63 | { 64 | "search" : re.compile(r'"'), 65 | "replace" : '', #" 66 | "comment" : "Unescaped quotation marks" 67 | },{ 68 | "search" : re.compile(r'\\'), 69 | "replace" : '', #\ 70 | "comment" : "Unescaped backslash" 71 | },{ 72 | "search" : re.compile(r'\n'), 73 | "replace" : '', 74 | "comment" : "Newline string" 75 | },{ 76 | "search" : re.compile(r'\b'), 77 | "replace" : '', 78 | "comment" : "Newline string" 79 | },{ 80 | "search" : re.compile(r'\t'), 81 | "replace" : '', 82 | "comment" : "Newline string" 83 | },{ 84 | "search" : re.compile(r'\r'), 85 | "replace" : '', 86 | "comment" : "Newline string" 87 | },{ 88 | "search" : re.compile(r'\f'), 89 | "replace" : '', 90 | "comment" : "Newline string" 91 | } 92 | ] 93 | replacements_url = [ 94 | { 95 | "search" : re.compile(r'"'), 96 | "replace" : '%22', 97 | "comment" : "Unescaped quotation mark in URI" 98 | },{ 99 | "search" : re.compile(r'\\'), 100 | "replace" : '%5c', 101 | "comment" : "Unescaped backslash in URI" 102 | },{ 103 | "search" : re.compile(r'\n'), 104 | "replace" : '', 105 | "comment" : "Newline string" 106 | },{ 107 | "search" : re.compile(r'\r'), 108 | "replace" : '', 109 | "comment" : "Newline string" 110 | },{ 111 | "search" : re.compile(r'\t'), 112 | "replace" : '', 113 | "comment" : "Newline string" 114 | }, 115 | ] 116 | 117 | def clean(nameStr): 118 | cleaned_str = nameStr 119 | for r in replacements: 120 | if re.search(r["search"], nameStr): 121 | cleaned_str = re.sub(r["search"], r["replace"], cleaned_str) 122 | return cleaned_str 123 | def clean_url(nameStr): 124 | cleaned_str = nameStr 125 | for r in replacements_url: 126 | if re.search(r["search"], nameStr): 127 | cleaned_str = re.sub(r["search"], r["replace"], cleaned_str) 128 | return cleaned_str 129 | 130 | # e.g 2024-05-27T05:10:45.699856 to 2024-05-27 131 | def clean_date(dateStr): 132 | return dateStr.split("T")[0] 133 | 134 | # info for namespaces used in SOA 135 | soa_namespace_class = "https://semopenalex.org/ontology/" 136 | soa_namespace_countsbyyear = "https://semopenalex.org/countsByYear/" 137 | soa_namespace_subfields = "https://semopenalex.org/subfield/" 138 | soa_namespace_fields = "https://semopenalex.org/field/" 139 | soa_namespace_domains = "https://semopenalex.org/domain/" 140 | soa_namespace_publishers = "https://semopenalex.org/publisher/" 141 | soa_namespace_institutions = "https://semopenalex.org/institution/" 142 | 143 | # SOA classes used in this file 144 | soa_class_topic = URIRef(soa_namespace_class + "Topic") 145 | soa_class_subfield = URIRef(soa_namespace_class + "Subfield") 146 | soa_class_counts_by_year = URIRef(soa_namespace_class + "CountsByYear") 147 | 148 | # SOA predicates 149 | works_count_predicate = URIRef("https://semopenalex.org/ontology/worksCount") 150 | cited_by_count_predicate = URIRef("https://semopenalex.org/ontology/citedByCount") 151 | 152 | # topics entity context 153 | context = URIRef("https://semopenalex.org/topics/context") 154 | 155 | #topic scheme URI 156 | topic_scheme_uri = URIRef("https://semopenalex.org/topics") 157 | 158 | i = 0 159 | error_count = 0 160 | subfields_graph = Graph(identifier=context) 161 | 162 | today = date.today() 163 | 164 | ########## 165 | ENTITY_TYPE = 'subfields' 166 | ########## 167 | 168 | data_dump_input_root_dir = '/opt/openalex-snapshot' 169 | 170 | absolute_path = os.path.dirname(__file__) 171 | trig_output_dir_path = os.path.join(absolute_path, '../graphdb-preload/graphdb-import/') 172 | trig_output_file_path = f'{trig_output_dir_path}{ENTITY_TYPE}-semopenalex-{today}.trig' 173 | 174 | data_dump_start_time = time.ctime() 175 | print('subfields entity files started to download at: '+ data_dump_start_time) 176 | # Copy institutions entity snapshot 177 | client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) 178 | file_names, folders = get_file_folders(client, "openalex", "data/subfields/") 179 | download_files(client, "openalex", data_dump_input_root_dir, file_names, folders) 180 | print('subfields entity files finished to download.') 181 | 182 | start_time = time.ctime() 183 | print('subfields entity started to transform at: '+ start_time) 184 | 185 | with open(trig_output_file_path, "w", encoding="utf-8") as g: 186 | 187 | #Path where the OpenAlex data for the current entity type is located 188 | data_dump_input_entity_dir = f'{data_dump_input_root_dir}/data/{ENTITY_TYPE}/*' 189 | 190 | for filename in glob.glob(os.path.join(data_dump_input_entity_dir, '*.gz')): 191 | with gzip.open(filename, 'r') as f: 192 | for line in f: 193 | try: 194 | json_data = json.loads(line.decode('utf-8')) 195 | 196 | # subfield-ID 197 | subfield_id = json_data['id'].replace("https://openalex.org/subfields/", "") 198 | subfield_uri = URIRef(soa_namespace_subfields+subfield_id) 199 | subfields_graph.add((subfield_uri, RDF.type, soa_class_subfield)) 200 | subfields_graph.add((subfield_uri, RDF.type, SKOS.Concept)) 201 | 202 | #topic Scheme 203 | subfields_graph.add((subfield_uri, SKOS.inScheme, topic_scheme_uri)) 204 | 205 | # display_name 206 | subfield_display_name = json_data['display_name'] 207 | if not subfield_display_name is None: 208 | subfield_display_name = clean(subfield_display_name) 209 | subfields_graph.add((subfield_uri, SKOS.prefLabel, Literal(subfield_display_name,datatype=XSD.string))) 210 | 211 | # description 212 | topic_description = json_data['description'] 213 | if not topic_description is None: 214 | topic_description = clean(topic_description) 215 | subfields_graph.add((subfield_uri, SKOS.note, Literal(topic_description,datatype=XSD.string))) 216 | 217 | # field 218 | topic_field = json_data['field']['id'] 219 | if not topic_field is None: 220 | topic_field_id = topic_field.replace("https://openalex.org/fields/", "") 221 | topic_field_uri = URIRef(soa_namespace_fields + topic_field_id) 222 | subfields_graph.add((subfield_uri, SKOS.broader, topic_field_uri)) 223 | 224 | # domain 225 | 226 | # topics 227 | 228 | # ids (wikidata, wikipedia) 229 | subfield_wikidata = json_data.get('ids').get('wikidata') 230 | if not subfield_wikidata is None: 231 | subfield_wikidata = clean_url(subfield_wikidata) 232 | subfields_graph.add((subfield_uri, OWL.sameAs, URIRef(subfield_wikidata))) 233 | 234 | topic_wikipedia = json_data.get('ids').get('wikipedia') 235 | if not topic_wikipedia is None: 236 | topic_wikipedia = clean_url(topic_wikipedia) 237 | subfields_graph.add((subfield_uri, RDFS.seeAlso, URIRef(topic_wikipedia))) 238 | 239 | 240 | # updated_date 241 | subfield_updated_date = json_data['updated_date'] 242 | if not subfield_updated_date is None: 243 | subfield_updated_date = clean_date(subfield_updated_date) 244 | subfields_graph.add((subfield_uri, DCTERMS.modified, Literal(subfield_updated_date,datatype=XSD.date))) 245 | 246 | # created_date 247 | subfield_created_date = json_data['created_date'] 248 | if not subfield_created_date is None: 249 | subfields_graph.add((subfield_uri, DCTERMS.created, Literal(subfield_created_date,datatype=XSD.date))) 250 | 251 | # works_count 252 | subfield_works_count = json_data['works_count'] 253 | if not subfield_works_count is None: 254 | subfields_graph.add((subfield_uri, works_count_predicate, Literal(subfield_works_count,datatype=XSD.integer))) 255 | 256 | # cited_by_count 257 | subfield_cited_by_count = json_data['cited_by_count'] 258 | if not subfield_cited_by_count is None: 259 | subfields_graph.add((subfield_uri, cited_by_count_predicate, Literal(subfield_cited_by_count,datatype=XSD.integer))) 260 | 261 | # siblings 262 | subfield_siblings = json_data['siblings'] 263 | if not subfield_siblings is None: 264 | for sibling in subfield_siblings: 265 | sibling_id = sibling['id'].replace("https://openalex.org/subfields/", "") 266 | sibling_uri = URIRef(soa_namespace_subfields+sibling_id) 267 | subfields_graph.add((subfield_uri, SKOS.related, sibling_uri)) 268 | 269 | i += 1 270 | if i % 10 == 0: 271 | print('Processed subfields entity {} lines'.format(i)) 272 | 273 | except Exception as e: 274 | print(str((e)) + ' Error in subfields entity line ' + str(i + 1 + error_count)) 275 | error_count += 1 276 | pass 277 | 278 | # Write the graph to the .trig file 279 | g.write(subfields_graph.serialize(format='trig')) 280 | 281 | f.close() 282 | g.close() 283 | 284 | print("Done with .trig parsing and graph serialization..") 285 | print("Start zipping the .trig file.. ") 286 | 287 | # gzip file directly with command 288 | # -v for live output, --fast for faster compression with about 90% size reduction, -k for keeping the original .trig file 289 | os.system(f'gzip --fast -v {trig_output_file_path}') 290 | 291 | end_time = time.ctime() 292 | with open(f"{trig_output_dir_path}{ENTITY_TYPE}-transformation-summary.txt", "w") as z: 293 | z.write('Start Time: {} .\n'.format(start_time)) 294 | z.write('Items (lines) processed: {} .\n'.format(i)) 295 | z.write('Errors encountered: {} .\n'.format(error_count)) 296 | z.write('End Time: {} .\n'.format(end_time)) 297 | z.close() 298 | 299 | 300 | print("Done") 301 | print('Processed {} lines in total'.format(i)) 302 | print('Error count: '+str(error_count)) 303 | print("#############################") 304 | --------------------------------------------------------------------------------