├── .gitignore ├── BUILD.org ├── README.md ├── alzkb ├── __init__.py ├── build.py ├── data │ ├── alzkb.rdf │ └── alzkb_v2.rdf ├── populate_edge_weights.py ├── populate_ontology.py └── rdf_to_memgraph_csv.py ├── img └── build-abstract.png ├── scripts ├── alzkb_parse_disgenet.py ├── alzkb_parse_dorothea.py ├── alzkb_parse_drugbank.py ├── alzkb_parse_ncbigene.py └── dorothea.R └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,linux 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,linux 3 | 4 | ### Emacs ### 5 | # -*- mode: gitignore; -*- 6 | *~ 7 | \#*\# 8 | /.emacs.desktop 9 | /.emacs.desktop.lock 10 | *.elc 11 | auto-save-list 12 | tramp 13 | .\#* 14 | 15 | # Org-mode 16 | .org-id-locations 17 | *_archive 18 | 19 | # flymake-mode 20 | *_flymake.* 21 | 22 | # eshell files 23 | /eshell/history 24 | /eshell/lastdir 25 | 26 | # elpa packages 27 | /elpa/ 28 | 29 | # reftex files 30 | *.rel 31 | 32 | # AUCTeX auto folder 33 | /auto/ 34 | 35 | # cask packages 36 | .cask/ 37 | dist/ 38 | 39 | # Flycheck 40 | flycheck_*.el 41 | 42 | # server auth directory 43 | /server/ 44 | 45 | # projectiles files 46 | .projectile 47 | 48 | # directory configuration 49 | .dir-locals.el 50 | 51 | # network security 52 | /network-security.data 53 | 54 | 55 | ### Linux ### 56 | 57 | # temporary files which can be created if a process still has a handle open of a deleted file 58 | .fuse_hidden* 59 | 60 | # KDE directory preferences 61 | .directory 62 | 63 | # Linux trash folder which might appear on any partition or disk 64 | .Trash-* 65 | 66 | # .nfs files are created when an open file is removed but is still being accessed 67 | .nfs* 68 | 69 | ### macOS ### 70 | # General 71 | .DS_Store 72 | .AppleDouble 73 | .LSOverride 74 | 75 | # Icon must end with two \r 76 | Icon 77 | 78 | 79 | # Thumbnails 80 | ._* 81 | 82 | # Files that might appear in the root of a volume 83 | .DocumentRevisions-V100 84 | .fseventsd 85 | .Spotlight-V100 86 | .TemporaryItems 87 | .Trashes 88 | .VolumeIcon.icns 89 | .com.apple.timemachine.donotpresent 90 | 91 | # Directories potentially created on remote AFP share 92 | .AppleDB 93 | .AppleDesktop 94 | Network Trash Folder 95 | Temporary Items 96 | .apdisk 97 | 98 | ### Python ### 99 | # Byte-compiled / optimized / DLL files 100 | __pycache__/ 101 | *.py[cod] 102 | *$py.class 103 | 104 | # C extensions 105 | *.so 106 | 107 | # Distribution / packaging 108 | .Python 109 | build/ 110 | develop-eggs/ 111 | downloads/ 112 | eggs/ 113 | .eggs/ 114 | lib/ 115 | lib64/ 116 | parts/ 117 | sdist/ 118 | var/ 119 | wheels/ 120 | share/python-wheels/ 121 | *.egg-info/ 122 | .installed.cfg 123 | *.egg 124 | MANIFEST 125 | 126 | # PyInstaller 127 | # Usually these files are written by a python script from a template 128 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 129 | *.manifest 130 | *.spec 131 | 132 | # Installer logs 133 | pip-log.txt 134 | pip-delete-this-directory.txt 135 | 136 | # Unit test / coverage reports 137 | htmlcov/ 138 | .tox/ 139 | .nox/ 140 | .coverage 141 | .coverage.* 142 | .cache 143 | nosetests.xml 144 | coverage.xml 145 | *.cover 146 | *.py,cover 147 | .hypothesis/ 148 | .pytest_cache/ 149 | cover/ 150 | 151 | # Translations 152 | *.mo 153 | *.pot 154 | 155 | # Django stuff: 156 | *.log 157 | local_settings.py 158 | db.sqlite3 159 | db.sqlite3-journal 160 | 161 | # Flask stuff: 162 | instance/ 163 | .webassets-cache 164 | 165 | # Scrapy stuff: 166 | .scrapy 167 | 168 | # Sphinx documentation 169 | docs/_build/ 170 | 171 | # PyBuilder 172 | .pybuilder/ 173 | target/ 174 | 175 | # Jupyter Notebook 176 | .ipynb_checkpoints 177 | 178 | # IPython 179 | profile_default/ 180 | ipython_config.py 181 | 182 | # pyenv 183 | # For a library or package, you might want to ignore these files since the code is 184 | # intended to run in multiple environments; otherwise, check them in: 185 | # .python-version 186 | 187 | # pipenv 188 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 189 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 190 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 191 | # install all needed dependencies. 192 | #Pipfile.lock 193 | 194 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 195 | __pypackages__/ 196 | 197 | # Celery stuff 198 | celerybeat-schedule 199 | celerybeat.pid 200 | 201 | # SageMath parsed files 202 | *.sage.py 203 | 204 | # Environments 205 | .env 206 | .venv 207 | env/ 208 | venv/ 209 | ENV/ 210 | env.bak/ 211 | venv.bak/ 212 | 213 | # Spyder project settings 214 | .spyderproject 215 | .spyproject 216 | 217 | # Rope project settings 218 | .ropeproject 219 | 220 | # mkdocs documentation 221 | /site 222 | 223 | # mypy 224 | .mypy_cache/ 225 | .dmypy.json 226 | dmypy.json 227 | 228 | # Pyre type checker 229 | .pyre/ 230 | 231 | # pytype static type analyzer 232 | .pytype/ 233 | 234 | # Cython debug symbols 235 | cython_debug/ 236 | 237 | # End of https://www.toptal.com/developers/gitignore/api/python,macos,emacs,linux 238 | 239 | # Ignore files from the alzkb-site 240 | node_modules 241 | default.conf 242 | *.dump 243 | *.key 244 | *.pem 245 | *.crt 246 | -------------------------------------------------------------------------------- /BUILD.org: -------------------------------------------------------------------------------- 1 | #+TITLE: Building AlzKB (from scratch) 2 | #+AUTHOR: Joseph D. Romano 3 | #+EMAIL: joseph.romano@pennmedicine.upenn.edu 4 | #+LANGUAGE: en 5 | #+OPTIONS: toc:nil author 6 | 7 | * Overview 8 | This guide will teach you the complete process of building the 9 | Alzheimer's Knowledge Base (AlzKB). It's not a concise process, but it 10 | is extensible to other applications of knowledge engineering. We use 11 | the same process for our other knowledge bases (such as [[https://comptox.ai][ComptoxAI]]), so 12 | this guide can also be used to teach you how to build your own. 13 | 14 | The following diagram gives an overview of the build process: 15 | 16 | #+CAPTION: Summary of how to build AlzKB 17 | [[./img/build-abstract.png]] 18 | 19 | 1. First, you use domain knowledge to create the ontology 20 | 2. Then, you collect the data sources and use them to populate the 21 | ontology 22 | 3. Finally, you convert the ontology into a graph database 23 | 24 | * 1.: Creating the AlzKB Ontology 25 | _Important note_: Most users don't need to follow these steps, since 26 | it is already done! Unless you want to extend AlzKB or make major 27 | modifications to its node/edge types, you should skip to the [[Obtaining the third-party data sources][next 28 | section]]. If you DO want to do those things, then keep reading. 29 | 30 | AlzKB uses an OWL 2 ontology to act something like a 'template' for 31 | the nodes and relationships in the final knowledge graph. While the 32 | actual nodes and relationships are added automatically according to 33 | the 'rules' defined in the ontology, the ontology itself is 34 | constructed manually, using domain knowledge about AD. We do this 35 | using the Protégé ontology editor. If you don't already have it, 36 | download and install [[https://protege.stanford.edu/software.php][Protégé Desktop]] on your computer. 37 | 38 | * 2.: Obtaining the third-party data sources 39 | The next step is to collect the source data files that will eventually 40 | become the nodes, relationships, and properties in AlzKB's knowledge 41 | graph. Since databases are distributed in a variety of formats and 42 | modalities, you will have to work with a mix of plain-text "flat" 43 | files as well as relational (SQL) databases. All of the SQL databases 44 | parsed to build AlzKB are distributed for MySQL (as opposed to some 45 | other flavor of SQL). 46 | 47 | ** Flat file data sources 48 | 49 | |-----------+----------------+-----------------------------------+---------------------------+--------------------| 50 | | Source | Directory name | Entity type(s) | URL | Extra instructions | 51 | |-----------+----------------+-----------------------------------+---------------------------+--------------------| 52 | | Hetionet | =hetionet= | Many - see =populate-ontology.py= | [[https://github.com/hetio/hetionet/tree/master/hetnet/tsv][GitHub]] | [[https://het.io][Hetionet]] | 53 | | NCBI Gene | =ncbigene= | Genes | [[https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz][Homo_sapiens.gene_info.gz]] | [[https://www.ncbi.nlm.nih.gov/gene/][NCBI Gene]] | 54 | | Drugbank | =drugbank= | Drugs / drug candidates | [[https://go.drugbank.com/releases/latest#open-data][DrugBank website]] | [[https://go.drugbank.com][Drugbank]] | 55 | | DisGeNET | =disgenet= | Diseases and disease-gene edges | [[https://www.disgenet.org/][DisGeNET]] | [[https://disgenet.com][DisGeNET]] | 56 | | | | | | | 57 | 58 | *** Hetionet 59 | Download the =hetionet-v1.0-edges.sif.gz= (extract it using =gunzip=) 60 | and =hetionet-v1.0-nodes.tsv= files from the Hetionet Github 61 | repository. Both of them are, essentially, TSV files, even though one 62 | has the =.sif= extension. 63 | 64 | Hetionet is, itself, a knowledge base, and contains many of the core 65 | biological entities used in AlzKB. Accordingly, it contains data 66 | derived from many other third-party sources. 67 | 68 | *** NCBI Gene 69 | Download the =Homo_sapiens.gene_info.gz= file from the NCBI FTP page 70 | and extract it (e.g., using =gunzip=). 71 | 72 | Create a =CUSTOM= subdirectory inside the =ncbigene= directory. Inside 73 | of that subdirectory, place the following two files: 74 | - [[https://github.com/EpistasisLab/AlzKB/blob/a9db2602e3e7960ec09749b99944fbf675323497/scripts/alzkb_parse_ncbigene.py][alzkb_parse_ncbigene.py]] 75 | - [[https://bgee.org/ftp/bgee_v15_0/download/calls/expr_calls/Homo_sapiens_expr_advanced.tsv.gz][Homo_sapiens_expr_advanced.tsv]] (from the Bgee database) 76 | Then, run =alzkb_parse_ncbigene.py= (no external Python packages 77 | should be needed). You'll notice that it creates two output files 78 | that are used while populating the ontology. 79 | 80 | *** Drugbank 81 | In order to download the Academic DrugBank datasets, you need to first create a free DrugBank account and verify your email address. After verifying your email address, they may need some more information regarding your DrugBank account, like the description of how you plan to use DrugBank, a description of your organization, Who is sponsoring this research, and What is the end goal of this research. Account approval can take up to several business days to weeks based on our experience. 82 | 83 | After your access has been approved, navigate to the Academic Download page on the Drugbank website (linked 84 | above) by selecting the "Download" tab and "Academic Download". Select the "External Links" tab. In the table titled "External 85 | Drug Links", click the "Download" button on the row labeled 86 | "All". This will download a zip file. Extract the contents of that zip 87 | file, and make sure it is named =drug_links.csv= (some versions use a 88 | space instead of an underscore in the filename). 89 | 90 | *** DisGeNET 91 | Although DisGeNET is available under a Creative Commons license, the 92 | database requires users to create a free account to download the 93 | tab-delimited data files. Therefore, you should create a user account 94 | and log in. Then, navigate to the Downloads page on the DisGeNET 95 | website. Now, download the two necessary files by clicking on the 96 | corresponding links: 97 | - "UMLS CUI to several disease vocabularies" (under the "UMLS CUI to 98 | several disease vocabularies" section heading - the resulting file 99 | name will be =disease_mappings.tsv.gz=) 100 | - "UMLS CUI to top disease classes" (the resulting file will be named 101 | =disease_mappings_to_attributes.tar.gz=) 102 | Next, download =curated_disease_gene_associations.tsv.gz= directly by 103 | copying the following URL into your web browser: 104 | https://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz 105 | 106 | All three files are gzipped, so extract them into the =disgenet/= 107 | directory using your favorite method (e.g., gunzip from the command 108 | line, 7zip from within Windows, etc.). 109 | 110 | Now that you have the three necessary data files, you should run the 111 | AlzKB script we wrote to filter for rows in those files corresponding 112 | to Alzheimer's Disease, named =alzkb_parse_disgenet.py=. This script 113 | is in the =scripts/= directory of the AlzKB repository, so either find 114 | it on your local filesystem if you already have a copy of the 115 | repository, or find it on the AlzKB GitHub repository in your web 116 | browser. 117 | 118 | You can then run the Python script from within the =disgenet/= 119 | directory, which should deposit two filtered data files in the 120 | =disgenet/CUSTOM/= subdirectory. These will be automatically detected 121 | and used when you run the ontology population script, along with the 122 | unmodified =curated_disease_gene_associations.tsv= file. 123 | 124 | Then you create a directory that will hold all of the raw data files. It can be 'D:\data\' or something else you prefer. Within that, there will be 1 folder for each third-party database, and in those folders, you'll put the individual csv/tsv/txt files. 125 | 126 | ** SQL data sources 127 | If you don't already have MySQL installed, install it. We recommend 128 | using either a package manager (if one is available on your OS), or 129 | installing MySQL Community Server from the mysql.com website (e.g., by 130 | visiting https://dev.mysql.com/downloads/mysql/). Make sure it's 131 | running and you have the ability to create and modify new databases. 132 | 133 | *** AOP-DB 134 | The Adverse Outcome Pathway Database (AOP-DB) is the only MySQL 135 | database you need to install to build the current version of AlzKB. It 136 | can be downloaded at: https://gaftp.epa.gov/EPADataCommons/ORD/AOP-DB/ 137 | 138 | *WARNING:* This is a big download (7.2G while compressed)! Make sure 139 | you have enough disk space before proceeding. 140 | 141 | You'll have to extract two archives - first, unzip the =AOP-DB_v2.zip= 142 | archive, which should contain two *.tar.gz archives and another .zip 143 | archive. Now, extract the *.tar.gz archive containing =nogi= in its 144 | name (the smaller of the two). Windows doesn't natively support 145 | extracting .tar.gz archives, so you'll either have to download another 146 | program that does this (e.g., 7-zip) or extract it in a Unix-based 147 | environment (Linux, MacOS, Windows Subsystem for Linux, Cygwin, etc.) 148 | that has the =tar= program available on the command line. Once you've 149 | extracted it, you should have a file named something like 150 | =aopdb_no-orthoscores.sql=. 151 | 152 | Now, create an empty database in MySQL, and name it =aopdb=. Make sure 153 | you have full admin privileges on the database. Then, load the (newly 154 | extracted) =.sql= file into the empty database. I always find this 155 | easiest from the command line, by running a command such as: 156 | #+begin_src bash 157 | $ mysql -u username -p database_name < aopdb_no-orthoscores.sql 158 | #+end_src 159 | Substitute your username after the =-u= option and enter your password 160 | when prompted. If you prefer to import it from a GUI, you can use a 161 | tool like MySQL Workbench or DataGrip. 162 | 163 | *WARNING:* It can take a while to import, so be ready to take a break 164 | or do something else while you wait. 165 | 166 | * 2.5: Populating the ontology 167 | Now that we have an ontology (currently 'unpopulated', consisting of a 168 | class hierarchy, object property types, data property types, and 169 | possibly annotations), we can populate it with records from the 170 | third-party databases we collected in the previous step. Fortunately, 171 | this is a largely automated process, facilitated by a tool we call 172 | =ista= (/ista/ is the Sindarin word for /knowledge/). With =ista=, you 173 | write a Python script that first tells =ista= where to find the 174 | third-party data sources, and then maps each of those data sources to 175 | one or two node or edge types defined in the ontology (as classes or 176 | object properties, respectively). Here, we'll walk through the 177 | different parts of AlzKB's =ista= build script and discuss what each 178 | component does. If you are reading this guide to modify or extend 179 | AlzKB, you should be able to use the information in the following few 180 | sections to write your own build script. 181 | 182 | For reference, an up-to-date, complete copy of this build file can be 183 | found in the [[https://github.com/EpistasisLab/AlzKB][AlzKB source repository]] at the location 184 | =alzkb/populate_ontology.py=. 185 | 186 | ** Installing ista 187 | - Keep MySQL Server running 188 | - Install mysqlclient via Anaconda-Navigator 189 | - Clone the ista repository onto your computer (=git clone https://github.com/RomanoLab/ista=) 190 | - =cd ista= 191 | - =pip install .= 192 | 193 | ** Build file top-matter 194 | At the top of the file, we do some imports of necessary Python 195 | packages. First comes =ista=. We don't import the whole package, just 196 | the classes and function that we actually interact with. 197 | #+begin_src python 198 | from ista import FlatFileDatabaseParser, MySQLDatabaseParser 199 | from ista.util import print_onto_stats 200 | #+end_src 201 | In order to interact with OWL 2 ontology files, we bring in the 202 | =owlready2= library. 203 | #+begin_src python 204 | import owlready2 205 | #+end_src 206 | We put private data for our local MySQL databases (hostname, username, 207 | and password) in a file named =secrets.py=, and then make sure the 208 | file is added to our =.gitignore= file so it isn't checked into 209 | version control. You'll have to create that file yourself, and define 210 | the variables =MYSQL_HOSTNAME=, =MYSQL_USERNAME=, and 211 | =MYSQL_PASSWORD=. Then, in the build script, you'll import the file 212 | containing those variables and wrap them into a configuration dict. 213 | #+begin_src python 214 | import secrets 215 | 216 | mysql_config = { 217 | 'host': secrets.MYSQL_HOSTNAME, 218 | 'user': secrets.MYSQL_USERNAME, 219 | 'passwd': secrets.MYSQL_PASSWORD 220 | } 221 | #+end_src 222 | ** Telling =ista= where to find your data sources 223 | Since we are populating an ontology, we need to load the ontology into 224 | =owlready2=. Make sure to modify this path to fit the location of the 225 | AlzKB ontology file on your system! Future versions of AlzKB will 226 | source the path dynamically. Also note the =file://= prefix, which 227 | tells =owlready2= to look on the local file system rather than load a 228 | web URL. Since this guide was made on a Windows desktop, you'll notice 229 | that we have to use escaped backslashes to specify file paths that the 230 | Python interpreter will parse correctly. 231 | #+begin_src python 232 | onto = owlready2.get_ontology("file://D:\\projects\\ista\\tests\\projects\\alzkb\\alzkb.rdf").load() 233 | #+end_src 234 | We also set the 'base' directory for all of the flat files that =ista= 235 | will be loading. You will have determined this location already (see 236 | [[Obtaining the third-party data sources]]). 237 | #+begin_src python 238 | data_dir = "D:\\data\\" 239 | #+end_src 240 | Now, we can actually register the source databases with =ista='s 241 | parser classes. We use =FlatFileDatabaseParser= for data sources 242 | stored as one or more delimited flat files, and =MySQLDatabaseParser= 243 | for data sources in a MySQL database. For flat file-based sources, the 244 | first argument given to the parser's constructor MUST be the 245 | subdirectory (within =data_dir=) where that source's data files are 246 | contained, and for MySQL sources it MUST be the name of the MySQL 247 | database. If not, =ista= won't know where to find the files. The 248 | second argument is always the ontology object loaded using 249 | =owlready2=, and the third is either the base data directory or the 250 | MySQL config dictionary, both of which were defined above. 251 | #+begin_src python 252 | epa = FlatFileDatabaseParser("epa", onto, data_dir) 253 | ncbigene = FlatFileDatabaseParser("ncbigene", onto, data_dir) 254 | drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir) 255 | hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir) 256 | aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config) 257 | aopwiki = FlatFileDatabaseParser("aopwiki", onto, data_dir) 258 | tox21 = FlatFileDatabaseParser("tox21", onto, data_dir) 259 | disgenet = FlatFileDatabaseParser("disgenet", onto, data_dir) 260 | #+end_src 261 | In the following two sections, we'll go over a few examples of how to 262 | define mappings using these parser objects. We won't replicate every 263 | mapping in this guide for brevity, but you can see all of them in the 264 | full AlzKB build script. 265 | *** Configuration for 'flat file' (e.g., CSV) data sources 266 | #+begin_src python 267 | hetionet.parse_node_type( 268 | node_type="Symptom", 269 | source_filename="hetionet-v1.0-nodes.tsv", 270 | fmt="tsv", 271 | parse_config={ 272 | "iri_column_name": "name", 273 | "headers": True, 274 | "filter_column": "kind", 275 | "filter_value": "Symptom", 276 | "data_transforms": { 277 | "id": lambda x: x.split("::")[-1] 278 | }, 279 | "data_property_map": { 280 | "id": onto.xrefMeSH, 281 | "name": onto.commonName 282 | } 283 | }, 284 | merge=False, 285 | skip=False 286 | ) 287 | #+end_src 288 | This block indicates the third-party database is hetionet, and the file is hetionet-v1.0-nodes.tsv 289 | 290 | So the file it will look for is D:\data\hetionet\hetionet-v1.0-nodes.tsv 291 | 292 | Some of the configuration blocks will have a CUSTOM\ prefix to the filename. This means that the file was created by us manually and will need to be stored in a CUSTOM subdirectory of the database folder. For example: 293 | #+begin_src python 294 | disgenet.parse_node_type( 295 | node_type="Disease", 296 | source_filename="CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", # Filtered for just Alzheimer disease 297 | fmt="tsv-pandas", 298 | parse_config={ 299 | "iri_column_name": "diseaseId", 300 | "headers": True, 301 | "data_property_map": { 302 | "diseaseId": onto.xrefUmlsCUI, 303 | "name": onto.commonName, 304 | } 305 | }, 306 | merge=False, 307 | skip=False 308 | ) 309 | #+end_src 310 | This file will be D:\data\disgenet\CUSTOM\disease_mappings_alzheimer.tsv 311 | 312 | *** Configuration for SQL server data sources 313 | #+begin_src python 314 | aopdb.parse_node_type( 315 | node_type="Drug", 316 | source_table="chemical_info", 317 | parse_config={ 318 | "iri_column_name": "DTX_id", 319 | "data_property_map": {"ChemicalID": onto.xrefMeSH}, 320 | "merge_column": { 321 | "source_column_name": "DTX_id", 322 | "data_property": onto.xrefDTXSID 323 | } 324 | }, 325 | merge=True, 326 | skip=False 327 | ) 328 | #+end_src 329 | This block indicates the third-party database is AOP-DB, and the source table is chemical_info. 330 | 331 | 332 | ** Mapping data sources to ontology components 333 | Every flat file or SQL table from a third-party data source can be 334 | mapped a single node or relationship type. For example, a file 335 | describing diseases can be mapped to the =Disease= node type, where 336 | each line in the file corresponds to a disease to be inserted (or 337 | 'merged'---see below) into the knowledge graph. If the source is being 338 | mapped to a node type (rather than a relationship type), =ista= 339 | additionally can populate one or more /node properties/ from the 340 | feature columns in the source file. 341 | 342 | Each mapping is defined using a method call in the =ista= Python 343 | script. 344 | 345 | ** Running =ista= 346 | Now you have set the location of data resources, ontology, and defined mapping method. Run populate_ontology.py 347 | 348 | The alzkb-populated.rdf is the output of this step and will be used for setting Neo4j Graph database. 349 | 350 | * 3.: Converting the ontology into a Neo4j graph database 351 | 352 | ** Installing Neo4j 353 | If you haven't done so already, download Neo4j from the [[https://neo4j.com/download-center/][Neo4j Download 354 | Center]]. Most users should select Neo4j Desktop, but advanced users can 355 | instead opt for Community Server (the instructions for which are well 356 | outside of the scope of this guide). 357 | ** Configuring an empty graph database for AlzKB 358 | You should now create a new graph database that will be populated with 359 | the contents of AlzKB. In Neo4j Community, this can be done as follows: 360 | - Create a new project by clicking the "New" button in the upper left, 361 | then selecting "Create project". 362 | - In the project panel (on the right of the screen), you will see the 363 | default name "Project" populates automatically. Hover over this 364 | name and click the edit icon, then change the name to =AlzKB=. 365 | - To the right of the project name, click "Add", and select "Local 366 | DBMS". Change the Name to =AlzKB DBMS=, specify a password that you will 367 | remember, and use the Version dropdown to select "4.4.0" (if it is 368 | not already selected). Click "Create". Wait for the operation to 369 | finish. 370 | - Install plugins: 371 | - Click the name of the DBMS ("AlzKB DBMS", if you have followed the 372 | guide), and in the new panel to the right click the "Plugins" tab. 373 | - Expand the "APOC" option, click "Install", and wait for the 374 | operation to complete. 375 | - Do the same for the "Graph Data Science Library" and "Neosemantics 376 | (n10s)" plugins. 377 | - Before starting the DBMS, click the ellipsis immediately to the 378 | right of the "Open" button, and then click "Settings...". Make the 379 | following changes to the configuration file: 380 | - Set =dbms.memory.heap.initial_size= to =2048m=. 381 | - Set =dbms.memory.heap.max_size= to =4G=. 382 | - Set =dbms.memory.pagecache.size= to =2048m=. 383 | - Uncomment the line containing 384 | =dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*= 385 | to activate it. 386 | - Add =n10s.*,apoc.cypher.*,apoc.help= to =dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*= 387 | - Click the "Apply" button, then "Close". 388 | - Click "Start" to start the graph database. 389 | ** Importing the =ista= RDF output into Neo4j 390 | - Open neo4j Browser and run the following Cypher to import RDF data 391 | #+begin_src cypher 392 | # Cleaning nodes 393 | MATCH (n) DETACH DELETE n 394 | #+end_src 395 | 396 | #+begin_src cypher 397 | # Constraint Creation 398 | CREATE CONSTRAINT n10s_unique_uri FOR (r:Resource) REQUIRE r.uri IS UNIQUE 399 | #+end_src 400 | 401 | #+begin_src cypher 402 | # Creating a Graph Configuration 403 | CALL n10s.graphconfig.init() 404 | CALL n10s.graphconfig.set({applyNeo4jNaming: true, handleVocabUris: 'IGNORE'}) 405 | #+end_src 406 | 407 | #+begin_src cypher 408 | # Importing RDF 409 | CALL n10s.rdf.import.fetch( "file://D:\\data\\alzkb-populated.rdf", "RDF/XML") 410 | #+end_src 411 | 412 | - Run the Cyphers below to clean nodes 413 | #+begin_src cypher 414 | MATCH (n:Resource) REMOVE n:Resource; 415 | MATCH (n:NamedIndividual) REMOVE n:NamedIndividual; 416 | MATCH (n:AllDisjointClasses) REMOVE n:AllDisjointClasses; 417 | MATCH (n:AllDisjointProperties) REMOVE n:AllDisjointProperties; 418 | MATCH (n:DatatypeProperty) REMOVE n:DatatypeProperty; 419 | MATCH (n:FunctionalProperty) REMOVE n:FunctionalProperty; 420 | MATCH (n:ObjectProperty) REMOVE n:ObjectProperty; 421 | MATCH (n:AnnotationProperty) REMOVE n:AnnotationProperty; 422 | MATCH (n:SymmetricProperty) REMOVE n:SymmetricProperty; 423 | MATCH (n:_GraphConfig) REMOVE n:_GraphConfig; 424 | MATCH (n:Ontology) REMOVE n:Ontology; 425 | MATCH (n:Restriction) REMOVE n:Restriction; 426 | MATCH (n:Class) REMOVE n:Class; 427 | MATCH (n) WHERE size(labels(n)) = 0 DETACH DELETE n; # Removes nodes without labels 428 | #+end_src 429 | 430 | Now, you have built the AlzKB from scratch. You can find the number of nodes and relationships with 431 | #+begin_src cypher 432 | CALL db.labels() YIELD label 433 | CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as count',{}) YIELD value 434 | RETURN label, value.count ORDER BY label 435 | #+end_src 436 | #+begin_src cypher 437 | CALL db.relationshipTypes() YIELD relationshipType as type 438 | CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as count',{}) YIELD value 439 | RETURN type, value.count ORDER BY type 440 | #+end_src 441 | 442 | * 4.: Adding new data resources, nodes, relationships, and properties. 443 | 444 | In version 2.0, we added "TranscriptionFactor" nodes, "TRANSCRIPTIONFACTORINTERACTSWITHGENE" relationships, node properties of "chromosome" number and "sourcedatabase", relationships properties of "correlation", "score", "p_fisher", "z_score", "affinity_nm", "confidence", "sourcedatabase", and "unbiased". 445 | 446 | To achieve this, we added the above entities to the ontology RDF and now named =alzkb_v2.rdf= in the =alzkb\data= directory. Then collect additional source data files as detailed in the table below. 447 | | Source | Directory name | Entity type(s) | URL | Extra instructions | 448 | |-----------|----------------|---------------------------------------------|-----------------------|--------------------| 449 | | TRRUST | =dorothea= | Transcription factors(TF) and TF-gene edges | [[https://www.grnpedia.org/trrust/downloadnetwork.php][TRRUST Download]] | [[https://www.grnpedia.org/trrust/][TRRUST]] | 450 | | DoRothEA | =dorothea= | Transcription factors(TF) and TF-gene edges | [[https://saezlab.github.io/dorothea/][DoRothEA Installation]] | [[https://bioconductor.org/packages/release/data/experiment/vignettes/dorothea/inst/doc/dorothea.R][DoRothEA RScript]] | 451 | 452 | ** Prepare Source Data 453 | Download =trrust_rawdata.human.tsv= from TRRUST Download. Install DoRothEA by following the DoRothEA Installation within R. Place the =trrust_rawdata.human.tsv= and =alzkb_parse_dorothea.py= inside of =Dorothea/= subdirectory, which should be within your raw data directory (e.g., =D:\data=). Run =alzkb_parse_dorothea.py=. You’ll notice that it creates a =tf.tsv= file that is used while populating the ontology. 454 | 455 | ** Replicate Hetionet Resources 456 | Since Hetionet does not have an up-to-date update plan, we have replicated them using the rephetio paper and source code to ensure AlzKB has current data. Follow the steps in [[https://github.com/EpistasisLab/AlzKB-updates][AlzKB-updates]] Github repository to create =hetionet-custom-nodes.tsv= and =hetionet-custom-edges.tsv=. Place these files in the =hetionet/= subdirectory. 457 | 458 | ** Process Data Files 459 | Place the updated =alzkb_parse_ncbigene.py=, =alzkb_parse_drugbank.py=, and =alzkb_parse_disgenet.py= from the =scripts/= directory in their respective raw data file subdirectory. Run each script to process the data for the next step. 460 | 461 | ** Populate Ontology 462 | Now that we have the updated ontology and updated data files, run the updated =alzkb/populate_ontology.py= to populate records. It creates a =alzkb_v2-populated.rdf= file that will be used in next step. 463 | 464 | * 5.: Converting the ontology into a Memgraph graph database 465 | ** Installing Memgraph 466 | If you haven't done so already, download Memgraph from the [[https://memgraph.com/docs/getting-started/install-memgraph][Install Memgraph]] page. Most users install Memgraph using a =pre-prepared docker-compose.yml= file by executing: 467 | - for Linux and macOS: 468 | =curl https://install.memgraph.com | sh= 469 | - for Windows: 470 | =iwr https://windows.memgraph.com | iex= 471 | 472 | More details are in [[https://memgraph.com/docs/getting-started/install-memgraph/docker][Install Memgraph with Docker]] 473 | 474 | ** Generating the CSV File 475 | Before uploading the file to Memgrpah, run =alzkb/rdf_to_memgraph_csv.py= with the =alzkb_v2-populated.rdf= file to generate =alzkb-populated.csv=. 476 | Then run =populate_edge_weights.py= to create =alzkb_with_edge_properties.csv= file if you want to add edge properies to the knowledge graph. 477 | 478 | ** Starting Memgraph with Docker 479 | Follow the instructions in [[https://memgraph.com/docs/data-migration/migrate-from-neo4j#importing-data-into-memgraph][importing-data-into-memgraph]] Step 1. Starting Memgraph with Docker to upload the =alzkb-populated.csv= or =alzkb_with_edge_properties.csv= file to the container. 480 | 481 | Open Memgraph Lab. Memgraph Lab is available at =http://localhost:3000=. Click the =Query Execution= in MENU on the left bar. Then, you can type a Cypher query in the =Cypher Editor=. 482 | 483 | ** Gaining speed with indexes and analytical storage mode 484 | - To create indexes, run the following Cypher queries: 485 | #+begin_src cypher 486 | CREATE INDEX ON :Drug(nodeID); 487 | CREATE INDEX ON :Gene(nodeID); 488 | CREATE INDEX ON :BiologicalProcess(nodeID); 489 | CREATE INDEX ON :Pathway(nodeID); 490 | CREATE INDEX ON :MolecularFunction(nodeID); 491 | CREATE INDEX ON :CellularComponent(nodeID); 492 | CREATE INDEX ON :Symptom(nodeID); 493 | CREATE INDEX ON :BodyPart(nodeID); 494 | CREATE INDEX ON :DrugClass(nodeID); 495 | CREATE INDEX ON :Disease(nodeID); 496 | CREATE INDEX ON :TranscriptionFactor (nodeID); 497 | #+end_src 498 | 499 | - To check the current storage mode, run: 500 | #+begin_src cypher 501 | SHOW STORAGE INFO; 502 | #+end_src 503 | 504 | - Change the storage mode to analytical before import: 505 | #+begin_src cypher 506 | STORAGE MODE IN_MEMORY_ANALYTICAL; 507 | #+end_src 508 | 509 | ** Importing data into Memgraph 510 | - Drug nodes 511 | #+begin_src cypher 512 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 513 | WITH row WHERE row._labels = ':Drug' AND row.commonName <> '' 514 | CREATE (d:Drug {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, 515 | xrefCasRN: row.xrefCasRN, xrefDrugbank: row.xrefDrugbank}); 516 | 517 | MATCH (d:Drug) 518 | RETURN count(d); 519 | #+end_src 520 | 521 | - Gene nodes 522 | #+begin_src cypher 523 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 524 | WITH row WHERE row._labels = ':Gene' 525 | CREATE (g:Gene {nodeID: row._id, commonName: row.commonName, geneSymbol: row.geneSymbol, sourceDatabase: row.sourceDatabase, 526 | typeOfGene: row.typeOfGene, chromosome: row.chromosome, xrefEnsembl: row.xrefEnsembl, 527 | xrefHGNC: row.xrefHGNC, xrefNcbiGene: toInteger(row.xrefNcbiGene), xrefOMIM: row.xrefOMIM}); 528 | 529 | MATCH (g:Gene) 530 | RETURN count(g); 531 | #+end_src 532 | 533 | - BiologicalProcess nodes 534 | #+begin_src cypher 535 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 536 | WITH row WHERE row._labels = ':BiologicalProcess' 537 | CREATE (b:BiologicalProcess {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, 538 | xrefGeneOntology: row.xrefGeneOntology}); 539 | 540 | MATCH (b:BiologicalProcess) 541 | RETURN count(b) 542 | #+end_src 543 | 544 | - Pathway nodes 545 | #+begin_src cypher 546 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 547 | WITH row WHERE row._labels = ':Pathway' 548 | CREATE (p:Pathway {nodeID: row._id, pathwayId: row.pathwayId, pathwayName: row.pathwayName, sourceDatabase: row.sourceDatabase}); 549 | 550 | MATCH (p:Pathway) 551 | RETURN count(p) 552 | #+end_src 553 | 554 | - MolecularFunction nodes 555 | #+begin_src cypher 556 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 557 | WITH row WHERE row._labels = ':MolecularFunction' 558 | CREATE (m:MolecularFunction {nodeID: row._id, commonName: row.commonName, xrefGeneOntology: row.xrefGeneOntology}); 559 | 560 | MATCH (m:MolecularFunction) 561 | RETURN count(m) 562 | #+end_src 563 | 564 | - CellularComponent nodes 565 | #+begin_src cypher 566 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 567 | WITH row WHERE row._labels = ':CellularComponent' 568 | CREATE (c:CellularComponent {nodeID: row._id, commonName: row.commonName, xrefGeneOntology: row.xrefGeneOntology}); 569 | 570 | MATCH (c:CellularComponent) 571 | RETURN count(c) 572 | #+end_src 573 | 574 | - Symptom nodes 575 | #+begin_src cypher 576 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 577 | WITH row WHERE row._labels = ':Symptom' 578 | CREATE (s:Symptom {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefMeSH: row.xrefMeSH}); 579 | 580 | MATCH (s:Symptom) 581 | RETURN count(s) 582 | #+end_src 583 | 584 | - BodyPart nodes 585 | #+begin_src cypher 586 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 587 | WITH row WHERE row._labels = ':BodyPart' 588 | CREATE (b:BodyPart {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefUberon: row.xrefUberon}); 589 | 590 | MATCH (b:BodyPart) 591 | RETURN count(b) 592 | #+end_src 593 | 594 | - DrugClass nodes 595 | #+begin_src cypher 596 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 597 | WITH row WHERE row._labels = ':DrugClass' 598 | CREATE (d:DrugClass {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefNciThesaurus: row.xrefNciThesaurus}); 599 | 600 | MATCH (d:DrugClass) 601 | RETURN count(d) 602 | #+end_src 603 | 604 | - Disease nodes 605 | #+begin_src cypher 606 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 607 | WITH row WHERE row._labels = ':Disease' 608 | CREATE (d:Disease {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, 609 | xrefDiseaseOntology: row.xrefDiseaseOntology, xrefUmlsCUI: row.xrefUmlsCUI}); 610 | 611 | MATCH (d:Disease) 612 | RETURN count(d) 613 | #+end_src 614 | 615 | - Transcription Factor nodes 616 | #+begin_src cypher 617 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 618 | WITH row WHERE row._labels = ':TranscriptionFactor' 619 | CREATE (t:TranscriptionFactor {nodeID: row._id, sourceDatabase: row.sourceDatabase, TF: row.TF}); 620 | MATCH (t:TranscriptionFactor) 621 | RETURN count(t) 622 | #+end_src 623 | 624 | - GENEPARTICIPATESINBIOLOGICALPROCESS relationships 625 | #+begin_src cypher 626 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 627 | WITH row WHERE row._type = 'GENEPARTICIPATESINBIOLOGICALPROCESS' 628 | MATCH (g:Gene {nodeID: row._start}) MATCH (b:BiologicalProcess {nodeID: row._end}) 629 | MERGE (g)-[rel:GENEPARTICIPATESINBIOLOGICALPROCESS]->(b) 630 | RETURN count(rel) 631 | #+end_src 632 | 633 | - GENEREGULATESGENE relationships 634 | #+begin_src cypher 635 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 636 | WITH row WHERE row._type = 'GENEREGULATESGENE' 637 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end}) 638 | MERGE (g)-[rel:GENEREGULATESGENE]->(g2) 639 | RETURN count(rel) 640 | #+end_src 641 | 642 | - GENEINPATHWAY relationships 643 | #+begin_src cypher 644 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 645 | WITH row WHERE row._type = 'GENEINPATHWAY' 646 | MATCH (g:Gene {nodeID: row._start}) MATCH (p:Pathway {nodeID: row._end}) 647 | MERGE (g)-[rel:GENEINPATHWAY]->(p) 648 | RETURN count(rel) 649 | #+end_src 650 | 651 | - GENEINTERACTSWITHGENE relationships 652 | #+begin_src cypher 653 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 654 | WITH row WHERE row._type = 'GENEINTERACTSWITHGENE' 655 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end}) 656 | MERGE (g)-[rel:GENEINTERACTSWITHGENE]->(g2) 657 | RETURN count(rel) 658 | #+end_src 659 | 660 | - BODYPARTUNDEREXPRESSESGENE relationships 661 | #+begin_src cypher 662 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 663 | WITH row WHERE row._type = 'BODYPARTUNDEREXPRESSESGENE' 664 | MATCH (b:BodyPart {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 665 | MERGE (b)-[rel:BODYPARTUNDEREXPRESSESGENE]->(g) 666 | RETURN count(rel) 667 | #+end_src 668 | 669 | - BODYPARTOVEREXPRESSESGENE relationships 670 | #+begin_src cypher 671 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 672 | WITH row WHERE row._type = 'BODYPARTOVEREXPRESSESGENE' 673 | MATCH (b:BodyPart {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 674 | MERGE (b)-[rel:BODYPARTOVEREXPRESSESGENE]->(g) 675 | RETURN count(rel) 676 | #+end_src 677 | 678 | - GENEHASMOLECULARFUNCTION relationships 679 | #+begin_src cypher 680 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 681 | WITH row WHERE row._type = 'GENEHASMOLECULARFUNCTION' 682 | MATCH (g:Gene {nodeID: row._start}) MATCH (m:MolecularFunction {nodeID: row._end}) 683 | MERGE (g)-[rel:GENEHASMOLECULARFUNCTION]->(m) 684 | RETURN count(rel) 685 | #+end_src 686 | 687 | - GENEASSOCIATEDWITHCELLULARCOMPONENT relationships 688 | #+begin_src cypher 689 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 690 | WITH row WHERE row._type = 'GENEASSOCIATEDWITHCELLULARCOMPONENT' 691 | MATCH (g:Gene {nodeID: row._start}) MATCH (c:CellularComponent {nodeID: row._end}) 692 | MERGE (g)-[rel:GENEASSOCIATEDWITHCELLULARCOMPONENT]->(c) 693 | RETURN count(rel) 694 | #+end_src 695 | 696 | - GENECOVARIESWITHGENE relationships 697 | #+begin_src cypher 698 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 699 | WITH row WHERE row._type = 'GENECOVARIESWITHGENE' 700 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end}) 701 | MERGE (g)-[rel:GENECOVARIESWITHGENE {sourceDB: row.sourceDB, unbiased: row.unbiased, correlation: row.correlation}]->(g2) 702 | RETURN count(rel) 703 | #+end_src 704 | 705 | - CHEMICALDECREASESEXPRESSION relationships 706 | #+begin_src cypher 707 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 708 | WITH row WHERE row._type = 'CHEMICALDECREASESEXPRESSION' 709 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 710 | MERGE (d)-[rel:CHEMICALDECREASESEXPRESSION {sourceDB: row.sourceDB, unbiased: row.unbiased, z_score: row.z_score}]->(g) 711 | RETURN count(rel) 712 | #+end_src 713 | 714 | - CHEMICALINCREASESEXPRESSION relationships 715 | #+begin_src cypher 716 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 717 | WITH row WHERE row._type = 'CHEMICALINCREASESEXPRESSION' 718 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 719 | MERGE (d)-[rel:CHEMICALINCREASESEXPRESSION {sourceDB: row.sourceDB, unbiased: row.unbiased, z_score: row.z_score}]->(g) 720 | RETURN count(rel) 721 | #+end_src 722 | 723 | - CHEMICALBINDSGENE relationships 724 | #+begin_src cypher 725 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 726 | WITH row WHERE row._type = 'CHEMICALBINDSGENE' 727 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 728 | MERGE (d)-[rel:CHEMICALBINDSGENE {sourceDB: row.sourceDB, unbiased: row.unbiased, affinity_nM: row.affinity_nM}]->(g) 729 | RETURN count(rel) 730 | #+end_src 731 | 732 | - DRUGINCLASS relationships 733 | #+begin_src cypher 734 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 735 | WITH row WHERE row._type = 'DRUGINCLASS' 736 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:DrugClass {nodeID: row._end}) 737 | MERGE (d)-[rel:DRUGINCLASS]->(d2) 738 | RETURN count(rel) 739 | #+end_src 740 | 741 | - GENEASSOCIATESWITHDISEASE relationships 742 | #+begin_src cypher 743 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 744 | WITH row WHERE row._type = 'GENEASSOCIATESWITHDISEASE' 745 | MATCH (g:Gene {nodeID: row._start}) MATCH (d:Disease {nodeID: row._end}) 746 | MERGE (g)-[rel:GENEASSOCIATESWITHDISEASE {sourceDB: row.sourceDB, score: row.score}]->(d) 747 | RETURN count(rel) 748 | #+end_src 749 | 750 | - SYMPTOMMANIFESTATIONOFDISEASE relationships 751 | #+begin_src cypher 752 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 753 | WITH row WHERE row._type = 'SYMPTOMMANIFESTATIONOFDISEASE' 754 | MATCH (s:Symptom {nodeID: row._start}) MATCH (d:Disease {nodeID: row._end}) 755 | MERGE (s)-[rel:SYMPTOMMANIFESTATIONOFDISEASE {sourceDB: row.sourceDB, unbiased: row.unbiased, p_fisher: row.p_fisher}]->(d) 756 | RETURN count(rel) 757 | #+end_src 758 | 759 | - DISEASELOCALIZESTOANATOMY relationships 760 | #+begin_src cypher 761 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 762 | WITH row WHERE row._type = 'DISEASELOCALIZESTOANATOMY' 763 | MATCH (d:Disease {nodeID: row._start}) MATCH (b:BodyPart {nodeID: row._end}) 764 | MERGE (d)-[rel:DISEASELOCALIZESTOANATOMY {sourceDB: row.sourceDB, unbiased: row.unbiased, p_fisher: row.p_fisher}]->(b) 765 | RETURN count(rel) 766 | #+end_src 767 | 768 | - DRUGTREATSDISEASE relationships 769 | #+begin_src cypher 770 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 771 | WITH row WHERE row._type = 'DRUGTREATSDISEASE' 772 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:Disease {nodeID: row._end}) 773 | MERGE (d)-[rel:DRUGTREATSDISEASE]->(d2) 774 | RETURN count(rel) 775 | #+end_src 776 | 777 | - DRUGCAUSESEFFECT relationships 778 | #+begin_src cypher 779 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 780 | WITH row WHERE row._type = 'DRUGCAUSESEFFECT' 781 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:Disease {nodeID: row._end}) 782 | MERGE (d)-[rel:DRUGCAUSESEFFECT]->(d2) 783 | RETURN count(rel) 784 | #+end_src 785 | 786 | - TRANSCRIPTIONFACTORINTERACTSWITHGENE relationships 787 | #+begin_src cypher 788 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row 789 | WITH row WHERE row._type = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE' 790 | MATCH (t:TranscriptionFactor {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end}) 791 | MERGE (t)-[rel:TRANSCRIPTIONFACTORINTERACTSWITHGENE {sourceDB: row.sourceDB, confidence: row.confidence}]->(g) 792 | RETURN count(rel) 793 | #+end_src 794 | 795 | ** Switching Back to Transactional Storage Mode 796 | After importing the data, follow these steps to switch back to the transactional storage mode: 797 | - Switch to Transactional Storage Mode: 798 | #+begin_src cypher 799 | STORAGE MODE IN_MEMORY_TRANSACTIONAL; 800 | #+end_src 801 | 802 | - Verify the Storage Mode Switch: 803 | #+begin_src cypher 804 | SHOW STORAGE INFO; 805 | #+end_src 806 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AlzKB (http://alzkb.ai/) 2 | 3 | A knowledge base for AI research in Alzheimer Disease, based on graph databases. 4 | ![image](https://github.com/user-attachments/assets/4106ebe7-0d36-4fc6-a360-5174597f6f7b) 5 | 6 | _Please note DRUGCAUSESEFFECT in AlzKB refers to drug causes of side effects._ 7 | 8 | ### Authors 9 | 10 | AlzKB is designed and developed by the following authors (in alphabetical order): 11 | 12 | - Britney Graham, PhD (Cedars-Sinai) 13 | - Yun Hao, MS (UPenn) 14 | - Rachit Kumar (UPenn) 15 | - Xi Li, MD (Cedars-Sinai) 16 | - Nick Matsumoto (Cedars-Sinai) 17 | - Jason H. Moore, PhD, FACMI (Cedars-Sinai) 18 | - Jay Moran, MS (Cedars-Sinai) 19 | - Marylyn Ritchie, PhD (UPenn) 20 | - Joseph D. Romano, PhD (UPenn) 21 | - Li Shen, PhD, FAIMBE (UPenn) 22 | - Van Truong, MS (UPenn) 23 | - Mythreye Venkatesan, MS (Cedars-Sinai) 24 | - Paul Wang, PhD (Cedars-Sinai) 25 | 26 | 27 | ## Deprication Note 28 | Versions of AlzKB prior to v1.3.0 used Neo4j. Use of Neo4j is now depricated. Legacy versions of the knowledge graph will continue to be provided in the Releases page to support existing research. 29 | 30 | ## Prerequisites 31 | - Memgraph Lab (Desktop application) 32 | - Starting with AlzKB v1.3.0, Memgraph is used as the knowledge graph server. 33 | - Memgraph offers a variety of [installation options](https://memgraph.com/docs/getting-started/install-memgraph). 34 | - Memgraph Lab is the easiest way to get up and running with AlzKB. But you may use Memgraph Server if your deployment requires it. 35 | - Python (version 3.7 or later) 36 | 37 | ## Installation 38 | 39 | To build a copy of AlzKB's graph database, you can either: 40 | - Download a copy of the latest CYPHERL file and import it into Memgraph 41 | - Build the knowledge base from its original third-party sources and import it into Memgraph 42 | 43 | ### Install from CYPHERL file (easy) 44 | - Visit the [Releases page](https://github.com/EpistasisLab/AlzKB/releases) and find the version of AlzKB you want to install. Unless you have a particular reason to do otherwise, this should probably be the most recent release. Follow the link in the release notes to the corresponding database dump (it will redirect to an external page). 45 | - Using Memgraph Lab, import the downloaded CYPHERL file by navigating to _Import & Export_ and then click the _Import Data_ button. 46 | - For other ways to import the CYPHERL file into a Memgraph server, see [here](https://memgraph.com/docs/data-migration/cypherl) 47 | - In Memgraph Lab, navigate to _Query execution_ to start querying the knowledge graph. 48 | 49 | ### Build from scratch (less easy) 50 | 51 | **For detailed instructions on building AlzKB from scratch, see [here](https://github.com/EpistasisLab/AlzKB/blob/master/BUILD.org)** 52 | 53 | Start by installing the Python package, which includes the necessary scripts: 54 | 55 | ```{bash} 56 | $ git clone https://github.com/EpistasisLab/AlzKB 57 | $ cd AlzKB 58 | $ pip install . 59 | ``` 60 | 61 | #### Download the third-party database sources 62 | 63 | First, install MySQL and make sure it is running, as some of the source 64 | databases are only available as MySQL dumps. 65 | 66 | We've created a script that will fetch all of the source files and put them into 67 | the expected directory structure. We will try to keep this script as updated as 68 | possible, but if you encounter any issues we suggest looking at the script and 69 | making sure it points to entities that still exist. 70 | 71 | ```{bash} 72 | $ alzkb bootstrap 73 | ``` 74 | 75 | #### Populate the ontology 76 | 77 | We use the external `ista` library to populate the OWL ontology. This should 78 | be pretty much entirely automated: 79 | 80 | ```{bash} 81 | $ alzkb build 82 | ``` 83 | 84 | #### Load the ontology contents into Neo4j 85 | 86 | This script will import the OWL 2 ontology contents into an empty Neo4j database 87 | and clean up unnecessary artifacts left over by the OWL 2 standard: 88 | 89 | ```{bash} 90 | $ alzkb install 91 | ``` 92 | 93 | After this, check the Neo4j database (which will now be turned on) and make sure 94 | everything looks alright. 95 | -------------------------------------------------------------------------------- /alzkb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/AlzKB/3ce9515b3172e4356edc83e3ea37cd1a0df3d7ed/alzkb/__init__.py -------------------------------------------------------------------------------- /alzkb/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from argparse import ArgumentError 4 | import warnings 5 | import os, sys 6 | 7 | def confirm_y_n(): 8 | while True: 9 | resp = str(input("Continue with operation? (Y/n): ")).lower().strip() 10 | if resp[0] == 'y': 11 | return True 12 | if resp[0] == 'n': 13 | print("Exiting application...") 14 | sys.exit(0) 15 | print("Please enter y or n.") 16 | 17 | 18 | def bootstrap(): 19 | """ 20 | Retrieve data files needed to build AlzKB from scratch and organize them 21 | into the required directory structure. 22 | """ 23 | pass 24 | 25 | def build(): 26 | """ 27 | Populate the AlzKB ontology using the local copies of the source databases. 28 | """ 29 | pass 30 | 31 | def install(): 32 | """ 33 | Import the contents of the AlzKB populated ontology into Neoj4. 34 | """ 35 | pass 36 | 37 | def main(): 38 | args = sys.argv 39 | 40 | try: 41 | assert len(args) > 1 42 | except AssertionError: 43 | raise ArgumentError("Error - must provide one of `bootstrap`, `build`, or `install` as an argument to `alzkb`. See the README for more information.") 44 | 45 | if len(args) > 2: 46 | warnings.warn("Multiple arguments provided - only the first will be used.") 47 | 48 | op_arg = args[1].lower() 49 | 50 | if op_arg == 'bootstrap': 51 | bootstrap() 52 | elif op_arg == 'build': 53 | build() 54 | elif op_arg == 'install': 55 | install() 56 | else: 57 | raise ArgumentError("Error - must provide one of `bootstrap`, `build`, or `install` as an argument to `alzkb`. See the README for more information.") 58 | -------------------------------------------------------------------------------- /alzkb/data/alzkb.rdf: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 13 | English 14 | A note on classes vs. individuals: 15 | 16 | In this ontology, individuals are modeled as the idealised entities corresponding to examples of a certain class. For example, 'paroxetine' is an individual of the class Chemical, and 'hsdl1' is an individual of the class Gene. Other ontologies may choose to model these instead as subclasses (e.g., Paroxetine is a subclass of Chemical), and individuals are physical realizations of those classes (e.g., a specific molecule of Paroxetine in the real world). 17 | 18 | The decision to model idealised entities as individuals rather than classes allows us stricter control over the logical assumptions we apply to all entities of a related type. For many use cases, it is often appropriate to take the alternative approach. 19 | 20 | Furthermore, this is beneficial when the ontology is used to populate a graph database. Each individual in the ontology corresponds to a node in the graph database, data properties on those individuals correspond to node attributes, and object properties corresond to edges in the graph. 21 | An ontology describing entities relevant to Alzheimer's disease etiology and entities relevant to drug discovery for Alzheimer's disease. 22 | 0.1.0a 23 | 24 | 25 | 26 | 27 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | String prefix that precedes the unique label on a named individual of the corresponding class. This is necessary to avoid duplicate labels, which result in invalid RDF/XML. Note that the data property "commonName" should also be set, preserving punctuation and whitespace, and omitting the individualLabelPrefix 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | The entity that is being altered via the KE. 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | true 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | Two proteins are isozymes if they are comprised of different amino acid sequences but catalyze the same enzymatic reaction. Isozymes often have different reaction rates and respond differently in various environmental settings. They may also be regulated through different regulatory mechanisms. 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | If True, chemical is a known pharmaceutical drug. Effectively, this means that it is present in DrugBank, but this de facto definition may be modified in the future. 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | If True, chemical is present in the Comparative Toxicogenomics Database. 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | If True, chemical is considered foreign to the human body. 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | A string used to name the entity. This provides a more useful way to label nodes without the prefixes needed to prevent conflicts when nodes in different classes have the same common name. This common name should also be safe to use with punctuation and whitespace characters (which should be removed from node names). 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | MACCS fingerprint is stored as a string where each character is a bit representing an individual feature. 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 | 1129 | 1130 | 1131 | 1132 | 1133 | 1134 | 1135 | 1136 | 1137 | 1138 | 1139 | 1140 | 1141 | 1142 | 1143 | 1144 | 1145 | 1146 | 1147 | 1148 | 1149 | 1150 | 1151 | 1152 | 1153 | 1154 | 1155 | 1156 | 1157 | 1158 | 1159 | 1160 | 1161 | 1162 | 1163 | 1164 | 1165 | 1166 | 1167 | 1168 | 1169 | 1170 | 1171 | 1172 | 1173 | 1174 | 1175 | 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | 1198 | 1199 | 1200 | 1201 | 1202 | 1203 | 1204 | 1205 | 1206 | 1207 | 1208 | 1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1225 | 1226 | 1227 | 1228 | 1229 | 1230 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | Term from the Cell Ontology (CL) 1241 | 1242 | 1243 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1250 | 1251 | 1252 | 1253 | 1254 | 1255 | 1256 | 1257 | 1258 | 1259 | 1260 | Term from Chemical Entities of Biological Interest (ChEBI) 1261 | 1262 | 1263 | 1264 | 1265 | 1266 | 1267 | 1268 | 1269 | 1270 | 1271 | 1272 | DSSTox substance identifier. This is the main 'unit of ground truth' for chemicals in ComptoxAI. 1273 | 1274 | 1275 | 1276 | 1277 | 1278 | 1279 | 1280 | 1281 | 1282 | 1283 | Note: Due to inconsistencies in granularity, some diseases may have multiple DOIDs. 1284 | 1285 | 1286 | 1287 | 1288 | 1289 | 1290 | 1291 | 1292 | 1293 | 1294 | 1295 | 1296 | 1297 | 1298 | 1299 | 1300 | 1301 | 1302 | 1303 | 1304 | 1305 | 1306 | 1307 | 1308 | 1309 | 1310 | 1311 | 1312 | 1313 | Term from the Foundational Model of Anatomy (FMA) 1314 | 1315 | 1316 | 1317 | 1318 | 1319 | 1320 | 1321 | 1322 | 1323 | 1324 | EPA GSID. This has been largely superceded by DSSTOX IDs, but web services and Invitrodb still identify chemicals using GSID. 1325 | 1326 | 1327 | 1328 | 1329 | 1330 | 1331 | 1332 | 1333 | 1334 | 1335 | 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | 1346 | 1347 | 1348 | 1349 | 1350 | 1351 | 1352 | 1353 | 1354 | Any medical subject heading (does not include supplemental terms, such as the UIs that point to specific compounds). 1355 | 1356 | 1357 | 1358 | 1359 | 1360 | 1361 | 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | 1371 | 1372 | 1373 | 1374 | 1375 | A MeSH "Unique ID". This is different from a true subject heading, and is usually a controlled term for a chemical substance. 1376 | 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 | 1411 | 1412 | 1413 | 1414 | 1415 | 1416 | 1417 | 1418 | 1419 | 1420 | 1421 | 1422 | 1423 | 1424 | 1425 | 1426 | 1427 | 1428 | 1429 | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | 1440 | 1441 | 1442 | 1443 | 1444 | 1445 | 1446 | 1447 | 1448 | 1449 | 1450 | 1451 | 1452 | 1453 | 1454 | 1455 | 1456 | 1457 | Pathway ID from an unknown (or deprecated) pathway database. 1458 | 1459 | 1460 | 1461 | 1462 | 1469 | 1470 | 1471 | 1472 | 1473 | 1474 | 1475 | 1476 | 1477 | 1478 | 1479 | 1480 | 1481 | 1482 | 1483 | Detrimental phenotypic effect resulting from exposure to a chemical 1484 | 1485 | 1486 | 1487 | 1488 | 1489 | 1490 | 1491 | 1492 | 1493 | 1494 | 1495 | 1496 | 1497 | 1498 | 1499 | 1500 | 1501 | 1502 | 1503 | 1504 | 1505 | 1506 | 1507 | 1508 | 1509 | 1510 | 1511 | 1512 | 1513 | 1514 | 1515 | 1516 | 1517 | 1518 | 1519 | 1520 | 1521 | 1522 | 1523 | 1524 | 1525 | 1526 | 1527 | 1528 | 1529 | 1530 | 1531 | 1532 | 1533 | 1534 | 1535 | 1536 | 1537 | 1538 | 1539 | 1540 | 1541 | se_ 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | 1549 | 1550 | 1551 | 1552 | 1553 | 1554 | 1555 | 1556 | 1557 | 1558 | 1559 | 1560 | 1561 | 1562 | 1563 | 1564 | 1565 | 1566 | 1567 | 1568 | 1569 | 1570 | 1571 | 1572 | 1573 | 1574 | 1575 | 1576 | 1577 | 1578 | 1579 | 1580 | 1581 | 1582 | 1583 | 1584 | 1585 | 1586 | 1587 | 1588 | 1589 | 1590 | 1591 | 1592 | 1593 | 1594 | 1595 | 1596 | 1597 | 1598 | 1599 | 1600 | 1601 | 1602 | 1603 | 1604 | 1605 | 1606 | 1607 | 1608 | 1609 | 1610 | 1611 | 1612 | 1613 | 1614 | 1615 | 1616 | 1617 | 1618 | 1619 | 1620 | chem_ 1621 | 1622 | 1623 | 1624 | 1625 | 1626 | 1627 | 1628 | 1629 | 1630 | 1631 | 1632 | 1633 | 1634 | 1635 | 1636 | 1637 | 1638 | 1639 | 1640 | 1641 | 1642 | 1643 | 1644 | 1645 | 1646 | 1647 | 1648 | 1649 | 1650 | 1651 | 1652 | 1653 | 1654 | 1655 | 1656 | 1657 | 1658 | 1659 | 1660 | 1661 | 1662 | 1663 | 1664 | 1665 | 1666 | 1667 | 1668 | 1669 | 1670 | 1671 | 1672 | 1673 | Chemical Lists defined by the US EPA and used in the EPA's Comptox Dashboard web application. These lists are scraped from the Dashboard's public-facing API. 1674 | 1675 | A chemical list is loosely defined, but they can be thought of in broad terms as functional classes of chemicals. They range in size from a few chemicals to tens of thousands of chemicals. 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1682 | 1683 | 1684 | 1685 | 1686 | 1687 | 0 1688 | 1689 | 1690 | 1691 | 1692 | 1693 | 1694 | 1695 | 1696 | 1697 | phen_ 1698 | 1699 | 1700 | 1701 | 1702 | 1703 | 1704 | 1705 | 1706 | 1707 | 1708 | 1709 | 1710 | 1711 | 1712 | 1713 | 1714 | 1715 | 1716 | 1717 | 1718 | 1719 | 1720 | Any database that is relevant to computational toxicology (not necessarily intended to be used primarily for toxicology - e.g., PubChem). 1721 | 1722 | 1723 | 1724 | 1725 | 1726 | 1727 | 1728 | 1729 | 1730 | 1731 | 1732 | 1733 | 1734 | 1735 | 1736 | 1737 | 1738 | 1739 | 1740 | 1741 | 1742 | 1743 | 1744 | 1745 | 1746 | 1747 | 1748 | 1749 | 1750 | 1751 | 1752 | 1753 | 1754 | 1755 | 1756 | 1757 | 1758 | 1759 | 1760 | 1761 | 1762 | 1763 | 1764 | 1765 | 1766 | 1767 | 1768 | 1769 | 1770 | 1771 | 1772 | 1773 | 1774 | 1775 | 1776 | 1777 | 1778 | 1779 | 1780 | 1781 | 1782 | 1783 | 1784 | 1785 | 1786 | 1787 | 1788 | 1789 | 1790 | 1791 | dis_ 1792 | A disease is defined as a medical condition with a deleterious effect. 1793 | 1794 | 1795 | 1796 | 1797 | 1798 | 1799 | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1 1807 | 1808 | 1809 | 1810 | 1811 | 1812 | 1813 | 1814 | 1815 | 1816 | 1817 | 1818 | 1819 | 1820 | 1821 | 1822 | 1823 | 1824 | 1825 | 1826 | 1827 | 1828 | 1 1829 | 1830 | 1831 | 1832 | A chemical substance that causes a change in an organism's physiology or psychology when consumed or administered. This ontology primarily considers the effects of drugs when administered on humans. 1833 | 1834 | 1835 | 1836 | 1837 | 1838 | 1839 | 1840 | 1841 | 1842 | 1843 | 1844 | 1845 | 1846 | 1847 | 1848 | 1849 | 1850 | 1851 | 1852 | 1853 | 1854 | 1855 | 1856 | 1857 | 1858 | 1859 | 1860 | 1861 | 1862 | 1863 | 1864 | 1865 | 1866 | 1867 | 1868 | 1869 | 1870 | 1871 | 1872 | 1873 | 1874 | 1875 | 1876 | 1877 | 1878 | 1879 | 1880 | 1881 | 1882 | 1883 | 1884 | 1885 | 1886 | 1887 | 1888 | 1889 | 1890 | 1891 | 1892 | 1893 | 1894 | 1895 | 1896 | 1897 | 1898 | 1899 | 1900 | 1901 | 1902 | 1903 | 1904 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 | 1912 | 1913 | 1914 | 1915 | 1916 | 1917 | 1918 | gene_ 1919 | 1920 | 1921 | 1922 | 1923 | 1924 | 1925 | 1926 | 1927 | 1928 | 1929 | 1930 | 1931 | 1932 | 1933 | 1934 | 1935 | 1936 | 1937 | 1938 | 1939 | 1940 | 1941 | 1942 | 1943 | 1944 | 1945 | 1946 | 1947 | 1948 | 1949 | 1950 | 1951 | 1952 | 1953 | 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | 1960 | 1961 | 1962 | 1963 | 1964 | 1965 | 1966 | 1967 | 1968 | 1969 | 1970 | 1971 | 1972 | 1973 | 1974 | 1975 | 1976 | 1977 | 1978 | 1979 | 1980 | 1981 | 1982 | 1983 | 1984 | 1985 | 1986 | 1987 | 1988 | 1989 | 1990 | 1991 | 1992 | 1993 | 1994 | 1995 | 1996 | 1997 | 1998 | 1999 | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 | 2019 | 2020 | 2021 | 2022 | 2023 | 2024 | 2025 | 2026 | 2027 | 2028 | 2029 | 2030 | 2031 | 2032 | 2033 | 2034 | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 | 2045 | 2046 | 2047 | 2048 | 2049 | 2050 | 2051 | 2052 | 2053 | 2054 | 2055 | 2056 | 2057 | 2058 | 2059 | 2060 | 2061 | 2062 | 2063 | 2064 | 2065 | 2066 | 2067 | 2068 | 2069 | 2070 | 2071 | 2072 | 2073 | 2074 | 2075 | 2076 | 2077 | 2078 | 2079 | 2080 | 2081 | 2082 | 2083 | 2084 | 2085 | 2086 | 2087 | 2088 | 2089 | 2090 | 2091 | 2092 | 2093 | 2094 | 2095 | 2096 | 2097 | 2098 | 2099 | 2100 | 2101 | 2102 | 2103 | 2104 | 2105 | 2106 | 2107 | 2108 | 2109 | 2110 | 2111 | 2112 | 2113 | 2114 | 2115 | 2116 | 2117 | 2118 | 2119 | 2120 | 2121 | 2122 | 2123 | 2124 | 2125 | 2126 | 2127 | 2128 | 2129 | 2130 | 2131 | 2132 | 2133 | 2134 | 2135 | 2136 | 2137 | 2138 | 2139 | 2140 | 2141 | 2142 | 2143 | 2144 | 2145 | 2146 | 2147 | 2148 | 2149 | 2150 | 2151 | 2152 | 2153 | 2154 | 2155 | 2156 | 2157 | 2158 | 2159 | 2160 | 2161 | 2162 | 2163 | 2164 | 2165 | 2166 | 2167 | 2168 | 2169 | 2170 | 2171 | 2172 | Pathways are taken from AOP-DB's pathway_gene table, which includes pathways from many taxa taken from a large number of source databases. We only include pathways with taxid 9606 (humans), but we don't filter based on source database. Therefore, we can't be 100% sure that there isn't duplication, and we haven't created individual node properties for each of the source database xrefs. Generally, the 'sourceDatabase' property can be used along with the format of the pathwayId to determine where the pathway originally came from. 2173 | 2174 | Note that each gene can be in potentially many pathways, and each pathway can contain potentially many genes. 2175 | 2176 | 2177 | 2178 | 2179 | 2180 | 2181 | 2182 | 2183 | 2184 | 2185 | 2186 | 2187 | 2188 | 1 2189 | 2190 | 2191 | 2192 | 2193 | 2194 | 2195 | A drug with an intended beneficial effect, intended to 'treat, cure, prevent, or diagnose a disease or promote well-being.' 2196 | 2197 | 2198 | 2199 | 2200 | 2201 | 2202 | 2203 | 2204 | 2205 | 2206 | 2207 | 2208 | 2209 | 2210 | 2211 | 2212 | 2213 | 2214 | 2215 | 2216 | 2217 | 2218 | 2219 | 2220 | 2221 | 2222 | 0 2223 | 2224 | 2225 | 2226 | 2227 | 2228 | 2229 | 2230 | 2231 | 2232 | 2233 | 2234 | 2235 | 2236 | 2237 | 2238 | 2239 | 2240 | 2241 | 2242 | 2243 | 2244 | 2245 | 2246 | 2247 | 2248 | 2249 | 2250 | 2251 | 2252 | 2253 | 2254 | 2255 | 2256 | 2257 | 2258 | 2259 | 2260 | 2261 | 2262 | 2263 | 2264 | 2265 | 2266 | 2267 | 2268 | 2269 | 2270 | 2271 | 2272 | 2273 | 2274 | 2275 | 2276 | 2277 | 2278 | 2279 | 2280 | 2281 | 2282 | 2283 | 2284 | 2285 | 2286 | 2287 | 2288 | 2289 | 2290 | 2291 | 2292 | 2293 | 2294 | 2295 | 2296 | 2297 | 2298 | 2299 | 2300 | 2301 | 2302 | 2303 | 2304 | 2305 | 2306 | 2307 | 2308 | 2309 | 2310 | 2311 | 2312 | 2313 | 2314 | 2315 | 2316 | 2317 | 2318 | 2319 | 2320 | 2321 | 2322 | 2323 | 2324 | 2331 | 2332 | 2333 | 2334 | 2335 | 2336 | 2337 | 2338 | 2339 | 2340 | 2341 | 2342 | 2343 | 2344 | 2345 | 2346 | 2347 | 2348 | 2349 | 2350 | 2351 | 2352 | 2353 | 2354 | 2355 | 2356 | 2357 | 2358 | 2359 | 2360 | 2361 | 2362 | 2363 | 2364 | 2365 | 2366 | 2367 | 2368 | 2369 | 2370 | 2371 | 2372 | 2373 | 2374 | 2375 | 2376 | 2377 | 2378 | 2379 | 2380 | 2381 | 2382 | 2383 | 2384 | 2385 | 2386 | 2387 | 2388 | 2389 | 2390 | 2391 | 2392 | 2393 | 2394 | -------------------------------------------------------------------------------- /alzkb/populate_edge_weights.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | 4 | 5 | path = './data/alzkb_v2-populated.csv' 6 | df= pd.read_csv(path) 7 | df= pd.concat([df,pd.DataFrame(columns=['sourceDB','unbiased','affinity_nM','p_fisher','z_score','correlation','score','confidence'])]) 8 | 9 | # hetionet-custom-edges.tsv 10 | data_dir = "./AlzKB_Raw_Data" 11 | hetionet_custom = pd.read_table(os.path.join(data_dir,'hetionet/hetionet-custom-edges.tsv')) 12 | 13 | hetio_custom = { 14 | 'CbG':'CHEMICALBINDSGENE', 15 | 'DrD':'DISEASEASSOCIATESWITHDISEASE', # no results 16 | 'DlA':'DISEASELOCALIZESTOANATOMY', 17 | 'DpS':'SYMPTOMMANIFESTATIONOFDISEASE' 18 | } 19 | 20 | 21 | affinity_nM = hetionet_custom[hetionet_custom['metaedge']=='CbG'] 22 | affinity_nM['xrefDrugbank'] = affinity_nM['source'].str.split('::').str[-1] 23 | affinity_nM['xrefNcbiGene'] = affinity_nM['target'].str.split('::').str[-1].astype(int) 24 | affinity_nM = affinity_nM.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') 25 | affinity_nM = affinity_nM.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') 26 | affinity_nM['_type'] = hetio_custom['CbG'] 27 | merged_df = df.merge(affinity_nM, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 28 | for column in ['sourceDB', 'unbiased', 'affinity_nM']: 29 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 30 | df.shape 31 | 32 | 33 | disgenet = pd.read_table('./AlzKB_Raw_Data/disgenet/CUSTOM/disease_mappings_alzheimer.tsv') 34 | disgenet = disgenet[disgenet['vocabulary']=='DO'] 35 | 36 | 37 | p_fisher_DlA = hetionet_custom[hetionet_custom['metaedge']=='DlA'] 38 | 39 | p_fisher_DlA['do_id'] = p_fisher_DlA['source'].str.split('::').str[-1].str.split(':').str[-1] 40 | p_fisher_DlA['xrefUberon'] = p_fisher_DlA['target'].str.split('::').str[-1] 41 | 42 | p_fisher_DlA = p_fisher_DlA.merge(disgenet, left_on='do_id', right_on= 'code') 43 | p_fisher_DlA['_start'] = 'disease_'+p_fisher_DlA['diseaseId'].str.lower() 44 | p_fisher_DlA = p_fisher_DlA.merge(df[['_id','xrefUberon']].rename(columns={'_id':'_end'}), on='xrefUberon', how='left') 45 | p_fisher_DlA['_type'] = hetio_custom['DlA'] 46 | 47 | p_fisher_DpS = hetionet_custom[hetionet_custom['metaedge']=='DpS'] 48 | 49 | p_fisher_DpS['xrefMeSH'] = p_fisher_DpS['target'].str.split('::').str[-1] 50 | p_fisher_DpS['do_id'] = p_fisher_DpS['source'].str.split('::').str[-1].str.split(':').str[-1] 51 | 52 | p_fisher_DpS = p_fisher_DpS.merge(df[['_id','xrefMeSH']].rename(columns={'_id':'_start'}), on='xrefMeSH', how='left') 53 | p_fisher_DpS = p_fisher_DpS.merge(disgenet, left_on='do_id', right_on= 'code') 54 | p_fisher_DpS['_end'] = 'disease_'+p_fisher_DpS['diseaseId'].str.lower() 55 | p_fisher_DpS['_type'] = hetio_custom['DpS'] 56 | 57 | p_fisher = pd.concat([p_fisher_DlA, p_fisher_DpS]) 58 | 59 | merged_df = df.merge(p_fisher, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 60 | for column in ['sourceDB', 'unbiased', 'p_fisher']: 61 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 62 | df.shape 63 | 64 | 65 | # hetionet-v1.0-edges.sif 66 | #https://github.com/dhimmel/integrate/blob/master/integrate.ipynb 67 | 68 | import hetio.hetnet 69 | import hetio.readwrite 70 | import hetio.stats 71 | 72 | path = 'https://raw.githubusercontent.com/dhimmel/integrate/master/data/hetnet.json.bz2' 73 | graph = hetio.readwrite.read_graph(path, formatting=None) 74 | 75 | 76 | #https://github.com/hetio/hetnetpy/blob/main/hetnetpy/readwrite.py 77 | import collections 78 | import operator 79 | import pandas as pd 80 | 81 | def write_nodetable(graph): 82 | """Write a tabular encoding of the graph nodes.""" 83 | rows = list() 84 | for node in graph.node_dict.values(): 85 | row = collections.OrderedDict() 86 | row["kind"] = node.metanode.identifier 87 | row["id"] = str(node) 88 | row["name"] = node.name 89 | row["source"] = node.data['source'] 90 | rows.append(row) 91 | rows.sort(key=operator.itemgetter("kind", "id")) 92 | fieldnames = ["id", "name", "kind", "source"] 93 | df_nodes_tsv = pd.DataFrame(rows, columns=fieldnames) 94 | print(df_nodes_tsv.shape) 95 | return df_nodes_tsv 96 | 97 | 98 | def write_edgetable(graph): 99 | """Write a tsv of the graph edges.""" 100 | rows = list() 101 | edge_properties=["sourceDB", "unbiased", "affinity_nM", "z_score", "p_fisher", "correlation"] 102 | fieldnames =["source", "metaedge", "target"] 103 | fieldnames = fieldnames+edge_properties 104 | metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True) 105 | for metaedge, edges in metaedge_to_edges.items(): 106 | for edge in edges: 107 | row = collections.OrderedDict() 108 | row["source"] = edge.source 109 | row["metaedge"] = edge.metaedge.abbrev 110 | row["target"] = edge.target 111 | for pro in edge_properties: 112 | if pro =='sourceDB': 113 | if 'source' in edge.data.keys(): 114 | row[pro]=edge.data['source'] 115 | else: 116 | row[pro]=None 117 | else: 118 | if pro in edge.data.keys(): 119 | row[pro]=edge.data[pro] 120 | else: 121 | row[pro]=None 122 | rows.append(row) 123 | df_edges_tsv = pd.DataFrame(rows, columns=fieldnames) 124 | print(df_edges_tsv.shape) 125 | return df_edges_tsv 126 | 127 | hetionet = write_edgetable(graph) 128 | hetionet['source']=hetionet['source'].astype(str) 129 | hetionet['target']=hetionet['target'].astype(str) 130 | hetionet 131 | 132 | hetio = { 133 | 'CuG':'CHEMICALINCREASESEXPRESSION', 134 | 'CdG':'CHEMICALDECREASESEXPRESSION', 135 | 'GcG':'GENECOVARIESWITHGENE', 136 | 'Gr>G':'GENEREGULATESGENE' 137 | } 138 | 139 | 140 | z_score = hetionet[hetionet['metaedge']=='CuG'] 141 | z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] 142 | z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) 143 | 144 | z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') 145 | z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') 146 | z_score['_type'] = hetio['CuG'] 147 | 148 | z_score_all = z_score 149 | 150 | z_score = hetionet[hetionet['metaedge']=='CdG'] 151 | z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] 152 | z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) 153 | 154 | z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') 155 | z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') 156 | z_score['_type'] = hetio['CdG'] 157 | 158 | z_score_all = pd.concat([z_score_all,z_score]) 159 | 160 | merged_df = df.merge(z_score_all, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 161 | for column in ['sourceDB', 'unbiased', 'z_score']: 162 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 163 | df.shape 164 | 165 | 166 | correlation = pd.read_table(os.path.join(data_dir,'hetionet/geneCovariesWithGene_correlation.tsv')) 167 | 168 | correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='source_entrez', right_on='xrefNcbiGene', how='left') 169 | correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), left_on='target_entrez', right_on='xrefNcbiGene', how='left') 170 | correlation['_type'] = hetio['GcG'] 171 | correlation['sourceDB'] = 'Hetionet - ERC' 172 | correlation['unbiased'] = True 173 | 174 | merged_df = df.merge(correlation, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 175 | for column in ['sourceDB', 'unbiased', 'correlation']: 176 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 177 | df.shape 178 | df.loc[~df['correlation'].isna()] 179 | 180 | 181 | #DisGeNET 182 | score = pd.read_table('./AlzKB_Raw_Data/disgenet/curated_gene_disease_associations.tsv') 183 | score['sourceDB'] = 'DisGeNET - '+score['source'] 184 | 185 | score = score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='geneId', right_on='xrefNcbiGene', how='left') 186 | score['_end'] = 'disease_'+score['diseaseId'].str.lower() 187 | score['_type'] = 'GENEASSOCIATESWITHDISEASE' 188 | 189 | merged_df = df.merge(score, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 190 | for column in ['sourceDB', 'score']: 191 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 192 | df.shape 193 | 194 | 195 | #TF 196 | confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') 197 | confidence 198 | 199 | confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') 200 | 201 | confidence = confidence.merge(df[['_id','TF']].rename(columns={'_id':'_start'}), on='TF', how='left') 202 | confidence = confidence.merge(df[['_id','geneSymbol']].rename(columns={'_id':'_end'}), left_on='Gene', right_on='geneSymbol', how='left') 203 | 204 | confidence['_type'] = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE' 205 | 206 | merged_df = df.merge(confidence, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') 207 | for column in ['sourceDB', 'confidence']: 208 | df[column] = merged_df[column + '_new'].combine_first(df[column]) 209 | df.shape 210 | 211 | #save data file 212 | df.to_csv('./data/alzkb_v2.0.0_with_edge_properties.csv') 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /alzkb/populate_ontology.py: -------------------------------------------------------------------------------- 1 | from ista import FlatFileDatabaseParser, MySQLDatabaseParser 2 | from ista.util import print_onto_stats 3 | 4 | import owlready2 5 | 6 | import mysecrets 7 | 8 | import ipdb 9 | 10 | onto = owlready2.get_ontology("./data/alzkb_v2.rdf").load() 11 | data_dir = "./AlzKB_Raw_Data/" 12 | 13 | mysql_config = { 14 | 'host': mysecrets.MYSQL_HOSTNAME, 15 | 'user': mysecrets.MYSQL_USERNAME, 16 | 'passwd': mysecrets.MYSQL_PASSWORD 17 | } 18 | 19 | ncbigene = FlatFileDatabaseParser("ncbigene", onto, data_dir) 20 | drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir) 21 | hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir) 22 | aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config) 23 | disgenet = FlatFileDatabaseParser("disgenet", onto, data_dir) 24 | dorothea = FlatFileDatabaseParser("dorothea", onto, data_dir) 25 | 26 | drugbank.parse_node_type( 27 | node_type="Drug", # Switch from "Chemical" in ComptoxAI to "Drug" in AlzKB 28 | source_filename="CUSTOM/drug_links.tsv", 29 | fmt="tsv", 30 | parse_config={ 31 | "iri_column_name": "DrugBank ID", 32 | "headers": True, 33 | "data_property_map": { 34 | "DrugBank ID": onto.xrefDrugbank, 35 | "CAS Number": onto.xrefCasRN, 36 | "Name": onto.commonName, 37 | "data_resource": onto.sourceDatabase, 38 | }, 39 | "merge_column": { 40 | "source_column_name": "CAS Number", 41 | "data_property": onto.xrefCasRN, 42 | }, 43 | }, 44 | merge=False, 45 | skip=False 46 | ) 47 | 48 | ncbigene.parse_node_type( 49 | node_type="Gene", 50 | source_filename="CUSTOM/output.tsv", 51 | fmt="tsv-pandas", 52 | parse_config={ 53 | "compound_fields": { 54 | "dbXrefs": {"delimiter": "|", "field_split_prefix": ":"} 55 | }, 56 | "iri_column_name": "Symbol", 57 | "headers": True, 58 | "data_property_map": { 59 | "GeneID": onto.xrefNcbiGene, 60 | "Symbol": onto.geneSymbol, 61 | "type_of_gene": onto.typeOfGene, 62 | "Full_name_from_nomenclature_authority": onto.commonName, 63 | "MIM": onto.xrefOMIM, 64 | "HGNC": onto.xrefHGNC, 65 | "Ensembl": onto.xrefEnsembl, 66 | "chromosome": onto.chromosome, 67 | "data_resource": onto.sourceDatabase, 68 | # TODO: Parse Feature_type and other columns 69 | }, 70 | }, 71 | merge=False, 72 | skip=False 73 | ) 74 | 75 | hetionet.parse_node_type( 76 | node_type="DrugClass", 77 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 78 | fmt="tsv", 79 | parse_config={ 80 | "iri_column_name": "name", 81 | "headers": True, 82 | "filter_column": "kind", 83 | "filter_value": "Pharmacologic Class", 84 | "data_transforms": { 85 | "id": lambda x: x.split("::")[-1] 86 | }, 87 | "data_property_map": { 88 | "id": onto.xrefNciThesaurus, 89 | "name": onto.commonName, 90 | "sourceDB": onto.sourceDatabase, 91 | } 92 | }, 93 | merge=False, 94 | skip=False 95 | ) 96 | 97 | hetionet.parse_node_type( 98 | node_type="Symptom", 99 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 100 | fmt="tsv", 101 | parse_config={ 102 | "iri_column_name": "name", 103 | "headers": True, 104 | "filter_column": "kind", 105 | "filter_value": "Symptom", 106 | "data_transforms": { 107 | "id": lambda x: x.split("::")[-1] 108 | }, 109 | "data_property_map": { 110 | "id": onto.xrefMeSH, 111 | "name": onto.commonName, 112 | "sourceDB": onto.sourceDatabase, 113 | } 114 | }, 115 | merge=False, 116 | skip=False 117 | ) 118 | hetionet.parse_node_type( # ANATOMY RESOLUTION NEEDS TO BE REFINED! 119 | node_type="BodyPart", 120 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 121 | fmt="tsv", 122 | parse_config={ 123 | "iri_column_name": "name", 124 | "headers": True, 125 | "filter_column": "kind", 126 | "filter_value": "Anatomy", 127 | "data_transforms": { 128 | "id": lambda x: x.split("::")[-1] 129 | }, 130 | "data_property_map": { 131 | "id": onto.xrefUberon, 132 | "name": onto.commonName, 133 | "sourceDB": onto.sourceDatabase, 134 | } 135 | }, 136 | merge=False, 137 | skip=False 138 | ) 139 | hetionet.parse_node_type( 140 | node_type="BiologicalProcess", 141 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 142 | fmt="tsv", 143 | parse_config={ 144 | "iri_column_name": "name", 145 | "headers": True, 146 | "filter_column": "kind", 147 | "filter_value": "Biological Process", 148 | "data_transforms": { 149 | "id": lambda x: x.split("::")[-1] 150 | }, 151 | "data_property_map": { 152 | "id": onto.xrefGeneOntology, 153 | "name": onto.commonName, 154 | "sourceDB": onto.sourceDatabase, 155 | } 156 | }, 157 | merge=False, 158 | skip=False 159 | ) 160 | hetionet.parse_node_type( 161 | node_type="MolecularFunction", 162 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 163 | fmt="tsv", 164 | parse_config={ 165 | "iri_column_name": "name", 166 | "headers": True, 167 | "filter_column": "kind", 168 | "filter_value": "Molecular Function", 169 | "data_transforms": { 170 | "id": lambda x: x.split("::")[-1] 171 | }, 172 | "data_property_map": { 173 | "id": onto.xrefGeneOntology, 174 | "name": onto.commonName, 175 | "source": onto.sourceDatabase, 176 | } 177 | }, 178 | merge=False, 179 | skip=False 180 | ) 181 | hetionet.parse_node_type( 182 | node_type="CellularComponent", 183 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet 184 | fmt="tsv", 185 | parse_config={ 186 | "iri_column_name": "name", 187 | "headers": True, 188 | "filter_column": "kind", 189 | "filter_value": "Cellular Component", 190 | "data_transforms": { 191 | "id": lambda x: x.split("::")[-1] 192 | }, 193 | "data_property_map": { 194 | "id": onto.xrefGeneOntology, 195 | "name": onto.commonName, 196 | "source": onto.sourceDatabase, 197 | } 198 | }, 199 | merge=False, 200 | skip=False 201 | ) 202 | 203 | """ 204 | aopdb.parse_node_type( 205 | node_type="Drug", 206 | source_table="chemical_info", 207 | parse_config={ 208 | "iri_column_name": "DTX_id", 209 | "data_property_map": {"ChemicalID": onto.xrefMeSH}, 210 | "merge_column": { 211 | "source_column_name": "DTX_id", 212 | "data_property": onto.xrefDTXSID 213 | } 214 | }, 215 | merge=True, 216 | skip=False 217 | ) 218 | """ 219 | 220 | aopdb.parse_node_type( 221 | node_type="Pathway", 222 | source_table="stressor_info", 223 | parse_config={ 224 | "iri_column_name": "path_name", 225 | "data_property_map": { 226 | "path_id": onto.pathwayId, 227 | #"path_name": onto.commonName, 228 | "path_name": onto.pathwayName, 229 | "ext_source": onto.sourceDatabase, 230 | }, 231 | "custom_sql_query": """SELECT path_name, GROUP_CONCAT(DISTINCT path_id) as path_id, CONCAT('AOPDB - ', GROUP_CONCAT(DISTINCT ext_source)) as ext_source 232 | FROM( 233 | SELECT DISTINCT path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name, ext_source 234 | FROM aopdb.pathway_gene 235 | WHERE tax_id = 9606 236 | )data 237 | GROUP BY path_name;""" #clean duplicated pathway 238 | }, 239 | merge=False, 240 | skip=False 241 | ) 242 | 243 | disgenet.parse_node_type( 244 | node_type="Disease", 245 | source_filename="CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", # Filtered for just Alzheimer disease 246 | fmt="tsv-pandas", 247 | parse_config={ 248 | "iri_column_name": "diseaseId", 249 | "headers": True, 250 | "data_property_map": { 251 | "diseaseId": onto.xrefUmlsCUI, 252 | "name": onto.commonName, 253 | "data_source": onto.sourceDatabase, 254 | } 255 | }, 256 | merge=False, 257 | skip=False 258 | ) 259 | disgenet.parse_node_type( 260 | node_type="Disease", 261 | source_filename="CUSTOM/disease_mappings_alzheimer.tsv", # Filtered, as above 262 | fmt="tsv-pandas", 263 | parse_config={ 264 | "iri_column_name": "diseaseId", 265 | "headers": True, 266 | "filter_column": "vocabulary", 267 | "filter_value": "DO", 268 | "merge_column": { 269 | "source_column_name": "diseaseId", 270 | "data_property": onto.xrefUmlsCUI, 271 | "data_source": onto.sourceDatabase, 272 | }, 273 | "data_property_map": { 274 | "code": onto.xrefDiseaseOntology 275 | } 276 | }, 277 | merge=True, 278 | skip=False 279 | ) 280 | 281 | disgenet.parse_relationship_type( 282 | relationship_type=onto.geneAssociatesWithDisease, 283 | source_filename="curated_gene_disease_associations.tsv", 284 | fmt="tsv", 285 | parse_config={ 286 | "subject_node_type": onto.Gene, 287 | "subject_column_name": "geneSymbol", 288 | "subject_match_property": onto.geneSymbol, 289 | "object_node_type": onto.Disease, 290 | "object_column_name": "diseaseId", 291 | "object_match_property": onto.xrefUmlsCUI, 292 | "filter_column": "diseaseType", 293 | "filter_value": "disease", 294 | "headers": True 295 | }, 296 | merge=False, 297 | skip=False 298 | ) 299 | 300 | hetionet.parse_relationship_type( 301 | relationship_type=onto.chemicalIncreasesExpression, 302 | source_filename="hetionet-v1.0-edges.sif", 303 | fmt="tsv", 304 | parse_config={ 305 | "subject_node_type": onto.Drug, 306 | "subject_column_name": "source", 307 | "subject_match_property": onto.xrefDrugbank, 308 | "object_node_type": onto.Gene, 309 | "object_column_name": "target", 310 | "object_match_property": onto.xrefNcbiGene, 311 | "filter_column": "metaedge", 312 | "filter_value": "CuG", 313 | "headers": True, 314 | "data_transforms": { 315 | "source": lambda x: x.split("::")[-1], 316 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 317 | }, 318 | }, 319 | merge=True, # Merge with aopdb/ctd chemical-gene ixns 320 | skip=False 321 | ) 322 | hetionet.parse_relationship_type( 323 | relationship_type=onto.chemicalDecreasesExpression, 324 | source_filename="hetionet-v1.0-edges.sif", 325 | fmt="tsv", 326 | parse_config={ 327 | "subject_node_type": onto.Drug, 328 | "subject_column_name": "source", 329 | "subject_match_property": onto.xrefDrugbank, 330 | "object_node_type": onto.Gene, 331 | "object_column_name": "target", 332 | "object_match_property": onto.xrefNcbiGene, 333 | "filter_column": "metaedge", 334 | "filter_value": "CdG", 335 | "headers": True, 336 | "data_transforms": { 337 | "source": lambda x: x.split("::")[-1], 338 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 339 | }, 340 | }, 341 | merge=True, 342 | skip=False 343 | ), 344 | hetionet.parse_relationship_type( 345 | relationship_type=onto.chemicalBindsGene, 346 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 347 | fmt="tsv", 348 | parse_config={ 349 | "subject_node_type": onto.Drug, 350 | "subject_column_name": "source", 351 | "subject_match_property": onto.xrefDrugbank, 352 | "object_node_type": onto.Gene, 353 | "object_column_name": "target", 354 | "object_match_property": onto.xrefNcbiGene, 355 | "filter_column": "metaedge", 356 | "filter_value": "CbG", 357 | "headers": True, 358 | "data_transforms": { 359 | "source": lambda x: x.split("::")[-1], 360 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 361 | }, 362 | }, 363 | merge=False, 364 | skip=False 365 | ) 366 | hetionet.parse_relationship_type( 367 | relationship_type=onto.geneInteractsWithGene, 368 | source_filename="hetionet-v1.0-edges.sif", 369 | fmt="tsv", 370 | parse_config={ 371 | "subject_node_type": onto.Gene, 372 | "subject_column_name": "source", 373 | "subject_match_property": onto.xrefNcbiGene, 374 | "object_node_type": onto.Gene, 375 | "object_column_name": "target", 376 | "object_match_property": onto.xrefNcbiGene, 377 | "filter_column": "metaedge", 378 | "filter_value": "GiG", 379 | "headers": True, 380 | "data_transforms": { 381 | "source": lambda x: int(x.split("::")[-1]), 382 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 383 | }, 384 | }, 385 | merge=False, 386 | skip=False 387 | ) 388 | hetionet.parse_relationship_type( 389 | relationship_type=onto.drugInClass, 390 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 391 | fmt="tsv", 392 | parse_config={ 393 | "subject_node_type": onto.Drug, 394 | "subject_column_name": "target", # Note how we reverse the direction of the relationship here 395 | "subject_match_property": onto.xrefDrugbank, 396 | "object_node_type": onto.DrugClass, 397 | "object_column_name": "source", 398 | "object_match_property": onto.xrefNciThesaurus, 399 | "filter_column": "metaedge", 400 | "filter_value": "PCiC", 401 | "headers": True, 402 | "data_transforms": { 403 | "source": lambda x: x.split("::")[-1], 404 | "target": lambda x: x.split("::")[-1] # I foresee this causing problems in the future - should all IDs be cast to str? 405 | }, 406 | }, 407 | merge=False, 408 | skip=False 409 | ) 410 | hetionet.parse_relationship_type( 411 | relationship_type=onto.drugCausesEffect, 412 | source_filename="hetionet-v1.0-edges.sif", 413 | fmt="tsv", 414 | parse_config={ 415 | "subject_node_type": onto.Drug, 416 | "subject_column_name": "source", 417 | "subject_match_property": onto.xrefDrugbank, 418 | "object_node_type": onto.ChemicalEffect, 419 | "object_column_name": "target", 420 | "object_match_property": onto.xrefUmlsCUI, 421 | "filter_column": "metaedge", 422 | "filter_value": "CcSE", 423 | "headers": True, 424 | "data_transforms": { 425 | "source": lambda x: x.split("::")[-1], 426 | "target": lambda x: x.split("::")[-1] 427 | }, 428 | }, 429 | merge=False, 430 | skip=False 431 | ) 432 | hetionet.parse_relationship_type( 433 | relationship_type=onto.symptomManifestationOfDisease, 434 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 435 | fmt="tsv", 436 | parse_config={ 437 | "subject_node_type": onto.Symptom, 438 | "subject_column_name": "target", # Flip target and source 439 | "subject_match_property": onto.xrefMeSH, 440 | "object_node_type": onto.Disease, 441 | "object_column_name": "source", 442 | "object_match_property": onto.xrefDiseaseOntology, 443 | "filter_column": "metaedge", 444 | "filter_value": "DpS", 445 | "headers": True, 446 | "data_transforms": { 447 | "source": lambda x: x.split("DOID:")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:' 448 | "target": lambda x: x.split("::")[-1] 449 | }, 450 | }, 451 | merge=False, 452 | skip=False 453 | ) 454 | hetionet.parse_relationship_type( 455 | relationship_type=onto.drugTreatsDisease, 456 | source_filename="hetionet-v1.0-edges.sif", 457 | fmt="tsv", 458 | parse_config={ 459 | "subject_node_type": onto.Drug, 460 | "subject_column_name": "source", 461 | "subject_match_property": onto.xrefDrugbank, 462 | "object_node_type": onto.Disease, 463 | "object_column_name": "target", 464 | "object_match_property": onto.xrefDiseaseOntology, 465 | "filter_column": "metaedge", 466 | "filter_value": "CtD", 467 | "headers": True, 468 | "data_transforms": { 469 | "source": lambda x: x.split("::")[-1], 470 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 471 | }, 472 | }, 473 | merge=False, 474 | skip=False 475 | ) 476 | hetionet.parse_relationship_type( # Hetionet makes a messy distinction between 'treats' and 'palliates' which we ignore 477 | relationship_type=onto.drugTreatsDisease, 478 | source_filename="hetionet-v1.0-edges.sif", 479 | fmt="tsv", 480 | parse_config={ 481 | "subject_node_type": onto.Drug, 482 | "subject_column_name": "source", 483 | "subject_match_property": onto.xrefDrugbank, 484 | "object_node_type": onto.Disease, 485 | "object_column_name": "target", 486 | "object_match_property": onto.xrefDiseaseOntology, 487 | "filter_column": "metaedge", 488 | "filter_value": "CpD", 489 | "headers": True, 490 | "data_transforms": { 491 | "source": lambda x: x.split("::")[-1], 492 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 493 | }, 494 | }, 495 | merge=False, 496 | skip=False 497 | ) 498 | hetionet.parse_relationship_type( 499 | relationship_type=onto.diseaseLocalizesToAnatomy, 500 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 501 | fmt="tsv", 502 | parse_config={ 503 | "subject_node_type": onto.Disease, 504 | "subject_column_name": "source", 505 | "subject_match_property": onto.xrefDiseaseOntology, 506 | "object_node_type": onto.BodyPart, 507 | "object_column_name": "target", 508 | "object_match_property": onto.xrefUberon, 509 | "filter_column": "metaedge", 510 | "filter_value": "DlA", 511 | "headers": True, 512 | "data_transforms": { 513 | "source": lambda x: x.split(":")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:' 514 | "target": lambda x: x.split("::")[-1] 515 | }, 516 | }, 517 | merge=False, 518 | skip=False 519 | ) 520 | hetionet.parse_relationship_type( 521 | relationship_type=onto.diseaseAssociatesWithDisease, 522 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 523 | fmt="tsv", 524 | parse_config={ 525 | "subject_node_type": onto.Disease, 526 | "subject_column_name": "source", 527 | "subject_match_property": onto.xrefDiseaseOntology, 528 | "object_node_type": onto.Disease, 529 | "object_column_name": "target", 530 | "object_match_property": onto.xrefDiseaseOntology, 531 | "filter_column": "metaedge", 532 | "filter_value": "DrD", 533 | "headers": True, 534 | "data_transforms": { 535 | "source": lambda x: x.split(":")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:' 536 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 537 | }, 538 | }, 539 | merge=False, 540 | skip=False 541 | ) 542 | hetionet.parse_relationship_type( 543 | relationship_type=onto.geneParticipatesInBiologicalProcess, 544 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 545 | fmt="tsv", 546 | parse_config={ 547 | "subject_node_type": onto.Gene, 548 | "subject_column_name": "source", 549 | "subject_match_property": onto.xrefNcbiGene, 550 | "object_node_type": onto.BiologicalProcess, 551 | "object_column_name": "target", 552 | "object_match_property": onto.xrefGeneOntology, 553 | "filter_column": "metaedge", 554 | "filter_value": "GpBP", 555 | "headers": True, 556 | "data_transforms": { 557 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:' 558 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 559 | }, 560 | }, 561 | merge=False, 562 | skip=False 563 | ) 564 | hetionet.parse_relationship_type( 565 | relationship_type=onto.geneAssociatedWithCellularComponent, 566 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 567 | fmt="tsv", 568 | parse_config={ 569 | "subject_node_type": onto.Gene, 570 | "subject_column_name": "source", 571 | "subject_match_property": onto.xrefNcbiGene, 572 | "object_node_type": onto.CellularComponent, 573 | "object_column_name": "target", 574 | "object_match_property": onto.xrefGeneOntology, 575 | "filter_column": "metaedge", 576 | "filter_value": "GpCC", 577 | "headers": True, 578 | "data_transforms": { 579 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:' 580 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 581 | }, 582 | }, 583 | merge=False, 584 | skip=False 585 | ) 586 | hetionet.parse_relationship_type( 587 | relationship_type=onto.geneHasMolecularFunction, 588 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet 589 | fmt="tsv", 590 | parse_config={ 591 | "subject_node_type": onto.Gene, 592 | "subject_column_name": "source", 593 | "subject_match_property": onto.xrefNcbiGene, 594 | "object_node_type": onto.MolecularFunction, 595 | "object_column_name": "target", 596 | "object_match_property": onto.xrefGeneOntology, 597 | "filter_column": "metaedge", 598 | "filter_value": "GpMF", 599 | "headers": True, 600 | "data_transforms": { 601 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:' 602 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:' 603 | }, 604 | }, 605 | merge=False, 606 | skip=False 607 | ) 608 | 609 | aopdb.parse_relationship_type( 610 | relationship_type=onto.geneInPathway, 611 | inverse_relationship_type=onto.PathwayContainsGene, 612 | parse_config = { 613 | "subject_node_type": onto.Gene, 614 | "subject_column_name": "entrez", 615 | "subject_match_property": onto.xrefNcbiGene, 616 | "object_node_type": onto.Pathway, 617 | "object_column_name": "path_name", 618 | "object_match_property": onto.pathwayName, 619 | "custom_sql_query": """SELECT DISTINCT entrez, path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name 620 | FROM aopdb.pathway_gene 621 | WHERE tax_id = 9606;""", 622 | "source_table_type": "foreignKey", 623 | "source_table": "pathway_gene", 624 | }, 625 | merge=False, 626 | skip=False 627 | ) 628 | hetionet.parse_relationship_type( 629 | relationship_type=onto.bodyPartOverexpressesGene, 630 | source_filename="hetionet-v1.0-edges.sif", 631 | fmt="tsv", 632 | parse_config={ 633 | "subject_node_type": onto.BodyPart, 634 | "subject_column_name": "source", 635 | "subject_match_property": onto.xrefUberon, 636 | "object_node_type": onto.Gene, 637 | "object_column_name": "target", 638 | "object_match_property": onto.xrefNcbiGene, 639 | "filter_column": "metaedge", 640 | "filter_value": "AuG", # "anatomyUpregulatesGene" 641 | "headers": True, 642 | "data_transforms": { 643 | "source": lambda x: x.split("::")[-1], 644 | "target": lambda x: int(x.split("::")[-1]) 645 | }, 646 | }, 647 | merge=False, 648 | skip=False 649 | ) 650 | hetionet.parse_relationship_type( 651 | relationship_type=onto.bodyPartUnderexpressesGene, 652 | source_filename="hetionet-v1.0-edges.sif", 653 | fmt="tsv", 654 | parse_config={ 655 | "subject_node_type": onto.BodyPart, 656 | "subject_column_name": "source", 657 | "subject_match_property": onto.xrefUberon, 658 | "object_node_type": onto.Gene, 659 | "object_column_name": "target", 660 | "object_match_property": onto.xrefNcbiGene, 661 | "filter_column": "metaedge", 662 | "filter_value": "AdG", # "anatomyDownregulatesGene" 663 | "headers": True, 664 | "data_transforms": { 665 | "source": lambda x: x.split("::")[-1], 666 | "target": lambda x: int(x.split("::")[-1]) 667 | }, 668 | }, 669 | merge=False, 670 | skip=False 671 | ) 672 | 673 | # POSSIBLE ISSUE: Normalize Drug > Chemical or vice versa? Gonna have to look for 'gaps' 674 | # in Neo4j database stemming from inconsistency in node type. 675 | 676 | hetionet.parse_relationship_type( 677 | relationship_type=onto.geneCovariesWithGene, 678 | source_filename="hetionet-v1.0-edges.sif", 679 | fmt="tsv", 680 | parse_config={ 681 | "subject_node_type": onto.Gene, 682 | "subject_column_name": "source", 683 | "subject_match_property": onto.xrefNcbiGene, 684 | "object_node_type": onto.Gene, 685 | "object_column_name": "target", 686 | "object_match_property": onto.xrefNcbiGene, 687 | "filter_column": "metaedge", 688 | "filter_value": "GcG", 689 | "headers": True, 690 | "data_transforms": { 691 | "source": lambda x: int(x.split("::")[-1]), 692 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 693 | }, 694 | }, 695 | merge=False, 696 | skip=False 697 | ) 698 | 699 | hetionet.parse_relationship_type( 700 | relationship_type=onto.geneRegulatesGene, 701 | source_filename="hetionet-v1.0-edges.sif", 702 | fmt="tsv", 703 | parse_config={ 704 | "subject_node_type": onto.Gene, 705 | "subject_column_name": "source", 706 | "subject_match_property": onto.xrefNcbiGene, 707 | "object_node_type": onto.Gene, 708 | "object_column_name": "target", 709 | "object_match_property": onto.xrefNcbiGene, 710 | "filter_column": "metaedge", 711 | "filter_value": "Gr>G", 712 | "headers": True, 713 | "data_transforms": { 714 | "source": lambda x: int(x.split("::")[-1]), 715 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str? 716 | }, 717 | }, 718 | merge=False, 719 | skip=False 720 | ) 721 | 722 | dorothea.parse_node_type( 723 | node_type="TranscriptionFactor", 724 | source_filename="tf.tsv", 725 | fmt="tsv", 726 | parse_config={ 727 | "iri_column_name": "source", 728 | "headers": True, 729 | "data_property_map": { 730 | "source": onto.TF, 731 | #"source": onto.commonName, 732 | "sourceDB": onto.sourceDatabase, 733 | }, 734 | }, 735 | merge=False, 736 | skip=False 737 | ) 738 | 739 | 740 | dorothea.parse_relationship_type( 741 | relationship_type=onto.transcriptionFactorInteractsWithGene, 742 | source_filename="tf.tsv", 743 | fmt="tsv", 744 | parse_config={ 745 | "subject_node_type": onto.TranscriptionFactor, 746 | "subject_column_name": "source", 747 | "subject_match_property": onto.TF, 748 | "object_node_type": onto.Gene, 749 | "object_column_name": "target", 750 | "object_match_property": onto.geneSymbol, 751 | "headers": True, 752 | }, 753 | merge=False, 754 | skip=False 755 | ) 756 | 757 | print_onto_stats(onto) 758 | 759 | with open("./data/alzkb_v2-populated.rdf", 'wb') as fp: 760 | onto.save(file=fp, format="rdfxml") -------------------------------------------------------------------------------- /alzkb/rdf_to_memgraph_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import pandas as pd 4 | import numpy as np 5 | from gqlalchemy import Memgraph 6 | import owlready2 7 | 8 | 9 | #read RDF 10 | path = './data/alzkb_v2-populated.rdf' 11 | onto = owlready2.get_ontology(path).load() 12 | 13 | 14 | #Load node and property 15 | def extract_node_details(label, node): 16 | details = { 17 | '_id': node.name, 18 | '_labels': label, 19 | 'commonName': node.commonName if node.commonName else np.nan, 20 | 'geneSymbol': node.geneSymbol if node.geneSymbol else np.nan, 21 | 'pathwayId': node.pathwayId if node.pathwayId else np.nan, 22 | 'pathwayName': node.pathwayName if node.pathwayName else np.nan, 23 | 'sourceDatabase': node.sourceDatabase if node.sourceDatabase else np.nan, 24 | 'typeOfGene': node.typeOfGene if node.typeOfGene else np.nan, 25 | 'chromosome': node.chromosome if node.chromosome else np.nan, 26 | 'TF': node.TF if node.TF else np.nan, 27 | 'xrefCasRN': node.xrefCasRN if node.xrefCasRN else np.nan, 28 | 'xrefDiseaseOntology': node.xrefDiseaseOntology if node.xrefDiseaseOntology else np.nan, 29 | 'xrefDrugbank': node.xrefDrugbank if node.xrefDrugbank else np.nan, 30 | 'xrefEnsembl': node.xrefEnsembl if node.xrefEnsembl else np.nan, 31 | 'xrefGeneOntology': node.xrefGeneOntology if node.xrefGeneOntology else np.nan, 32 | 'xrefHGNC': node.xrefHGNC if node.xrefHGNC else np.nan, 33 | 'xrefMeSH': node.xrefMeSH if node.xrefMeSH else np.nan, 34 | 'xrefNcbiGene': node.xrefNcbiGene if node.xrefNcbiGene else np.nan, 35 | 'xrefNciThesaurus': node.xrefNciThesaurus if node.xrefNciThesaurus else np.nan, 36 | 'xrefOMIM': node.xrefOMIM if node.xrefOMIM else np.nan, 37 | 'xrefUberon': node.xrefUberon if node.xrefUberon else np.nan, 38 | 'xrefUmlsCUI': node.xrefUmlsCUI if node.xrefUmlsCUI else np.nan 39 | } 40 | 41 | for key, value in details.items(): 42 | if isinstance(value, list) and len(value) > 0: 43 | try: 44 | details[key] = str(value[-1]) 45 | except ValueError: 46 | details[key] = np.nan 47 | elif isinstance(value, list): 48 | details[key] = np.nan 49 | 50 | return details 51 | 52 | 53 | #Drug 54 | drug_details_list = [] 55 | for drug in onto.individuals(): 56 | if onto.Drug in drug.is_a: 57 | drug_details_list.append(extract_node_details(':Drug', drug)) 58 | drug_details_df = pd.DataFrame(drug_details_list) 59 | 60 | 61 | #Gene 62 | gene_details_list = [] 63 | for gene in onto.individuals(): 64 | if onto.Gene in gene.is_a: 65 | gene_details_list.append(extract_node_details(':Gene', gene)) 66 | gene_details_df = pd.DataFrame(gene_details_list) 67 | 68 | 69 | #BodyPart 70 | bodypart_details_list = [] 71 | for bodypart in onto.individuals(): 72 | if onto.BodyPart in bodypart.is_a: 73 | bodypart_details_list.append(extract_node_details(':BodyPart', bodypart)) 74 | bodypart_details_df = pd.DataFrame(bodypart_details_list) 75 | 76 | 77 | #Disease 78 | disease_details_list = [] 79 | for disease in onto.individuals(): 80 | if onto.Disease in disease.is_a: 81 | disease_details_list.append(extract_node_details(':Disease', disease)) 82 | disease_details_df = pd.DataFrame(disease_details_list) 83 | 84 | 85 | #DrugClass 86 | drugclass_details_list = [] 87 | for drugclass in onto.individuals(): 88 | if onto.DrugClass in drugclass.is_a: 89 | drugclass_details_list.append(extract_node_details(':DrugClass', drugclass)) 90 | drugclass_details_df = pd.DataFrame(drugclass_details_list) 91 | 92 | 93 | #CellularComponent 94 | cellular_details_list = [] 95 | for cellular in onto.individuals(): 96 | if onto.CellularComponent in cellular.is_a: 97 | cellular_details_list.append(extract_node_details(':CellularComponent', cellular)) 98 | cellular_details_df = pd.DataFrame(cellular_details_list) 99 | 100 | 101 | #MolecularFunction 102 | molecular_details_list = [] 103 | for molecular in onto.individuals(): 104 | if onto.MolecularFunction in molecular.is_a: 105 | molecular_details_list.append(extract_node_details(':MolecularFunction', molecular)) 106 | molecular_details_df = pd.DataFrame(molecular_details_list) 107 | 108 | 109 | #Pathway 110 | pathway_details_list = [] 111 | for pathway in onto.individuals(): 112 | if onto.Pathway in pathway.is_a: 113 | pathway_details_list.append(extract_node_details(':Pathway', pathway)) 114 | pathway_details_df = pd.DataFrame(pathway_details_list) 115 | 116 | 117 | #BiologicalProcess 118 | biological_details_list = [] 119 | for biological in onto.individuals(): 120 | if onto.BiologicalProcess in biological.is_a: 121 | biological_details_list.append(extract_node_details(':BiologicalProcess', biological)) 122 | biological_details_df = pd.DataFrame(biological_details_list) 123 | 124 | 125 | #Symptom 126 | symptom_details_list = [] 127 | for symptom in onto.individuals(): 128 | if onto.Symptom in symptom.is_a: 129 | symptom_details_list.append(extract_node_details(':Symptom', symptom)) 130 | symptom_details_df = pd.DataFrame(symptom_details_list) 131 | 132 | 133 | # TranscriptionFactor 134 | transcription_details_list = [] 135 | for transcriptionfactor in onto.individuals(): 136 | if onto.TranscriptionFactor in transcriptionfactor.is_a: 137 | transcription_details_list.append(extract_node_details(':TranscriptionFactor', transcriptionfactor)) 138 | transcription_details_df = pd.DataFrame(transcription_details_list) 139 | 140 | 141 | #Merge all nodes df 142 | merged_node_df = pd.concat([drug_details_df, gene_details_df, bodypart_details_df, disease_details_df, 143 | drugclass_details_df, cellular_details_df, molecular_details_df, pathway_details_df, 144 | biological_details_df, symptom_details_df, transcription_details_df], ignore_index=True) 145 | merged_node_df.reset_index(drop=True, inplace=True) 146 | merged_node_df.shape 147 | 148 | 149 | #Load relationship 150 | 151 | #Drug 152 | relations = [] 153 | def extract_rel_details_from_drug(node): 154 | for gene in node.chemicalBindsGene: 155 | relations.append({ 156 | '_start': node.name, 157 | '_end': gene.name, 158 | '_type': 'CHEMICALBINDSGENE'}) 159 | for gene in node.chemicalDecreasesExpression: 160 | relations.append({ 161 | '_start': node.name, 162 | '_end': gene.name, 163 | '_type': 'CHEMICALDECREASESEXPRESSION'}) 164 | for gene in node.chemicalIncreasesExpression: 165 | relations.append({ 166 | '_start': node.name, 167 | '_end': gene.name, 168 | '_type': 'CHEMICALINCREASESEXPRESSION'}) 169 | for disease in node.drugCausesEffect: 170 | relations.append({ 171 | '_start': node.name, 172 | '_end': disease.name, 173 | '_type': 'DRUGCAUSESEFFECT'}) 174 | for disease in node.drugTreatsDisease: 175 | relations.append({ 176 | '_start': node.name, 177 | '_end': disease.name, 178 | '_type': 'DRUGTREATSDISEASE'}) 179 | for drugclass in node.drugInClass: 180 | relations.append({ 181 | '_start': node.name, 182 | '_end': drugclass.name, 183 | '_type': 'DRUGINCLASS'}) 184 | 185 | 186 | for drug in onto.individuals(): 187 | if onto.Drug in drug.is_a: 188 | extract_rel_details_from_drug(drug) 189 | 190 | drug_rel = pd.DataFrame(relations) 191 | 192 | 193 | #Gene 194 | relations = [] 195 | def extract_rel_details_from_gene(node): 196 | for cellular in node.geneAssociatedWithCellularComponent: 197 | relations.append({ 198 | '_start': node.name, 199 | '_end': cellular.name, 200 | '_type': 'GENEASSOCIATEDWITHCELLULARCOMPONENT'}) 201 | for disease in node.geneAssociatesWithDisease: 202 | relations.append({ 203 | '_start': node.name, 204 | '_end': disease.name, 205 | '_type': 'GENEASSOCIATESWITHDISEASE'}) 206 | for molecular in node.geneHasMolecularFunction: 207 | relations.append({ 208 | '_start': node.name, 209 | '_end': molecular.name, 210 | '_type': 'GENEHASMOLECULARFUNCTION'}) 211 | for biological in node.geneParticipatesInBiologicalProcess: 212 | relations.append({ 213 | '_start': node.name, 214 | '_end': biological.name, 215 | '_type': 'GENEPARTICIPATESINBIOLOGICALPROCESS'}) 216 | 217 | 218 | for gene in onto.individuals(): 219 | if onto.Gene in gene.is_a: 220 | extract_rel_details_from_gene(gene) 221 | 222 | gene_rel = pd.DataFrame(relations) 223 | 224 | 225 | # #### geneInteractsWithGene (to avoid inverse property problem) 226 | from rdflib import Graph, URIRef 227 | 228 | g = Graph() 229 | 230 | rdf_file = path 231 | g.parse(rdf_file, format='xml') 232 | 233 | pred_uri_1 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneCovariesWithGene') 234 | pred_uri_2 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInteractsWithGene') 235 | pred_uri_3 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneRegulatesGene') 236 | pred_uri_4 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInPathway') 237 | 238 | def extract_last_part(uri): 239 | return uri.split('#')[-1] 240 | 241 | triples = [] 242 | for subj, pred, obj in g: 243 | if pred == pred_uri_1: 244 | triples.append([extract_last_part(subj), 'GENECOVARIESWITHGENE', extract_last_part(obj)]) 245 | elif pred == pred_uri_2: 246 | triples.append([extract_last_part(subj), 'GENEINTERACTSWITHGENE', extract_last_part(obj)]) 247 | elif pred == pred_uri_3: 248 | triples.append([extract_last_part(subj), 'GENEREGULATESGENE', extract_last_part(obj)]) 249 | elif pred == pred_uri_4: 250 | triples.append([extract_last_part(subj), 'GENEINPATHWAY', extract_last_part(obj)]) 251 | 252 | gene_rel2 = pd.DataFrame(triples, columns=['_start', '_type', '_end']) 253 | 254 | #Merge gene rel and rel2 255 | gene_rel2 = gene_rel2[gene_rel.columns] 256 | gene_rel = pd.concat([gene_rel, gene_rel2], ignore_index=True) 257 | 258 | 259 | #Body Part 260 | relations = [] 261 | def extract_rel_details_from_bodypart(node): 262 | for gene in node.bodyPartOverexpressesGene: 263 | relations.append({ 264 | '_start': node.name, 265 | '_end': gene.name, 266 | '_type': 'BODYPARTOVEREXPRESSESGENE'}) 267 | for gene in node.bodyPartUnderexpressesGene: 268 | relations.append({ 269 | '_start': node.name, 270 | '_end': gene.name, 271 | '_type': 'BODYPARTUNDEREXPRESSESGENE'}) 272 | 273 | 274 | for bodypart in onto.individuals(): 275 | if onto.BodyPart in bodypart.is_a: 276 | extract_rel_details_from_bodypart(bodypart) 277 | 278 | bodypart_rel = pd.DataFrame(relations) 279 | 280 | 281 | #Disease 282 | relations = [] 283 | def extract_rel_details_from_disease(node): 284 | for disease in node.diseaseAssociatesWithDisease: 285 | relations.append({ 286 | '_start': node.name, 287 | '_end': disease.name, 288 | '_type': 'DISEASEASSOCIATESWITHDISEASE'}) 289 | for bodypart in node.diseaseLocalizesToAnatomy: 290 | relations.append({ 291 | '_start': node.name, 292 | '_end': bodypart.name, 293 | '_type': 'DISEASELOCALIZESTOANATOMY'}) 294 | 295 | 296 | for disease in onto.individuals(): 297 | if onto.Disease in disease.is_a: 298 | extract_rel_details_from_disease(disease) 299 | 300 | disease_rel = pd.DataFrame(relations) 301 | 302 | 303 | #Symptom 304 | relations = [] 305 | def extract_rel_details_from_symptom(node): 306 | for disease in node.symptomManifestationOfDisease: 307 | relations.append({ 308 | '_start': node.name, 309 | '_end': disease.name, 310 | '_type': 'SYMPTOMMANIFESTATIONOFDISEASE'}) 311 | 312 | 313 | for symptom in onto.individuals(): 314 | if onto.Symptom in symptom.is_a: 315 | extract_rel_details_from_symptom(symptom) 316 | 317 | symptom_rel = pd.DataFrame(relations) 318 | 319 | 320 | # Transcription Factor 321 | relations = [] 322 | def extract_rel_details_from_transcriptionfactor(node): 323 | for transcriptionfactor in node.transcriptionFactorInteractsWithGene: 324 | relations.append({ 325 | '_start': node.name, 326 | '_end': transcriptionfactor.name, 327 | '_type': 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'}) 328 | 329 | 330 | for transcriptionfactor in onto.individuals(): 331 | if onto.TranscriptionFactor in transcriptionfactor.is_a: 332 | extract_rel_details_from_transcriptionfactor(transcriptionfactor) 333 | 334 | transcriptionfactor_rel = pd.DataFrame(relations) 335 | 336 | 337 | #Merge all rels df 338 | merged_rel_df = pd.concat([drug_rel, gene_rel, bodypart_rel, disease_rel, symptom_rel, transcriptionfactor_rel], ignore_index=True) 339 | merged_rel_df.reset_index(drop=True, inplace=True) 340 | merged_rel_df.shape 341 | 342 | 343 | #Merge node and rel 344 | df_all = pd.concat([merged_node_df, merged_rel_df], axis=0, ignore_index=True) 345 | df_all.to_csv('./data/alzkb_v2-populated.csv', index=False) 346 | 347 | 348 | 349 | 350 | -------------------------------------------------------------------------------- /img/build-abstract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EpistasisLab/AlzKB/3ce9515b3172e4356edc83e3ea37cd1a0df3d7ed/img/build-abstract.png -------------------------------------------------------------------------------- /scripts/alzkb_parse_disgenet.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | ## created by Yun Hao and Joe Romano @MooreLab 2022 3 | ## This script parses DisGeNET gene-disease relationship data to extract relationships specific to Alzheimer's disease 4 | 5 | # NOTE: This file must be run from the `disgenet/` directory containing the original TSV files referenced below! 6 | # Both output files will be deposited into the `disgenet/CUSTOM/` directory. 7 | 8 | import pandas as pd 9 | 10 | from pathlib import Path 11 | 12 | disgenet_df = pd.read_csv("./disease_mappings_to_attributes.tsv", sep="\t", header=0) 13 | disgenet_do_df = pd.read_csv("./disease_mappings.tsv", sep="\t", header=0) 14 | 15 | # case insensitive match 16 | disgenet_ad_df = disgenet_df.loc[disgenet_df["name"].str.contains("Alzheimer",case=False),:] 17 | cuis = list(disgenet_ad_df.diseaseId.unique()) 18 | 19 | # For adding disease ontology identifiers 20 | disgenet_ad_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis),:] 21 | 22 | # clean data 23 | # Creutzfeldt-jakob disease (CJD) and Familial Alzheimer Disease (FAD) are different diseases but got merged to the same node in AlzKB because of disease mappings in DisGeNET file “UMLS CUI to several disease vocabularies” in which the DO of Creutzfeldt-Jakob disease is mapped to FAD. 24 | disgenet_ad_do_df = disgenet_ad_do_df[~((disgenet_ad_do_df['name']=='Familial Alzheimer Disease (FAD)') & (disgenet_ad_do_df['vocabularyName']=='Creutzfeldt-Jakob disease'))] 25 | 26 | # add "data_source" & "unbiased" colomns 27 | disgenet_ad_do_df['data_source'] ='DisGeNET' 28 | disgenet_ad_df['data_source'] ='DisGeNET' 29 | 30 | # if we don't have the CUSTOM subdirectory, create it 31 | Path("CUSTOM").mkdir(exist_ok=True) 32 | 33 | disgenet_ad_df.to_csv("./CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", sep="\t", header=True, index=False) 34 | disgenet_ad_do_df.to_csv("./CUSTOM/disease_mappings_alzheimer.tsv", sep="\t", header=True, index=False) -------------------------------------------------------------------------------- /scripts/alzkb_parse_dorothea.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import rpy2.robjects as robjects 3 | from rpy2.robjects import pandas2ri 4 | 5 | # dorothea 6 | # Defining the R script and loading the instance in Python (create and save R script in Rstudio) 7 | r = robjects.r 8 | r['source']('./dorothea.R') 9 | 10 | # Loading the function we have defined in R. 11 | #list(robjects.globalenv.keys()) 12 | net_r = robjects.globalenv['net'] 13 | 14 | #r to pandas dataframe 15 | import rpy2.robjects as ro 16 | with (ro.default_converter + pandas2ri.converter).context(): 17 | dorothea = ro.conversion.get_conversion().rpy2py(net_r) 18 | #dorothea['source'].nunique() #643 TFs 19 | 20 | 21 | #trrust 22 | trrust_rawdata = pd.read_csv('./trrust_rawdata.human.tsv', sep='\t', header=None, names=["TF","Gene","Interaction","PMID"]) 23 | #trrust_rawdata['TF'].nunique() #795 TFs matches with https://www.grnpedia.org/trrust/downloadnetwork.php 24 | 25 | 26 | #combine 27 | df_comb = trrust_rawdata.merge(dorothea, left_on=["TF","Gene"], right_on=["source","target"], how='inner') 28 | df_comb['sourceDB'] ='DoRothEA & TRRUST' 29 | df_comb.to_csv('./tf.tsv', sep="\t", header=True, index=False) -------------------------------------------------------------------------------- /scripts/alzkb_parse_drugbank.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | 4 | df = pd.read_csv('./drug_links.csv') 5 | print(df.shape) 6 | 7 | # add "data_source" colomn 8 | df['data_resource'] ='DrugBank' 9 | 10 | # if we don't have the CUSTOM subdirectory, create it 11 | Path("CUSTOM").mkdir(exist_ok=True) 12 | 13 | df.to_csv("./CUSTOM/drug_links.tsv", sep="\t", header=True, index=False) 14 | print(df.shape) -------------------------------------------------------------------------------- /scripts/alzkb_parse_ncbigene.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ## created by Van Truong @RitchieWherryLabs 2022 3 | ## This script parses NCBI human gene data and Bgee epxression data for knowledge related to Alzheimer's disease 4 | 5 | 6 | my_set = set() 7 | 8 | def processLargeTextFile(source, compare_index, separator): 9 | with open(source, "r") as r: 10 | for line in r: 11 | if 'brain' in line: 12 | columns = line.split(separator) 13 | my_set.add(columns[compare_index].replace('Ensembl:', '') ) 14 | r.close() 15 | 16 | def keepDesiredColums(row, keep_index, separator): 17 | columns = row.split(separator) 18 | 19 | output_str = [] 20 | for index in keep_index: 21 | output_str.append(columns[index]) 22 | 23 | return separator.join(output_str) 24 | 25 | def filterLargeTextFile(source, destination, delimiter, keep_index): 26 | with open(source, "r") as r, open(destination, "w") as w: 27 | #load header row 28 | w.write(keepDesiredColums(r.readline(), keep_index, delimiter) + '\n') 29 | 30 | #load body 31 | for line in r: 32 | #if line is not None: 33 | if line.startswith('9606'): #filter to Homo sapiens (human) 34 | w.write(keepDesiredColums(line, keep_index, delimiter) + '\n') 35 | r.close(), w.close() 36 | 37 | def fileIndexFinder(source, destination, keep_set, compare_column_index, separator): 38 | count_rows =0 39 | with open(source, "r") as r, open(destination, "w") as w: 40 | w.write('data_resource' + separator + 'Ensembl' + separator + r.readline()) 41 | 42 | for line in r: 43 | columns = line.split(separator) 44 | parsed_column = columns[compare_column_index] 45 | 46 | if '|' in parsed_column: 47 | parsed_column_split = parsed_column.split('|') 48 | if len(parsed_column_split) > 2: 49 | parsed_column = parsed_column_split[2].replace('Ensembl:', '') 50 | 51 | #if parsed_column in keep_set: # keep all instead of filtering to brain 52 | w.write('NCBI Gene' + separator + parsed_column + separator + line) 53 | count_rows +=1 54 | 55 | print(count_rows) 56 | r.close() 57 | 58 | 59 | brain_file='./Homo_sapiens_expr_advanced.tsv' #https://bgee.org/?page=download&action=expr_calls#id1 Homo_sapiens_expr_advanced_development 60 | gene_file='../Homo_sapiens.gene_info' #https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz 61 | gene_dest_file='./Homo_sapiens.gene_info_filtered' 62 | 63 | final_out='./output.tsv' 64 | 65 | delimiter = '\t' 66 | keep_index = [1,2,4,5,6,8,9,11] 67 | compare_index = 0 68 | 69 | processLargeTextFile(brain_file, compare_index, delimiter) 70 | print(len(my_set)) 71 | 72 | filterLargeTextFile(gene_file, gene_dest_file, delimiter, keep_index) 73 | fileIndexFinder(gene_dest_file, final_out, my_set, 3, delimiter) -------------------------------------------------------------------------------- /scripts/dorothea.R: -------------------------------------------------------------------------------- 1 | library(dorothea) 2 | library(decoupleR) 3 | library(ggplot2) 4 | library(dplyr) 5 | 6 | net <- decoupleR::get_dorothea(levels = c('A', 'B', 'C', 'D')) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | __all__ = [ 4 | 'VERSION' 5 | ] 6 | 7 | import setuptools 8 | 9 | MAJOR = 0 10 | MINOR = 1 11 | MICRO = 0 12 | TEST_VER = 'a' 13 | ISRELEASED = True 14 | VERSION = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, TEST_VER) 15 | 16 | setuptools.setup( 17 | name="AlzKB", 18 | version=VERSION, 19 | author="Joseph D. Romano, Van Truong, Yun Hao, Li Shen, and Jason H. Moore", 20 | description="A graph knowledge base for Alzheimer disease", 21 | url="https://github.com/EpistasisLab/AlzKB.git", 22 | packages=setuptools.find_packages(), 23 | python_requires=">=3.7", 24 | include_package_data=True, 25 | install_requires=[ 26 | 'ista @ git+https://github.com/JDRomano2/ista@c036c1074e0b59df704a0aeb097862108b012b45' 27 | ], 28 | entry_points={ 29 | 'console_scripts': [ 30 | 'alzkb=alzkb.build:main' 31 | ] 32 | } 33 | ) 34 | --------------------------------------------------------------------------------