├── .gitignore
├── BUILD.org
├── README.md
├── alzkb
├── __init__.py
├── build.py
├── data
│ ├── alzkb.rdf
│ └── alzkb_v2.rdf
├── populate_edge_weights.py
├── populate_ontology.py
└── rdf_to_memgraph_csv.py
├── img
└── build-abstract.png
├── scripts
├── alzkb_parse_disgenet.py
├── alzkb_parse_dorothea.py
├── alzkb_parse_drugbank.py
├── alzkb_parse_ncbigene.py
└── dorothea.R
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,linux
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,linux
3 |
4 | ### Emacs ###
5 | # -*- mode: gitignore; -*-
6 | *~
7 | \#*\#
8 | /.emacs.desktop
9 | /.emacs.desktop.lock
10 | *.elc
11 | auto-save-list
12 | tramp
13 | .\#*
14 |
15 | # Org-mode
16 | .org-id-locations
17 | *_archive
18 |
19 | # flymake-mode
20 | *_flymake.*
21 |
22 | # eshell files
23 | /eshell/history
24 | /eshell/lastdir
25 |
26 | # elpa packages
27 | /elpa/
28 |
29 | # reftex files
30 | *.rel
31 |
32 | # AUCTeX auto folder
33 | /auto/
34 |
35 | # cask packages
36 | .cask/
37 | dist/
38 |
39 | # Flycheck
40 | flycheck_*.el
41 |
42 | # server auth directory
43 | /server/
44 |
45 | # projectiles files
46 | .projectile
47 |
48 | # directory configuration
49 | .dir-locals.el
50 |
51 | # network security
52 | /network-security.data
53 |
54 |
55 | ### Linux ###
56 |
57 | # temporary files which can be created if a process still has a handle open of a deleted file
58 | .fuse_hidden*
59 |
60 | # KDE directory preferences
61 | .directory
62 |
63 | # Linux trash folder which might appear on any partition or disk
64 | .Trash-*
65 |
66 | # .nfs files are created when an open file is removed but is still being accessed
67 | .nfs*
68 |
69 | ### macOS ###
70 | # General
71 | .DS_Store
72 | .AppleDouble
73 | .LSOverride
74 |
75 | # Icon must end with two \r
76 | Icon
77 |
78 |
79 | # Thumbnails
80 | ._*
81 |
82 | # Files that might appear in the root of a volume
83 | .DocumentRevisions-V100
84 | .fseventsd
85 | .Spotlight-V100
86 | .TemporaryItems
87 | .Trashes
88 | .VolumeIcon.icns
89 | .com.apple.timemachine.donotpresent
90 |
91 | # Directories potentially created on remote AFP share
92 | .AppleDB
93 | .AppleDesktop
94 | Network Trash Folder
95 | Temporary Items
96 | .apdisk
97 |
98 | ### Python ###
99 | # Byte-compiled / optimized / DLL files
100 | __pycache__/
101 | *.py[cod]
102 | *$py.class
103 |
104 | # C extensions
105 | *.so
106 |
107 | # Distribution / packaging
108 | .Python
109 | build/
110 | develop-eggs/
111 | downloads/
112 | eggs/
113 | .eggs/
114 | lib/
115 | lib64/
116 | parts/
117 | sdist/
118 | var/
119 | wheels/
120 | share/python-wheels/
121 | *.egg-info/
122 | .installed.cfg
123 | *.egg
124 | MANIFEST
125 |
126 | # PyInstaller
127 | # Usually these files are written by a python script from a template
128 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
129 | *.manifest
130 | *.spec
131 |
132 | # Installer logs
133 | pip-log.txt
134 | pip-delete-this-directory.txt
135 |
136 | # Unit test / coverage reports
137 | htmlcov/
138 | .tox/
139 | .nox/
140 | .coverage
141 | .coverage.*
142 | .cache
143 | nosetests.xml
144 | coverage.xml
145 | *.cover
146 | *.py,cover
147 | .hypothesis/
148 | .pytest_cache/
149 | cover/
150 |
151 | # Translations
152 | *.mo
153 | *.pot
154 |
155 | # Django stuff:
156 | *.log
157 | local_settings.py
158 | db.sqlite3
159 | db.sqlite3-journal
160 |
161 | # Flask stuff:
162 | instance/
163 | .webassets-cache
164 |
165 | # Scrapy stuff:
166 | .scrapy
167 |
168 | # Sphinx documentation
169 | docs/_build/
170 |
171 | # PyBuilder
172 | .pybuilder/
173 | target/
174 |
175 | # Jupyter Notebook
176 | .ipynb_checkpoints
177 |
178 | # IPython
179 | profile_default/
180 | ipython_config.py
181 |
182 | # pyenv
183 | # For a library or package, you might want to ignore these files since the code is
184 | # intended to run in multiple environments; otherwise, check them in:
185 | # .python-version
186 |
187 | # pipenv
188 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
189 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
190 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
191 | # install all needed dependencies.
192 | #Pipfile.lock
193 |
194 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
195 | __pypackages__/
196 |
197 | # Celery stuff
198 | celerybeat-schedule
199 | celerybeat.pid
200 |
201 | # SageMath parsed files
202 | *.sage.py
203 |
204 | # Environments
205 | .env
206 | .venv
207 | env/
208 | venv/
209 | ENV/
210 | env.bak/
211 | venv.bak/
212 |
213 | # Spyder project settings
214 | .spyderproject
215 | .spyproject
216 |
217 | # Rope project settings
218 | .ropeproject
219 |
220 | # mkdocs documentation
221 | /site
222 |
223 | # mypy
224 | .mypy_cache/
225 | .dmypy.json
226 | dmypy.json
227 |
228 | # Pyre type checker
229 | .pyre/
230 |
231 | # pytype static type analyzer
232 | .pytype/
233 |
234 | # Cython debug symbols
235 | cython_debug/
236 |
237 | # End of https://www.toptal.com/developers/gitignore/api/python,macos,emacs,linux
238 |
239 | # Ignore files from the alzkb-site
240 | node_modules
241 | default.conf
242 | *.dump
243 | *.key
244 | *.pem
245 | *.crt
246 |
--------------------------------------------------------------------------------
/BUILD.org:
--------------------------------------------------------------------------------
1 | #+TITLE: Building AlzKB (from scratch)
2 | #+AUTHOR: Joseph D. Romano
3 | #+EMAIL: joseph.romano@pennmedicine.upenn.edu
4 | #+LANGUAGE: en
5 | #+OPTIONS: toc:nil author
6 |
7 | * Overview
8 | This guide will teach you the complete process of building the
9 | Alzheimer's Knowledge Base (AlzKB). It's not a concise process, but it
10 | is extensible to other applications of knowledge engineering. We use
11 | the same process for our other knowledge bases (such as [[https://comptox.ai][ComptoxAI]]), so
12 | this guide can also be used to teach you how to build your own.
13 |
14 | The following diagram gives an overview of the build process:
15 |
16 | #+CAPTION: Summary of how to build AlzKB
17 | [[./img/build-abstract.png]]
18 |
19 | 1. First, you use domain knowledge to create the ontology
20 | 2. Then, you collect the data sources and use them to populate the
21 | ontology
22 | 3. Finally, you convert the ontology into a graph database
23 |
24 | * 1.: Creating the AlzKB Ontology
25 | _Important note_: Most users don't need to follow these steps, since
26 | it is already done! Unless you want to extend AlzKB or make major
27 | modifications to its node/edge types, you should skip to the [[Obtaining the third-party data sources][next
28 | section]]. If you DO want to do those things, then keep reading.
29 |
30 | AlzKB uses an OWL 2 ontology to act something like a 'template' for
31 | the nodes and relationships in the final knowledge graph. While the
32 | actual nodes and relationships are added automatically according to
33 | the 'rules' defined in the ontology, the ontology itself is
34 | constructed manually, using domain knowledge about AD. We do this
35 | using the Protégé ontology editor. If you don't already have it,
36 | download and install [[https://protege.stanford.edu/software.php][Protégé Desktop]] on your computer.
37 |
38 | * 2.: Obtaining the third-party data sources
39 | The next step is to collect the source data files that will eventually
40 | become the nodes, relationships, and properties in AlzKB's knowledge
41 | graph. Since databases are distributed in a variety of formats and
42 | modalities, you will have to work with a mix of plain-text "flat"
43 | files as well as relational (SQL) databases. All of the SQL databases
44 | parsed to build AlzKB are distributed for MySQL (as opposed to some
45 | other flavor of SQL).
46 |
47 | ** Flat file data sources
48 |
49 | |-----------+----------------+-----------------------------------+---------------------------+--------------------|
50 | | Source | Directory name | Entity type(s) | URL | Extra instructions |
51 | |-----------+----------------+-----------------------------------+---------------------------+--------------------|
52 | | Hetionet | =hetionet= | Many - see =populate-ontology.py= | [[https://github.com/hetio/hetionet/tree/master/hetnet/tsv][GitHub]] | [[https://het.io][Hetionet]] |
53 | | NCBI Gene | =ncbigene= | Genes | [[https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz][Homo_sapiens.gene_info.gz]] | [[https://www.ncbi.nlm.nih.gov/gene/][NCBI Gene]] |
54 | | Drugbank | =drugbank= | Drugs / drug candidates | [[https://go.drugbank.com/releases/latest#open-data][DrugBank website]] | [[https://go.drugbank.com][Drugbank]] |
55 | | DisGeNET | =disgenet= | Diseases and disease-gene edges | [[https://www.disgenet.org/][DisGeNET]] | [[https://disgenet.com][DisGeNET]] |
56 | | | | | | |
57 |
58 | *** Hetionet
59 | Download the =hetionet-v1.0-edges.sif.gz= (extract it using =gunzip=)
60 | and =hetionet-v1.0-nodes.tsv= files from the Hetionet Github
61 | repository. Both of them are, essentially, TSV files, even though one
62 | has the =.sif= extension.
63 |
64 | Hetionet is, itself, a knowledge base, and contains many of the core
65 | biological entities used in AlzKB. Accordingly, it contains data
66 | derived from many other third-party sources.
67 |
68 | *** NCBI Gene
69 | Download the =Homo_sapiens.gene_info.gz= file from the NCBI FTP page
70 | and extract it (e.g., using =gunzip=).
71 |
72 | Create a =CUSTOM= subdirectory inside the =ncbigene= directory. Inside
73 | of that subdirectory, place the following two files:
74 | - [[https://github.com/EpistasisLab/AlzKB/blob/a9db2602e3e7960ec09749b99944fbf675323497/scripts/alzkb_parse_ncbigene.py][alzkb_parse_ncbigene.py]]
75 | - [[https://bgee.org/ftp/bgee_v15_0/download/calls/expr_calls/Homo_sapiens_expr_advanced.tsv.gz][Homo_sapiens_expr_advanced.tsv]] (from the Bgee database)
76 | Then, run =alzkb_parse_ncbigene.py= (no external Python packages
77 | should be needed). You'll notice that it creates two output files
78 | that are used while populating the ontology.
79 |
80 | *** Drugbank
81 | In order to download the Academic DrugBank datasets, you need to first create a free DrugBank account and verify your email address. After verifying your email address, they may need some more information regarding your DrugBank account, like the description of how you plan to use DrugBank, a description of your organization, Who is sponsoring this research, and What is the end goal of this research. Account approval can take up to several business days to weeks based on our experience.
82 |
83 | After your access has been approved, navigate to the Academic Download page on the Drugbank website (linked
84 | above) by selecting the "Download" tab and "Academic Download". Select the "External Links" tab. In the table titled "External
85 | Drug Links", click the "Download" button on the row labeled
86 | "All". This will download a zip file. Extract the contents of that zip
87 | file, and make sure it is named =drug_links.csv= (some versions use a
88 | space instead of an underscore in the filename).
89 |
90 | *** DisGeNET
91 | Although DisGeNET is available under a Creative Commons license, the
92 | database requires users to create a free account to download the
93 | tab-delimited data files. Therefore, you should create a user account
94 | and log in. Then, navigate to the Downloads page on the DisGeNET
95 | website. Now, download the two necessary files by clicking on the
96 | corresponding links:
97 | - "UMLS CUI to several disease vocabularies" (under the "UMLS CUI to
98 | several disease vocabularies" section heading - the resulting file
99 | name will be =disease_mappings.tsv.gz=)
100 | - "UMLS CUI to top disease classes" (the resulting file will be named
101 | =disease_mappings_to_attributes.tar.gz=)
102 | Next, download =curated_disease_gene_associations.tsv.gz= directly by
103 | copying the following URL into your web browser:
104 | https://www.disgenet.org/static/disgenet_ap1/files/downloads/curated_gene_disease_associations.tsv.gz
105 |
106 | All three files are gzipped, so extract them into the =disgenet/=
107 | directory using your favorite method (e.g., gunzip from the command
108 | line, 7zip from within Windows, etc.).
109 |
110 | Now that you have the three necessary data files, you should run the
111 | AlzKB script we wrote to filter for rows in those files corresponding
112 | to Alzheimer's Disease, named =alzkb_parse_disgenet.py=. This script
113 | is in the =scripts/= directory of the AlzKB repository, so either find
114 | it on your local filesystem if you already have a copy of the
115 | repository, or find it on the AlzKB GitHub repository in your web
116 | browser.
117 |
118 | You can then run the Python script from within the =disgenet/=
119 | directory, which should deposit two filtered data files in the
120 | =disgenet/CUSTOM/= subdirectory. These will be automatically detected
121 | and used when you run the ontology population script, along with the
122 | unmodified =curated_disease_gene_associations.tsv= file.
123 |
124 | Then you create a directory that will hold all of the raw data files. It can be 'D:\data\' or something else you prefer. Within that, there will be 1 folder for each third-party database, and in those folders, you'll put the individual csv/tsv/txt files.
125 |
126 | ** SQL data sources
127 | If you don't already have MySQL installed, install it. We recommend
128 | using either a package manager (if one is available on your OS), or
129 | installing MySQL Community Server from the mysql.com website (e.g., by
130 | visiting https://dev.mysql.com/downloads/mysql/). Make sure it's
131 | running and you have the ability to create and modify new databases.
132 |
133 | *** AOP-DB
134 | The Adverse Outcome Pathway Database (AOP-DB) is the only MySQL
135 | database you need to install to build the current version of AlzKB. It
136 | can be downloaded at: https://gaftp.epa.gov/EPADataCommons/ORD/AOP-DB/
137 |
138 | *WARNING:* This is a big download (7.2G while compressed)! Make sure
139 | you have enough disk space before proceeding.
140 |
141 | You'll have to extract two archives - first, unzip the =AOP-DB_v2.zip=
142 | archive, which should contain two *.tar.gz archives and another .zip
143 | archive. Now, extract the *.tar.gz archive containing =nogi= in its
144 | name (the smaller of the two). Windows doesn't natively support
145 | extracting .tar.gz archives, so you'll either have to download another
146 | program that does this (e.g., 7-zip) or extract it in a Unix-based
147 | environment (Linux, MacOS, Windows Subsystem for Linux, Cygwin, etc.)
148 | that has the =tar= program available on the command line. Once you've
149 | extracted it, you should have a file named something like
150 | =aopdb_no-orthoscores.sql=.
151 |
152 | Now, create an empty database in MySQL, and name it =aopdb=. Make sure
153 | you have full admin privileges on the database. Then, load the (newly
154 | extracted) =.sql= file into the empty database. I always find this
155 | easiest from the command line, by running a command such as:
156 | #+begin_src bash
157 | $ mysql -u username -p database_name < aopdb_no-orthoscores.sql
158 | #+end_src
159 | Substitute your username after the =-u= option and enter your password
160 | when prompted. If you prefer to import it from a GUI, you can use a
161 | tool like MySQL Workbench or DataGrip.
162 |
163 | *WARNING:* It can take a while to import, so be ready to take a break
164 | or do something else while you wait.
165 |
166 | * 2.5: Populating the ontology
167 | Now that we have an ontology (currently 'unpopulated', consisting of a
168 | class hierarchy, object property types, data property types, and
169 | possibly annotations), we can populate it with records from the
170 | third-party databases we collected in the previous step. Fortunately,
171 | this is a largely automated process, facilitated by a tool we call
172 | =ista= (/ista/ is the Sindarin word for /knowledge/). With =ista=, you
173 | write a Python script that first tells =ista= where to find the
174 | third-party data sources, and then maps each of those data sources to
175 | one or two node or edge types defined in the ontology (as classes or
176 | object properties, respectively). Here, we'll walk through the
177 | different parts of AlzKB's =ista= build script and discuss what each
178 | component does. If you are reading this guide to modify or extend
179 | AlzKB, you should be able to use the information in the following few
180 | sections to write your own build script.
181 |
182 | For reference, an up-to-date, complete copy of this build file can be
183 | found in the [[https://github.com/EpistasisLab/AlzKB][AlzKB source repository]] at the location
184 | =alzkb/populate_ontology.py=.
185 |
186 | ** Installing ista
187 | - Keep MySQL Server running
188 | - Install mysqlclient via Anaconda-Navigator
189 | - Clone the ista repository onto your computer (=git clone https://github.com/RomanoLab/ista=)
190 | - =cd ista=
191 | - =pip install .=
192 |
193 | ** Build file top-matter
194 | At the top of the file, we do some imports of necessary Python
195 | packages. First comes =ista=. We don't import the whole package, just
196 | the classes and function that we actually interact with.
197 | #+begin_src python
198 | from ista import FlatFileDatabaseParser, MySQLDatabaseParser
199 | from ista.util import print_onto_stats
200 | #+end_src
201 | In order to interact with OWL 2 ontology files, we bring in the
202 | =owlready2= library.
203 | #+begin_src python
204 | import owlready2
205 | #+end_src
206 | We put private data for our local MySQL databases (hostname, username,
207 | and password) in a file named =secrets.py=, and then make sure the
208 | file is added to our =.gitignore= file so it isn't checked into
209 | version control. You'll have to create that file yourself, and define
210 | the variables =MYSQL_HOSTNAME=, =MYSQL_USERNAME=, and
211 | =MYSQL_PASSWORD=. Then, in the build script, you'll import the file
212 | containing those variables and wrap them into a configuration dict.
213 | #+begin_src python
214 | import secrets
215 |
216 | mysql_config = {
217 | 'host': secrets.MYSQL_HOSTNAME,
218 | 'user': secrets.MYSQL_USERNAME,
219 | 'passwd': secrets.MYSQL_PASSWORD
220 | }
221 | #+end_src
222 | ** Telling =ista= where to find your data sources
223 | Since we are populating an ontology, we need to load the ontology into
224 | =owlready2=. Make sure to modify this path to fit the location of the
225 | AlzKB ontology file on your system! Future versions of AlzKB will
226 | source the path dynamically. Also note the =file://= prefix, which
227 | tells =owlready2= to look on the local file system rather than load a
228 | web URL. Since this guide was made on a Windows desktop, you'll notice
229 | that we have to use escaped backslashes to specify file paths that the
230 | Python interpreter will parse correctly.
231 | #+begin_src python
232 | onto = owlready2.get_ontology("file://D:\\projects\\ista\\tests\\projects\\alzkb\\alzkb.rdf").load()
233 | #+end_src
234 | We also set the 'base' directory for all of the flat files that =ista=
235 | will be loading. You will have determined this location already (see
236 | [[Obtaining the third-party data sources]]).
237 | #+begin_src python
238 | data_dir = "D:\\data\\"
239 | #+end_src
240 | Now, we can actually register the source databases with =ista='s
241 | parser classes. We use =FlatFileDatabaseParser= for data sources
242 | stored as one or more delimited flat files, and =MySQLDatabaseParser=
243 | for data sources in a MySQL database. For flat file-based sources, the
244 | first argument given to the parser's constructor MUST be the
245 | subdirectory (within =data_dir=) where that source's data files are
246 | contained, and for MySQL sources it MUST be the name of the MySQL
247 | database. If not, =ista= won't know where to find the files. The
248 | second argument is always the ontology object loaded using
249 | =owlready2=, and the third is either the base data directory or the
250 | MySQL config dictionary, both of which were defined above.
251 | #+begin_src python
252 | epa = FlatFileDatabaseParser("epa", onto, data_dir)
253 | ncbigene = FlatFileDatabaseParser("ncbigene", onto, data_dir)
254 | drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir)
255 | hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir)
256 | aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config)
257 | aopwiki = FlatFileDatabaseParser("aopwiki", onto, data_dir)
258 | tox21 = FlatFileDatabaseParser("tox21", onto, data_dir)
259 | disgenet = FlatFileDatabaseParser("disgenet", onto, data_dir)
260 | #+end_src
261 | In the following two sections, we'll go over a few examples of how to
262 | define mappings using these parser objects. We won't replicate every
263 | mapping in this guide for brevity, but you can see all of them in the
264 | full AlzKB build script.
265 | *** Configuration for 'flat file' (e.g., CSV) data sources
266 | #+begin_src python
267 | hetionet.parse_node_type(
268 | node_type="Symptom",
269 | source_filename="hetionet-v1.0-nodes.tsv",
270 | fmt="tsv",
271 | parse_config={
272 | "iri_column_name": "name",
273 | "headers": True,
274 | "filter_column": "kind",
275 | "filter_value": "Symptom",
276 | "data_transforms": {
277 | "id": lambda x: x.split("::")[-1]
278 | },
279 | "data_property_map": {
280 | "id": onto.xrefMeSH,
281 | "name": onto.commonName
282 | }
283 | },
284 | merge=False,
285 | skip=False
286 | )
287 | #+end_src
288 | This block indicates the third-party database is hetionet, and the file is hetionet-v1.0-nodes.tsv
289 |
290 | So the file it will look for is D:\data\hetionet\hetionet-v1.0-nodes.tsv
291 |
292 | Some of the configuration blocks will have a CUSTOM\ prefix to the filename. This means that the file was created by us manually and will need to be stored in a CUSTOM subdirectory of the database folder. For example:
293 | #+begin_src python
294 | disgenet.parse_node_type(
295 | node_type="Disease",
296 | source_filename="CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", # Filtered for just Alzheimer disease
297 | fmt="tsv-pandas",
298 | parse_config={
299 | "iri_column_name": "diseaseId",
300 | "headers": True,
301 | "data_property_map": {
302 | "diseaseId": onto.xrefUmlsCUI,
303 | "name": onto.commonName,
304 | }
305 | },
306 | merge=False,
307 | skip=False
308 | )
309 | #+end_src
310 | This file will be D:\data\disgenet\CUSTOM\disease_mappings_alzheimer.tsv
311 |
312 | *** Configuration for SQL server data sources
313 | #+begin_src python
314 | aopdb.parse_node_type(
315 | node_type="Drug",
316 | source_table="chemical_info",
317 | parse_config={
318 | "iri_column_name": "DTX_id",
319 | "data_property_map": {"ChemicalID": onto.xrefMeSH},
320 | "merge_column": {
321 | "source_column_name": "DTX_id",
322 | "data_property": onto.xrefDTXSID
323 | }
324 | },
325 | merge=True,
326 | skip=False
327 | )
328 | #+end_src
329 | This block indicates the third-party database is AOP-DB, and the source table is chemical_info.
330 |
331 |
332 | ** Mapping data sources to ontology components
333 | Every flat file or SQL table from a third-party data source can be
334 | mapped a single node or relationship type. For example, a file
335 | describing diseases can be mapped to the =Disease= node type, where
336 | each line in the file corresponds to a disease to be inserted (or
337 | 'merged'---see below) into the knowledge graph. If the source is being
338 | mapped to a node type (rather than a relationship type), =ista=
339 | additionally can populate one or more /node properties/ from the
340 | feature columns in the source file.
341 |
342 | Each mapping is defined using a method call in the =ista= Python
343 | script.
344 |
345 | ** Running =ista=
346 | Now you have set the location of data resources, ontology, and defined mapping method. Run populate_ontology.py
347 |
348 | The alzkb-populated.rdf is the output of this step and will be used for setting Neo4j Graph database.
349 |
350 | * 3.: Converting the ontology into a Neo4j graph database
351 |
352 | ** Installing Neo4j
353 | If you haven't done so already, download Neo4j from the [[https://neo4j.com/download-center/][Neo4j Download
354 | Center]]. Most users should select Neo4j Desktop, but advanced users can
355 | instead opt for Community Server (the instructions for which are well
356 | outside of the scope of this guide).
357 | ** Configuring an empty graph database for AlzKB
358 | You should now create a new graph database that will be populated with
359 | the contents of AlzKB. In Neo4j Community, this can be done as follows:
360 | - Create a new project by clicking the "New" button in the upper left,
361 | then selecting "Create project".
362 | - In the project panel (on the right of the screen), you will see the
363 | default name "Project" populates automatically. Hover over this
364 | name and click the edit icon, then change the name to =AlzKB=.
365 | - To the right of the project name, click "Add", and select "Local
366 | DBMS". Change the Name to =AlzKB DBMS=, specify a password that you will
367 | remember, and use the Version dropdown to select "4.4.0" (if it is
368 | not already selected). Click "Create". Wait for the operation to
369 | finish.
370 | - Install plugins:
371 | - Click the name of the DBMS ("AlzKB DBMS", if you have followed the
372 | guide), and in the new panel to the right click the "Plugins" tab.
373 | - Expand the "APOC" option, click "Install", and wait for the
374 | operation to complete.
375 | - Do the same for the "Graph Data Science Library" and "Neosemantics
376 | (n10s)" plugins.
377 | - Before starting the DBMS, click the ellipsis immediately to the
378 | right of the "Open" button, and then click "Settings...". Make the
379 | following changes to the configuration file:
380 | - Set =dbms.memory.heap.initial_size= to =2048m=.
381 | - Set =dbms.memory.heap.max_size= to =4G=.
382 | - Set =dbms.memory.pagecache.size= to =2048m=.
383 | - Uncomment the line containing
384 | =dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*=
385 | to activate it.
386 | - Add =n10s.*,apoc.cypher.*,apoc.help= to =dbms.security.procedures.allowlist=apoc.coll.*,apoc.load.*,gds.*=
387 | - Click the "Apply" button, then "Close".
388 | - Click "Start" to start the graph database.
389 | ** Importing the =ista= RDF output into Neo4j
390 | - Open neo4j Browser and run the following Cypher to import RDF data
391 | #+begin_src cypher
392 | # Cleaning nodes
393 | MATCH (n) DETACH DELETE n
394 | #+end_src
395 |
396 | #+begin_src cypher
397 | # Constraint Creation
398 | CREATE CONSTRAINT n10s_unique_uri FOR (r:Resource) REQUIRE r.uri IS UNIQUE
399 | #+end_src
400 |
401 | #+begin_src cypher
402 | # Creating a Graph Configuration
403 | CALL n10s.graphconfig.init()
404 | CALL n10s.graphconfig.set({applyNeo4jNaming: true, handleVocabUris: 'IGNORE'})
405 | #+end_src
406 |
407 | #+begin_src cypher
408 | # Importing RDF
409 | CALL n10s.rdf.import.fetch( "file://D:\\data\\alzkb-populated.rdf", "RDF/XML")
410 | #+end_src
411 |
412 | - Run the Cyphers below to clean nodes
413 | #+begin_src cypher
414 | MATCH (n:Resource) REMOVE n:Resource;
415 | MATCH (n:NamedIndividual) REMOVE n:NamedIndividual;
416 | MATCH (n:AllDisjointClasses) REMOVE n:AllDisjointClasses;
417 | MATCH (n:AllDisjointProperties) REMOVE n:AllDisjointProperties;
418 | MATCH (n:DatatypeProperty) REMOVE n:DatatypeProperty;
419 | MATCH (n:FunctionalProperty) REMOVE n:FunctionalProperty;
420 | MATCH (n:ObjectProperty) REMOVE n:ObjectProperty;
421 | MATCH (n:AnnotationProperty) REMOVE n:AnnotationProperty;
422 | MATCH (n:SymmetricProperty) REMOVE n:SymmetricProperty;
423 | MATCH (n:_GraphConfig) REMOVE n:_GraphConfig;
424 | MATCH (n:Ontology) REMOVE n:Ontology;
425 | MATCH (n:Restriction) REMOVE n:Restriction;
426 | MATCH (n:Class) REMOVE n:Class;
427 | MATCH (n) WHERE size(labels(n)) = 0 DETACH DELETE n; # Removes nodes without labels
428 | #+end_src
429 |
430 | Now, you have built the AlzKB from scratch. You can find the number of nodes and relationships with
431 | #+begin_src cypher
432 | CALL db.labels() YIELD label
433 | CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as count',{}) YIELD value
434 | RETURN label, value.count ORDER BY label
435 | #+end_src
436 | #+begin_src cypher
437 | CALL db.relationshipTypes() YIELD relationshipType as type
438 | CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as count',{}) YIELD value
439 | RETURN type, value.count ORDER BY type
440 | #+end_src
441 |
442 | * 4.: Adding new data resources, nodes, relationships, and properties.
443 |
444 | In version 2.0, we added "TranscriptionFactor" nodes, "TRANSCRIPTIONFACTORINTERACTSWITHGENE" relationships, node properties of "chromosome" number and "sourcedatabase", relationships properties of "correlation", "score", "p_fisher", "z_score", "affinity_nm", "confidence", "sourcedatabase", and "unbiased".
445 |
446 | To achieve this, we added the above entities to the ontology RDF and now named =alzkb_v2.rdf= in the =alzkb\data= directory. Then collect additional source data files as detailed in the table below.
447 | | Source | Directory name | Entity type(s) | URL | Extra instructions |
448 | |-----------|----------------|---------------------------------------------|-----------------------|--------------------|
449 | | TRRUST | =dorothea= | Transcription factors(TF) and TF-gene edges | [[https://www.grnpedia.org/trrust/downloadnetwork.php][TRRUST Download]] | [[https://www.grnpedia.org/trrust/][TRRUST]] |
450 | | DoRothEA | =dorothea= | Transcription factors(TF) and TF-gene edges | [[https://saezlab.github.io/dorothea/][DoRothEA Installation]] | [[https://bioconductor.org/packages/release/data/experiment/vignettes/dorothea/inst/doc/dorothea.R][DoRothEA RScript]] |
451 |
452 | ** Prepare Source Data
453 | Download =trrust_rawdata.human.tsv= from TRRUST Download. Install DoRothEA by following the DoRothEA Installation within R. Place the =trrust_rawdata.human.tsv= and =alzkb_parse_dorothea.py= inside of =Dorothea/= subdirectory, which should be within your raw data directory (e.g., =D:\data=). Run =alzkb_parse_dorothea.py=. You’ll notice that it creates a =tf.tsv= file that is used while populating the ontology.
454 |
455 | ** Replicate Hetionet Resources
456 | Since Hetionet does not have an up-to-date update plan, we have replicated them using the rephetio paper and source code to ensure AlzKB has current data. Follow the steps in [[https://github.com/EpistasisLab/AlzKB-updates][AlzKB-updates]] Github repository to create =hetionet-custom-nodes.tsv= and =hetionet-custom-edges.tsv=. Place these files in the =hetionet/= subdirectory.
457 |
458 | ** Process Data Files
459 | Place the updated =alzkb_parse_ncbigene.py=, =alzkb_parse_drugbank.py=, and =alzkb_parse_disgenet.py= from the =scripts/= directory in their respective raw data file subdirectory. Run each script to process the data for the next step.
460 |
461 | ** Populate Ontology
462 | Now that we have the updated ontology and updated data files, run the updated =alzkb/populate_ontology.py= to populate records. It creates a =alzkb_v2-populated.rdf= file that will be used in next step.
463 |
464 | * 5.: Converting the ontology into a Memgraph graph database
465 | ** Installing Memgraph
466 | If you haven't done so already, download Memgraph from the [[https://memgraph.com/docs/getting-started/install-memgraph][Install Memgraph]] page. Most users install Memgraph using a =pre-prepared docker-compose.yml= file by executing:
467 | - for Linux and macOS:
468 | =curl https://install.memgraph.com | sh=
469 | - for Windows:
470 | =iwr https://windows.memgraph.com | iex=
471 |
472 | More details are in [[https://memgraph.com/docs/getting-started/install-memgraph/docker][Install Memgraph with Docker]]
473 |
474 | ** Generating the CSV File
475 | Before uploading the file to Memgrpah, run =alzkb/rdf_to_memgraph_csv.py= with the =alzkb_v2-populated.rdf= file to generate =alzkb-populated.csv=.
476 | Then run =populate_edge_weights.py= to create =alzkb_with_edge_properties.csv= file if you want to add edge properies to the knowledge graph.
477 |
478 | ** Starting Memgraph with Docker
479 | Follow the instructions in [[https://memgraph.com/docs/data-migration/migrate-from-neo4j#importing-data-into-memgraph][importing-data-into-memgraph]] Step 1. Starting Memgraph with Docker to upload the =alzkb-populated.csv= or =alzkb_with_edge_properties.csv= file to the container.
480 |
481 | Open Memgraph Lab. Memgraph Lab is available at =http://localhost:3000=. Click the =Query Execution= in MENU on the left bar. Then, you can type a Cypher query in the =Cypher Editor=.
482 |
483 | ** Gaining speed with indexes and analytical storage mode
484 | - To create indexes, run the following Cypher queries:
485 | #+begin_src cypher
486 | CREATE INDEX ON :Drug(nodeID);
487 | CREATE INDEX ON :Gene(nodeID);
488 | CREATE INDEX ON :BiologicalProcess(nodeID);
489 | CREATE INDEX ON :Pathway(nodeID);
490 | CREATE INDEX ON :MolecularFunction(nodeID);
491 | CREATE INDEX ON :CellularComponent(nodeID);
492 | CREATE INDEX ON :Symptom(nodeID);
493 | CREATE INDEX ON :BodyPart(nodeID);
494 | CREATE INDEX ON :DrugClass(nodeID);
495 | CREATE INDEX ON :Disease(nodeID);
496 | CREATE INDEX ON :TranscriptionFactor (nodeID);
497 | #+end_src
498 |
499 | - To check the current storage mode, run:
500 | #+begin_src cypher
501 | SHOW STORAGE INFO;
502 | #+end_src
503 |
504 | - Change the storage mode to analytical before import:
505 | #+begin_src cypher
506 | STORAGE MODE IN_MEMORY_ANALYTICAL;
507 | #+end_src
508 |
509 | ** Importing data into Memgraph
510 | - Drug nodes
511 | #+begin_src cypher
512 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
513 | WITH row WHERE row._labels = ':Drug' AND row.commonName <> ''
514 | CREATE (d:Drug {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase,
515 | xrefCasRN: row.xrefCasRN, xrefDrugbank: row.xrefDrugbank});
516 |
517 | MATCH (d:Drug)
518 | RETURN count(d);
519 | #+end_src
520 |
521 | - Gene nodes
522 | #+begin_src cypher
523 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
524 | WITH row WHERE row._labels = ':Gene'
525 | CREATE (g:Gene {nodeID: row._id, commonName: row.commonName, geneSymbol: row.geneSymbol, sourceDatabase: row.sourceDatabase,
526 | typeOfGene: row.typeOfGene, chromosome: row.chromosome, xrefEnsembl: row.xrefEnsembl,
527 | xrefHGNC: row.xrefHGNC, xrefNcbiGene: toInteger(row.xrefNcbiGene), xrefOMIM: row.xrefOMIM});
528 |
529 | MATCH (g:Gene)
530 | RETURN count(g);
531 | #+end_src
532 |
533 | - BiologicalProcess nodes
534 | #+begin_src cypher
535 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
536 | WITH row WHERE row._labels = ':BiologicalProcess'
537 | CREATE (b:BiologicalProcess {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase,
538 | xrefGeneOntology: row.xrefGeneOntology});
539 |
540 | MATCH (b:BiologicalProcess)
541 | RETURN count(b)
542 | #+end_src
543 |
544 | - Pathway nodes
545 | #+begin_src cypher
546 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
547 | WITH row WHERE row._labels = ':Pathway'
548 | CREATE (p:Pathway {nodeID: row._id, pathwayId: row.pathwayId, pathwayName: row.pathwayName, sourceDatabase: row.sourceDatabase});
549 |
550 | MATCH (p:Pathway)
551 | RETURN count(p)
552 | #+end_src
553 |
554 | - MolecularFunction nodes
555 | #+begin_src cypher
556 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
557 | WITH row WHERE row._labels = ':MolecularFunction'
558 | CREATE (m:MolecularFunction {nodeID: row._id, commonName: row.commonName, xrefGeneOntology: row.xrefGeneOntology});
559 |
560 | MATCH (m:MolecularFunction)
561 | RETURN count(m)
562 | #+end_src
563 |
564 | - CellularComponent nodes
565 | #+begin_src cypher
566 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
567 | WITH row WHERE row._labels = ':CellularComponent'
568 | CREATE (c:CellularComponent {nodeID: row._id, commonName: row.commonName, xrefGeneOntology: row.xrefGeneOntology});
569 |
570 | MATCH (c:CellularComponent)
571 | RETURN count(c)
572 | #+end_src
573 |
574 | - Symptom nodes
575 | #+begin_src cypher
576 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
577 | WITH row WHERE row._labels = ':Symptom'
578 | CREATE (s:Symptom {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefMeSH: row.xrefMeSH});
579 |
580 | MATCH (s:Symptom)
581 | RETURN count(s)
582 | #+end_src
583 |
584 | - BodyPart nodes
585 | #+begin_src cypher
586 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
587 | WITH row WHERE row._labels = ':BodyPart'
588 | CREATE (b:BodyPart {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefUberon: row.xrefUberon});
589 |
590 | MATCH (b:BodyPart)
591 | RETURN count(b)
592 | #+end_src
593 |
594 | - DrugClass nodes
595 | #+begin_src cypher
596 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
597 | WITH row WHERE row._labels = ':DrugClass'
598 | CREATE (d:DrugClass {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase, xrefNciThesaurus: row.xrefNciThesaurus});
599 |
600 | MATCH (d:DrugClass)
601 | RETURN count(d)
602 | #+end_src
603 |
604 | - Disease nodes
605 | #+begin_src cypher
606 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
607 | WITH row WHERE row._labels = ':Disease'
608 | CREATE (d:Disease {nodeID: row._id, commonName: row.commonName, sourceDatabase: row.sourceDatabase,
609 | xrefDiseaseOntology: row.xrefDiseaseOntology, xrefUmlsCUI: row.xrefUmlsCUI});
610 |
611 | MATCH (d:Disease)
612 | RETURN count(d)
613 | #+end_src
614 |
615 | - Transcription Factor nodes
616 | #+begin_src cypher
617 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
618 | WITH row WHERE row._labels = ':TranscriptionFactor'
619 | CREATE (t:TranscriptionFactor {nodeID: row._id, sourceDatabase: row.sourceDatabase, TF: row.TF});
620 | MATCH (t:TranscriptionFactor)
621 | RETURN count(t)
622 | #+end_src
623 |
624 | - GENEPARTICIPATESINBIOLOGICALPROCESS relationships
625 | #+begin_src cypher
626 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
627 | WITH row WHERE row._type = 'GENEPARTICIPATESINBIOLOGICALPROCESS'
628 | MATCH (g:Gene {nodeID: row._start}) MATCH (b:BiologicalProcess {nodeID: row._end})
629 | MERGE (g)-[rel:GENEPARTICIPATESINBIOLOGICALPROCESS]->(b)
630 | RETURN count(rel)
631 | #+end_src
632 |
633 | - GENEREGULATESGENE relationships
634 | #+begin_src cypher
635 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
636 | WITH row WHERE row._type = 'GENEREGULATESGENE'
637 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end})
638 | MERGE (g)-[rel:GENEREGULATESGENE]->(g2)
639 | RETURN count(rel)
640 | #+end_src
641 |
642 | - GENEINPATHWAY relationships
643 | #+begin_src cypher
644 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
645 | WITH row WHERE row._type = 'GENEINPATHWAY'
646 | MATCH (g:Gene {nodeID: row._start}) MATCH (p:Pathway {nodeID: row._end})
647 | MERGE (g)-[rel:GENEINPATHWAY]->(p)
648 | RETURN count(rel)
649 | #+end_src
650 |
651 | - GENEINTERACTSWITHGENE relationships
652 | #+begin_src cypher
653 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
654 | WITH row WHERE row._type = 'GENEINTERACTSWITHGENE'
655 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end})
656 | MERGE (g)-[rel:GENEINTERACTSWITHGENE]->(g2)
657 | RETURN count(rel)
658 | #+end_src
659 |
660 | - BODYPARTUNDEREXPRESSESGENE relationships
661 | #+begin_src cypher
662 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
663 | WITH row WHERE row._type = 'BODYPARTUNDEREXPRESSESGENE'
664 | MATCH (b:BodyPart {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
665 | MERGE (b)-[rel:BODYPARTUNDEREXPRESSESGENE]->(g)
666 | RETURN count(rel)
667 | #+end_src
668 |
669 | - BODYPARTOVEREXPRESSESGENE relationships
670 | #+begin_src cypher
671 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
672 | WITH row WHERE row._type = 'BODYPARTOVEREXPRESSESGENE'
673 | MATCH (b:BodyPart {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
674 | MERGE (b)-[rel:BODYPARTOVEREXPRESSESGENE]->(g)
675 | RETURN count(rel)
676 | #+end_src
677 |
678 | - GENEHASMOLECULARFUNCTION relationships
679 | #+begin_src cypher
680 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
681 | WITH row WHERE row._type = 'GENEHASMOLECULARFUNCTION'
682 | MATCH (g:Gene {nodeID: row._start}) MATCH (m:MolecularFunction {nodeID: row._end})
683 | MERGE (g)-[rel:GENEHASMOLECULARFUNCTION]->(m)
684 | RETURN count(rel)
685 | #+end_src
686 |
687 | - GENEASSOCIATEDWITHCELLULARCOMPONENT relationships
688 | #+begin_src cypher
689 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
690 | WITH row WHERE row._type = 'GENEASSOCIATEDWITHCELLULARCOMPONENT'
691 | MATCH (g:Gene {nodeID: row._start}) MATCH (c:CellularComponent {nodeID: row._end})
692 | MERGE (g)-[rel:GENEASSOCIATEDWITHCELLULARCOMPONENT]->(c)
693 | RETURN count(rel)
694 | #+end_src
695 |
696 | - GENECOVARIESWITHGENE relationships
697 | #+begin_src cypher
698 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
699 | WITH row WHERE row._type = 'GENECOVARIESWITHGENE'
700 | MATCH (g:Gene {nodeID: row._start}) MATCH (g2:Gene {nodeID: row._end})
701 | MERGE (g)-[rel:GENECOVARIESWITHGENE {sourceDB: row.sourceDB, unbiased: row.unbiased, correlation: row.correlation}]->(g2)
702 | RETURN count(rel)
703 | #+end_src
704 |
705 | - CHEMICALDECREASESEXPRESSION relationships
706 | #+begin_src cypher
707 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
708 | WITH row WHERE row._type = 'CHEMICALDECREASESEXPRESSION'
709 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
710 | MERGE (d)-[rel:CHEMICALDECREASESEXPRESSION {sourceDB: row.sourceDB, unbiased: row.unbiased, z_score: row.z_score}]->(g)
711 | RETURN count(rel)
712 | #+end_src
713 |
714 | - CHEMICALINCREASESEXPRESSION relationships
715 | #+begin_src cypher
716 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
717 | WITH row WHERE row._type = 'CHEMICALINCREASESEXPRESSION'
718 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
719 | MERGE (d)-[rel:CHEMICALINCREASESEXPRESSION {sourceDB: row.sourceDB, unbiased: row.unbiased, z_score: row.z_score}]->(g)
720 | RETURN count(rel)
721 | #+end_src
722 |
723 | - CHEMICALBINDSGENE relationships
724 | #+begin_src cypher
725 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
726 | WITH row WHERE row._type = 'CHEMICALBINDSGENE'
727 | MATCH (d:Drug {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
728 | MERGE (d)-[rel:CHEMICALBINDSGENE {sourceDB: row.sourceDB, unbiased: row.unbiased, affinity_nM: row.affinity_nM}]->(g)
729 | RETURN count(rel)
730 | #+end_src
731 |
732 | - DRUGINCLASS relationships
733 | #+begin_src cypher
734 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
735 | WITH row WHERE row._type = 'DRUGINCLASS'
736 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:DrugClass {nodeID: row._end})
737 | MERGE (d)-[rel:DRUGINCLASS]->(d2)
738 | RETURN count(rel)
739 | #+end_src
740 |
741 | - GENEASSOCIATESWITHDISEASE relationships
742 | #+begin_src cypher
743 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
744 | WITH row WHERE row._type = 'GENEASSOCIATESWITHDISEASE'
745 | MATCH (g:Gene {nodeID: row._start}) MATCH (d:Disease {nodeID: row._end})
746 | MERGE (g)-[rel:GENEASSOCIATESWITHDISEASE {sourceDB: row.sourceDB, score: row.score}]->(d)
747 | RETURN count(rel)
748 | #+end_src
749 |
750 | - SYMPTOMMANIFESTATIONOFDISEASE relationships
751 | #+begin_src cypher
752 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
753 | WITH row WHERE row._type = 'SYMPTOMMANIFESTATIONOFDISEASE'
754 | MATCH (s:Symptom {nodeID: row._start}) MATCH (d:Disease {nodeID: row._end})
755 | MERGE (s)-[rel:SYMPTOMMANIFESTATIONOFDISEASE {sourceDB: row.sourceDB, unbiased: row.unbiased, p_fisher: row.p_fisher}]->(d)
756 | RETURN count(rel)
757 | #+end_src
758 |
759 | - DISEASELOCALIZESTOANATOMY relationships
760 | #+begin_src cypher
761 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
762 | WITH row WHERE row._type = 'DISEASELOCALIZESTOANATOMY'
763 | MATCH (d:Disease {nodeID: row._start}) MATCH (b:BodyPart {nodeID: row._end})
764 | MERGE (d)-[rel:DISEASELOCALIZESTOANATOMY {sourceDB: row.sourceDB, unbiased: row.unbiased, p_fisher: row.p_fisher}]->(b)
765 | RETURN count(rel)
766 | #+end_src
767 |
768 | - DRUGTREATSDISEASE relationships
769 | #+begin_src cypher
770 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
771 | WITH row WHERE row._type = 'DRUGTREATSDISEASE'
772 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:Disease {nodeID: row._end})
773 | MERGE (d)-[rel:DRUGTREATSDISEASE]->(d2)
774 | RETURN count(rel)
775 | #+end_src
776 |
777 | - DRUGCAUSESEFFECT relationships
778 | #+begin_src cypher
779 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
780 | WITH row WHERE row._type = 'DRUGCAUSESEFFECT'
781 | MATCH (d:Drug {nodeID: row._start}) MATCH (d2:Disease {nodeID: row._end})
782 | MERGE (d)-[rel:DRUGCAUSESEFFECT]->(d2)
783 | RETURN count(rel)
784 | #+end_src
785 |
786 | - TRANSCRIPTIONFACTORINTERACTSWITHGENE relationships
787 | #+begin_src cypher
788 | LOAD CSV FROM "/usr/lib/memgraph/alzkb-populated.csv" WITH HEADER AS row
789 | WITH row WHERE row._type = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'
790 | MATCH (t:TranscriptionFactor {nodeID: row._start}) MATCH (g:Gene {nodeID: row._end})
791 | MERGE (t)-[rel:TRANSCRIPTIONFACTORINTERACTSWITHGENE {sourceDB: row.sourceDB, confidence: row.confidence}]->(g)
792 | RETURN count(rel)
793 | #+end_src
794 |
795 | ** Switching Back to Transactional Storage Mode
796 | After importing the data, follow these steps to switch back to the transactional storage mode:
797 | - Switch to Transactional Storage Mode:
798 | #+begin_src cypher
799 | STORAGE MODE IN_MEMORY_TRANSACTIONAL;
800 | #+end_src
801 |
802 | - Verify the Storage Mode Switch:
803 | #+begin_src cypher
804 | SHOW STORAGE INFO;
805 | #+end_src
806 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AlzKB (http://alzkb.ai/)
2 |
3 | A knowledge base for AI research in Alzheimer Disease, based on graph databases.
4 | 
5 |
6 | _Please note DRUGCAUSESEFFECT in AlzKB refers to drug causes of side effects._
7 |
8 | ### Authors
9 |
10 | AlzKB is designed and developed by the following authors (in alphabetical order):
11 |
12 | - Britney Graham, PhD (Cedars-Sinai)
13 | - Yun Hao, MS (UPenn)
14 | - Rachit Kumar (UPenn)
15 | - Xi Li, MD (Cedars-Sinai)
16 | - Nick Matsumoto (Cedars-Sinai)
17 | - Jason H. Moore, PhD, FACMI (Cedars-Sinai)
18 | - Jay Moran, MS (Cedars-Sinai)
19 | - Marylyn Ritchie, PhD (UPenn)
20 | - Joseph D. Romano, PhD (UPenn)
21 | - Li Shen, PhD, FAIMBE (UPenn)
22 | - Van Truong, MS (UPenn)
23 | - Mythreye Venkatesan, MS (Cedars-Sinai)
24 | - Paul Wang, PhD (Cedars-Sinai)
25 |
26 |
27 | ## Deprication Note
28 | Versions of AlzKB prior to v1.3.0 used Neo4j. Use of Neo4j is now depricated. Legacy versions of the knowledge graph will continue to be provided in the Releases page to support existing research.
29 |
30 | ## Prerequisites
31 | - Memgraph Lab (Desktop application)
32 | - Starting with AlzKB v1.3.0, Memgraph is used as the knowledge graph server.
33 | - Memgraph offers a variety of [installation options](https://memgraph.com/docs/getting-started/install-memgraph).
34 | - Memgraph Lab is the easiest way to get up and running with AlzKB. But you may use Memgraph Server if your deployment requires it.
35 | - Python (version 3.7 or later)
36 |
37 | ## Installation
38 |
39 | To build a copy of AlzKB's graph database, you can either:
40 | - Download a copy of the latest CYPHERL file and import it into Memgraph
41 | - Build the knowledge base from its original third-party sources and import it into Memgraph
42 |
43 | ### Install from CYPHERL file (easy)
44 | - Visit the [Releases page](https://github.com/EpistasisLab/AlzKB/releases) and find the version of AlzKB you want to install. Unless you have a particular reason to do otherwise, this should probably be the most recent release. Follow the link in the release notes to the corresponding database dump (it will redirect to an external page).
45 | - Using Memgraph Lab, import the downloaded CYPHERL file by navigating to _Import & Export_ and then click the _Import Data_ button.
46 | - For other ways to import the CYPHERL file into a Memgraph server, see [here](https://memgraph.com/docs/data-migration/cypherl)
47 | - In Memgraph Lab, navigate to _Query execution_ to start querying the knowledge graph.
48 |
49 | ### Build from scratch (less easy)
50 |
51 | **For detailed instructions on building AlzKB from scratch, see [here](https://github.com/EpistasisLab/AlzKB/blob/master/BUILD.org)**
52 |
53 | Start by installing the Python package, which includes the necessary scripts:
54 |
55 | ```{bash}
56 | $ git clone https://github.com/EpistasisLab/AlzKB
57 | $ cd AlzKB
58 | $ pip install .
59 | ```
60 |
61 | #### Download the third-party database sources
62 |
63 | First, install MySQL and make sure it is running, as some of the source
64 | databases are only available as MySQL dumps.
65 |
66 | We've created a script that will fetch all of the source files and put them into
67 | the expected directory structure. We will try to keep this script as updated as
68 | possible, but if you encounter any issues we suggest looking at the script and
69 | making sure it points to entities that still exist.
70 |
71 | ```{bash}
72 | $ alzkb bootstrap
73 | ```
74 |
75 | #### Populate the ontology
76 |
77 | We use the external `ista` library to populate the OWL ontology. This should
78 | be pretty much entirely automated:
79 |
80 | ```{bash}
81 | $ alzkb build
82 | ```
83 |
84 | #### Load the ontology contents into Neo4j
85 |
86 | This script will import the OWL 2 ontology contents into an empty Neo4j database
87 | and clean up unnecessary artifacts left over by the OWL 2 standard:
88 |
89 | ```{bash}
90 | $ alzkb install
91 | ```
92 |
93 | After this, check the Neo4j database (which will now be turned on) and make sure
94 | everything looks alright.
95 |
--------------------------------------------------------------------------------
/alzkb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/AlzKB/3ce9515b3172e4356edc83e3ea37cd1a0df3d7ed/alzkb/__init__.py
--------------------------------------------------------------------------------
/alzkb/build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from argparse import ArgumentError
4 | import warnings
5 | import os, sys
6 |
7 | def confirm_y_n():
8 | while True:
9 | resp = str(input("Continue with operation? (Y/n): ")).lower().strip()
10 | if resp[0] == 'y':
11 | return True
12 | if resp[0] == 'n':
13 | print("Exiting application...")
14 | sys.exit(0)
15 | print("Please enter y or n.")
16 |
17 |
18 | def bootstrap():
19 | """
20 | Retrieve data files needed to build AlzKB from scratch and organize them
21 | into the required directory structure.
22 | """
23 | pass
24 |
25 | def build():
26 | """
27 | Populate the AlzKB ontology using the local copies of the source databases.
28 | """
29 | pass
30 |
31 | def install():
32 | """
33 | Import the contents of the AlzKB populated ontology into Neoj4.
34 | """
35 | pass
36 |
37 | def main():
38 | args = sys.argv
39 |
40 | try:
41 | assert len(args) > 1
42 | except AssertionError:
43 | raise ArgumentError("Error - must provide one of `bootstrap`, `build`, or `install` as an argument to `alzkb`. See the README for more information.")
44 |
45 | if len(args) > 2:
46 | warnings.warn("Multiple arguments provided - only the first will be used.")
47 |
48 | op_arg = args[1].lower()
49 |
50 | if op_arg == 'bootstrap':
51 | bootstrap()
52 | elif op_arg == 'build':
53 | build()
54 | elif op_arg == 'install':
55 | install()
56 | else:
57 | raise ArgumentError("Error - must provide one of `bootstrap`, `build`, or `install` as an argument to `alzkb`. See the README for more information.")
58 |
--------------------------------------------------------------------------------
/alzkb/data/alzkb.rdf:
--------------------------------------------------------------------------------
1 |
2 |
11 |
12 |
13 | English
14 | A note on classes vs. individuals:
15 |
16 | In this ontology, individuals are modeled as the idealised entities corresponding to examples of a certain class. For example, 'paroxetine' is an individual of the class Chemical, and 'hsdl1' is an individual of the class Gene. Other ontologies may choose to model these instead as subclasses (e.g., Paroxetine is a subclass of Chemical), and individuals are physical realizations of those classes (e.g., a specific molecule of Paroxetine in the real world).
17 |
18 | The decision to model idealised entities as individuals rather than classes allows us stricter control over the logical assumptions we apply to all entities of a related type. For many use cases, it is often appropriate to take the alternative approach.
19 |
20 | Furthermore, this is beneficial when the ontology is used to populate a graph database. Each individual in the ontology corresponds to a node in the graph database, data properties on those individuals correspond to node attributes, and object properties corresond to edges in the graph.
21 | An ontology describing entities relevant to Alzheimer's disease etiology and entities relevant to drug discovery for Alzheimer's disease.
22 | 0.1.0a
23 |
24 |
25 |
26 |
27 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | String prefix that precedes the unique label on a named individual of the corresponding class. This is necessary to avoid duplicate labels, which result in invalid RDF/XML. Note that the data property "commonName" should also be set, preserving punctuation and whitespace, and omitting the individualLabelPrefix
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | The entity that is being altered via the KE.
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 | true
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 | Two proteins are isozymes if they are comprised of different amino acid sequences but catalyze the same enzymatic reaction. Isozymes often have different reaction rates and respond differently in various environmental settings. They may also be regulated through different regulatory mechanisms.
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 |
911 |
912 |
913 |
914 | If True, chemical is a known pharmaceutical drug. Effectively, this means that it is present in DrugBank, but this de facto definition may be modified in the future.
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 |
923 |
924 |
925 | If True, chemical is present in the Comparative Toxicogenomics Database.
926 |
927 |
928 |
929 |
930 |
931 |
932 |
933 |
934 |
935 |
936 |
937 | If True, chemical is considered foreign to the human body.
938 |
939 |
940 |
941 |
942 |
943 |
944 |
945 |
946 |
947 |
948 |
949 |
950 |
951 |
952 |
953 |
954 |
955 |
956 | A string used to name the entity. This provides a more useful way to label nodes without the prefixes needed to prevent conflicts when nodes in different classes have the same common name. This common name should also be safe to use with punctuation and whitespace characters (which should be removed from node names).
957 |
958 |
959 |
960 |
961 |
962 |
963 |
964 |
965 |
966 |
967 |
968 |
969 |
970 |
971 |
972 |
973 |
974 |
975 |
976 |
977 |
978 |
979 |
980 |
981 |
982 |
983 |
984 |
985 |
986 |
987 |
988 |
989 |
990 |
991 |
992 |
993 |
994 |
995 |
996 |
997 |
998 |
999 |
1000 |
1001 |
1002 |
1003 |
1004 |
1005 |
1006 |
1007 |
1008 |
1009 |
1010 |
1011 |
1012 |
1013 |
1014 |
1015 |
1016 |
1017 |
1018 |
1019 |
1020 |
1021 |
1022 |
1023 |
1024 |
1025 |
1026 |
1027 |
1028 |
1029 |
1030 |
1031 |
1032 |
1033 |
1034 |
1035 |
1036 |
1037 |
1038 |
1039 |
1040 |
1041 |
1042 |
1043 |
1044 |
1045 |
1046 |
1047 |
1048 |
1049 |
1050 |
1051 |
1052 |
1053 |
1054 |
1055 |
1056 |
1057 |
1058 |
1059 |
1060 |
1061 |
1062 |
1063 |
1064 |
1065 |
1066 |
1067 |
1068 |
1069 |
1070 |
1071 |
1072 |
1073 |
1074 |
1075 |
1076 |
1077 |
1078 |
1079 |
1080 |
1081 |
1082 |
1083 |
1084 |
1085 |
1086 |
1087 |
1088 |
1089 |
1090 |
1091 |
1092 | MACCS fingerprint is stored as a string where each character is a bit representing an individual feature.
1093 |
1094 |
1095 |
1096 |
1097 |
1098 |
1099 |
1100 |
1101 |
1102 |
1103 |
1104 |
1105 |
1106 |
1107 |
1108 |
1109 |
1110 |
1111 |
1112 |
1113 |
1114 |
1115 |
1116 |
1117 |
1118 |
1119 |
1120 |
1121 |
1122 |
1123 |
1124 |
1125 |
1126 |
1127 |
1128 |
1129 |
1130 |
1131 |
1132 |
1133 |
1134 |
1135 |
1136 |
1137 |
1138 |
1139 |
1140 |
1141 |
1142 |
1143 |
1144 |
1145 |
1146 |
1147 |
1148 |
1149 |
1150 |
1151 |
1152 |
1153 |
1154 |
1155 |
1156 |
1157 |
1158 |
1159 |
1160 |
1161 |
1162 |
1163 |
1164 |
1165 |
1166 |
1167 |
1168 |
1169 |
1170 |
1171 |
1172 |
1173 |
1174 |
1175 |
1176 |
1177 |
1178 |
1179 |
1180 |
1181 |
1182 |
1183 |
1184 |
1185 |
1186 |
1187 |
1188 |
1189 |
1190 |
1191 |
1192 |
1193 |
1194 |
1195 |
1196 |
1197 |
1198 |
1199 |
1200 |
1201 |
1202 |
1203 |
1204 |
1205 |
1206 |
1207 |
1208 |
1209 |
1210 |
1211 |
1212 |
1213 |
1214 |
1215 |
1216 |
1217 |
1218 |
1219 |
1220 |
1221 |
1222 |
1223 |
1224 |
1225 |
1226 |
1227 |
1228 |
1229 |
1230 |
1231 |
1232 |
1233 |
1234 |
1235 |
1236 |
1237 |
1238 |
1239 |
1240 | Term from the Cell Ontology (CL)
1241 |
1242 |
1243 |
1244 |
1245 |
1246 |
1247 |
1248 |
1249 |
1250 |
1251 |
1252 |
1253 |
1254 |
1255 |
1256 |
1257 |
1258 |
1259 |
1260 | Term from Chemical Entities of Biological Interest (ChEBI)
1261 |
1262 |
1263 |
1264 |
1265 |
1266 |
1267 |
1268 |
1269 |
1270 |
1271 |
1272 | DSSTox substance identifier. This is the main 'unit of ground truth' for chemicals in ComptoxAI.
1273 |
1274 |
1275 |
1276 |
1277 |
1278 |
1279 |
1280 |
1281 |
1282 |
1283 | Note: Due to inconsistencies in granularity, some diseases may have multiple DOIDs.
1284 |
1285 |
1286 |
1287 |
1288 |
1289 |
1290 |
1291 |
1292 |
1293 |
1294 |
1295 |
1296 |
1297 |
1298 |
1299 |
1300 |
1301 |
1302 |
1303 |
1304 |
1305 |
1306 |
1307 |
1308 |
1309 |
1310 |
1311 |
1312 |
1313 | Term from the Foundational Model of Anatomy (FMA)
1314 |
1315 |
1316 |
1317 |
1318 |
1319 |
1320 |
1321 |
1322 |
1323 |
1324 | EPA GSID. This has been largely superceded by DSSTOX IDs, but web services and Invitrodb still identify chemicals using GSID.
1325 |
1326 |
1327 |
1328 |
1329 |
1330 |
1331 |
1332 |
1333 |
1334 |
1335 |
1336 |
1337 |
1338 |
1339 |
1340 |
1341 |
1342 |
1343 |
1344 |
1345 |
1346 |
1347 |
1348 |
1349 |
1350 |
1351 |
1352 |
1353 |
1354 | Any medical subject heading (does not include supplemental terms, such as the UIs that point to specific compounds).
1355 |
1356 |
1357 |
1358 |
1359 |
1360 |
1361 |
1362 |
1363 |
1364 |
1365 |
1366 |
1367 |
1368 |
1369 |
1370 |
1371 |
1372 |
1373 |
1374 |
1375 | A MeSH "Unique ID". This is different from a true subject heading, and is usually a controlled term for a chemical substance.
1376 |
1377 |
1378 |
1379 |
1380 |
1381 |
1382 |
1383 |
1384 |
1385 |
1386 |
1387 |
1388 |
1389 |
1390 |
1391 |
1392 |
1393 |
1394 |
1395 |
1396 |
1397 |
1398 |
1399 |
1400 |
1401 |
1402 |
1403 |
1404 |
1405 |
1406 |
1407 |
1408 |
1409 |
1410 |
1411 |
1412 |
1413 |
1414 |
1415 |
1416 |
1417 |
1418 |
1419 |
1420 |
1421 |
1422 |
1423 |
1424 |
1425 |
1426 |
1427 |
1428 |
1429 |
1430 |
1431 |
1432 |
1433 |
1434 |
1435 |
1436 |
1437 |
1438 |
1439 |
1440 |
1441 |
1442 |
1443 |
1444 |
1445 |
1446 |
1447 |
1448 |
1449 |
1450 |
1451 |
1452 |
1453 |
1454 |
1455 |
1456 |
1457 | Pathway ID from an unknown (or deprecated) pathway database.
1458 |
1459 |
1460 |
1461 |
1462 |
1469 |
1470 |
1471 |
1472 |
1473 |
1474 |
1475 |
1476 |
1477 |
1478 |
1479 |
1480 |
1481 |
1482 |
1483 | Detrimental phenotypic effect resulting from exposure to a chemical
1484 |
1485 |
1486 |
1487 |
1488 |
1489 |
1490 |
1491 |
1492 |
1493 |
1494 |
1495 |
1496 |
1497 |
1498 |
1499 |
1500 |
1501 |
1502 |
1503 |
1504 |
1505 |
1506 |
1507 |
1508 |
1509 |
1510 |
1511 |
1512 |
1513 |
1514 |
1515 |
1516 |
1517 |
1518 |
1519 |
1520 |
1521 |
1522 |
1523 |
1524 |
1525 |
1526 |
1527 |
1528 |
1529 |
1530 |
1531 |
1532 |
1533 |
1534 |
1535 |
1536 |
1537 |
1538 |
1539 |
1540 |
1541 | se_
1542 |
1543 |
1544 |
1545 |
1546 |
1547 |
1548 |
1549 |
1550 |
1551 |
1552 |
1553 |
1554 |
1555 |
1556 |
1557 |
1558 |
1559 |
1560 |
1561 |
1562 |
1563 |
1564 |
1565 |
1566 |
1567 |
1568 |
1569 |
1570 |
1571 |
1572 |
1573 |
1574 |
1575 |
1576 |
1577 |
1578 |
1579 |
1580 |
1581 |
1582 |
1583 |
1584 |
1585 |
1586 |
1587 |
1588 |
1589 |
1590 |
1591 |
1592 |
1593 |
1594 |
1595 |
1596 |
1597 |
1598 |
1599 |
1600 |
1601 |
1602 |
1603 |
1604 |
1605 |
1606 |
1607 |
1608 |
1609 |
1610 |
1611 |
1612 |
1613 |
1614 |
1615 |
1616 |
1617 |
1618 |
1619 |
1620 | chem_
1621 |
1622 |
1623 |
1624 |
1625 |
1626 |
1627 |
1628 |
1629 |
1630 |
1631 |
1632 |
1633 |
1634 |
1635 |
1636 |
1637 |
1638 |
1639 |
1640 |
1641 |
1642 |
1643 |
1644 |
1645 |
1646 |
1647 |
1648 |
1649 |
1650 |
1651 |
1652 |
1653 |
1654 |
1655 |
1656 |
1657 |
1658 |
1659 |
1660 |
1661 |
1662 |
1663 |
1664 |
1665 |
1666 |
1667 |
1668 |
1669 |
1670 |
1671 |
1672 |
1673 | Chemical Lists defined by the US EPA and used in the EPA's Comptox Dashboard web application. These lists are scraped from the Dashboard's public-facing API.
1674 |
1675 | A chemical list is loosely defined, but they can be thought of in broad terms as functional classes of chemicals. They range in size from a few chemicals to tens of thousands of chemicals.
1676 |
1677 |
1678 |
1679 |
1680 |
1681 |
1682 |
1683 |
1684 |
1685 |
1686 |
1687 | 0
1688 |
1689 |
1690 |
1691 |
1692 |
1693 |
1694 |
1695 |
1696 |
1697 | phen_
1698 |
1699 |
1700 |
1701 |
1702 |
1703 |
1704 |
1705 |
1706 |
1707 |
1708 |
1709 |
1710 |
1711 |
1712 |
1713 |
1714 |
1715 |
1716 |
1717 |
1718 |
1719 |
1720 | Any database that is relevant to computational toxicology (not necessarily intended to be used primarily for toxicology - e.g., PubChem).
1721 |
1722 |
1723 |
1724 |
1725 |
1726 |
1727 |
1728 |
1729 |
1730 |
1731 |
1732 |
1733 |
1734 |
1735 |
1736 |
1737 |
1738 |
1739 |
1740 |
1741 |
1742 |
1743 |
1744 |
1745 |
1746 |
1747 |
1748 |
1749 |
1750 |
1751 |
1752 |
1753 |
1754 |
1755 |
1756 |
1757 |
1758 |
1759 |
1760 |
1761 |
1762 |
1763 |
1764 |
1765 |
1766 |
1767 |
1768 |
1769 |
1770 |
1771 |
1772 |
1773 |
1774 |
1775 |
1776 |
1777 |
1778 |
1779 |
1780 |
1781 |
1782 |
1783 |
1784 |
1785 |
1786 |
1787 |
1788 |
1789 |
1790 |
1791 | dis_
1792 | A disease is defined as a medical condition with a deleterious effect.
1793 |
1794 |
1795 |
1796 |
1797 |
1798 |
1799 |
1800 |
1801 |
1802 |
1803 |
1804 |
1805 |
1806 | 1
1807 |
1808 |
1809 |
1810 |
1811 |
1812 |
1813 |
1814 |
1815 |
1816 |
1817 |
1818 |
1819 |
1820 |
1821 |
1822 |
1823 |
1824 |
1825 |
1826 |
1827 |
1828 | 1
1829 |
1830 |
1831 |
1832 | A chemical substance that causes a change in an organism's physiology or psychology when consumed or administered. This ontology primarily considers the effects of drugs when administered on humans.
1833 |
1834 |
1835 |
1836 |
1837 |
1838 |
1839 |
1840 |
1841 |
1842 |
1843 |
1844 |
1845 |
1846 |
1847 |
1848 |
1849 |
1850 |
1851 |
1852 |
1853 |
1854 |
1855 |
1856 |
1857 |
1858 |
1859 |
1860 |
1861 |
1862 |
1863 |
1864 |
1865 |
1866 |
1867 |
1868 |
1869 |
1870 |
1871 |
1872 |
1873 |
1874 |
1875 |
1876 |
1877 |
1878 |
1879 |
1880 |
1881 |
1882 |
1883 |
1884 |
1885 |
1886 |
1887 |
1888 |
1889 |
1890 |
1891 |
1892 |
1893 |
1894 |
1895 |
1896 |
1897 |
1898 |
1899 |
1900 |
1901 |
1902 |
1903 |
1904 |
1905 |
1906 |
1907 |
1908 |
1909 |
1910 |
1911 |
1912 |
1913 |
1914 |
1915 |
1916 |
1917 |
1918 | gene_
1919 |
1920 |
1921 |
1922 |
1923 |
1924 |
1925 |
1926 |
1927 |
1928 |
1929 |
1930 |
1931 |
1932 |
1933 |
1934 |
1935 |
1936 |
1937 |
1938 |
1939 |
1940 |
1941 |
1942 |
1943 |
1944 |
1945 |
1946 |
1947 |
1948 |
1949 |
1950 |
1951 |
1952 |
1953 |
1954 |
1955 |
1956 |
1957 |
1958 |
1959 |
1960 |
1961 |
1962 |
1963 |
1964 |
1965 |
1966 |
1967 |
1968 |
1969 |
1970 |
1971 |
1972 |
1973 |
1974 |
1975 |
1976 |
1977 |
1978 |
1979 |
1980 |
1981 |
1982 |
1983 |
1984 |
1985 |
1986 |
1987 |
1988 |
1989 |
1990 |
1991 |
1992 |
1993 |
1994 |
1995 |
1996 |
1997 |
1998 |
1999 |
2000 |
2001 |
2002 |
2003 |
2004 |
2005 |
2006 |
2007 |
2008 |
2009 |
2010 |
2011 |
2012 |
2013 |
2014 |
2015 |
2016 |
2017 |
2018 |
2019 |
2020 |
2021 |
2022 |
2023 |
2024 |
2025 |
2026 |
2027 |
2028 |
2029 |
2030 |
2031 |
2032 |
2033 |
2034 |
2035 |
2036 |
2037 |
2038 |
2039 |
2040 |
2041 |
2042 |
2043 |
2044 |
2045 |
2046 |
2047 |
2048 |
2049 |
2050 |
2051 |
2052 |
2053 |
2054 |
2055 |
2056 |
2057 |
2058 |
2059 |
2060 |
2061 |
2062 |
2063 |
2064 |
2065 |
2066 |
2067 |
2068 |
2069 |
2070 |
2071 |
2072 |
2073 |
2074 |
2075 |
2076 |
2077 |
2078 |
2079 |
2080 |
2081 |
2082 |
2083 |
2084 |
2085 |
2086 |
2087 |
2088 |
2089 |
2090 |
2091 |
2092 |
2093 |
2094 |
2095 |
2096 |
2097 |
2098 |
2099 |
2100 |
2101 |
2102 |
2103 |
2104 |
2105 |
2106 |
2107 |
2108 |
2109 |
2110 |
2111 |
2112 |
2113 |
2114 |
2115 |
2116 |
2117 |
2118 |
2119 |
2120 |
2121 |
2122 |
2123 |
2124 |
2125 |
2126 |
2127 |
2128 |
2129 |
2130 |
2131 |
2132 |
2133 |
2134 |
2135 |
2136 |
2137 |
2138 |
2139 |
2140 |
2141 |
2142 |
2143 |
2144 |
2145 |
2146 |
2147 |
2148 |
2149 |
2150 |
2151 |
2152 |
2153 |
2154 |
2155 |
2156 |
2157 |
2158 |
2159 |
2160 |
2161 |
2162 |
2163 |
2164 |
2165 |
2166 |
2167 |
2168 |
2169 |
2170 |
2171 |
2172 | Pathways are taken from AOP-DB's pathway_gene table, which includes pathways from many taxa taken from a large number of source databases. We only include pathways with taxid 9606 (humans), but we don't filter based on source database. Therefore, we can't be 100% sure that there isn't duplication, and we haven't created individual node properties for each of the source database xrefs. Generally, the 'sourceDatabase' property can be used along with the format of the pathwayId to determine where the pathway originally came from.
2173 |
2174 | Note that each gene can be in potentially many pathways, and each pathway can contain potentially many genes.
2175 |
2176 |
2177 |
2178 |
2179 |
2180 |
2181 |
2182 |
2183 |
2184 |
2185 |
2186 |
2187 |
2188 | 1
2189 |
2190 |
2191 |
2192 |
2193 |
2194 |
2195 | A drug with an intended beneficial effect, intended to 'treat, cure, prevent, or diagnose a disease or promote well-being.'
2196 |
2197 |
2198 |
2199 |
2200 |
2201 |
2202 |
2203 |
2204 |
2205 |
2206 |
2207 |
2208 |
2209 |
2210 |
2211 |
2212 |
2213 |
2214 |
2215 |
2216 |
2217 |
2218 |
2219 |
2220 |
2221 |
2222 | 0
2223 |
2224 |
2225 |
2226 |
2227 |
2228 |
2229 |
2230 |
2231 |
2232 |
2233 |
2234 |
2235 |
2236 |
2237 |
2238 |
2239 |
2240 |
2241 |
2242 |
2243 |
2244 |
2245 |
2246 |
2247 |
2248 |
2249 |
2250 |
2251 |
2252 |
2253 |
2254 |
2255 |
2256 |
2257 |
2258 |
2259 |
2260 |
2261 |
2262 |
2263 |
2264 |
2265 |
2266 |
2267 |
2268 |
2269 |
2270 |
2271 |
2272 |
2273 |
2274 |
2275 |
2276 |
2277 |
2278 |
2279 |
2280 |
2281 |
2282 |
2283 |
2284 |
2285 |
2286 |
2287 |
2288 |
2289 |
2290 |
2291 |
2292 |
2293 |
2294 |
2295 |
2296 |
2297 |
2298 |
2299 |
2300 |
2301 |
2302 |
2303 |
2304 |
2305 |
2306 |
2307 |
2308 |
2309 |
2310 |
2311 |
2312 |
2313 |
2314 |
2315 |
2316 |
2317 |
2318 |
2319 |
2320 |
2321 |
2322 |
2323 |
2324 |
2331 |
2332 |
2333 |
2334 |
2335 |
2336 |
2337 |
2338 |
2339 |
2340 |
2341 |
2342 |
2343 |
2344 |
2345 |
2346 |
2347 |
2348 |
2349 |
2350 |
2351 |
2352 |
2353 |
2354 |
2355 |
2356 |
2357 |
2358 |
2359 |
2360 |
2361 |
2362 |
2363 |
2364 |
2365 |
2366 |
2367 |
2368 |
2369 |
2370 |
2371 |
2372 |
2373 |
2374 |
2375 |
2376 |
2377 |
2378 |
2379 |
2380 |
2381 |
2382 |
2383 |
2384 |
2385 |
2386 |
2387 |
2388 |
2389 |
2390 |
2391 |
2392 |
2393 |
2394 |
--------------------------------------------------------------------------------
/alzkb/populate_edge_weights.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 |
4 |
5 | path = './data/alzkb_v2-populated.csv'
6 | df= pd.read_csv(path)
7 | df= pd.concat([df,pd.DataFrame(columns=['sourceDB','unbiased','affinity_nM','p_fisher','z_score','correlation','score','confidence'])])
8 |
9 | # hetionet-custom-edges.tsv
10 | data_dir = "./AlzKB_Raw_Data"
11 | hetionet_custom = pd.read_table(os.path.join(data_dir,'hetionet/hetionet-custom-edges.tsv'))
12 |
13 | hetio_custom = {
14 | 'CbG':'CHEMICALBINDSGENE',
15 | 'DrD':'DISEASEASSOCIATESWITHDISEASE', # no results
16 | 'DlA':'DISEASELOCALIZESTOANATOMY',
17 | 'DpS':'SYMPTOMMANIFESTATIONOFDISEASE'
18 | }
19 |
20 |
21 | affinity_nM = hetionet_custom[hetionet_custom['metaedge']=='CbG']
22 | affinity_nM['xrefDrugbank'] = affinity_nM['source'].str.split('::').str[-1]
23 | affinity_nM['xrefNcbiGene'] = affinity_nM['target'].str.split('::').str[-1].astype(int)
24 | affinity_nM = affinity_nM.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
25 | affinity_nM = affinity_nM.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
26 | affinity_nM['_type'] = hetio_custom['CbG']
27 | merged_df = df.merge(affinity_nM, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
28 | for column in ['sourceDB', 'unbiased', 'affinity_nM']:
29 | df[column] = merged_df[column + '_new'].combine_first(df[column])
30 | df.shape
31 |
32 |
33 | disgenet = pd.read_table('./AlzKB_Raw_Data/disgenet/CUSTOM/disease_mappings_alzheimer.tsv')
34 | disgenet = disgenet[disgenet['vocabulary']=='DO']
35 |
36 |
37 | p_fisher_DlA = hetionet_custom[hetionet_custom['metaedge']=='DlA']
38 |
39 | p_fisher_DlA['do_id'] = p_fisher_DlA['source'].str.split('::').str[-1].str.split(':').str[-1]
40 | p_fisher_DlA['xrefUberon'] = p_fisher_DlA['target'].str.split('::').str[-1]
41 |
42 | p_fisher_DlA = p_fisher_DlA.merge(disgenet, left_on='do_id', right_on= 'code')
43 | p_fisher_DlA['_start'] = 'disease_'+p_fisher_DlA['diseaseId'].str.lower()
44 | p_fisher_DlA = p_fisher_DlA.merge(df[['_id','xrefUberon']].rename(columns={'_id':'_end'}), on='xrefUberon', how='left')
45 | p_fisher_DlA['_type'] = hetio_custom['DlA']
46 |
47 | p_fisher_DpS = hetionet_custom[hetionet_custom['metaedge']=='DpS']
48 |
49 | p_fisher_DpS['xrefMeSH'] = p_fisher_DpS['target'].str.split('::').str[-1]
50 | p_fisher_DpS['do_id'] = p_fisher_DpS['source'].str.split('::').str[-1].str.split(':').str[-1]
51 |
52 | p_fisher_DpS = p_fisher_DpS.merge(df[['_id','xrefMeSH']].rename(columns={'_id':'_start'}), on='xrefMeSH', how='left')
53 | p_fisher_DpS = p_fisher_DpS.merge(disgenet, left_on='do_id', right_on= 'code')
54 | p_fisher_DpS['_end'] = 'disease_'+p_fisher_DpS['diseaseId'].str.lower()
55 | p_fisher_DpS['_type'] = hetio_custom['DpS']
56 |
57 | p_fisher = pd.concat([p_fisher_DlA, p_fisher_DpS])
58 |
59 | merged_df = df.merge(p_fisher, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
60 | for column in ['sourceDB', 'unbiased', 'p_fisher']:
61 | df[column] = merged_df[column + '_new'].combine_first(df[column])
62 | df.shape
63 |
64 |
65 | # hetionet-v1.0-edges.sif
66 | #https://github.com/dhimmel/integrate/blob/master/integrate.ipynb
67 |
68 | import hetio.hetnet
69 | import hetio.readwrite
70 | import hetio.stats
71 |
72 | path = 'https://raw.githubusercontent.com/dhimmel/integrate/master/data/hetnet.json.bz2'
73 | graph = hetio.readwrite.read_graph(path, formatting=None)
74 |
75 |
76 | #https://github.com/hetio/hetnetpy/blob/main/hetnetpy/readwrite.py
77 | import collections
78 | import operator
79 | import pandas as pd
80 |
81 | def write_nodetable(graph):
82 | """Write a tabular encoding of the graph nodes."""
83 | rows = list()
84 | for node in graph.node_dict.values():
85 | row = collections.OrderedDict()
86 | row["kind"] = node.metanode.identifier
87 | row["id"] = str(node)
88 | row["name"] = node.name
89 | row["source"] = node.data['source']
90 | rows.append(row)
91 | rows.sort(key=operator.itemgetter("kind", "id"))
92 | fieldnames = ["id", "name", "kind", "source"]
93 | df_nodes_tsv = pd.DataFrame(rows, columns=fieldnames)
94 | print(df_nodes_tsv.shape)
95 | return df_nodes_tsv
96 |
97 |
98 | def write_edgetable(graph):
99 | """Write a tsv of the graph edges."""
100 | rows = list()
101 | edge_properties=["sourceDB", "unbiased", "affinity_nM", "z_score", "p_fisher", "correlation"]
102 | fieldnames =["source", "metaedge", "target"]
103 | fieldnames = fieldnames+edge_properties
104 | metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True)
105 | for metaedge, edges in metaedge_to_edges.items():
106 | for edge in edges:
107 | row = collections.OrderedDict()
108 | row["source"] = edge.source
109 | row["metaedge"] = edge.metaedge.abbrev
110 | row["target"] = edge.target
111 | for pro in edge_properties:
112 | if pro =='sourceDB':
113 | if 'source' in edge.data.keys():
114 | row[pro]=edge.data['source']
115 | else:
116 | row[pro]=None
117 | else:
118 | if pro in edge.data.keys():
119 | row[pro]=edge.data[pro]
120 | else:
121 | row[pro]=None
122 | rows.append(row)
123 | df_edges_tsv = pd.DataFrame(rows, columns=fieldnames)
124 | print(df_edges_tsv.shape)
125 | return df_edges_tsv
126 |
127 | hetionet = write_edgetable(graph)
128 | hetionet['source']=hetionet['source'].astype(str)
129 | hetionet['target']=hetionet['target'].astype(str)
130 | hetionet
131 |
132 | hetio = {
133 | 'CuG':'CHEMICALINCREASESEXPRESSION',
134 | 'CdG':'CHEMICALDECREASESEXPRESSION',
135 | 'GcG':'GENECOVARIESWITHGENE',
136 | 'Gr>G':'GENEREGULATESGENE'
137 | }
138 |
139 |
140 | z_score = hetionet[hetionet['metaedge']=='CuG']
141 | z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1]
142 | z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int)
143 |
144 | z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
145 | z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
146 | z_score['_type'] = hetio['CuG']
147 |
148 | z_score_all = z_score
149 |
150 | z_score = hetionet[hetionet['metaedge']=='CdG']
151 | z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1]
152 | z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int)
153 |
154 | z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
155 | z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
156 | z_score['_type'] = hetio['CdG']
157 |
158 | z_score_all = pd.concat([z_score_all,z_score])
159 |
160 | merged_df = df.merge(z_score_all, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
161 | for column in ['sourceDB', 'unbiased', 'z_score']:
162 | df[column] = merged_df[column + '_new'].combine_first(df[column])
163 | df.shape
164 |
165 |
166 | correlation = pd.read_table(os.path.join(data_dir,'hetionet/geneCovariesWithGene_correlation.tsv'))
167 |
168 | correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='source_entrez', right_on='xrefNcbiGene', how='left')
169 | correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), left_on='target_entrez', right_on='xrefNcbiGene', how='left')
170 | correlation['_type'] = hetio['GcG']
171 | correlation['sourceDB'] = 'Hetionet - ERC'
172 | correlation['unbiased'] = True
173 |
174 | merged_df = df.merge(correlation, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
175 | for column in ['sourceDB', 'unbiased', 'correlation']:
176 | df[column] = merged_df[column + '_new'].combine_first(df[column])
177 | df.shape
178 | df.loc[~df['correlation'].isna()]
179 |
180 |
181 | #DisGeNET
182 | score = pd.read_table('./AlzKB_Raw_Data/disgenet/curated_gene_disease_associations.tsv')
183 | score['sourceDB'] = 'DisGeNET - '+score['source']
184 |
185 | score = score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='geneId', right_on='xrefNcbiGene', how='left')
186 | score['_end'] = 'disease_'+score['diseaseId'].str.lower()
187 | score['_type'] = 'GENEASSOCIATESWITHDISEASE'
188 |
189 | merged_df = df.merge(score, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
190 | for column in ['sourceDB', 'score']:
191 | df[column] = merged_df[column + '_new'].combine_first(df[column])
192 | df.shape
193 |
194 |
195 | #TF
196 | confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv')
197 | confidence
198 |
199 | confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv')
200 |
201 | confidence = confidence.merge(df[['_id','TF']].rename(columns={'_id':'_start'}), on='TF', how='left')
202 | confidence = confidence.merge(df[['_id','geneSymbol']].rename(columns={'_id':'_end'}), left_on='Gene', right_on='geneSymbol', how='left')
203 |
204 | confidence['_type'] = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'
205 |
206 | merged_df = df.merge(confidence, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
207 | for column in ['sourceDB', 'confidence']:
208 | df[column] = merged_df[column + '_new'].combine_first(df[column])
209 | df.shape
210 |
211 | #save data file
212 | df.to_csv('./data/alzkb_v2.0.0_with_edge_properties.csv')
213 |
214 |
215 |
216 |
--------------------------------------------------------------------------------
/alzkb/populate_ontology.py:
--------------------------------------------------------------------------------
1 | from ista import FlatFileDatabaseParser, MySQLDatabaseParser
2 | from ista.util import print_onto_stats
3 |
4 | import owlready2
5 |
6 | import mysecrets
7 |
8 | import ipdb
9 |
10 | onto = owlready2.get_ontology("./data/alzkb_v2.rdf").load()
11 | data_dir = "./AlzKB_Raw_Data/"
12 |
13 | mysql_config = {
14 | 'host': mysecrets.MYSQL_HOSTNAME,
15 | 'user': mysecrets.MYSQL_USERNAME,
16 | 'passwd': mysecrets.MYSQL_PASSWORD
17 | }
18 |
19 | ncbigene = FlatFileDatabaseParser("ncbigene", onto, data_dir)
20 | drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir)
21 | hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir)
22 | aopdb = MySQLDatabaseParser("aopdb", onto, mysql_config)
23 | disgenet = FlatFileDatabaseParser("disgenet", onto, data_dir)
24 | dorothea = FlatFileDatabaseParser("dorothea", onto, data_dir)
25 |
26 | drugbank.parse_node_type(
27 | node_type="Drug", # Switch from "Chemical" in ComptoxAI to "Drug" in AlzKB
28 | source_filename="CUSTOM/drug_links.tsv",
29 | fmt="tsv",
30 | parse_config={
31 | "iri_column_name": "DrugBank ID",
32 | "headers": True,
33 | "data_property_map": {
34 | "DrugBank ID": onto.xrefDrugbank,
35 | "CAS Number": onto.xrefCasRN,
36 | "Name": onto.commonName,
37 | "data_resource": onto.sourceDatabase,
38 | },
39 | "merge_column": {
40 | "source_column_name": "CAS Number",
41 | "data_property": onto.xrefCasRN,
42 | },
43 | },
44 | merge=False,
45 | skip=False
46 | )
47 |
48 | ncbigene.parse_node_type(
49 | node_type="Gene",
50 | source_filename="CUSTOM/output.tsv",
51 | fmt="tsv-pandas",
52 | parse_config={
53 | "compound_fields": {
54 | "dbXrefs": {"delimiter": "|", "field_split_prefix": ":"}
55 | },
56 | "iri_column_name": "Symbol",
57 | "headers": True,
58 | "data_property_map": {
59 | "GeneID": onto.xrefNcbiGene,
60 | "Symbol": onto.geneSymbol,
61 | "type_of_gene": onto.typeOfGene,
62 | "Full_name_from_nomenclature_authority": onto.commonName,
63 | "MIM": onto.xrefOMIM,
64 | "HGNC": onto.xrefHGNC,
65 | "Ensembl": onto.xrefEnsembl,
66 | "chromosome": onto.chromosome,
67 | "data_resource": onto.sourceDatabase,
68 | # TODO: Parse Feature_type and other columns
69 | },
70 | },
71 | merge=False,
72 | skip=False
73 | )
74 |
75 | hetionet.parse_node_type(
76 | node_type="DrugClass",
77 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
78 | fmt="tsv",
79 | parse_config={
80 | "iri_column_name": "name",
81 | "headers": True,
82 | "filter_column": "kind",
83 | "filter_value": "Pharmacologic Class",
84 | "data_transforms": {
85 | "id": lambda x: x.split("::")[-1]
86 | },
87 | "data_property_map": {
88 | "id": onto.xrefNciThesaurus,
89 | "name": onto.commonName,
90 | "sourceDB": onto.sourceDatabase,
91 | }
92 | },
93 | merge=False,
94 | skip=False
95 | )
96 |
97 | hetionet.parse_node_type(
98 | node_type="Symptom",
99 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
100 | fmt="tsv",
101 | parse_config={
102 | "iri_column_name": "name",
103 | "headers": True,
104 | "filter_column": "kind",
105 | "filter_value": "Symptom",
106 | "data_transforms": {
107 | "id": lambda x: x.split("::")[-1]
108 | },
109 | "data_property_map": {
110 | "id": onto.xrefMeSH,
111 | "name": onto.commonName,
112 | "sourceDB": onto.sourceDatabase,
113 | }
114 | },
115 | merge=False,
116 | skip=False
117 | )
118 | hetionet.parse_node_type( # ANATOMY RESOLUTION NEEDS TO BE REFINED!
119 | node_type="BodyPart",
120 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
121 | fmt="tsv",
122 | parse_config={
123 | "iri_column_name": "name",
124 | "headers": True,
125 | "filter_column": "kind",
126 | "filter_value": "Anatomy",
127 | "data_transforms": {
128 | "id": lambda x: x.split("::")[-1]
129 | },
130 | "data_property_map": {
131 | "id": onto.xrefUberon,
132 | "name": onto.commonName,
133 | "sourceDB": onto.sourceDatabase,
134 | }
135 | },
136 | merge=False,
137 | skip=False
138 | )
139 | hetionet.parse_node_type(
140 | node_type="BiologicalProcess",
141 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
142 | fmt="tsv",
143 | parse_config={
144 | "iri_column_name": "name",
145 | "headers": True,
146 | "filter_column": "kind",
147 | "filter_value": "Biological Process",
148 | "data_transforms": {
149 | "id": lambda x: x.split("::")[-1]
150 | },
151 | "data_property_map": {
152 | "id": onto.xrefGeneOntology,
153 | "name": onto.commonName,
154 | "sourceDB": onto.sourceDatabase,
155 | }
156 | },
157 | merge=False,
158 | skip=False
159 | )
160 | hetionet.parse_node_type(
161 | node_type="MolecularFunction",
162 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
163 | fmt="tsv",
164 | parse_config={
165 | "iri_column_name": "name",
166 | "headers": True,
167 | "filter_column": "kind",
168 | "filter_value": "Molecular Function",
169 | "data_transforms": {
170 | "id": lambda x: x.split("::")[-1]
171 | },
172 | "data_property_map": {
173 | "id": onto.xrefGeneOntology,
174 | "name": onto.commonName,
175 | "source": onto.sourceDatabase,
176 | }
177 | },
178 | merge=False,
179 | skip=False
180 | )
181 | hetionet.parse_node_type(
182 | node_type="CellularComponent",
183 | source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
184 | fmt="tsv",
185 | parse_config={
186 | "iri_column_name": "name",
187 | "headers": True,
188 | "filter_column": "kind",
189 | "filter_value": "Cellular Component",
190 | "data_transforms": {
191 | "id": lambda x: x.split("::")[-1]
192 | },
193 | "data_property_map": {
194 | "id": onto.xrefGeneOntology,
195 | "name": onto.commonName,
196 | "source": onto.sourceDatabase,
197 | }
198 | },
199 | merge=False,
200 | skip=False
201 | )
202 |
203 | """
204 | aopdb.parse_node_type(
205 | node_type="Drug",
206 | source_table="chemical_info",
207 | parse_config={
208 | "iri_column_name": "DTX_id",
209 | "data_property_map": {"ChemicalID": onto.xrefMeSH},
210 | "merge_column": {
211 | "source_column_name": "DTX_id",
212 | "data_property": onto.xrefDTXSID
213 | }
214 | },
215 | merge=True,
216 | skip=False
217 | )
218 | """
219 |
220 | aopdb.parse_node_type(
221 | node_type="Pathway",
222 | source_table="stressor_info",
223 | parse_config={
224 | "iri_column_name": "path_name",
225 | "data_property_map": {
226 | "path_id": onto.pathwayId,
227 | #"path_name": onto.commonName,
228 | "path_name": onto.pathwayName,
229 | "ext_source": onto.sourceDatabase,
230 | },
231 | "custom_sql_query": """SELECT path_name, GROUP_CONCAT(DISTINCT path_id) as path_id, CONCAT('AOPDB - ', GROUP_CONCAT(DISTINCT ext_source)) as ext_source
232 | FROM(
233 | SELECT DISTINCT path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name, ext_source
234 | FROM aopdb.pathway_gene
235 | WHERE tax_id = 9606
236 | )data
237 | GROUP BY path_name;""" #clean duplicated pathway
238 | },
239 | merge=False,
240 | skip=False
241 | )
242 |
243 | disgenet.parse_node_type(
244 | node_type="Disease",
245 | source_filename="CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", # Filtered for just Alzheimer disease
246 | fmt="tsv-pandas",
247 | parse_config={
248 | "iri_column_name": "diseaseId",
249 | "headers": True,
250 | "data_property_map": {
251 | "diseaseId": onto.xrefUmlsCUI,
252 | "name": onto.commonName,
253 | "data_source": onto.sourceDatabase,
254 | }
255 | },
256 | merge=False,
257 | skip=False
258 | )
259 | disgenet.parse_node_type(
260 | node_type="Disease",
261 | source_filename="CUSTOM/disease_mappings_alzheimer.tsv", # Filtered, as above
262 | fmt="tsv-pandas",
263 | parse_config={
264 | "iri_column_name": "diseaseId",
265 | "headers": True,
266 | "filter_column": "vocabulary",
267 | "filter_value": "DO",
268 | "merge_column": {
269 | "source_column_name": "diseaseId",
270 | "data_property": onto.xrefUmlsCUI,
271 | "data_source": onto.sourceDatabase,
272 | },
273 | "data_property_map": {
274 | "code": onto.xrefDiseaseOntology
275 | }
276 | },
277 | merge=True,
278 | skip=False
279 | )
280 |
281 | disgenet.parse_relationship_type(
282 | relationship_type=onto.geneAssociatesWithDisease,
283 | source_filename="curated_gene_disease_associations.tsv",
284 | fmt="tsv",
285 | parse_config={
286 | "subject_node_type": onto.Gene,
287 | "subject_column_name": "geneSymbol",
288 | "subject_match_property": onto.geneSymbol,
289 | "object_node_type": onto.Disease,
290 | "object_column_name": "diseaseId",
291 | "object_match_property": onto.xrefUmlsCUI,
292 | "filter_column": "diseaseType",
293 | "filter_value": "disease",
294 | "headers": True
295 | },
296 | merge=False,
297 | skip=False
298 | )
299 |
300 | hetionet.parse_relationship_type(
301 | relationship_type=onto.chemicalIncreasesExpression,
302 | source_filename="hetionet-v1.0-edges.sif",
303 | fmt="tsv",
304 | parse_config={
305 | "subject_node_type": onto.Drug,
306 | "subject_column_name": "source",
307 | "subject_match_property": onto.xrefDrugbank,
308 | "object_node_type": onto.Gene,
309 | "object_column_name": "target",
310 | "object_match_property": onto.xrefNcbiGene,
311 | "filter_column": "metaedge",
312 | "filter_value": "CuG",
313 | "headers": True,
314 | "data_transforms": {
315 | "source": lambda x: x.split("::")[-1],
316 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
317 | },
318 | },
319 | merge=True, # Merge with aopdb/ctd chemical-gene ixns
320 | skip=False
321 | )
322 | hetionet.parse_relationship_type(
323 | relationship_type=onto.chemicalDecreasesExpression,
324 | source_filename="hetionet-v1.0-edges.sif",
325 | fmt="tsv",
326 | parse_config={
327 | "subject_node_type": onto.Drug,
328 | "subject_column_name": "source",
329 | "subject_match_property": onto.xrefDrugbank,
330 | "object_node_type": onto.Gene,
331 | "object_column_name": "target",
332 | "object_match_property": onto.xrefNcbiGene,
333 | "filter_column": "metaedge",
334 | "filter_value": "CdG",
335 | "headers": True,
336 | "data_transforms": {
337 | "source": lambda x: x.split("::")[-1],
338 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
339 | },
340 | },
341 | merge=True,
342 | skip=False
343 | ),
344 | hetionet.parse_relationship_type(
345 | relationship_type=onto.chemicalBindsGene,
346 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
347 | fmt="tsv",
348 | parse_config={
349 | "subject_node_type": onto.Drug,
350 | "subject_column_name": "source",
351 | "subject_match_property": onto.xrefDrugbank,
352 | "object_node_type": onto.Gene,
353 | "object_column_name": "target",
354 | "object_match_property": onto.xrefNcbiGene,
355 | "filter_column": "metaedge",
356 | "filter_value": "CbG",
357 | "headers": True,
358 | "data_transforms": {
359 | "source": lambda x: x.split("::")[-1],
360 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
361 | },
362 | },
363 | merge=False,
364 | skip=False
365 | )
366 | hetionet.parse_relationship_type(
367 | relationship_type=onto.geneInteractsWithGene,
368 | source_filename="hetionet-v1.0-edges.sif",
369 | fmt="tsv",
370 | parse_config={
371 | "subject_node_type": onto.Gene,
372 | "subject_column_name": "source",
373 | "subject_match_property": onto.xrefNcbiGene,
374 | "object_node_type": onto.Gene,
375 | "object_column_name": "target",
376 | "object_match_property": onto.xrefNcbiGene,
377 | "filter_column": "metaedge",
378 | "filter_value": "GiG",
379 | "headers": True,
380 | "data_transforms": {
381 | "source": lambda x: int(x.split("::")[-1]),
382 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
383 | },
384 | },
385 | merge=False,
386 | skip=False
387 | )
388 | hetionet.parse_relationship_type(
389 | relationship_type=onto.drugInClass,
390 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
391 | fmt="tsv",
392 | parse_config={
393 | "subject_node_type": onto.Drug,
394 | "subject_column_name": "target", # Note how we reverse the direction of the relationship here
395 | "subject_match_property": onto.xrefDrugbank,
396 | "object_node_type": onto.DrugClass,
397 | "object_column_name": "source",
398 | "object_match_property": onto.xrefNciThesaurus,
399 | "filter_column": "metaedge",
400 | "filter_value": "PCiC",
401 | "headers": True,
402 | "data_transforms": {
403 | "source": lambda x: x.split("::")[-1],
404 | "target": lambda x: x.split("::")[-1] # I foresee this causing problems in the future - should all IDs be cast to str?
405 | },
406 | },
407 | merge=False,
408 | skip=False
409 | )
410 | hetionet.parse_relationship_type(
411 | relationship_type=onto.drugCausesEffect,
412 | source_filename="hetionet-v1.0-edges.sif",
413 | fmt="tsv",
414 | parse_config={
415 | "subject_node_type": onto.Drug,
416 | "subject_column_name": "source",
417 | "subject_match_property": onto.xrefDrugbank,
418 | "object_node_type": onto.ChemicalEffect,
419 | "object_column_name": "target",
420 | "object_match_property": onto.xrefUmlsCUI,
421 | "filter_column": "metaedge",
422 | "filter_value": "CcSE",
423 | "headers": True,
424 | "data_transforms": {
425 | "source": lambda x: x.split("::")[-1],
426 | "target": lambda x: x.split("::")[-1]
427 | },
428 | },
429 | merge=False,
430 | skip=False
431 | )
432 | hetionet.parse_relationship_type(
433 | relationship_type=onto.symptomManifestationOfDisease,
434 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
435 | fmt="tsv",
436 | parse_config={
437 | "subject_node_type": onto.Symptom,
438 | "subject_column_name": "target", # Flip target and source
439 | "subject_match_property": onto.xrefMeSH,
440 | "object_node_type": onto.Disease,
441 | "object_column_name": "source",
442 | "object_match_property": onto.xrefDiseaseOntology,
443 | "filter_column": "metaedge",
444 | "filter_value": "DpS",
445 | "headers": True,
446 | "data_transforms": {
447 | "source": lambda x: x.split("DOID:")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:'
448 | "target": lambda x: x.split("::")[-1]
449 | },
450 | },
451 | merge=False,
452 | skip=False
453 | )
454 | hetionet.parse_relationship_type(
455 | relationship_type=onto.drugTreatsDisease,
456 | source_filename="hetionet-v1.0-edges.sif",
457 | fmt="tsv",
458 | parse_config={
459 | "subject_node_type": onto.Drug,
460 | "subject_column_name": "source",
461 | "subject_match_property": onto.xrefDrugbank,
462 | "object_node_type": onto.Disease,
463 | "object_column_name": "target",
464 | "object_match_property": onto.xrefDiseaseOntology,
465 | "filter_column": "metaedge",
466 | "filter_value": "CtD",
467 | "headers": True,
468 | "data_transforms": {
469 | "source": lambda x: x.split("::")[-1],
470 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
471 | },
472 | },
473 | merge=False,
474 | skip=False
475 | )
476 | hetionet.parse_relationship_type( # Hetionet makes a messy distinction between 'treats' and 'palliates' which we ignore
477 | relationship_type=onto.drugTreatsDisease,
478 | source_filename="hetionet-v1.0-edges.sif",
479 | fmt="tsv",
480 | parse_config={
481 | "subject_node_type": onto.Drug,
482 | "subject_column_name": "source",
483 | "subject_match_property": onto.xrefDrugbank,
484 | "object_node_type": onto.Disease,
485 | "object_column_name": "target",
486 | "object_match_property": onto.xrefDiseaseOntology,
487 | "filter_column": "metaedge",
488 | "filter_value": "CpD",
489 | "headers": True,
490 | "data_transforms": {
491 | "source": lambda x: x.split("::")[-1],
492 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
493 | },
494 | },
495 | merge=False,
496 | skip=False
497 | )
498 | hetionet.parse_relationship_type(
499 | relationship_type=onto.diseaseLocalizesToAnatomy,
500 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
501 | fmt="tsv",
502 | parse_config={
503 | "subject_node_type": onto.Disease,
504 | "subject_column_name": "source",
505 | "subject_match_property": onto.xrefDiseaseOntology,
506 | "object_node_type": onto.BodyPart,
507 | "object_column_name": "target",
508 | "object_match_property": onto.xrefUberon,
509 | "filter_column": "metaedge",
510 | "filter_value": "DlA",
511 | "headers": True,
512 | "data_transforms": {
513 | "source": lambda x: x.split(":")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:'
514 | "target": lambda x: x.split("::")[-1]
515 | },
516 | },
517 | merge=False,
518 | skip=False
519 | )
520 | hetionet.parse_relationship_type(
521 | relationship_type=onto.diseaseAssociatesWithDisease,
522 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
523 | fmt="tsv",
524 | parse_config={
525 | "subject_node_type": onto.Disease,
526 | "subject_column_name": "source",
527 | "subject_match_property": onto.xrefDiseaseOntology,
528 | "object_node_type": onto.Disease,
529 | "object_column_name": "target",
530 | "object_match_property": onto.xrefDiseaseOntology,
531 | "filter_column": "metaedge",
532 | "filter_value": "DrD",
533 | "headers": True,
534 | "data_transforms": {
535 | "source": lambda x: x.split(":")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:'
536 | "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
537 | },
538 | },
539 | merge=False,
540 | skip=False
541 | )
542 | hetionet.parse_relationship_type(
543 | relationship_type=onto.geneParticipatesInBiologicalProcess,
544 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
545 | fmt="tsv",
546 | parse_config={
547 | "subject_node_type": onto.Gene,
548 | "subject_column_name": "source",
549 | "subject_match_property": onto.xrefNcbiGene,
550 | "object_node_type": onto.BiologicalProcess,
551 | "object_column_name": "target",
552 | "object_match_property": onto.xrefGeneOntology,
553 | "filter_column": "metaedge",
554 | "filter_value": "GpBP",
555 | "headers": True,
556 | "data_transforms": {
557 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:'
558 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
559 | },
560 | },
561 | merge=False,
562 | skip=False
563 | )
564 | hetionet.parse_relationship_type(
565 | relationship_type=onto.geneAssociatedWithCellularComponent,
566 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
567 | fmt="tsv",
568 | parse_config={
569 | "subject_node_type": onto.Gene,
570 | "subject_column_name": "source",
571 | "subject_match_property": onto.xrefNcbiGene,
572 | "object_node_type": onto.CellularComponent,
573 | "object_column_name": "target",
574 | "object_match_property": onto.xrefGeneOntology,
575 | "filter_column": "metaedge",
576 | "filter_value": "GpCC",
577 | "headers": True,
578 | "data_transforms": {
579 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:'
580 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
581 | },
582 | },
583 | merge=False,
584 | skip=False
585 | )
586 | hetionet.parse_relationship_type(
587 | relationship_type=onto.geneHasMolecularFunction,
588 | source_filename="hetionet-custom-edges.tsv", #use customized hetionet
589 | fmt="tsv",
590 | parse_config={
591 | "subject_node_type": onto.Gene,
592 | "subject_column_name": "source",
593 | "subject_match_property": onto.xrefNcbiGene,
594 | "object_node_type": onto.MolecularFunction,
595 | "object_column_name": "target",
596 | "object_match_property": onto.xrefGeneOntology,
597 | "filter_column": "metaedge",
598 | "filter_value": "GpMF",
599 | "headers": True,
600 | "data_transforms": {
601 | "source": lambda x: int(x.split("::")[-1]), # Note: Because hetionet prefixes DOIDs with 'DOID:'
602 | "target": lambda x: x.split("::")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
603 | },
604 | },
605 | merge=False,
606 | skip=False
607 | )
608 |
609 | aopdb.parse_relationship_type(
610 | relationship_type=onto.geneInPathway,
611 | inverse_relationship_type=onto.PathwayContainsGene,
612 | parse_config = {
613 | "subject_node_type": onto.Gene,
614 | "subject_column_name": "entrez",
615 | "subject_match_property": onto.xrefNcbiGene,
616 | "object_node_type": onto.Pathway,
617 | "object_column_name": "path_name",
618 | "object_match_property": onto.pathwayName,
619 | "custom_sql_query": """SELECT DISTINCT entrez, path_id, TRIM(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(path_name, '', ''), '', ''), '', ''), '', ''), ' - Homo sapiens (human)', '')) as path_name
620 | FROM aopdb.pathway_gene
621 | WHERE tax_id = 9606;""",
622 | "source_table_type": "foreignKey",
623 | "source_table": "pathway_gene",
624 | },
625 | merge=False,
626 | skip=False
627 | )
628 | hetionet.parse_relationship_type(
629 | relationship_type=onto.bodyPartOverexpressesGene,
630 | source_filename="hetionet-v1.0-edges.sif",
631 | fmt="tsv",
632 | parse_config={
633 | "subject_node_type": onto.BodyPart,
634 | "subject_column_name": "source",
635 | "subject_match_property": onto.xrefUberon,
636 | "object_node_type": onto.Gene,
637 | "object_column_name": "target",
638 | "object_match_property": onto.xrefNcbiGene,
639 | "filter_column": "metaedge",
640 | "filter_value": "AuG", # "anatomyUpregulatesGene"
641 | "headers": True,
642 | "data_transforms": {
643 | "source": lambda x: x.split("::")[-1],
644 | "target": lambda x: int(x.split("::")[-1])
645 | },
646 | },
647 | merge=False,
648 | skip=False
649 | )
650 | hetionet.parse_relationship_type(
651 | relationship_type=onto.bodyPartUnderexpressesGene,
652 | source_filename="hetionet-v1.0-edges.sif",
653 | fmt="tsv",
654 | parse_config={
655 | "subject_node_type": onto.BodyPart,
656 | "subject_column_name": "source",
657 | "subject_match_property": onto.xrefUberon,
658 | "object_node_type": onto.Gene,
659 | "object_column_name": "target",
660 | "object_match_property": onto.xrefNcbiGene,
661 | "filter_column": "metaedge",
662 | "filter_value": "AdG", # "anatomyDownregulatesGene"
663 | "headers": True,
664 | "data_transforms": {
665 | "source": lambda x: x.split("::")[-1],
666 | "target": lambda x: int(x.split("::")[-1])
667 | },
668 | },
669 | merge=False,
670 | skip=False
671 | )
672 |
673 | # POSSIBLE ISSUE: Normalize Drug > Chemical or vice versa? Gonna have to look for 'gaps'
674 | # in Neo4j database stemming from inconsistency in node type.
675 |
676 | hetionet.parse_relationship_type(
677 | relationship_type=onto.geneCovariesWithGene,
678 | source_filename="hetionet-v1.0-edges.sif",
679 | fmt="tsv",
680 | parse_config={
681 | "subject_node_type": onto.Gene,
682 | "subject_column_name": "source",
683 | "subject_match_property": onto.xrefNcbiGene,
684 | "object_node_type": onto.Gene,
685 | "object_column_name": "target",
686 | "object_match_property": onto.xrefNcbiGene,
687 | "filter_column": "metaedge",
688 | "filter_value": "GcG",
689 | "headers": True,
690 | "data_transforms": {
691 | "source": lambda x: int(x.split("::")[-1]),
692 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
693 | },
694 | },
695 | merge=False,
696 | skip=False
697 | )
698 |
699 | hetionet.parse_relationship_type(
700 | relationship_type=onto.geneRegulatesGene,
701 | source_filename="hetionet-v1.0-edges.sif",
702 | fmt="tsv",
703 | parse_config={
704 | "subject_node_type": onto.Gene,
705 | "subject_column_name": "source",
706 | "subject_match_property": onto.xrefNcbiGene,
707 | "object_node_type": onto.Gene,
708 | "object_column_name": "target",
709 | "object_match_property": onto.xrefNcbiGene,
710 | "filter_column": "metaedge",
711 | "filter_value": "Gr>G",
712 | "headers": True,
713 | "data_transforms": {
714 | "source": lambda x: int(x.split("::")[-1]),
715 | "target": lambda x: int(x.split("::")[-1]) # I foresee this causing problems in the future - should all IDs be cast to str?
716 | },
717 | },
718 | merge=False,
719 | skip=False
720 | )
721 |
722 | dorothea.parse_node_type(
723 | node_type="TranscriptionFactor",
724 | source_filename="tf.tsv",
725 | fmt="tsv",
726 | parse_config={
727 | "iri_column_name": "source",
728 | "headers": True,
729 | "data_property_map": {
730 | "source": onto.TF,
731 | #"source": onto.commonName,
732 | "sourceDB": onto.sourceDatabase,
733 | },
734 | },
735 | merge=False,
736 | skip=False
737 | )
738 |
739 |
740 | dorothea.parse_relationship_type(
741 | relationship_type=onto.transcriptionFactorInteractsWithGene,
742 | source_filename="tf.tsv",
743 | fmt="tsv",
744 | parse_config={
745 | "subject_node_type": onto.TranscriptionFactor,
746 | "subject_column_name": "source",
747 | "subject_match_property": onto.TF,
748 | "object_node_type": onto.Gene,
749 | "object_column_name": "target",
750 | "object_match_property": onto.geneSymbol,
751 | "headers": True,
752 | },
753 | merge=False,
754 | skip=False
755 | )
756 |
757 | print_onto_stats(onto)
758 |
759 | with open("./data/alzkb_v2-populated.rdf", 'wb') as fp:
760 | onto.save(file=fp, format="rdfxml")
--------------------------------------------------------------------------------
/alzkb/rdf_to_memgraph_csv.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | import pandas as pd
4 | import numpy as np
5 | from gqlalchemy import Memgraph
6 | import owlready2
7 |
8 |
9 | #read RDF
10 | path = './data/alzkb_v2-populated.rdf'
11 | onto = owlready2.get_ontology(path).load()
12 |
13 |
14 | #Load node and property
15 | def extract_node_details(label, node):
16 | details = {
17 | '_id': node.name,
18 | '_labels': label,
19 | 'commonName': node.commonName if node.commonName else np.nan,
20 | 'geneSymbol': node.geneSymbol if node.geneSymbol else np.nan,
21 | 'pathwayId': node.pathwayId if node.pathwayId else np.nan,
22 | 'pathwayName': node.pathwayName if node.pathwayName else np.nan,
23 | 'sourceDatabase': node.sourceDatabase if node.sourceDatabase else np.nan,
24 | 'typeOfGene': node.typeOfGene if node.typeOfGene else np.nan,
25 | 'chromosome': node.chromosome if node.chromosome else np.nan,
26 | 'TF': node.TF if node.TF else np.nan,
27 | 'xrefCasRN': node.xrefCasRN if node.xrefCasRN else np.nan,
28 | 'xrefDiseaseOntology': node.xrefDiseaseOntology if node.xrefDiseaseOntology else np.nan,
29 | 'xrefDrugbank': node.xrefDrugbank if node.xrefDrugbank else np.nan,
30 | 'xrefEnsembl': node.xrefEnsembl if node.xrefEnsembl else np.nan,
31 | 'xrefGeneOntology': node.xrefGeneOntology if node.xrefGeneOntology else np.nan,
32 | 'xrefHGNC': node.xrefHGNC if node.xrefHGNC else np.nan,
33 | 'xrefMeSH': node.xrefMeSH if node.xrefMeSH else np.nan,
34 | 'xrefNcbiGene': node.xrefNcbiGene if node.xrefNcbiGene else np.nan,
35 | 'xrefNciThesaurus': node.xrefNciThesaurus if node.xrefNciThesaurus else np.nan,
36 | 'xrefOMIM': node.xrefOMIM if node.xrefOMIM else np.nan,
37 | 'xrefUberon': node.xrefUberon if node.xrefUberon else np.nan,
38 | 'xrefUmlsCUI': node.xrefUmlsCUI if node.xrefUmlsCUI else np.nan
39 | }
40 |
41 | for key, value in details.items():
42 | if isinstance(value, list) and len(value) > 0:
43 | try:
44 | details[key] = str(value[-1])
45 | except ValueError:
46 | details[key] = np.nan
47 | elif isinstance(value, list):
48 | details[key] = np.nan
49 |
50 | return details
51 |
52 |
53 | #Drug
54 | drug_details_list = []
55 | for drug in onto.individuals():
56 | if onto.Drug in drug.is_a:
57 | drug_details_list.append(extract_node_details(':Drug', drug))
58 | drug_details_df = pd.DataFrame(drug_details_list)
59 |
60 |
61 | #Gene
62 | gene_details_list = []
63 | for gene in onto.individuals():
64 | if onto.Gene in gene.is_a:
65 | gene_details_list.append(extract_node_details(':Gene', gene))
66 | gene_details_df = pd.DataFrame(gene_details_list)
67 |
68 |
69 | #BodyPart
70 | bodypart_details_list = []
71 | for bodypart in onto.individuals():
72 | if onto.BodyPart in bodypart.is_a:
73 | bodypart_details_list.append(extract_node_details(':BodyPart', bodypart))
74 | bodypart_details_df = pd.DataFrame(bodypart_details_list)
75 |
76 |
77 | #Disease
78 | disease_details_list = []
79 | for disease in onto.individuals():
80 | if onto.Disease in disease.is_a:
81 | disease_details_list.append(extract_node_details(':Disease', disease))
82 | disease_details_df = pd.DataFrame(disease_details_list)
83 |
84 |
85 | #DrugClass
86 | drugclass_details_list = []
87 | for drugclass in onto.individuals():
88 | if onto.DrugClass in drugclass.is_a:
89 | drugclass_details_list.append(extract_node_details(':DrugClass', drugclass))
90 | drugclass_details_df = pd.DataFrame(drugclass_details_list)
91 |
92 |
93 | #CellularComponent
94 | cellular_details_list = []
95 | for cellular in onto.individuals():
96 | if onto.CellularComponent in cellular.is_a:
97 | cellular_details_list.append(extract_node_details(':CellularComponent', cellular))
98 | cellular_details_df = pd.DataFrame(cellular_details_list)
99 |
100 |
101 | #MolecularFunction
102 | molecular_details_list = []
103 | for molecular in onto.individuals():
104 | if onto.MolecularFunction in molecular.is_a:
105 | molecular_details_list.append(extract_node_details(':MolecularFunction', molecular))
106 | molecular_details_df = pd.DataFrame(molecular_details_list)
107 |
108 |
109 | #Pathway
110 | pathway_details_list = []
111 | for pathway in onto.individuals():
112 | if onto.Pathway in pathway.is_a:
113 | pathway_details_list.append(extract_node_details(':Pathway', pathway))
114 | pathway_details_df = pd.DataFrame(pathway_details_list)
115 |
116 |
117 | #BiologicalProcess
118 | biological_details_list = []
119 | for biological in onto.individuals():
120 | if onto.BiologicalProcess in biological.is_a:
121 | biological_details_list.append(extract_node_details(':BiologicalProcess', biological))
122 | biological_details_df = pd.DataFrame(biological_details_list)
123 |
124 |
125 | #Symptom
126 | symptom_details_list = []
127 | for symptom in onto.individuals():
128 | if onto.Symptom in symptom.is_a:
129 | symptom_details_list.append(extract_node_details(':Symptom', symptom))
130 | symptom_details_df = pd.DataFrame(symptom_details_list)
131 |
132 |
133 | # TranscriptionFactor
134 | transcription_details_list = []
135 | for transcriptionfactor in onto.individuals():
136 | if onto.TranscriptionFactor in transcriptionfactor.is_a:
137 | transcription_details_list.append(extract_node_details(':TranscriptionFactor', transcriptionfactor))
138 | transcription_details_df = pd.DataFrame(transcription_details_list)
139 |
140 |
141 | #Merge all nodes df
142 | merged_node_df = pd.concat([drug_details_df, gene_details_df, bodypart_details_df, disease_details_df,
143 | drugclass_details_df, cellular_details_df, molecular_details_df, pathway_details_df,
144 | biological_details_df, symptom_details_df, transcription_details_df], ignore_index=True)
145 | merged_node_df.reset_index(drop=True, inplace=True)
146 | merged_node_df.shape
147 |
148 |
149 | #Load relationship
150 |
151 | #Drug
152 | relations = []
153 | def extract_rel_details_from_drug(node):
154 | for gene in node.chemicalBindsGene:
155 | relations.append({
156 | '_start': node.name,
157 | '_end': gene.name,
158 | '_type': 'CHEMICALBINDSGENE'})
159 | for gene in node.chemicalDecreasesExpression:
160 | relations.append({
161 | '_start': node.name,
162 | '_end': gene.name,
163 | '_type': 'CHEMICALDECREASESEXPRESSION'})
164 | for gene in node.chemicalIncreasesExpression:
165 | relations.append({
166 | '_start': node.name,
167 | '_end': gene.name,
168 | '_type': 'CHEMICALINCREASESEXPRESSION'})
169 | for disease in node.drugCausesEffect:
170 | relations.append({
171 | '_start': node.name,
172 | '_end': disease.name,
173 | '_type': 'DRUGCAUSESEFFECT'})
174 | for disease in node.drugTreatsDisease:
175 | relations.append({
176 | '_start': node.name,
177 | '_end': disease.name,
178 | '_type': 'DRUGTREATSDISEASE'})
179 | for drugclass in node.drugInClass:
180 | relations.append({
181 | '_start': node.name,
182 | '_end': drugclass.name,
183 | '_type': 'DRUGINCLASS'})
184 |
185 |
186 | for drug in onto.individuals():
187 | if onto.Drug in drug.is_a:
188 | extract_rel_details_from_drug(drug)
189 |
190 | drug_rel = pd.DataFrame(relations)
191 |
192 |
193 | #Gene
194 | relations = []
195 | def extract_rel_details_from_gene(node):
196 | for cellular in node.geneAssociatedWithCellularComponent:
197 | relations.append({
198 | '_start': node.name,
199 | '_end': cellular.name,
200 | '_type': 'GENEASSOCIATEDWITHCELLULARCOMPONENT'})
201 | for disease in node.geneAssociatesWithDisease:
202 | relations.append({
203 | '_start': node.name,
204 | '_end': disease.name,
205 | '_type': 'GENEASSOCIATESWITHDISEASE'})
206 | for molecular in node.geneHasMolecularFunction:
207 | relations.append({
208 | '_start': node.name,
209 | '_end': molecular.name,
210 | '_type': 'GENEHASMOLECULARFUNCTION'})
211 | for biological in node.geneParticipatesInBiologicalProcess:
212 | relations.append({
213 | '_start': node.name,
214 | '_end': biological.name,
215 | '_type': 'GENEPARTICIPATESINBIOLOGICALPROCESS'})
216 |
217 |
218 | for gene in onto.individuals():
219 | if onto.Gene in gene.is_a:
220 | extract_rel_details_from_gene(gene)
221 |
222 | gene_rel = pd.DataFrame(relations)
223 |
224 |
225 | # #### geneInteractsWithGene (to avoid inverse property problem)
226 | from rdflib import Graph, URIRef
227 |
228 | g = Graph()
229 |
230 | rdf_file = path
231 | g.parse(rdf_file, format='xml')
232 |
233 | pred_uri_1 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneCovariesWithGene')
234 | pred_uri_2 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInteractsWithGene')
235 | pred_uri_3 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneRegulatesGene')
236 | pred_uri_4 = URIRef('http://jdr.bio/ontologies/alzkb.owl#geneInPathway')
237 |
238 | def extract_last_part(uri):
239 | return uri.split('#')[-1]
240 |
241 | triples = []
242 | for subj, pred, obj in g:
243 | if pred == pred_uri_1:
244 | triples.append([extract_last_part(subj), 'GENECOVARIESWITHGENE', extract_last_part(obj)])
245 | elif pred == pred_uri_2:
246 | triples.append([extract_last_part(subj), 'GENEINTERACTSWITHGENE', extract_last_part(obj)])
247 | elif pred == pred_uri_3:
248 | triples.append([extract_last_part(subj), 'GENEREGULATESGENE', extract_last_part(obj)])
249 | elif pred == pred_uri_4:
250 | triples.append([extract_last_part(subj), 'GENEINPATHWAY', extract_last_part(obj)])
251 |
252 | gene_rel2 = pd.DataFrame(triples, columns=['_start', '_type', '_end'])
253 |
254 | #Merge gene rel and rel2
255 | gene_rel2 = gene_rel2[gene_rel.columns]
256 | gene_rel = pd.concat([gene_rel, gene_rel2], ignore_index=True)
257 |
258 |
259 | #Body Part
260 | relations = []
261 | def extract_rel_details_from_bodypart(node):
262 | for gene in node.bodyPartOverexpressesGene:
263 | relations.append({
264 | '_start': node.name,
265 | '_end': gene.name,
266 | '_type': 'BODYPARTOVEREXPRESSESGENE'})
267 | for gene in node.bodyPartUnderexpressesGene:
268 | relations.append({
269 | '_start': node.name,
270 | '_end': gene.name,
271 | '_type': 'BODYPARTUNDEREXPRESSESGENE'})
272 |
273 |
274 | for bodypart in onto.individuals():
275 | if onto.BodyPart in bodypart.is_a:
276 | extract_rel_details_from_bodypart(bodypart)
277 |
278 | bodypart_rel = pd.DataFrame(relations)
279 |
280 |
281 | #Disease
282 | relations = []
283 | def extract_rel_details_from_disease(node):
284 | for disease in node.diseaseAssociatesWithDisease:
285 | relations.append({
286 | '_start': node.name,
287 | '_end': disease.name,
288 | '_type': 'DISEASEASSOCIATESWITHDISEASE'})
289 | for bodypart in node.diseaseLocalizesToAnatomy:
290 | relations.append({
291 | '_start': node.name,
292 | '_end': bodypart.name,
293 | '_type': 'DISEASELOCALIZESTOANATOMY'})
294 |
295 |
296 | for disease in onto.individuals():
297 | if onto.Disease in disease.is_a:
298 | extract_rel_details_from_disease(disease)
299 |
300 | disease_rel = pd.DataFrame(relations)
301 |
302 |
303 | #Symptom
304 | relations = []
305 | def extract_rel_details_from_symptom(node):
306 | for disease in node.symptomManifestationOfDisease:
307 | relations.append({
308 | '_start': node.name,
309 | '_end': disease.name,
310 | '_type': 'SYMPTOMMANIFESTATIONOFDISEASE'})
311 |
312 |
313 | for symptom in onto.individuals():
314 | if onto.Symptom in symptom.is_a:
315 | extract_rel_details_from_symptom(symptom)
316 |
317 | symptom_rel = pd.DataFrame(relations)
318 |
319 |
320 | # Transcription Factor
321 | relations = []
322 | def extract_rel_details_from_transcriptionfactor(node):
323 | for transcriptionfactor in node.transcriptionFactorInteractsWithGene:
324 | relations.append({
325 | '_start': node.name,
326 | '_end': transcriptionfactor.name,
327 | '_type': 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'})
328 |
329 |
330 | for transcriptionfactor in onto.individuals():
331 | if onto.TranscriptionFactor in transcriptionfactor.is_a:
332 | extract_rel_details_from_transcriptionfactor(transcriptionfactor)
333 |
334 | transcriptionfactor_rel = pd.DataFrame(relations)
335 |
336 |
337 | #Merge all rels df
338 | merged_rel_df = pd.concat([drug_rel, gene_rel, bodypart_rel, disease_rel, symptom_rel, transcriptionfactor_rel], ignore_index=True)
339 | merged_rel_df.reset_index(drop=True, inplace=True)
340 | merged_rel_df.shape
341 |
342 |
343 | #Merge node and rel
344 | df_all = pd.concat([merged_node_df, merged_rel_df], axis=0, ignore_index=True)
345 | df_all.to_csv('./data/alzkb_v2-populated.csv', index=False)
346 |
347 |
348 |
349 |
350 |
--------------------------------------------------------------------------------
/img/build-abstract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EpistasisLab/AlzKB/3ce9515b3172e4356edc83e3ea37cd1a0df3d7ed/img/build-abstract.png
--------------------------------------------------------------------------------
/scripts/alzkb_parse_disgenet.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | ## created by Yun Hao and Joe Romano @MooreLab 2022
3 | ## This script parses DisGeNET gene-disease relationship data to extract relationships specific to Alzheimer's disease
4 |
5 | # NOTE: This file must be run from the `disgenet/` directory containing the original TSV files referenced below!
6 | # Both output files will be deposited into the `disgenet/CUSTOM/` directory.
7 |
8 | import pandas as pd
9 |
10 | from pathlib import Path
11 |
12 | disgenet_df = pd.read_csv("./disease_mappings_to_attributes.tsv", sep="\t", header=0)
13 | disgenet_do_df = pd.read_csv("./disease_mappings.tsv", sep="\t", header=0)
14 |
15 | # case insensitive match
16 | disgenet_ad_df = disgenet_df.loc[disgenet_df["name"].str.contains("Alzheimer",case=False),:]
17 | cuis = list(disgenet_ad_df.diseaseId.unique())
18 |
19 | # For adding disease ontology identifiers
20 | disgenet_ad_do_df = disgenet_do_df.loc[disgenet_do_df.diseaseId.isin(cuis),:]
21 |
22 | # clean data
23 | # Creutzfeldt-jakob disease (CJD) and Familial Alzheimer Disease (FAD) are different diseases but got merged to the same node in AlzKB because of disease mappings in DisGeNET file “UMLS CUI to several disease vocabularies” in which the DO of Creutzfeldt-Jakob disease is mapped to FAD.
24 | disgenet_ad_do_df = disgenet_ad_do_df[~((disgenet_ad_do_df['name']=='Familial Alzheimer Disease (FAD)') & (disgenet_ad_do_df['vocabularyName']=='Creutzfeldt-Jakob disease'))]
25 |
26 | # add "data_source" & "unbiased" colomns
27 | disgenet_ad_do_df['data_source'] ='DisGeNET'
28 | disgenet_ad_df['data_source'] ='DisGeNET'
29 |
30 | # if we don't have the CUSTOM subdirectory, create it
31 | Path("CUSTOM").mkdir(exist_ok=True)
32 |
33 | disgenet_ad_df.to_csv("./CUSTOM/disease_mappings_to_attributes_alzheimer.tsv", sep="\t", header=True, index=False)
34 | disgenet_ad_do_df.to_csv("./CUSTOM/disease_mappings_alzheimer.tsv", sep="\t", header=True, index=False)
--------------------------------------------------------------------------------
/scripts/alzkb_parse_dorothea.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import rpy2.robjects as robjects
3 | from rpy2.robjects import pandas2ri
4 |
5 | # dorothea
6 | # Defining the R script and loading the instance in Python (create and save R script in Rstudio)
7 | r = robjects.r
8 | r['source']('./dorothea.R')
9 |
10 | # Loading the function we have defined in R.
11 | #list(robjects.globalenv.keys())
12 | net_r = robjects.globalenv['net']
13 |
14 | #r to pandas dataframe
15 | import rpy2.robjects as ro
16 | with (ro.default_converter + pandas2ri.converter).context():
17 | dorothea = ro.conversion.get_conversion().rpy2py(net_r)
18 | #dorothea['source'].nunique() #643 TFs
19 |
20 |
21 | #trrust
22 | trrust_rawdata = pd.read_csv('./trrust_rawdata.human.tsv', sep='\t', header=None, names=["TF","Gene","Interaction","PMID"])
23 | #trrust_rawdata['TF'].nunique() #795 TFs matches with https://www.grnpedia.org/trrust/downloadnetwork.php
24 |
25 |
26 | #combine
27 | df_comb = trrust_rawdata.merge(dorothea, left_on=["TF","Gene"], right_on=["source","target"], how='inner')
28 | df_comb['sourceDB'] ='DoRothEA & TRRUST'
29 | df_comb.to_csv('./tf.tsv', sep="\t", header=True, index=False)
--------------------------------------------------------------------------------
/scripts/alzkb_parse_drugbank.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pathlib import Path
3 |
4 | df = pd.read_csv('./drug_links.csv')
5 | print(df.shape)
6 |
7 | # add "data_source" colomn
8 | df['data_resource'] ='DrugBank'
9 |
10 | # if we don't have the CUSTOM subdirectory, create it
11 | Path("CUSTOM").mkdir(exist_ok=True)
12 |
13 | df.to_csv("./CUSTOM/drug_links.tsv", sep="\t", header=True, index=False)
14 | print(df.shape)
--------------------------------------------------------------------------------
/scripts/alzkb_parse_ncbigene.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | ## created by Van Truong @RitchieWherryLabs 2022
3 | ## This script parses NCBI human gene data and Bgee epxression data for knowledge related to Alzheimer's disease
4 |
5 |
6 | my_set = set()
7 |
8 | def processLargeTextFile(source, compare_index, separator):
9 | with open(source, "r") as r:
10 | for line in r:
11 | if 'brain' in line:
12 | columns = line.split(separator)
13 | my_set.add(columns[compare_index].replace('Ensembl:', '') )
14 | r.close()
15 |
16 | def keepDesiredColums(row, keep_index, separator):
17 | columns = row.split(separator)
18 |
19 | output_str = []
20 | for index in keep_index:
21 | output_str.append(columns[index])
22 |
23 | return separator.join(output_str)
24 |
25 | def filterLargeTextFile(source, destination, delimiter, keep_index):
26 | with open(source, "r") as r, open(destination, "w") as w:
27 | #load header row
28 | w.write(keepDesiredColums(r.readline(), keep_index, delimiter) + '\n')
29 |
30 | #load body
31 | for line in r:
32 | #if line is not None:
33 | if line.startswith('9606'): #filter to Homo sapiens (human)
34 | w.write(keepDesiredColums(line, keep_index, delimiter) + '\n')
35 | r.close(), w.close()
36 |
37 | def fileIndexFinder(source, destination, keep_set, compare_column_index, separator):
38 | count_rows =0
39 | with open(source, "r") as r, open(destination, "w") as w:
40 | w.write('data_resource' + separator + 'Ensembl' + separator + r.readline())
41 |
42 | for line in r:
43 | columns = line.split(separator)
44 | parsed_column = columns[compare_column_index]
45 |
46 | if '|' in parsed_column:
47 | parsed_column_split = parsed_column.split('|')
48 | if len(parsed_column_split) > 2:
49 | parsed_column = parsed_column_split[2].replace('Ensembl:', '')
50 |
51 | #if parsed_column in keep_set: # keep all instead of filtering to brain
52 | w.write('NCBI Gene' + separator + parsed_column + separator + line)
53 | count_rows +=1
54 |
55 | print(count_rows)
56 | r.close()
57 |
58 |
59 | brain_file='./Homo_sapiens_expr_advanced.tsv' #https://bgee.org/?page=download&action=expr_calls#id1 Homo_sapiens_expr_advanced_development
60 | gene_file='../Homo_sapiens.gene_info' #https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
61 | gene_dest_file='./Homo_sapiens.gene_info_filtered'
62 |
63 | final_out='./output.tsv'
64 |
65 | delimiter = '\t'
66 | keep_index = [1,2,4,5,6,8,9,11]
67 | compare_index = 0
68 |
69 | processLargeTextFile(brain_file, compare_index, delimiter)
70 | print(len(my_set))
71 |
72 | filterLargeTextFile(gene_file, gene_dest_file, delimiter, keep_index)
73 | fileIndexFinder(gene_dest_file, final_out, my_set, 3, delimiter)
--------------------------------------------------------------------------------
/scripts/dorothea.R:
--------------------------------------------------------------------------------
1 | library(dorothea)
2 | library(decoupleR)
3 | library(ggplot2)
4 | library(dplyr)
5 |
6 | net <- decoupleR::get_dorothea(levels = c('A', 'B', 'C', 'D'))
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | __all__ = [
4 | 'VERSION'
5 | ]
6 |
7 | import setuptools
8 |
9 | MAJOR = 0
10 | MINOR = 1
11 | MICRO = 0
12 | TEST_VER = 'a'
13 | ISRELEASED = True
14 | VERSION = '%d.%d.%d%s' % (MAJOR, MINOR, MICRO, TEST_VER)
15 |
16 | setuptools.setup(
17 | name="AlzKB",
18 | version=VERSION,
19 | author="Joseph D. Romano, Van Truong, Yun Hao, Li Shen, and Jason H. Moore",
20 | description="A graph knowledge base for Alzheimer disease",
21 | url="https://github.com/EpistasisLab/AlzKB.git",
22 | packages=setuptools.find_packages(),
23 | python_requires=">=3.7",
24 | include_package_data=True,
25 | install_requires=[
26 | 'ista @ git+https://github.com/JDRomano2/ista@c036c1074e0b59df704a0aeb097862108b012b45'
27 | ],
28 | entry_points={
29 | 'console_scripts': [
30 | 'alzkb=alzkb.build:main'
31 | ]
32 | }
33 | )
34 |
--------------------------------------------------------------------------------