├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── archs4py ├── __init__.py ├── align.py ├── data.py ├── data │ └── config.json ├── download.py ├── meta.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | testing.ipynb 2 | dist/* 3 | archs4py.egg* 4 | pypi_push.sh 5 | .DS_Store 6 | example* 7 | age* 8 | build* 9 | *.parquet 10 | *.csv 11 | setup_old.py 12 | test/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include archs4py 2 | 3 | recursive-exclude data 4 | recursive-exclude archs4py.egg-info 5 | recursive-exclude .ipynb_checkpoints 6 | 7 | global-exclude *.pkl 8 | global-exclude *.tsv 9 | global-exclude *.zip 10 | global-exclude *.parquet 11 | global-exclude *push.sh 12 | global-exclude *testing.ipynb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | archs4py 2 | 3 | # archs4py - Official Python package to load and query ARCHS4 data 4 | 5 | Official ARCHS4 companion package. This package is a wrapper for basic H5 commands performed on the ARCHS4 data files. Some of the data access is optimized for specific query strategies and should make this implementation faster than manually querying the data. The package supports automated file download, mutithreading, and some convenience functions such as data normalization. 6 | 7 | ARCHS4py also supports the ARCHS4 alignment pipeline. When aligning FASTQ files using ARCHS4py gene and transcript counts will be compatible with the preprocessed ARCHS4 samples. 8 | 9 | [Installation](#installation) | [Download H5 Files](#usage) | [List H5 Contents](#list-data-fields-in-h5) | [Extract Counts](#data-access) | [Extract Meta Data](#meta-data) | [Normalize Samples](#normalizing-data) | [Filter Genes](#filter-genes-with-low-expression) | [Aggregate Duplicate Genes](#aggregate-duplicate-genes) | [FASTQ Alignment](#sequence-alignment) | [Versions](#list-versions) 10 | 11 | ## ARCHS4 data 12 | 13 | ARCHS4 data is regularly updated to include publically available gene expression samples from RNA-seq. ARCHS4 processes the major platforms for human and mouse. As of 6/2023 ARCHS4 encompasses more than 1.5 million RNA-seq samples. All samples in ARCHS4 are homogeniously processed. ARCHS4 does currently not decern whether samples are bulk or single-cell and purely crawls GEO. Since samples are not always correctly annotated as single cell ARCHS4 uses a machine learning approach to predict single-cell samples and associated a singlecellprobability to each sample. Samples with a value larger than 0.5 can be removed from the queries if needed. 14 | 15 | ## Installation 16 | 17 | The python package can be directly installed from this GitHub repository using the following command (pip or pip3 depending on system setup) 18 | 19 | ``` 20 | pip3 install archs4py 21 | ``` 22 | 23 | ## Usage 24 | 25 | ### Download data file 26 | 27 | 28 | The data is stored in large HDF5 files which first need to be downloaded. HDF5 stores matrix information in a compressed datastructure that allows efficient data access to slices of the data. There are separate files for `human` and `mouse` data. The supported files are `gene counts` and `transcript counts`. As of 6/2023 the files are larger than 30GB and depending on the network speed will take some time to download. 29 | 30 | ```python 31 | import archs4py as a4 32 | 33 | file_path = a4.download.counts("human", path="", version="latest") 34 | ``` 35 | 36 | ## List data fields in H5 37 | 38 | The H5 files contain data and metadata information. To list the contents of ARCHS4 H5 files use the built in `ls` function. 39 | 40 | ```python 41 | import archs4py as a4 42 | 43 | file = "human_gene_v2.6.h5" 44 | a4.ls(file) 45 | ``` 46 | 47 | ## Data access 48 | 49 | archs4py supports several ways to load gene expression data. When querying ARCHS4 be aware that when loading too many samples the system might run out of memory. (e.g. the metadata search term is very broad). In most cases loading several thousand samples simultaneously should be no problem. To find relevant samples there are 5 main functions in the `archs4py.data` module. A function to extract N random samples `archs4py.data.rand()`, a function to extract samples by index `archs4py.data.index()`, a function to extract samples based on metadata search `archs4py.data.meta()`, a function to extract samples based on a list of geo accessions `archs4py.data.samples()` and lastly a function to extract all samples belonging to a series `archs4.data.series()`. 50 | 51 | 52 | 53 | #### Extract a random set of samples 54 | 55 | To extract a random gene expression matrix use the `archs4py.data.rand()` function. The function will return a pandas dataframe with samples as columns and genes as rows. 56 | 57 | ```python 58 | import archs4py as a4 59 | 60 | #path to file 61 | file = "human_gene_v2.6.h5" 62 | 63 | # extract 100 random samples and remove single cell data 64 | rand_counts = a4.data.rand(file, 100, remove_sc=True) 65 | 66 | ``` 67 | 68 | #### Extract samples at specified index positions 69 | 70 | Extract samples based on their index positions in the H5 file. 71 | 72 | ```python 73 | import archs4py as a4 74 | 75 | #path to file 76 | file = "human_gene_v2.6.h5" 77 | 78 | # get counts for samples at position [0,1,2,3,4] 79 | pos_counts = a4.data.index(file, [0,1,2,3,4]) 80 | 81 | ``` 82 | 83 | #### Extract samples matching search term in meta data 84 | 85 | The ARCHS4 H5 file contains all meta data of samples. Using meta data search all matching samples can be extracted with the use of search terms. There is also an `archs4py.meta` module that will only return meta data. Meta data fields to be returned can be specified `meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]` 86 | 87 | ```python 88 | import archs4py as a4 89 | 90 | #path to file 91 | file = "human_gene_v2.6.h5" 92 | 93 | # search and extract samples matching regex (ignores whitespaces) 94 | meta_counts = a4.data.meta(file, "myoblast", remove_sc=True) 95 | 96 | ``` 97 | 98 | #### Extract samples in a list of GEO accession IDs 99 | 100 | Samples can directly be downloaded by providing a list of GSM IDs. Samples not contained in ARCHS4 will be ignored. 101 | 102 | ```python 103 | import archs4py as a4 104 | 105 | #path to file 106 | file = "human_gene_v2.6.h5" 107 | 108 | #get sample counts 109 | sample_counts = a4.data.samples(file, ["GSM1158284","GSM1482938","GSM1562817"]) 110 | 111 | ``` 112 | 113 | #### Extract samples belonging to a GEO series 114 | 115 | To download all samples of a GEO series for example `GSE64016` use the series function. 116 | 117 | ```python 118 | import archs4py as a4 119 | 120 | #path to file 121 | file = "human_gene_v2.6.h5" 122 | 123 | #get sample counts 124 | series_counts = a4.data.series(file, "GSE64016") 125 | 126 | ``` 127 | 128 | ## Meta data 129 | 130 | 131 | 132 | Additinally to the data module archs4py also supports the extraction of meta data. It supports similar endpoints to the `archs4.data` module. Meta data fields can be specified with: `meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]` 133 | 134 | ```python 135 | import archs4py as a4 136 | 137 | #path to file 138 | file = "human_gene_v2.6.h5" 139 | 140 | # get sample meta data based on search term 141 | meta_meta = a4.meta.meta(file, "myoblast", meta_fields=["characteristics_ch1", "source_name_ch1"]) 142 | 143 | # get sample meta data 144 | sample_meta = a4.meta.samples(file, ["GSM1158284","GSM1482938","GSM1562817"]) 145 | 146 | # get series meta data 147 | series_meta = a4.meta.series(file, "GSE64016") 148 | 149 | # get all entries of a meta data field for all samples. In this example get all sample ids and gene symbols in H5 file 150 | all_samples = a4.meta.field(file, "geo_accession") 151 | all_symbols = a4.meta.field(file, "symbol") 152 | ``` 153 | 154 | ## Normalizing data 155 | 156 | The package also supports simple normalization. Currently supported are quantile normalization, log2 + quantile normalization, and cpm. In the example below we load 100 random samples and apply log quantile. 157 | 158 | ```python 159 | import archs4py as a4 160 | 161 | file = "human_gene_v2.6.h5" 162 | rand_counts = a4.data.rand(file, 100) 163 | 164 | #normalize using log quantile (method options for now = ["log_quantile", "quantile", "cpm", "tmm"]) 165 | norm_exp = a4.normalize(rand_counts, method="log_quantile") 166 | 167 | ``` 168 | 169 | ## Filter genes with low expression 170 | 171 | 172 | 173 | To filter genes with low expression use the `utils.filter()` function. It uses two parameters to determine which genes to filter. `readThreshold` and `sampleThreshold`. In the example below genes are removed that don't have at least 50 reads in 2% of samples. `aggregate` will also deal with duplicate gene symbols in the ARCHS4 data and aggregate the counts. 174 | 175 | ```python 176 | import archs4py as a4 177 | 178 | file = "human_gene_v2.6.h5" 179 | rand_counts = a4.data.rand(file, 100) 180 | 181 | # aggregate duplicate genes 182 | filtered_exp = a4.utils.filter_genes(rand_counts, readThreshold=50, sampleThreshold=0.02, deterministic=True, aggregate=True) 183 | ``` 184 | 185 | ## Aggregate duplicate genes 186 | 187 | 188 | 189 | Some gene symbols are duplicated, which is an artifact from the Ensembl gene annotation. The transcript sequences are often identical and reads are split between the different entries. The `utils.aggregate_duplicate_genes()` function will sum all counts of duplicate gene symbols and eliminate duplicate entries. 190 | 191 | ```python 192 | import archs4py as a4 193 | 194 | file = "human_gene_v2.6.h5" 195 | rand_counts = a4.data.rand(file, 100) 196 | 197 | # filter genes with low expression 198 | agg_exp = a4.utils.aggregate_duplicate_genes(rand_counts) 199 | 200 | ``` 201 | 202 | ## Sequence alignment 203 | 204 | 205 | 206 | The `align` module contains a replication of the ARCHS4 alignment pipeline. When used on FASTQ files the resulting gene or transcript counts are compatible with the previously aligned samples in ARCHS4. The package is highly automated and only required a path to a FASTQ file or a folder containing multiple FASTQ files. All file dependencies will downloaded automatically and index will be built when needed. 207 | 208 | ### Align FASTQ file 209 | 210 | Pass either a single or paired FASTQ file. This function can return transcript count, gene counts, or transcript level TPM data. 211 | 212 | ```python 213 | 214 | import archs4py as a4 215 | 216 | a4.align.load(["SRR14457464"], "data/example_1") 217 | 218 | result = a4.align.fastq("human", "data/example_1/SRR14457464.fastq", return_type="gene", identifier="symbol") 219 | 220 | ``` 221 | 222 | The next example is a SRR file that extracts into a pair of paired end FASTQ files. They can be passed to ARCHS4py like this: 223 | 224 | ```python 225 | import archs4py as a4 226 | 227 | # the sample is paired-end and will result in two files (SRR15972519_1.fastq, SRR15972519_2.fastq) 228 | a4.align.load(["SRR15972519"], "data/example_2") 229 | 230 | result = a4.align.fastq("mouse", ["data/example_2/SRR15972519_1.fastq", "data/example_2/SRR15972519_2.fastq"], return_type="transcript") 231 | 232 | ``` 233 | 234 | ### Align FASTQ files from folder 235 | 236 | Align all FASTQ files in folder using the function `a4.align.folder()`. ARCHS4py will automatically matching samples if data is paired end. 237 | 238 | ```python 239 | 240 | import archs4py as a4 241 | 242 | a4.align.load(["SRR15972519", "SRR15972520", "SRR15972521"], "data/example_3") 243 | 244 | result = a4.align.folder("mouse", "data/example_3", return_type="gene", identifier="symbol") 245 | 246 | ``` 247 | 248 | ## List versions 249 | 250 | ARCHS4 has different versions to download from. Recommended is the default setting, which will download the latest data release. 251 | 252 | ```python 253 | import archs4 as a4 254 | 255 | print(a4.versions()) 256 | 257 | ``` 258 | 259 | # Citation 260 | 261 | When using ARCHS4 please cite the following reference: 262 | 263 | Lachmann, Alexander, Denis Torre, Alexandra B. Keenan, Kathleen M. Jagodnik, Hoyjin J. Lee, Lily Wang, Moshe C. Silverstein, and Avi Ma’ayan. "Massive mining of publicly available RNA-seq data from human and mouse." Nature communications 9, no. 1 (2018): 1366. 264 | https://www.nature.com/articles/s41467-018-03751-6 265 | 266 | 267 | 268 | 269 | -------------------------------------------------------------------------------- /archs4py/__init__.py: -------------------------------------------------------------------------------- 1 | import archs4py.data 2 | import archs4py.download 3 | import archs4py.meta 4 | import archs4py.utils 5 | import archs4py.align 6 | 7 | import importlib 8 | importlib.reload(archs4py.data) 9 | importlib.reload(archs4py.download) 10 | importlib.reload(archs4py.meta) 11 | importlib.reload(archs4py.utils) 12 | importlib.reload(archs4py.align) 13 | 14 | from archs4py.utils import versions 15 | from archs4py.utils import normalize 16 | from archs4py.utils import ls 17 | 18 | __version__="0.2.18" 19 | -------------------------------------------------------------------------------- /archs4py/align.py: -------------------------------------------------------------------------------- 1 | import xalign 2 | import archs4py 3 | import biomart 4 | import numpy as np 5 | import pandas as pd 6 | 7 | conf = archs4py.utils.get_config() 8 | gene_mapping = {} 9 | gene_mapping["homo_sapiens"] = None 10 | gene_mapping["mus_musculus"] = None 11 | 12 | def fastq(species, fastq, release="latest", t=8, overwrite=False, return_type="transcript", identifier="symbol"): 13 | if species == "mouse": 14 | species = "mus_musculus" 15 | elif species == "human": 16 | species = "homo_sapiens" 17 | result = xalign.align_fastq(species, fastq, release=conf["ALIGNMENT"][str(release)]["release"], t=t, noncoding=True, overwrite=overwrite) 18 | result.set_index("transcript", inplace=True) 19 | result.index = [x.split(".")[0] for x in result.index] 20 | if return_type == "gene": 21 | return aggregate(result.loc[:,"reads"], species, release, identifier) 22 | elif return_type == "tpm": 23 | return result.loc[:,"tpm"] 24 | else: 25 | return result.loc[:,"reads"] 26 | 27 | def folder(species, folder, return_type="transcript", release="latest", overwrite=False, t=8, identifier="symbol"): 28 | if species == "mouse": 29 | species = "mus_musculus" 30 | elif species == "human": 31 | species = "homo_sapiens" 32 | 33 | gene_count, transcript_count = xalign.align_folder(species, folder, release=conf["ALIGNMENT"][str(release)]["release"], t=t, noncoding=True, overwrite=overwrite) 34 | del gene_count 35 | if return_type == "transcript": 36 | return transcript_count 37 | else: 38 | return aggregate(transcript_count, species, release, identifier) 39 | 40 | def aggregate(transcript_count, species, release, identifier): 41 | if gene_mapping[species] is None: 42 | gene_mapping[species] = get_ensembl_mappings(species, release) 43 | trans = transcript_count.copy() 44 | trans.index = [x.split(".")[0] for x in transcript_count.index] 45 | trans.index = gene_mapping[species].loc[trans.index, "ensembl_gene"] 46 | trans = trans.groupby(trans.index).sum().astype(np.uint64) 47 | if identifier == "symbol": 48 | gm = gene_mapping[species].copy() 49 | gm.index = gm.loc[:, "ensembl_gene"] 50 | gm = gm[~gm.index.duplicated(keep='first')] 51 | trans.index = gm.loc[trans.index, "symbol"] 52 | return trans 53 | 54 | def get_ensembl_mappings(species, release): 55 | server = biomart.BiomartServer(conf["ALIGNMENT"][str(release)]["biomart"]) 56 | if species == "mus_musculus": 57 | mart = server.datasets['mmusculus_gene_ensembl'] 58 | attributes = ['ensembl_transcript_id', 'mgi_symbol', 'ensembl_gene_id', 'gene_biotype'] 59 | else: 60 | mart = server.datasets['hsapiens_gene_ensembl'] 61 | attributes = ['ensembl_transcript_id', 'hgnc_symbol', 'ensembl_gene_id', 'gene_biotype'] 62 | # Get the mapping between the attributes 63 | response = mart.search({'attributes': attributes}) 64 | data = response.raw.data.decode('ascii') 65 | ensembl_ids = [] 66 | # Store the data in a dict 67 | for line in data.splitlines(): 68 | line = line.split('\t') 69 | ensembl_ids.append(line) 70 | gene_map = pd.DataFrame(ensembl_ids) 71 | gene_map.index = gene_map.iloc[:,0] 72 | nn = np.where(gene_map.iloc[:,1] == "")[0] 73 | gene_map.iloc[nn, 1] = gene_map.iloc[nn, 2] 74 | gene_map.columns = ["ensembl_transcript", "symbol", "ensembl_gene", "biotype"] 75 | gene_map = gene_map[~gene_map.index.duplicated(keep='first')] 76 | return gene_map 77 | 78 | def load(sras, outfolder): 79 | xalign.sra.load_sras(sras, outfolder) 80 | print("download complete") -------------------------------------------------------------------------------- /archs4py/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import h5py as h5 5 | import s3fs 6 | import tqdm 7 | import re 8 | 9 | import multiprocessing 10 | import random 11 | 12 | def resolve_url(url): 13 | u1 = url.rsplit('/', 1) 14 | u2 = u1[0].rsplit('/', 1) 15 | file_name = u1[-1] 16 | bucket_name = u2[-1] 17 | endpoint = u2[0] 18 | S3_URL = "s3://"+bucket_name+"/"+file_name 19 | return(S3_URL, endpoint) 20 | 21 | def fetch_meta_remote(field, s3_url, endpoint): 22 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 23 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 24 | meta = [x.decode("UTF-8") for x in list(np.array(f[field]))] 25 | return np.array(meta) 26 | 27 | def meta(file, search_term, meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], remove_sc=False, silent=False): 28 | """ 29 | Search for samples in a file based on a search term in specified metadata fields. 30 | 31 | Args: 32 | file (str): The file path or object containing the data. 33 | search_term (str): The term to search for. The search is case-insensitive and supports regular expressions. 34 | meta_fields (list, optional): The list of metadata fields to search within. 35 | Defaults to ["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]. 36 | remove_sc (bool, optional): Whether to filter single-cell samples from the results. 37 | Defaults to False. 38 | silent (bool, optional): Print progress bar. 39 | 40 | Returns: 41 | pd.DataFrame: A pandas DataFrame containing the gene expression data for the matching samples. 42 | """ 43 | #search_term = re.ssub(r"_|-|'|/| |\.", "", search_term.upper()) 44 | if not silent: 45 | print("Searches for any occurrence of", search_term, "as regular expression") 46 | if file.startswith("http"): 47 | return meta_remote(file, search_term, meta_fields, remove_sc, silent) 48 | else: 49 | return meta_local(file, search_term, meta_fields, remove_sc, silent) 50 | 51 | def meta_local(file, search_term, meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], remove_sc=False, silent=False): 52 | f = h5.File(file, "r") 53 | idx = [] 54 | for field in meta_fields: 55 | if field in f["meta"]["samples"].keys(): 56 | meta = [x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))] 57 | #idx.extend([i for i, item in enumerate(meta) if re.search(search_term, re.sub(r"_|-|'|/| |\.", "", item.upper()))]) 58 | idx.extend([i for i, item in enumerate(meta) if re.search(search_term, item, re.IGNORECASE)]) 59 | if remove_sc: 60 | singleprob = np.where(np.array(f["meta/samples/singlecellprobability"]) < 0.5)[0] 61 | f.close() 62 | if remove_sc: 63 | idx = sorted(list(set(idx).intersection(set(singleprob)))) 64 | else: 65 | idx = sorted(list(set(idx))) 66 | counts = index(file, idx, silent=silent) 67 | return counts 68 | 69 | def meta_remote(url, search_term, meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], remove_sc=False, silent=False): 70 | s3_url, endpoint = resolve_url(url) 71 | idx = [] 72 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 73 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 74 | for field in meta_fields: 75 | if field in f["meta"]["samples"].keys(): 76 | meta = [x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))] 77 | #idx.extend([i for i, item in enumerate(meta) if re.search(search_term, re.sub(r"_|-|'|/| |\.", "", item.upper()))]) 78 | idx.extend([i for i, item in enumerate(meta) if re.search(search_term, item, re.IGNORECASE)]) 79 | if remove_sc: 80 | singleprob = np.where(np.array(f["meta/samples/singlecellprobability"]) < 0.5)[0] 81 | if remove_sc: 82 | idx = sorted(list(set(idx).intersection(set(singleprob)))) 83 | else: 84 | idx = sorted(list(set(idx))) 85 | counts = index_remote(url, idx, silent) 86 | return counts 87 | 88 | def rand(file, number, seed=1, remove_sc=False, silent=False): 89 | """ 90 | Randomly select a specified number of samples from a file. 91 | 92 | Args: 93 | file (str): The file path or object containing the data. 94 | number (int): The number of samples to select randomly. 95 | seed (int, optional): The seed value for the random number generator. Defaults to 1. 96 | remove_sc (bool, optional): Whether to remove single-cell samples from the selection. Defaults to False. 97 | silent (bool, optional): Print progress bar. 98 | 99 | Returns: 100 | pd.DataFrame: A pandas DataFrame containing the randomly selected samples' gene expression data. 101 | """ 102 | random.seed(seed) 103 | if file.startswith("http"): 104 | return rand_remote(file, number, remove_sc, silent) 105 | else: 106 | return rand_local(file, number, remove_sc, silent) 107 | 108 | def rand_local(file, number, remove_sc, silent=False): 109 | f = h5.File(file, "r") 110 | gsm_ids = [x.decode("UTF-8") for x in np.array(f["meta/samples/geo_accession"])] 111 | if remove_sc: 112 | singleprob = np.array(f["meta/samples/singlecellprobability"]) 113 | f.close() 114 | if remove_sc: 115 | idx = sorted(random.sample(list(np.where(singleprob < 0.5)[0]), number)) 116 | else: 117 | idx = sorted(random.sample(range(len(gsm_ids)), number)) 118 | return index(file, idx, silent=silent) 119 | 120 | def rand_remote(url, number, remove_sc, silent=False): 121 | s3_url, endpoint = resolve_url(url) 122 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 123 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 124 | number_samples = len(f["meta/samples/geo_accession"]) 125 | if remove_sc: 126 | singleprob = np.array(f["meta/samples/singlecellprobability"]) 127 | if remove_sc: 128 | idx = sorted(random.sample(list(np.where(singleprob < 0.5)[0]), number)) 129 | else: 130 | idx = sorted(random.sample(range(number_samples), number)) 131 | return index_remote(url, idx, silent=silent) 132 | 133 | def series(file, series_id, silent=False): 134 | """ 135 | Retrieve samples belonging to a specific series from a file. 136 | 137 | Args: 138 | file (str): The file path or object containing the data. 139 | series_id (str): The ID of the series to retrieve samples from. 140 | 141 | Returns: 142 | pd.DataFrame: A pandas DataFrame containing the gene expression data for the samples belonging to the specified series. 143 | """ 144 | if file.startswith("http"): 145 | return series_remote(file, series_id, silent=silent) 146 | else: 147 | return series_local(file, series_id, silent=silent) 148 | 149 | def series_local(file, series_id, silent=False): 150 | f = h5.File(file, "r") 151 | series = [x.decode("UTF-8") for x in np.array(f["meta/samples/series_id"])] 152 | f.close() 153 | idx = [i for i,x in enumerate(series) if x == series_id] 154 | if len(idx) > 0: 155 | return index(file, idx, silent=silent) 156 | 157 | def series_remote(url, series_id, silent=False): 158 | s3_url, endpoint = resolve_url(url) 159 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 160 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 161 | series = [x.decode("UTF-8") for x in np.array(f["meta/samples/series_id"])] 162 | idx = [i for i,x in enumerate(series) if x == series_id] 163 | if len(idx) > 0: 164 | return index_remote(url, idx, silent) 165 | 166 | def samples(file, sample_ids, silent=False): 167 | if file.startswith("http"): 168 | return samples_remote(file, sample_ids, silent=silent) 169 | else: 170 | return samples_local(file, sample_ids, silent=silent) 171 | 172 | def samples_local(file, sample_ids, silent=False): 173 | sample_ids = set(sample_ids) 174 | f = h5.File(file, "r") 175 | samples = [x.decode("UTF-8") for x in np.array(f["meta/samples/geo_accession"])] 176 | f.close() 177 | idx = [i for i,x in enumerate(samples) if x in sample_ids] 178 | if len(idx) > 0: 179 | return index(file, idx, silent=silent) 180 | 181 | def samples_remote(url, sample_ids, silent=False): 182 | sample_ids = set(sample_ids) 183 | s3_url, endpoint = resolve_url(url) 184 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 185 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 186 | sample_ids = [x.decode("UTF-8") for x in np.array(f["meta/samples/geo_accession"])] 187 | idx = [i for i,x in enumerate(samples) if x in sample_ids] 188 | if len(idx) > 0: 189 | return index_remote(url, idx, silent=silent) 190 | 191 | def index(file, sample_idx, gene_idx = [], silent=False): 192 | """ 193 | Retrieve gene expression data from a specified file for the given sample and gene indices. 194 | 195 | Args: 196 | file (str): The file path or object containing the data. 197 | sample_idx (list): A list of sample indices to retrieve expression data for. 198 | gene_idx (list, optional): A list of gene indices to retrieve expression data for. Defaults to an empty list (return all). 199 | silent (bool, optional): Whether to disable progress bar. Defaults to False. 200 | 201 | Returns: 202 | pd.DataFrame: A pandas DataFrame containing the gene expression data. 203 | """ 204 | sample_idx = sorted(sample_idx) 205 | gene_idx = sorted(gene_idx) 206 | row_encoding = get_encoding(file) 207 | f = h5.File(file, "r") 208 | genes = np.array([x.decode("UTF-8") for x in np.array(f[row_encoding])]) 209 | if len(sample_idx) == 0: 210 | return pd.DataFrame(index=genes[gene_idx]) 211 | gsm_ids = np.array([x.decode("UTF-8") for x in np.array(f["meta/samples/geo_accession"])])[sample_idx] 212 | f.close() 213 | if len(gene_idx) == 0: 214 | gene_idx = list(range(len(genes))) 215 | exp = [] 216 | PROCESSES = 16 217 | with multiprocessing.Pool(PROCESSES) as pool: 218 | results = [pool.apply_async(get_sample, (file, i, gene_idx)) for i in sample_idx] 219 | for r in tqdm.tqdm(results, disable=silent): 220 | res = r.get() 221 | exp.append(res) 222 | exp = np.array(exp).T 223 | exp = pd.DataFrame(exp, index=genes[gene_idx], columns=gsm_ids, dtype=np.uint32) 224 | return exp 225 | 226 | def index_remote(url, sample_idx, gene_idx = [], silent=False): 227 | if len(sample_idx) == 0: 228 | return pd.DataFrame(index=genes[gene_idx]) 229 | s3_url, endpoint = resolve_url(url) 230 | sample_idx = sorted(sample_idx) 231 | gene_idx = sorted(gene_idx) 232 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 233 | row_encoding = get_encoding_remote(s3, url) 234 | genes = fetch_meta_remote(row_encoding, s3_url, endpoint) 235 | if len(gene_idx) == 0: 236 | gene_idx = np.array(list(range(len(genes)))) 237 | gsm_ids = fetch_meta_remote("meta/samples/geo_accession", s3_url, endpoint)[sample_idx] 238 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 239 | exp = np.array(f["data/expression"][:,np.array(sample_idx)], dtype=np.uint32)[gene_idx] 240 | exp = pd.DataFrame(exp, index=genes[gene_idx], columns=gsm_ids, dtype=np.uint32) 241 | return exp 242 | 243 | def get_sample(file, i, gene_idx): 244 | try: 245 | f = h5.File(file, "r") 246 | temp = np.array(f["data/expression"][:,i], dtype=np.uint32)[gene_idx] 247 | f.close() 248 | except Exception: 249 | dd = np.array([0]*len(gene_idx)) 250 | return dd 251 | return temp 252 | 253 | def get_sample_remote(s3_url, endpoint, i, gene_idx): 254 | try: 255 | s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': endpoint}) 256 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 257 | temp = np.array(f["data/expression"][:,i], dtype=np.uint32)[gene_idx] 258 | return temp 259 | except Exception: 260 | dd = np.array([0]*len(gene_idx)) 261 | return dd 262 | 263 | def get_encoding(file): 264 | with h5.File(file) as f: 265 | if "genes" in list(f["meta"].keys()): 266 | if "gene_symbol" in list(f["meta/genes"].keys()): 267 | return "meta/genes/gene_symbol" 268 | elif "symbol" in list(f["meta/genes"].keys()): 269 | return "meta/genes/symbol" 270 | elif "transcripts" in list(f["meta"].keys()): 271 | if "ensembl_id" in list(f["meta/transcripts"].keys()): 272 | return "meta/transcripts/ensembl_id" 273 | else: 274 | raise Exception("error in gene/transcript meta data") 275 | 276 | def get_encoding_remote(s3, s3_url): 277 | with h5.File(s3.open(s3_url, 'rb'), 'r', lib_version='latest') as f: 278 | if "genes" in list(f["meta"].keys()): 279 | if "gene_symbol" in list(f["meta/genes"].keys()): 280 | return "meta/genes/gene_symbol" 281 | elif "symbol" in list(f["meta/genes"].keys()): 282 | return "meta/gene_symbol" 283 | elif "transcripts" in list(f["meta"].keys()): 284 | if "ensembl_id" in list(f["meta/transcripts"].keys()): 285 | return "meta/transcripts/ensembl_id" 286 | else: 287 | raise Exception("error in gene/transcript meta data") -------------------------------------------------------------------------------- /archs4py/data/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "GENE_COUNTS":{ 3 | "HUMAN": { 4 | "latest": { 5 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.latest.h5", 6 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.latest.h5" 7 | }, 8 | "2.5": { 9 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.5.h5", 10 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.5.h5" 11 | }, 12 | "2.4": { 13 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.4.h5", 14 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.4.h5" 15 | }, 16 | "2.3": { 17 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.3.h5", 18 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.3.h5" 19 | }, 20 | "2.2": { 21 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.2.h5", 22 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.2.h5" 23 | }, 24 | "2.1.2": { 25 | "primary": "https://s3.dev.maayanlab.cloud/archs4/archs4_gene_human_v2.1.2.h5", 26 | "falback": "https://s3.amazonaws.com/mssm-data/archs4_gene_human_v2.1.2.h5" 27 | }, 28 | "1.11": { 29 | "primary": "https://s3.amazonaws.com/mssm-seq-matrix/human_matrix_v11.h5", 30 | "fallback":"https://s3.amazonaws.com/mssm-seq-matrix/human_matrix_v11.h5" 31 | } 32 | }, 33 | "MOUSE": { 34 | "latest": { 35 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_gene_v2.latest.h5", 36 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_gene_v2.latest.h5" 37 | }, 38 | "2.5": { 39 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_gene_v2.5.h5", 40 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_gene_v2.5.h5" 41 | }, 42 | "2.4": { 43 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_gene_v2.4.h5", 44 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_gene_v2.4.h5" 45 | }, 46 | "2.3": { 47 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_gene_v2.3.h5", 48 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_gene_v2.3.h5" 49 | }, 50 | "2.2": { 51 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_gene_v2.2.h5", 52 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_gene_v2.2.h5" 53 | }, 54 | "2.1.2": { 55 | "primary": "https://s3.dev.maayanlab.cloud/archs4/archs4_gene_mouse_v2.1.2.h5", 56 | "fallback": "https://s3.amazonaws.com/mssm-data/archs4_gene_mouse_v2.1.2.h5" 57 | }, 58 | "1.11": { 59 | "primary": "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix_v11.h5", 60 | "fallback":"https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix_v11.h5" 61 | } 62 | } 63 | }, 64 | "TRANSCRIPT_COUNTS": { 65 | "HUMAN": { 66 | "latest": { 67 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.2.h5", 68 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.2.h5" 69 | }, 70 | "2.2": { 71 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/human_gene_v2.2.h5", 72 | "fallback": "https://s3.amazonaws.com/mssm-data/human_gene_v2.2.h5" 73 | } 74 | }, 75 | "MOUSE": { 76 | "latest": { 77 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_transcript_v2.2.h5", 78 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_transcript_v2.2.h5" 79 | }, 80 | "2.2": { 81 | "primary": "https://s3.dev.maayanlab.cloud/archs4/files/mouse_transcript_v2.2.h5", 82 | "fallback": "https://s3.amazonaws.com/mssm-data/mouse_transcript_v2.2.h5" 83 | } 84 | } 85 | }, 86 | "ALIGNMENT": { 87 | "latest":{ 88 | "release": 107, 89 | "biomart": "http://jul2022.archive.ensembl.org/biomart" 90 | }, 91 | "2.2":{ 92 | "release": 107, 93 | "biomart": "http://jul2022.archive.ensembl.org/biomart" 94 | } 95 | }, 96 | "DOWNLOAD_URL": "https://maayanlab.cloud/archs4/search/downloadcounter.php" 97 | } -------------------------------------------------------------------------------- /archs4py/download.py: -------------------------------------------------------------------------------- 1 | import wget 2 | import sys 3 | import requests 4 | import archs4py.utils 5 | import requests 6 | import os 7 | 8 | def bar_progress(current, total, width=80, update_interval=10): 9 | current_gb = current / (1024**3) # Convert current bytes to GB 10 | total_gb = total / (1024**3) # Convert total bytes to GB 11 | 12 | if current % (update_interval * 1024**2) == 0: # Update progress every 10 MB 13 | progress_message = "Downloading: %d%% [%.2f GB / %.2f GB]" % (current / total * 100, current_gb, total_gb) 14 | sys.stdout.write("\r" + progress_message) 15 | sys.stdout.flush() 16 | 17 | def counts(species, path="", type="GENE_COUNTS", version="latest"): 18 | """ 19 | Download count files for a given species and count type. 20 | 21 | Args: 22 | species (str): The species for which count files are being downloaded. ["human", "mouse"] 23 | path (str, optional): The path where the downloaded file will be saved. Defaults to "". 24 | type (str, optional): The type of count file to be downloaded. Defaults to "GENE_COUNTS". 25 | version (str, optional): The version of the count file to be downloaded. Defaults to "latest". Versions can be listed with archs4py.versions() 26 | 27 | Returns: 28 | str: The path where the count file is downloaded. 29 | 30 | Raises: 31 | Exception: If an error occurs during the download process. 32 | 33 | Notes: 34 | The function first tries to download the count file using the primary URL specified in the configuration file. 35 | If the download fails, it falls back to the fallback URL specified in the configuration file. 36 | 37 | Supported count types: 38 | - GENE_COUNTS: Gene-level count files. 39 | - TRANSCRIPT_COUNTS: Transcript-level count files. 40 | """ 41 | conf = archs4py.utils.get_config() 42 | 43 | try: 44 | file_name = os.path.basename(conf[type][species.upper()][version]["primary"]) 45 | download_url = conf["DOWNLOAD_URL"] 46 | url = f"{download_url}?&file={file_name}&version=1337" 47 | response = requests.get(url) 48 | except: 49 | x = "just continue" 50 | 51 | try: 52 | fpath = wget.download(conf[type][species.upper()][version]["primary"], out=path, bar=bar_progress) 53 | print("file downloaded to", fpath) 54 | except Exception: 55 | fpath = wget.download(conf[type][species.upper()][version]["fallback"], out=path, bar=bar_progress) 56 | print("file downloaded to", fpath) 57 | 58 | -------------------------------------------------------------------------------- /archs4py/meta.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import Counter 4 | 5 | import h5py as h5 6 | import re 7 | import numpy as np 8 | import pandas as pd 9 | import tqdm 10 | 11 | def meta(file, search_term, meta_fields=["characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], remove_sc=False, silent=False): 12 | """ 13 | Search for samples in a file based on a search term in specified metadata fields. 14 | 15 | Args: 16 | file (str): The file path or object containing the data. 17 | search_term (str): The term to search for. Case-insensitive. 18 | meta_fields (list, optional): The list of metadata fields to search within. 19 | Defaults to ["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]. 20 | remove_sc (bool, optional): Whether to remove single-cell samples from the results. 21 | Defaults to False. 22 | silent (bool, optional): Print progress bar. 23 | 24 | Returns: 25 | pd.DataFrame: DataFrame containing the extracted metadata, with metadata fields as columns and samples as rows. 26 | """ 27 | #search_term = search_term.upper() 28 | with h5.File(file, "r") as f: 29 | meta = [] 30 | idx = [] 31 | mfields = [] 32 | for field in tqdm.tqdm(meta_fields, disable=not silent): 33 | if field in f["meta"]["samples"].keys(): 34 | try: 35 | meta.append([x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))]) 36 | mfields.append(field) 37 | except Exception: 38 | x=0 39 | meta = pd.DataFrame(meta, index=mfields ,columns=[x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"]["geo_accession"]))]) 40 | for i in tqdm.tqdm(range(meta.shape[0]), disable=silent): 41 | idx.extend([i for i, item in enumerate(meta.iloc[i,:]) if re.search(search_term, item, re.IGNORECASE)]) 42 | if remove_sc: 43 | singleprob = np.where(np.array(f["meta/samples/singlecellprobability"]) < 0.5)[0] 44 | idx = sorted(list(set(idx).intersection(set(singleprob)))) 45 | else: 46 | idx = sorted(list(set(idx))) 47 | return meta.iloc[:,idx].T 48 | 49 | def field(file, field): 50 | gene_meta = [] 51 | transcript_meta = [] 52 | with h5.File(file, 'r') as f: 53 | sample_meta = list(f["meta/samples"]) 54 | try: 55 | with h5.File(file, 'r') as f: 56 | gene_meta = list(f["meta/genes"]) 57 | except Exception: 58 | x = 0 59 | try: 60 | with h5.File(file, 'r') as f: 61 | transcript_meta = list(f["meta/transcripts"]) 62 | except Exception: 63 | x = 0 64 | with h5.File(file, 'r') as f: 65 | if field in sample_meta: 66 | try: 67 | return [x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))] 68 | except Exception: 69 | return list(np.array(f["meta"]["samples"][field])) 70 | elif field in gene_meta: 71 | return [x.decode("UTF-8") for x in list(np.array(f["meta"]["genes"][field]))] 72 | elif field in transcript_meta: 73 | return [x.decode("UTF-8") for x in list(np.array(f["meta"]["transcripts"][field]))] 74 | else: 75 | raise("specified field does not exist. Choose from supported sample meta fields or gene meta fields. List fields ysing archs4py.ls(filename) function") 76 | 77 | def samples(file, samples, meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], silent=False): 78 | """ 79 | Extracts metadata for specified samples from an HDF5 file. 80 | 81 | Args: 82 | file (str): Path to the HDF5 file. 83 | samples (list): List of samples to extract metadata for. 84 | meta_fields (list, optional): List of metadata fields to extract. Defaults to ["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]. 85 | silent (bool, optional): If True, disables the progress bar. Defaults to False. 86 | 87 | Returns: 88 | pandas.DataFrame: DataFrame containing the extracted metadata, with metadata fields as columns and samples as rows. 89 | """ 90 | samples = set(samples) 91 | with h5.File(file, "r") as f: 92 | meta = [] 93 | mfields = [] 94 | meta_samples = np.array([x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"]["geo_accession"]))]) 95 | idx = [i for i,x in enumerate(meta_samples) if x in samples] 96 | for field in tqdm.tqdm(meta_fields, disable=not silent): 97 | if field in f["meta"]["samples"].keys(): 98 | try: 99 | meta.append([x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field][idx]))]) 100 | mfields.append(field) 101 | except Exception: 102 | meta.append(list(np.array(f["meta"]["samples"][field][idx]))) 103 | mfields.append(field) 104 | meta = pd.DataFrame(meta, index=mfields ,columns=[x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"]["geo_accession"][idx]))]) 105 | inter = meta.columns.intersection(set(samples)) 106 | return meta.loc[:,inter].T 107 | 108 | def series(file, series, meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], silent=False): 109 | """ 110 | Extracts metadata for specified series from an HDF5 file. 111 | 112 | Args: 113 | file (str): Path to the HDF5 file. 114 | series: Series to extract metadata for. 115 | meta_fields (list, optional): List of metadata fields to extract. Defaults to ["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"]. 116 | silent (bool, optional): If True, disables the progress bar. Defaults to False. 117 | 118 | Returns: 119 | pandas.DataFrame: DataFrame containing the extracted metadata, with metadata fields as columns and samples as rows. 120 | """ 121 | with h5.File(file, "r") as f: 122 | meta = [] 123 | mfields = [] 124 | meta_series = np.array([x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"]["series_id"]))]) 125 | idx = [i for i,x in enumerate(meta_series) if x == series] 126 | for field in tqdm.tqdm(meta_fields, disable=not silent): 127 | if field in f["meta"]["samples"].keys(): 128 | try: 129 | meta.append([x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field][idx]))]) 130 | mfields.append(field) 131 | except Exception: 132 | meta.append(list(np.array(f["meta"]["samples"][field][idx]))) 133 | mfields.append(field) 134 | meta = pd.DataFrame(meta, index=mfields ,columns=[x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"]["geo_accession"][idx]))]) 135 | return meta.T 136 | 137 | def get_meta(file): 138 | with h5.File(file, "r") as f: 139 | meta = {} 140 | for field in list(f["meta"]["samples"].keys()): 141 | try: 142 | meta[field] = [x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))] 143 | except Exception: 144 | meta[field] = list(np.array(f["meta"]["samples"][field])) 145 | f.close() 146 | return meta 147 | 148 | def get_meta_sample_field(file, field): 149 | with h5.File(file, "r") as f: 150 | try: 151 | meta_data = [x.decode("UTF-8") for x in list(np.array(f["meta"]["samples"][field]))] 152 | except Exception: 153 | meta_data = list(np.array(f["meta"]["samples"][field])) 154 | return meta_data 155 | 156 | def get_meta_gene_field(file, field): 157 | with h5.File(file, "r") as f: 158 | meta_data = [x.decode("UTF-8") for x in list(np.array(f["meta"]["genes"][field]))] 159 | return meta_data 160 | -------------------------------------------------------------------------------- /archs4py/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import h5py as h5 4 | import random 5 | 6 | import os 7 | import json 8 | 9 | import qnorm 10 | 11 | def get_config(): 12 | config_url = os.path.join( 13 | os.path.dirname(__file__), 14 | 'data/config.json') 15 | with open(config_url) as json_file: 16 | data = json.load(json_file) 17 | return(data) 18 | 19 | def versions(): 20 | """ 21 | Get the available versions of human gene counts. 22 | 23 | Returns: 24 | list: A list of available versions of human gene counts. 25 | """ 26 | config = get_config() 27 | versions = config["GENE_COUNTS"]["HUMAN"].keys() 28 | return versions 29 | 30 | def normalize(counts, method="log_quantile", tmm_outlier=0.05): 31 | """ 32 | Normalize the count matrix using a specified method. 33 | 34 | Args: 35 | counts (pd.DataFrame): A pandas DataFrame representing the count matrix. 36 | method (str, optional): The normalization method to be applied. Default is "log_quantiles". 37 | - "quantile": Perform quantile normalization on the counts. 38 | - "log_quantile": Perform quantile normalization on the log-transformed counts. 39 | - "cpm": Perform count per million (CPM) normalization. 40 | - "tmm": Perform trimmed mean normalization 41 | 42 | Returns: 43 | pd.DataFrame: A normalized count matrix as a pandas DataFrame with the same index and columns as the input. 44 | 45 | Raises: 46 | ValueError: If an unsupported normalization method is provided. 47 | """ 48 | norm_exp = 0 49 | if method == "quantile": 50 | norm_exp = qnorm.quantile_normalize(np.array(counts)) 51 | elif method == "log_quantile": 52 | norm_exp = qnorm.quantile_normalize(np.log2(1+np.array(counts))) 53 | elif method == "cpm": 54 | norm_exp = cpm_normalization(counts) 55 | elif method == "tmm": 56 | norm_exp = tmm_norm(counts, tmm_outlier) 57 | else: 58 | raise ValueError("Unsupported normalization method: " + method) 59 | norm_exp = pd.DataFrame(norm_exp, index=counts.index, columns=counts.columns, dtype=np.float32) 60 | return norm_exp 61 | 62 | def tmm_norm(exp, percentage=0.05): 63 | lexp = np.log2(1+exp).astype(np.float32) 64 | tmm = trimmed_mean(lexp, percentage) 65 | nf = pd.DataFrame(np.tile(tmm, (exp.shape[0], 1)), index=lexp.index, columns=lexp.columns) 66 | temp = (lexp/nf) 67 | return temp 68 | 69 | def trimmed_mean(matrix, percentage): 70 | matrix = np.array(matrix) 71 | trimmed_means = [] 72 | for col in range(matrix.shape[1]): 73 | data = matrix[:, col].copy() 74 | data = data[data > 0] 75 | n_trim = int(len(data) * percentage) 76 | sorted_values = np.sort(data) 77 | trimmed_values = sorted_values[n_trim:-n_trim] 78 | trimmed_mean = np.mean(trimmed_values) 79 | trimmed_means.append(trimmed_mean) 80 | return trimmed_means 81 | 82 | def cpm_normalization(df): 83 | sample_sum = df.sum(axis=0) 84 | scaling_factor = sample_sum / 1e6 85 | normalized_df = df / scaling_factor 86 | return normalized_df 87 | 88 | def ls(file): 89 | """ 90 | List all meta data groups and meta data fields in the specified H5 file. 91 | 92 | Args: 93 | file: H5 input file 94 | """ 95 | def print_data(name, obj, prefix): 96 | if isinstance(obj, h5.Dataset): 97 | data_type = str(obj.dtype) 98 | if data_type == "object": 99 | data_type = "str" 100 | shape = obj.shape 101 | print("{}{:<20} {:<6} | {}".format(prefix, name, data_type, shape)) 102 | else: 103 | print("{}{:<26}".format(prefix, name)) 104 | 105 | for key, val in obj.attrs.items(): 106 | print("{} {:<11} : {}".format(prefix, key, val)) 107 | 108 | with h5.File(file, 'r') as f: 109 | for name in f: 110 | print_data(name, f[name], "") 111 | if isinstance(f[name], h5.Group): 112 | for sub_name in f[name]: 113 | print_data(sub_name, f[name][sub_name], "│ ") 114 | if isinstance(f[name][sub_name], h5.Group): 115 | for sub_sub_name in f[name][sub_name]: 116 | print_data(sub_sub_name, f[name][sub_name][sub_sub_name], "│ ") 117 | 118 | def aggregate_duplicate_genes(exp): 119 | return exp.groupby(exp.index).sum() 120 | 121 | def filter_genes(exp, readThreshold: int=20, sampleThreshold: float=0.02, deterministic: bool=True, aggregate=True): 122 | ''' 123 | Returns filtered genes with sufficient read support 124 | Parameters: 125 | h5file (string): path to expression h5 file 126 | readThreshold (int): minimum number of reads required for gene filtering 127 | sampleThreshold (float): fraction of samples required with read count larger than _readThreshold 128 | filterSamples (int): number of samples used to identify genes for clustering 129 | 130 | Returns: 131 | (List[int]): filtered index of genes passing criteria 132 | ''' 133 | if deterministic: 134 | random.seed(42) 135 | 136 | if aggregate: 137 | exp = aggregate_duplicate_genes(exp) 138 | 139 | kk = exp[exp > readThreshold].count(axis=1) 140 | ii = [idx for idx, val in enumerate(kk) if val >= exp.shape[1]*sampleThreshold] 141 | return exp.iloc[ii,:] 142 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | numpy 3 | pandas 4 | qnorm 5 | setuptools 6 | tqdm 7 | wget 8 | boto3 9 | s3fs 10 | xalign -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="archs4py", 8 | version="1.0.1", 9 | author="Alexander Lachmann", 10 | author_email="alexander.lachmann@mssm.edu", 11 | description="ARCHS4 Python package supporting data loading and data queries.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/maayanlab/archs4py", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: Apache Software License", 19 | "Operating System :: OS Independent", 20 | ], 21 | package_data={ 22 | "archs4py": ["data/*"] 23 | }, 24 | include_package_data=True, 25 | install_requires=[ 26 | 'h5py', 27 | 'numpy', 28 | 'pandas', 29 | 'qnorm', 30 | 'setuptools', 31 | 'tqdm', 32 | 'wget', 33 | 's3fs', 34 | 'biomart', 35 | 'xalign' 36 | 37 | ], 38 | python_requires='>=3.7', 39 | ) --------------------------------------------------------------------------------