├── .coveragerc ├── .editorconfig ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── c4gt_community.yml │ └── feature_request.md ├── c4gt_community.yml ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── publish.yml │ ├── pull_request.yml │ └── push.yml ├── .gitignore ├── AUTHORS.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docs ├── Makefile ├── _static │ ├── demo.svg │ ├── e1.png │ ├── e2.png │ ├── e3.png │ ├── pysradb_v3.png │ └── pysradb_v3.svg ├── authors.md ├── case_studies.md ├── cmdline.md ├── commands.md ├── conf.py ├── contributing.md ├── history.md ├── index.rst ├── installation.md ├── make.bat ├── modules.md ├── modules.rst ├── pysradb.rst ├── python-api-usage.md └── quickstart.md ├── notebooks ├── 01.Python-API_demo.ipynb ├── 02.Commandline_download.ipynb ├── 03.ParallelDownload.ipynb ├── 04.SRA_to_fastq_conda.ipynb ├── 05.Downloading_subsets_of_a_project.ipynb ├── 06.Download_BAMs.ipynb ├── 07.Multiple_SRPs.ipynb ├── 08.pysradb_ascp_multithreaded.ipynb ├── 09.Query_Search.ipynb └── README.md ├── pyproject.toml ├── pysradb ├── __init__.py ├── basedb.py ├── cli.py ├── download.py ├── exceptions.py ├── filter_attrs.py ├── geodb.py ├── geoweb.py ├── search.py ├── sradb.py ├── sraweb.py ├── taxid2name.py └── utils.py ├── requirements.txt ├── setup.cfg └── tests ├── _test_geodb.py ├── _test_pcli.py ├── _test_sradb.py ├── conftest.py ├── data └── test_search │ ├── ena_search_test1.txt │ ├── ena_test_verbosity_0.csv │ ├── ena_test_verbosity_0.json │ ├── ena_test_verbosity_1.csv │ ├── ena_test_verbosity_1.json │ ├── ena_test_verbosity_2.csv │ ├── ena_test_verbosity_2.json │ ├── ena_test_verbosity_3.csv │ ├── ena_test_verbosity_3.json │ ├── geo_search_test1.txt │ ├── sra_search_test1.txt │ ├── sra_test.xml │ ├── sra_test_2_verbosity_0.csv │ ├── sra_test_2_verbosity_1.csv │ ├── sra_test_2_verbosity_2.csv │ ├── sra_test_2_verbosity_3.csv │ ├── sra_test_ERS3331676.xml │ ├── sra_test_verbosity_0.csv │ ├── sra_test_verbosity_1.csv │ ├── sra_test_verbosity_2.csv │ ├── sra_test_verbosity_3.csv │ └── sra_uids.txt ├── test_geoweb.py ├── test_search.py ├── test_sradb.py ├── test_sraweb.py └── test_utils.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | pysradb/filter_attrs.py 4 | pysradb/geodb.py 5 | pysradb/sradb.py 6 | pysradb/taxid2name.py 7 | pysradb/utils.py 8 | 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.rst linguist-documentation 2 | *.html linguist-documentation 3 | *.ipynb linguist-language=python 4 | 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [saketkc] 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * pysradb version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | `pysradb SRPxxx` 16 | 17 | 18 | **Desktop (please complete the following information):** 19 | - OS: [e.g. Ubuntu 20.04] 20 | - Python version [e.g. 3.8] 21 | 22 | **Additional context** 23 | Add any other context about the problem here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/c4gt_community.yml: -------------------------------------------------------------------------------- 1 | name: C4GT Community Template 2 | description: Create a new Ticket for C4GT Community 3 | title: "[C4GT Community]: " 4 | labels: ["Please add the C4GT Community Label on all tickets that you will list. In addition to this, please add whichever labels best describe your project from this list: C4GT Community, C4GT Coding, C4GT Design, C4GT Mentorship, C4GT Bounty, C4GT Advisory"] 5 | body: 6 | - type: textarea 7 | id: ticket-description 8 | validations: 9 | required: true 10 | attributes: 11 | label: Ticket Contents 12 | value: | 13 | ## Description 14 | [Provide a brief project description, outlining the need and measurable goals of the feature to be developed. Kindly, specify the number of users or scale of the product and feature, articulating its anticipated impact and intended use. Kindly also add the bounty amount you are willing to pay in case of this being a bounty ticket. For Design, Mentorship & Advisory tickets, we request you to make a copy of this google form (https://docs.google.com/forms/d/18EPdoqBUFS6lRoQQbRKKjTvzA711nNTdmU8jinKTXWs/edit) in your own drive and attach a link to it in the issue ticket for contributors to fill in. This way you will be able to see all the applications that come in and assign it to the selected contributor.] 15 | 16 | - type: textarea 17 | id: ticket-goals 18 | validations: 19 | required: true 20 | attributes: 21 | label: Goals 22 | description: List the goals of the feature. 23 | value: | 24 | ## Goals 25 | - [ ] [Goal 1] 26 | - [ ] [Goal 2] 27 | - [ ] [Goal 3] 28 | - [ ] [Goal 4] 29 | - [ ] [Goal 5] 30 | [Kindly, state and link installation guide/ set-up steps (if any) in this section] 31 | 32 | - type: textarea 33 | id: ticket-expected-outcome 34 | validations: 35 | required: true 36 | attributes: 37 | label: Expected Outcome 38 | description: Describe in detail what the final product or result should look like and how it should behave. 39 | 40 | - type: textarea 41 | id: ticket-acceptance-criteria 42 | attributes: 43 | label: Acceptance Criteria 44 | description: List the acceptance criteria for this feature. 45 | 46 | - type: textarea 47 | id: ticket-implementation-details 48 | validations: 49 | required: true 50 | attributes: 51 | label: Implementation Details 52 | description: List any technical details about the proposed implementation, including any specific technologies that will be used. 53 | 54 | - type: textarea 55 | id: ticket-mockups 56 | attributes: 57 | label: Mockups/Wireframes 58 | description: Include links to any visual aids, mockups, wireframes, or diagrams that help illustrate what the final product should look like. This is not always necessary, but can be very helpful in many cases. 59 | 60 | - type: input 61 | id: ticket-product 62 | attributes: 63 | label: Product Name 64 | placeholder: Enter Product Name 65 | validations: 66 | required: true 67 | 68 | - type: dropdown 69 | id: ticket-organisation 70 | attributes: 71 | label: Organisation Name 72 | description: Enter Organisation Name 73 | multiple: false 74 | options: 75 | - C4GT 76 | - Dhiway 77 | - FIDE 78 | - SamagraX 79 | - ShikshaLokam 80 | - Tech4Dev 81 | - Tibil 82 | validations: 83 | required: true 84 | 85 | - type: input 86 | id: ticket-governance-domain 87 | attributes: 88 | label: Domain 89 | placeholder: Enter Area of Governance 90 | 91 | - type: dropdown 92 | id: ticket-technical-skills-required 93 | attributes: 94 | label: Tech Skills Needed 95 | description: Select the technologies needed for this ticket (use Ctrl or Command to select multiple) 96 | multiple: true 97 | options: 98 | - .NET 99 | - Agile 100 | - Angular 101 | - Artificial Intelligence 102 | - ASP.NET 103 | - AWS 104 | - Babel 105 | - Bootstrap 106 | - C# 107 | - Chart.js 108 | - CI/CD 109 | - Computer Vision 110 | - CORS 111 | - cURL 112 | - Cypress 113 | - D3.js 114 | - Database 115 | - Debugging 116 | - Design 117 | - DevOps 118 | - Django 119 | - Docker 120 | - Electron 121 | - ESLint 122 | - Express.js 123 | - Feature 124 | - Flask 125 | - Go 126 | - GraphQL 127 | - HTML 128 | - Ionic 129 | - Jest 130 | - Java 131 | - JavaScript 132 | - Jenkins 133 | - JWT 134 | - Kubernetes 135 | - Laravel 136 | - Machine Learning 137 | - Maintenance 138 | - Markdown 139 | - Material-UI 140 | - Microservices 141 | - MongoDB 142 | - Mobile 143 | - Mockups 144 | - Mocha 145 | - Natural Language Processing 146 | - NestJS 147 | - Node.js 148 | - NUnit 149 | - OAuth 150 | - Performance Improvement 151 | - Prettier 152 | - Python 153 | - Question 154 | - React 155 | - React Native 156 | - Redux 157 | - RESTful APIs 158 | - Ruby 159 | - Ruby on Rails 160 | - Rust 161 | - Scala 162 | - Security 163 | - Selenium 164 | - SEO 165 | - Serverless 166 | - Solidity 167 | - Spring Boot 168 | - SQL 169 | - Swagger 170 | - Tailwind CSS 171 | - Test 172 | - Testing Library 173 | - Three.js 174 | - TypeScript 175 | - UI/UX/Design 176 | - Virtual Reality 177 | - Vue.js 178 | - WebSockets 179 | - Webpack 180 | - Other 181 | validations: 182 | required: true 183 | 184 | - type: textarea 185 | id: ticketorg-mentors 186 | attributes: 187 | label: Organizational Mentor 188 | description: Please tag relevant mentors for the ticket 189 | validations: 190 | required: true 191 | 192 | - type: textarea 193 | id: ticketangel-mentors 194 | attributes: 195 | label: Angel Mentor 196 | description: Please tag relevant mentors for the ticket 197 | validations: 198 | required: false 199 | 200 | - type: dropdown 201 | id: ticket-complexity 202 | attributes: 203 | label: Complexity 204 | description: Choose a complexity describing the complexity of your ticket 205 | multiple: false 206 | options: 207 | - Low 208 | - Medium 209 | - High 210 | validations: 211 | required: true 212 | 213 | - type: dropdown 214 | id: ticket-category 215 | attributes: 216 | label: Category 217 | description: Choose the categories that best describe your ticket 218 | multiple: true 219 | options: 220 | - API 221 | - Analytics 222 | - Accessibility 223 | - Backend 224 | - Breaking Change 225 | - Beginner Friendly 226 | - Configuration 227 | - CI/CD 228 | - Database 229 | - Data Science 230 | - Deprecation 231 | - Documentation 232 | - Delpoyment 233 | - Frontend 234 | - Internationalization 235 | - Localization 236 | - Machine Learning 237 | - Maintenance 238 | - Mobile 239 | - Performance Improvement 240 | - Question 241 | - Refactoring 242 | - Research 243 | - Needs Reproduction 244 | - SEO 245 | - Security 246 | - Testing 247 | - Other 248 | validations: 249 | required: true 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[ENH]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | -------------------------------------------------------------------------------- /.github/c4gt_community.yml: -------------------------------------------------------------------------------- 1 | name: C4GT Community Template 2 | description: Create a new Ticket for C4GT Community 3 | title: "[C4GT Community]: " 4 | labels: ["Please add the C4GT Community Label on all tickets that you will list. In addition to this, please add whichever labels best describe your project from this list: C4GT Community, C4GT Coding, C4GT Design, C4GT Mentorship, C4GT Bounty, C4GT Advisory"] 5 | body: 6 | - type: textarea 7 | id: ticket-description 8 | validations: 9 | required: true 10 | attributes: 11 | label: Ticket Contents 12 | value: | 13 | ## Description 14 | [Provide a brief project description, outlining the need and measurable goals of the feature to be developed. Kindly, specify the number of users or scale of the product and feature, articulating its anticipated impact and intended use. Kindly also add the bounty amount you are willing to pay in case of this being a bounty ticket. For Design, Mentorship & Advisory tickets, we request you to make a copy of this google form (https://docs.google.com/forms/d/18EPdoqBUFS6lRoQQbRKKjTvzA711nNTdmU8jinKTXWs/edit) in your own drive and attach a link to it in the issue ticket for contributors to fill in. This way you will be able to see all the applications that come in and assign it to the selected contributor.] 15 | 16 | - type: textarea 17 | id: ticket-goals 18 | validations: 19 | required: true 20 | attributes: 21 | label: Goals 22 | description: List the goals of the feature. 23 | value: | 24 | ## Goals 25 | - [ ] [Goal 1] 26 | - [ ] [Goal 2] 27 | - [ ] [Goal 3] 28 | - [ ] [Goal 4] 29 | - [ ] [Goal 5] 30 | [Kindly, state and link installation guide/ set-up steps (if any) in this section] 31 | 32 | - type: textarea 33 | id: ticket-expected-outcome 34 | validations: 35 | required: true 36 | attributes: 37 | label: Expected Outcome 38 | description: Describe in detail what the final product or result should look like and how it should behave. 39 | 40 | - type: textarea 41 | id: ticket-acceptance-criteria 42 | attributes: 43 | label: Acceptance Criteria 44 | description: List the acceptance criteria for this feature. 45 | 46 | - type: textarea 47 | id: ticket-implementation-details 48 | validations: 49 | required: true 50 | attributes: 51 | label: Implementation Details 52 | description: List any technical details about the proposed implementation, including any specific technologies that will be used. 53 | 54 | - type: textarea 55 | id: ticket-mockups 56 | attributes: 57 | label: Mockups/Wireframes 58 | description: Include links to any visual aids, mockups, wireframes, or diagrams that help illustrate what the final product should look like. This is not always necessary, but can be very helpful in many cases. 59 | 60 | - type: input 61 | id: ticket-product 62 | attributes: 63 | label: Product Name 64 | placeholder: Enter Product Name 65 | validations: 66 | required: true 67 | 68 | - type: dropdown 69 | id: ticket-organisation 70 | attributes: 71 | label: Organisation Name 72 | description: Enter Organisation Name 73 | multiple: false 74 | options: 75 | - C4GT 76 | - Dhiway 77 | - FIDE 78 | - SamagraX 79 | - ShikshaLokam 80 | - Tech4Dev 81 | - Tibil 82 | validations: 83 | required: true 84 | 85 | - type: input 86 | id: ticket-governance-domain 87 | attributes: 88 | label: Domain 89 | placeholder: Enter Area of Governance 90 | 91 | - type: dropdown 92 | id: ticket-technical-skills-required 93 | attributes: 94 | label: Tech Skills Needed 95 | description: Select the technologies needed for this ticket (use Ctrl or Command to select multiple) 96 | multiple: true 97 | options: 98 | - .NET 99 | - Agile 100 | - Angular 101 | - Artificial Intelligence 102 | - ASP.NET 103 | - AWS 104 | - Babel 105 | - Bootstrap 106 | - C# 107 | - Chart.js 108 | - CI/CD 109 | - Computer Vision 110 | - CORS 111 | - cURL 112 | - Cypress 113 | - D3.js 114 | - Database 115 | - Debugging 116 | - Design 117 | - DevOps 118 | - Django 119 | - Docker 120 | - Electron 121 | - ESLint 122 | - Express.js 123 | - Feature 124 | - Flask 125 | - Go 126 | - GraphQL 127 | - HTML 128 | - Ionic 129 | - Jest 130 | - Java 131 | - JavaScript 132 | - Jenkins 133 | - JWT 134 | - Kubernetes 135 | - Laravel 136 | - Machine Learning 137 | - Maintenance 138 | - Markdown 139 | - Material-UI 140 | - Microservices 141 | - MongoDB 142 | - Mobile 143 | - Mockups 144 | - Mocha 145 | - Natural Language Processing 146 | - NestJS 147 | - Node.js 148 | - NUnit 149 | - OAuth 150 | - Performance Improvement 151 | - Prettier 152 | - Python 153 | - Question 154 | - React 155 | - React Native 156 | - Redux 157 | - RESTful APIs 158 | - Ruby 159 | - Ruby on Rails 160 | - Rust 161 | - Scala 162 | - Security 163 | - Selenium 164 | - SEO 165 | - Serverless 166 | - Solidity 167 | - Spring Boot 168 | - SQL 169 | - Swagger 170 | - Tailwind CSS 171 | - Test 172 | - Testing Library 173 | - Three.js 174 | - TypeScript 175 | - UI/UX/Design 176 | - Virtual Reality 177 | - Vue.js 178 | - WebSockets 179 | - Webpack 180 | - Other 181 | validations: 182 | required: true 183 | 184 | - type: textarea 185 | id: ticketorg-mentors 186 | attributes: 187 | label: Organizational Mentor 188 | description: Please tag relevant mentors for the ticket 189 | validations: 190 | required: true 191 | 192 | - type: textarea 193 | id: ticketangel-mentors 194 | attributes: 195 | label: Angel Mentor 196 | description: Please tag relevant mentors for the ticket 197 | validations: 198 | required: false 199 | 200 | - type: dropdown 201 | id: ticket-complexity 202 | attributes: 203 | label: Complexity 204 | description: Choose a complexity describing the complexity of your ticket 205 | multiple: false 206 | options: 207 | - Low 208 | - Medium 209 | - High 210 | validations: 211 | required: true 212 | 213 | - type: dropdown 214 | id: ticket-category 215 | attributes: 216 | label: Category 217 | description: Choose the categories that best describe your ticket 218 | multiple: true 219 | options: 220 | - API 221 | - Analytics 222 | - Accessibility 223 | - Backend 224 | - Breaking Change 225 | - Beginner Friendly 226 | - Configuration 227 | - CI/CD 228 | - Database 229 | - Data Science 230 | - Deprecation 231 | - Documentation 232 | - Delpoyment 233 | - Frontend 234 | - Internationalization 235 | - Localization 236 | - Machine Learning 237 | - Maintenance 238 | - Mobile 239 | - Performance Improvement 240 | - Question 241 | - Refactoring 242 | - Research 243 | - Needs Reproduction 244 | - SEO 245 | - Security 246 | - Testing 247 | - Other 248 | validations: 249 | required: true 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '35 5 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 37 | # Learn more: 38 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 39 | 40 | steps: 41 | - name: Checkout repository 42 | uses: actions/checkout@v2 43 | 44 | # Initializes the CodeQL tools for scanning. 45 | - name: Initialize CodeQL 46 | uses: github/codeql-action/init@v1 47 | with: 48 | languages: ${{ matrix.language }} 49 | # If you wish to specify custom queries, you can do so here or in a config file. 50 | # By default, queries listed here will override any specified in a config file. 51 | # Prefix the list here with "+" to use these queries and those in the config file. 52 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 53 | 54 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 55 | # If this step fails, then you should remove it and run the build manually (see below) 56 | - name: Autobuild 57 | uses: github/codeql-action/autobuild@v1 58 | 59 | # ℹ️ Command-line programs to run using the OS shell. 60 | # 📚 https://git.io/JvXDl 61 | 62 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 63 | # and modify them (or add more) to build your code if your project 64 | # uses a compiled language 65 | 66 | #- run: | 67 | # make bootstrap 68 | # make release 69 | 70 | - name: Perform CodeQL Analysis 71 | uses: github/codeql-action/analyze@v1 72 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: pull_request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.9, '3.10', '3.11'] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -U pip 24 | pip install -r requirements.txt 25 | - name: Lint with flake8 26 | run: | 27 | pip install -U pytest coverage pytest-cov codecov black flake8 28 | # stop the build if there are Python syntax errors or undefined names 29 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 31 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 32 | black --check . 33 | - name: Test with pytest 34 | run: | 35 | pip install --editable . 36 | pip install pytest 37 | pytest 38 | make coverage 39 | codecov 40 | - uses: ammaraskar/sphinx-action@master 41 | with: 42 | pre-build-command: "pip install -r requirements.txt && pip install . && pip install -U sphinx myst-parser && pip install sphinxcontrib-gtagjs ipython numpydoc sphinx-tabs sphinx_rtd_theme nbsphinx ipython pydata-sphinx-theme nbsphinx-link sphinx-panels" 43 | docs-folder: "docs/" 44 | -------------------------------------------------------------------------------- /.github/workflows/push.yml: -------------------------------------------------------------------------------- 1 | name: push 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.9, '3.10', '3.11'] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -U pip 24 | pip install -r requirements.txt 25 | - name: Lint with flake8 26 | run: | 27 | pip install -U pytest coverage pytest-cov codecov black flake8 28 | # stop the build if there are Python syntax errors or undefined names 29 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 31 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 32 | black --check . 33 | - name: Test with pytest 34 | run: | 35 | pip install --editable . 36 | pip install pytest 37 | pytest 38 | make coverage 39 | codecov 40 | - uses: ammaraskar/sphinx-action@master 41 | with: 42 | pre-build-command: "pip install -r requirements.txt && pip install . && pip install sphinx myst-parser && pip install sphinxcontrib-gtagjs ipython numpydoc sphinx-tabs sphinx_rtd_theme nbsphinx ipython pydata-sphinx-theme nbsphinx-link sphinx-panels" 43 | docs-folder: "docs/" 44 | - name: Deploy 45 | uses: peaceiris/actions-gh-pages@v3 46 | with: 47 | github_token: ${{ secrets.GITHUB_TOKEN }} 48 | publish_dir: ./docs/_build/html/ 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | *.sqlite 110 | *.sqlite.gz 111 | 112 | geoweb_downloads/ -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | ## Contributors 4 | 5 | - [Boshen Yan](https://github.com/bscrow) 6 | - [Maarten van der Sande](https://github.com/Maarten-vd-Sande) 7 | - [Dibya Gautam](https://github.com/dibyaaaaax) 8 | - [Marius van den Beek](https://github.com/mvdbeek) 9 | - [Devang Thakkar](https://github.com/DevangThakkar) 10 | 11 | ## Maintainer 12 | 13 | - Saket Choudhary \<\> 14 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Choudhary" 5 | given-names: "Saket" 6 | orcid: "https://orcid.org/0000-0001-5202-7633" 7 | title: "pysradb" 8 | version: 2.0.0 9 | doi: 10.12688/f1000research.18676.1 10 | date-released: 2023-04-05 11 | url: "https://github.com/saketkc/pysradb" 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at saketkc@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every 4 | little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs 11 | 12 | Report bugs at . 13 | 14 | If you are reporting a bug, please include: 15 | 16 | - Your operating system name and version. 17 | - Any details about your local setup that might be helpful in 18 | troubleshooting. 19 | - Detailed steps to reproduce the bug. 20 | 21 | ### Fix Bugs 22 | 23 | Look through the GitHub issues for bugs. Anything tagged with \"bug\" 24 | and \"help wanted\" is open to whoever wants to implement it. 25 | 26 | ### Implement Features 27 | 28 | Look through the GitHub issues for features. Anything tagged with 29 | \"enhancement\" and \"help wanted\" is open to whoever wants to 30 | implement it. 31 | 32 | ### Write Documentation 33 | 34 | pysradb could always use more documentation, whether as part of the 35 | official pysradb docs, in docstrings, or even on the web in blog posts, 36 | articles, and such. 37 | 38 | ### Submit Feedback 39 | 40 | The best way to send feedback is to file an issue at 41 | . 42 | 43 | If you are proposing a feature: 44 | 45 | - Explain in detail how it would work. 46 | - Keep the scope as narrow as possible, to make it easier to 47 | implement. 48 | - Remember that this is a volunteer-driven project, and that 49 | contributions are welcome :) 50 | 51 | ## Get Started! 52 | 53 | Ready to contribute? Here\'s how to set up [pysradb]{.title-ref} for 54 | local development. 55 | 56 | 1. Fork the [pysradb]{.title-ref} repo on GitHub. 57 | 58 | 2. Clone your fork locally: 59 | 60 | ``` shell 61 | $ git clone git@github.com:your_name_here/pysradb.git 62 | ``` 63 | 64 | 3. Install your local copy into a virtualenv. Assuming you have 65 | virtualenvwrapper installed, this is how you set up your fork for 66 | local development (If python \--version is less than 3.0, run [\$ 67 | mkvirtualenv pysradb \--python=py3]{.title-ref} instead): 68 | 69 | ``` shell 70 | $ mkvirtualenv pysradb 71 | $ cd pysradb/ 72 | $ python setup.py develop 73 | ``` 74 | 75 | 4. Create a branch for local development: 76 | 77 | ``` shell 78 | $ git checkout -b name-of-your-bugfix-or-feature 79 | ``` 80 | 81 | Now you can make your changes locally. 82 | 83 | 5. When you\'re done making changes, check that your changes pass 84 | flake8 and the tests, including testing other Python versions with 85 | tox: 86 | 87 | ``` shell 88 | $ flake8 pysradb tests 89 | $ python setup.py test or py.test 90 | $ tox 91 | ``` 92 | 93 | To get flake8 and tox, just pip install them into your virtualenv. 94 | 95 | 6. Commit your changes and push your branch to GitHub: 96 | 97 | ``` shell 98 | $ git add . 99 | $ git commit -m "Your detailed description of your changes." 100 | $ git push origin name-of-your-bugfix-or-feature 101 | ``` 102 | 103 | 7. Submit a pull request through the GitHub website. 104 | 105 | ## Pull Request Guidelines 106 | 107 | Before you submit a pull request, check that it meets these guidelines: 108 | 109 | 1. The pull request should include tests. 110 | 2. If the pull request adds functionality, the docs should be updated. 111 | Put your new functionality into a function with a docstring, and add 112 | the feature to the list in README.rst. 113 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and 114 | for PyPy. Check 115 | and make sure 116 | that the tests pass for all supported Python versions. 117 | 118 | ## Tips 119 | 120 | To run a subset of tests: 121 | 122 | ``` shell 123 | $ py.test tests.test_pysradb 124 | ``` 125 | 126 | ## Deploying 127 | 128 | A reminder for the maintainers on how to deploy. Make sure all your 129 | changes are committed (including an entry in HISTORY.rst). Then run: 130 | 131 | ``` shell 132 | $ bumpversion patch # possible: major / minor / patch 133 | $ git push 134 | $ git push --tags 135 | ``` 136 | 137 | Travis will then deploy to PyPI if tests pass. 138 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | # History 2 | 3 | # UNRELEASED 4 | - Fix for handling missing metadata keys [#223](https://github.com/saketkc/pysradb/pull/223). Thanks [@andrewdavidsmith](https://github.com/andrewdavidsmith) 5 | 6 | # 2.2.2 (2024-10-03) 7 | - Fix for handling ENA urls for paired end data 8 | 9 | # 2.2.1 (2024-08-21) 10 | - Fix for handling ENA urls 11 | - Migrated to pyproject.toml 12 | 13 | 14 | # 2.2.0 (2023-09-17) 15 | 16 | - Add support for Biosamples and bioproject [#199](https://github.com/saketkc/pysradb/pull/198) 17 | - Use retmode xml for Geo search [#200](https://github.com/saketkc/pysradb/pull/200) 18 | - Documentation fixes 19 | 20 | ## 2.1.0 (2023-05-16) 21 | 22 | - Fix for [gse-to-srp] returning unrequested GSEs [#186](https://github.com/saketkc/pysradb/issues/190) 23 | - Fix for [download] using [public_urls] 24 | - Fix for [gsm-to-srx] returning false positives [#165](https://github.com/saketkc/pysradb/issues/165) 25 | - Fix for delimiter not being consistent when metadata is printed on 26 | terminal [#147](https://github.com/saketkc/pysradb/issues/147) 27 | - ENA search is currently broken because of an API change 28 | 29 | ## 2.0.2 (2023-04-09) 30 | 31 | - Fix for [gse-to-srp] to handle cases where a project is 32 | missing but SRXs are returned [#186](https://github.com/saketkc/pysradb/issues/186) 33 | - Fix gse-to-gsm [#187](https://github.com/saketkc/pysradb/issues/187) 34 | 35 | ## 2.0.1 (2023-03-18) 36 | 37 | - Fix for [pysradb download] - using [public_url] 38 | - Fix for SRX -\> SRR and related conversions [#183](https://github.com/saketkc/pysradb/pull/183) 39 | 40 | ## 2.0.0 (2023-02-23) 41 | 42 | - BREAKING change: Overhaul of how urls and associated metadata are 43 | returned (not backward compatible); all column names are lower cased 44 | by default 45 | - Fix extra space in \"organism_taxid\" column 46 | - Added support for Experiment attributes [#89](https://github.com/saketkc/pysradb/issues/89#issuecomment-1439319532) 47 | 48 | ## 1.4.2 (06-17-2022) 49 | 50 | - Fix ENA fastq fetching [#163](https://github.com/saketkc/pysradb/issues/163) 51 | 52 | ## 1.4.1 (06-04-2022) 53 | 54 | - Fix for fetching alternative URLs 55 | 56 | ## 1.4.0 (06-04-2022) 57 | 58 | - Added ability to fetch alternative URLs (GCP/AWS) for metadata 59 | [#161](https://github.com/saketkc/pysradb/issues/161) 60 | - Fix for xmldict 0.13.0 no longer defaulting to OrderedDict [#159](https://github.com/saketkc/pysradb/pull/159) 61 | - Fix for missing experiment model and description in metadata [#160](https://github.com/saketkc/pysradb/issues/160) 62 | 63 | ## 1.3.0 (02-18-2022) 64 | 65 | - Add [study_title] to [\--detailed] flag 66 | ([#152](https://github.com/saketkc/pysradb/issues/152)) 67 | - Fix [KeyError] in [metadata] where some new 68 | IDs do not have any metadata 69 | ([#151](https://github.com/saketkc/pysradb/issues/151)) 70 | 71 | ## 1.2.0 (01-10-2022) 72 | 73 | - Do not exit if a qeury returns no hits ([#149](https://github.com/saketkc/pysradb/pull/149)) 74 | 75 | ## 1.1.0 (12-12-2021) 76 | 77 | - Fixed [gsm-to-gse] failure 78 | ([#128](https://github.com/saketkc/pysradb/pull/128)) 79 | - Fixed case sensitivity bug for ENA search 80 | ([#144](https://github.com/saketkc/pysradb/pull/144)) 81 | - Fixed publication date bug for search 82 | ([#146](https://github.com/saketkc/pysradb/pull/146)) 83 | - Added support for downloading data from GEO [pysradb dowload -g 84 | GSE] 85 | ([#129](https://github.com/saketkc/pysradb/pull/129)) 86 | 87 | ## 1.0.1 (01-10-2021) 88 | 89 | - Dropped Python 3.6 since pandas 1.2 is not supported 90 | 91 | ## 1.0.0 (01-09-2021) 92 | 93 | - Retired `metadb` and `SRAdb` based search through CLI - everything 94 | defaults to `SRAweb` 95 | - `SRAweb` now supports 96 | [search](https://saket-choudhary.me/pysradb/quickstart.html#search) 97 | - [N/A] is now replaced with [pd.NA] 98 | - Two new fields in \`\--detailed\`: [instrument_model] 99 | and [instrument_model_desc] 100 | [#75](https://github.com/saketkc/pysradb/issues/75) 101 | - Updated documentation 102 | 103 | ## 0.11.1 (09-18-2020) 104 | 105 | - [library_layout] is now outputted in metadata #56 106 | - [-detailed] unifies columns for ENA fastq links instead 107 | of appending \_x/\_y #59 108 | - bugfix for parsing namespace in xml outputs #65 109 | - XML errors from NCBI are now handled more gracefully #69 110 | - Documentation and dependency updates 111 | 112 | ## 0.11.0 (09-04-2020) 113 | 114 | - [pysradb download] now supports multiple threads for 115 | paralle downloads 116 | - [pysradb download] also supports ultra fast downloads of 117 | FASTQs from ENA using aspera-client 118 | 119 | ## 0.10.3 (03-26-2020) 120 | 121 | - Added test cases for SRAweb 122 | - API limit exceeding errors are automagically handled 123 | - Bug fixes for GSE \<=\> SRR 124 | - Bug fix for metadata - supports multiple SRPs 125 | 126 | Contributors 127 | 128 | - Dibya Gautam 129 | - Marius van den Beek 130 | 131 | ## 0.10.2 (02-05-2020) 132 | 133 | - Bug fix: Handle API-rate limit exceeding =\> Retries 134 | - Enhancement: \'Alternatives\' URLs are now part of 135 | [\--detailed] 136 | 137 | ## 0.10.1 (02-04-2020) 138 | 139 | - Bug fix: Handle Python3.6 for capture_output in subprocess.run 140 | 141 | ## 0.10.0 (01-31-2020) 142 | 143 | - All the subcommands (srx-to-srr, srx-to-srs) will now print 144 | additional columns where the first two columns represent the 145 | relevant conversion 146 | - Fixed a bug where for fetching entries with single efetch record 147 | 148 | ## 0.9.9 (01-15-2020) 149 | 150 | - Major fix: some SRRs would go missing as the experiment dict was 151 | being created only once per SRR (See #15) 152 | - Features: More detailed metadata by default in the SRAweb mode 153 | - See notebook: 154 | 155 | ## 0.9.7 (01-20-2020) 156 | 157 | - Feature: instrument, run size and total spots are now printed in the 158 | metadata by default (SRAweb mode only) 159 | - Issue: Fixed an issue with srapath failing on SRP. srapath is now 160 | run on individual SRRs. 161 | 162 | ## 0.9.6 (07-20-2019) 163 | 164 | - Introduced [SRAweb] to perform queries over the web if 165 | the SQLite is missing or does not contain the relevant record. 166 | 167 | ## 0.9.0 (02-27-2019) 168 | 169 | ### Others 170 | 171 | - This release completely changes the command line interface replacing 172 | click with argparse ([#3](https://github.com/saketkc/pysradb/pull/3)) 173 | - Removed Python 2 comptaible stale code 174 | 175 | ## 0.8.0 (02-26-2019) 176 | 177 | ### New methods/functionality 178 | 179 | - \`srr-to-gsm\`: convert SRR to GSM 180 | - SRAmetadb.sqlite.gz file is deleted by default after extraction 181 | - When SRAmetadb is not found a confirmation is seeked before 182 | downloading 183 | - Confirmation option before SRA downloads 184 | 185 | ### Bugfix 186 | 187 | - download() works with wget 188 | 189 | ### Others 190 | 191 | - [\--out_dir] is now [out-dir] 192 | 193 | ## 0.7.1 (02-18-2019) 194 | 195 | Important: Python2 is no longer supported. Please consider moving to 196 | Python3. 197 | 198 | ### Bugfix 199 | 200 | - Included docs in the index whihch were missed out in the previous 201 | release 202 | 203 | ## 0.7.0 (02-08-2019) 204 | 205 | ### New methods/functionality 206 | 207 | - \`gsm-to-srr\`: convert GSM to SRR 208 | - \`gsm-to-srx\`: convert GSM to SRX 209 | - \`gsm-to-gse\`: convert GSM to GSE 210 | 211 | ### Renamed methods 212 | 213 | The following commad line options have been renamed and the changes are 214 | not compatible with 0.6.0 release: 215 | 216 | - [sra-metadata] -\> [metadata]. 217 | - [sra-search] -\> [search]. 218 | - [srametadb] -\> [metadb]. 219 | 220 | ## 0.6.0 (12-25-2018) 221 | 222 | ### Bugfix 223 | 224 | - Fixed bugs introduced in 0.5.0 with API changes where multiple 225 | redundant columns were output in [sra-metadata] 226 | 227 | ### New methods/functionality 228 | 229 | - [download] now allows piped inputs 230 | 231 | ## 0.5.0 (12-24-2018) 232 | 233 | ### New methods/functionality 234 | 235 | - Support for filtering by SRX Id for SRA downloads. 236 | - \`srr_to_srx\`: Convert SRR to SRX/SRP 237 | - \`srp_to_srx\`: Convert SRP to SRX 238 | - Stripped down [sra-metadata] to give minimal information 239 | - Added [\--assay], [\--desc], 240 | [\--detailed] flag for [sra-metadata] 241 | - Improved table printing on terminal 242 | 243 | ## 0.4.2 (12-16-2018) 244 | 245 | ### Bugfix 246 | 247 | - Fixed unicode error in tests for Python2 248 | 249 | ## 0.4.0 (12-12-2018) 250 | 251 | ### New methods/functionality 252 | 253 | - Added a new [BASEdb] class to handle common database 254 | connections 255 | - Initial support for GEOmetadb through GEOdb class 256 | - Initial support or a command line interface: 257 | - download Download SRA project (SRPnnnn) 258 | - gse-metadata Fetch metadata for GEO ID (GSEnnnn) 259 | - gse-to-gsm Get GSM(s) for GSE 260 | - gsm-metadata Fetch metadata for GSM ID (GSMnnnn) 261 | - sra-metadata Fetch metadata for SRA project (SRPnnnn) 262 | - Added three separate notebooks for SRAdb, GEOdb, CLI usage 263 | 264 | ## 0.3.0 (12-05-2018) 265 | 266 | ### New methods/functionality 267 | 268 | - [sample_attribute] and 269 | [experiment_attribute] are now included by default in 270 | the df returned by [sra_metadata()] 271 | - [expand_sample_attribute_columns: expand metadata dataframe based on 272 | attributes in \`sample_attribute] column 273 | - New methods to guess cell/tissue/strain: 274 | [guess_cell_type()]/[guess_tissue_type()]/[guess_strain_type()] 275 | - Improved README and usage instructions 276 | 277 | ## 0.2.2 (12-03-2018) 278 | 279 | ### New methods/functionality 280 | 281 | - [search_sra()] allows full text search on SRA metadata. 282 | 283 | ## 0.2.0 (12-03-2018) 284 | 285 | ### Renamed methods 286 | 287 | The following methods have been renamed and the changes are not 288 | compatible with 0.1.0 release: 289 | 290 | - [get_query()] -\> [query()]. 291 | - [sra_convert()] -\> [sra_metadata()]. 292 | - [get_table_counts()] -\> [all_row_counts()]. 293 | 294 | ### New methods/functionality 295 | 296 | - [download_sradb_file()] makes fetching [SRAmetadb.sqlite] file easy; wget is no longer required. 297 | - [ftp] protocol is now supported besides [fsp] and hence [aspera-client] is now optional. We however, strongly recommend [aspera-client] for faster downloads. 298 | 299 | ### Bug fixes 300 | 301 | - Silenced [SettingWithCopyWarning] by excplicitly doing 302 | operations on a copy of the dataframe instead of the original. 303 | 304 | Besides these, all methods now follow a [numpydoc] 305 | compatible documentation. 306 | 307 | ## 0.1.0 (12-01-2018) 308 | 309 | - First release on PyPI. 310 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020-2023, Saket Choudhary 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.md 2 | include CONTRIBUTING.md 3 | include HISTORY.md 4 | include LICENSE 5 | include README.md 6 | include requirements.txt 7 | 8 | recursive-include tests * 9 | recursive-exclude * __pycache__ 10 | recursive-exclude * *.py[co] 11 | recursive-exclude * *.sqlite 12 | recursive-exclude * *.sqlite.gz 13 | 14 | recursive-include docs *.md conf.py Makefile make.bat *.jpg *.png *.gif *.rst 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | flake8 pysradb tests 55 | 56 | test: ## run tests quickly with the default Python 57 | pytest -s -v tests 58 | 59 | test-all: ## run tests on every Python version with tox 60 | tox 61 | 62 | coverage: ## check code coverage quickly with the default Python 63 | coverage run --source pysradb -m pytest 64 | coverage report -m 65 | coverage html 66 | 67 | docs: ## generate Sphinx HTML documentation, including API docs 68 | rm -f docs/pysradb.rst 69 | rm -f docs/modules.rst 70 | sphinx-apidoc -o docs/ pysradb 71 | $(MAKE) -C docs clean 72 | $(MAKE) -C docs html 73 | 74 | servedocs: docs ## compile the docs watching for changes 75 | #watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 76 | watchmedo shell-command -p '*.md|*.rst' -c '$(MAKE) -C docs html' -R -D . 77 | 78 | release: dist ## package and upload a release 79 | python -m build 80 | twine upload dist/* 81 | 82 | dist: clean ## builds source and wheel package 83 | python -m build 84 | ls -l dist 85 | 86 | install: clean ## install the package to the active Python's site-packages 87 | pip install -e . 88 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = pysradb 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/e1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e1.png -------------------------------------------------------------------------------- /docs/_static/e2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e2.png -------------------------------------------------------------------------------- /docs/_static/e3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e3.png -------------------------------------------------------------------------------- /docs/_static/pysradb_v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/pysradb_v3.png -------------------------------------------------------------------------------- /docs/_static/pysradb_v3.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 50 | 51 | 53 | 54 | 56 | image/svg+xml 57 | 59 | 60 | 61 | 62 | 63 | 68 | 71 | 77 | 82 | 83 | p 94 | sradb 105 | 109 | 112 | 118 | 123 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /docs/authors.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | ## Contributors 4 | 5 | - [Boshen Yan](https://github.com/bscrow) 6 | - [Maarten van der Sande](https://github.com/Maarten-vd-Sande) 7 | - [Dibya Gautam](https://github.com/dibyaaaaax) 8 | - [Marius van den Beek](https://github.com/mvdbeek) 9 | - [Devang Thakkar](https://github.com/DevangThakkar) 10 | 11 | ## Maintainer 12 | 13 | - Saket Choudhary \<\> 14 | -------------------------------------------------------------------------------- /docs/cmdline.md: -------------------------------------------------------------------------------- 1 | # CLI 2 | 3 | $ pysradb 4 | usage: pysradb [-h] [--version] [--citation] 5 | {metadb,metadata,download,search,gse-to-gsm,gse-to-srp,gsm-to-gse,gsm-to-srp,gsm-to-srr,gsm-to-srs,gsm-to-srx,srp-to-gse,srp-to-srr,srp-to-srs,srp-to-srx,srr-to-gsm,srr-to-srp,srr-to-srs,srr-to-srx,srs-to-gsm,srs-to-srx,srx-to-srp,srx-to-srr,srx-to-srs} 6 | ... 7 | 8 | pysradb: Query NGS metadata and data from NCBI Sequence Read Archive. 9 | Citation: 10.12688/f1000research.18676.1 10 | 11 | optional arguments: 12 | -h, --help show this help message and exit 13 | --version show program's version number and exit 14 | --citation how to cite 15 | 16 | subcommands: 17 | {metadb,metadata,download,search,gse-to-gsm,gse-to-srp,gsm-to-gse,gsm-to-srp,gsm-to-srr,gsm-to-srs,gsm-to-srx,srp-to-gse,srp-to-srr,srp-to-srs,srp-to-srx,srr-to-gsm,srr-to-srp,srr-to-srs,srr-to-srx,srs-to-gsm,srs-to-srx,srx-to-srp,srx-to-srr,srx-to-srs} 18 | metadata Fetch metadata for SRA project (SRPnnnn) 19 | download Download SRA project (SRPnnnn) 20 | search Search SRA/ENA for matching text 21 | gse-to-gsm Get GSM for a GSE 22 | gse-to-srp Get SRP for a GSE 23 | gsm-to-gse Get GSE for a GSM 24 | gsm-to-srp Get SRP for a GSM 25 | gsm-to-srr Get SRR for a GSM 26 | gsm-to-srs Get SRS for a GSM 27 | gsm-to-srx Get SRX for a GSM 28 | srp-to-gse Get GSE for a SRP 29 | srp-to-srr Get SRR for a SRP 30 | srp-to-srs Get SRS for a SRP 31 | srp-to-srx Get SRX for a SRP 32 | srr-to-gsm Get GSM for a SRR 33 | srr-to-srp Get SRP for a SRR 34 | srr-to-srs Get SRS for a SRR 35 | srr-to-srx Get SRX for a SRR 36 | srs-to-gsm Get GSM for a SRS 37 | srs-to-srx Get SRX for a SRS 38 | srx-to-srp Get SRP for a SRX 39 | srx-to-srr Get SRR for a SRX 40 | srx-to-srs Get SRS for a SRX 41 | 42 | ## Getting metadata for a SRA project (SRP) 43 | 44 | The most basic information associated with any SRA project is its list 45 | of experiments and run accessions. 46 | 47 | $ pysradb metadata SRP098789 48 | 49 | study_accession experiment_accession sample_accession run_accession 50 | SRP098789 SRX2536403 SRS1956353 SRR5227288 51 | SRP098789 SRX2536404 SRS1956354 SRR5227289 52 | SRP098789 SRX2536405 SRS1956355 SRR5227290 53 | SRP098789 SRX2536406 SRS1956356 SRR5227291 54 | SRP098789 SRX2536407 SRS1956357 SRR5227292 55 | SRP098789 SRX2536408 SRS1956358 SRR5227293 56 | SRP098789 SRX2536409 SRS1956359 SRR5227294 57 | 58 | Listing SRX and SRRs for a SRP is often not useful. We might want to 59 | take a quick look at the metadata associated with the samples: 60 | 61 | $ pysradb metadata SRP098789 62 | 63 | study_accession experiment_accession sample_accession run_accession sample_attribute 64 | SRP098789 SRX2536403 SRS1956353 SRR5227288 source_name: Huh7_1.5 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 65 | SRP098789 SRX2536404 SRS1956354 SRR5227289 source_name: Huh7_1.5 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 66 | SRP098789 SRX2536405 SRS1956355 SRR5227290 source_name: Huh7_1.5 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 67 | SRP098789 SRX2536406 SRS1956356 SRR5227291 source_name: Huh7_0.3 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 68 | SRP098789 SRX2536407 SRS1956357 SRR5227292 source_name: Huh7_0.3 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 69 | SRP098789 SRX2536408 SRS1956358 SRR5227293 source_name: Huh7_0.3 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 70 | 71 | The example here came from a Ribosome profiling study and consists of a 72 | collection of both Ribo-seq and RNA-seq samples. We can filter out only 73 | the RNA-seq samples: 74 | 75 | $ pysradb metadata SRP098789 --detailed | grep 'study|RNA-Seq' 76 | 77 | SRP098789 SRX2536422 SRR5227307 RNA-Seq SINGLE - 78 | SRP098789 SRX2536424 SRR5227309 RNA-Seq SINGLE - 79 | SRP098789 SRX2536426 SRR5227311 RNA-Seq SINGLE - 80 | SRP098789 SRX2536428 SRR5227313 RNA-Seq SINGLE - 81 | 82 | A more complicated example will consist of multiple assays. For example 83 | \`SRP000941\`: 84 | 85 | $ pysradb metadata SRP000941 --detailed | tr -s ' ' | cut -f5 -d ' ' | sort | uniq -c 86 | 999 Bisulfite-Seq 87 | 768 ChIP-Seq 88 | 1 library_strategy 89 | 121 OTHER 90 | 353 RNA-Seq 91 | 28 WGS 92 | 93 | ## Experiment accessions for a project (SRP =\> SRX) 94 | 95 | A frequently encountered task involves getting all the experiments (SRX) 96 | for a particular study accession (SRP). Consider project \`SRP048759\`: 97 | 98 | $ pysradb srp-to-srx SRP048759 99 | 100 | ## Sample accessions for a project (SRP =\> SRS) 101 | 102 | Each experiment involves one or multiple biological samples (SRS), that 103 | are put through different experiments (SRX). 104 | 105 | $ pysradb srp-to-srs --detailed SRP048759 106 | 107 | study_accession sample_accession 108 | SRP048759 SRS718878 109 | SRP048759 SRS718879 110 | SRP048759 SRS718880 111 | SRP048759 SRS718881 112 | SRP048759 SRS718882 113 | SRP048759 SRS718883 114 | SRP048759 SRS718884 115 | SRP048759 SRS718885 116 | SRP048759 SRS718886 117 | 118 | This is very limited information. It can again be detailed out using the 119 | [\--detailed]{.title-ref} flag: 120 | 121 | $ pysradb srp-to-srs --detailed SRP048759 122 | 123 | study_accession sample_accession experiment_accession run_accession study_alias sample_alias experiment_alias run_alias 124 | SRP048759 SRS718878 SRX729552 SRR1608490 GSE62190 GSM1521543 GSM1521543 GSM1521543_r1 125 | SRP048759 SRS718878 SRX729552 SRR1608491 GSE62190 GSM1521543 GSM1521543 GSM1521543_r2 126 | SRP048759 SRS718878 SRX729552 SRR1608492 GSE62190 GSM1521543 GSM1521543 GSM1521543_r3 127 | SRP048759 SRS718878 SRX729552 SRR1608493 GSE62190 GSM1521543 GSM1521543 GSM1521543_r4 128 | SRP048759 SRS718879 SRX729553 SRR1608494 GSE62190 GSM1521544 GSM1521544 GSM1521544_r1 129 | SRP048759 SRS718879 SRX729553 SRR1608495 GSE62190 GSM1521544 GSM1521544 GSM1521544_r2 130 | 131 | ## Run accessions for experiments (SRX =\> SRR) 132 | 133 | Another frequently encountered task involves fetching the run accessions 134 | (SRR) for a particular experiment (SRX). Consider experiments 135 | [SRX217956]{.title-ref} and [SRX2536403]{.title-ref}. We want to be able 136 | to resolve the run accessions for these experiments: 137 | 138 | $ pysradb srx-to-srr SRX217956 SRX2536403 --detailed 139 | 140 | experiment_accession run_accession study_accession sample_attribute 141 | SRX217956 SRR649752 SRP017942 source_name: 3T3 cells || treatment: control || cell line: 3T3 cells || assay type: Riboseq 142 | SRX2536403 SRR5227288 SRP098789 source_name: Huh7_1.5 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 143 | 144 | ## Experiment accessions for runs (SRR =\> SRX) 145 | 146 | For fetching experiment accessions (SRX) for one or multiple run 147 | accessions (SRR): 148 | 149 | $ pysradb srr-to-srx SRR5227288 SRR649752 --detailed 150 | run_accession study_accession experiment_accession sample_attribute 151 | SRR649752 SRP017942 SRX217956 source_name: 3T3 cells || treatment: control || cell line: 3T3 cells || assay type: Riboseq 152 | SRR5227288 SRP098789 SRX2536403 source_name: Huh7_1.5 µM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq 153 | 154 | ## Downaloading entire project 155 | 156 | $ pysradb metadata --detailed SRP098789 | pysradb download 157 | 158 | ## GEO accessions for studies (SRP =\> GSE) 159 | 160 | $ pysradb srp-to-gse SRP090415 161 | 162 | study_accession study_alias 163 | SRP090415 GSE87328 164 | 165 | But not all SRPs will have an associated GEO id (GSE): 166 | 167 | $ pysradb srp-to-gse SRP029589 168 | 169 | study_accession study_alias 170 | SRP029589 PRJNA218051 171 | 172 | ## SRA accessions for GEO studies (GSE =\> SRP) 173 | 174 | $ pysradb gse-to-srp GSE87328i 175 | 176 | study_alias study_accession 177 | GSE87328 SRP090415 178 | 179 | Please see 180 | [quickstart](https://www.saket-choudhary.me/pysradb/quickstart.html#the-full-list-of-possible-pysradb-operations) 181 | for all possible operations available through `pysradb`. 182 | -------------------------------------------------------------------------------- /docs/commands.md: -------------------------------------------------------------------------------- 1 | # API Documentation 2 | 3 | ::: {.toctree maxdepth="2"} 4 | commands/download commands/metadata commands/search commands/gse-to-gsm 5 | commands/gse-to-srp commands/srp-to-gse commands/srp-to-srr 6 | commands/srp-to-srs commands/srp-to-srx commands/srr-to-srs 7 | commands/srr-to-srx commands/srs-to-srx commands/srx-to-srr 8 | commands/srx-to-srs 9 | ::: 10 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # pysradb documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | 24 | # import guzzle_sphinx_theme 25 | import pysradb 26 | 27 | autodoc_mock_imports = ["xmltodict", "numpy", "pandas", "requests", "tqdm"] 28 | 29 | sys.path.insert(0, os.path.abspath("..")) 30 | 31 | 32 | # -- General configuration --------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 40 | extensions = [ 41 | "IPython.sphinxext.ipython_directive", 42 | "IPython.sphinxext.ipython_console_highlighting", 43 | "sphinx.ext.mathjax", 44 | "sphinx.ext.autodoc", 45 | "sphinx.ext.autosummary", 46 | "sphinx.ext.doctest", 47 | "sphinx.ext.viewcode", 48 | "sphinx.ext.inheritance_diagram", 49 | "numpydoc", 50 | "sphinx_tabs.tabs", 51 | "sphinx_panels", 52 | "sphinxcontrib.gtagjs", 53 | "myst_parser", 54 | ] 55 | gtagjs_ids = [ 56 | "G-CKQZFCEENZ", 57 | ] 58 | 59 | panels_add_bootstrap_css = False 60 | 61 | # Add any paths that contain templates here, relative to this directory. 62 | templates_path = ["_templates"] 63 | 64 | # The suffix(es) of source filenames. 65 | # You can specify multiple suffix as a list of string: 66 | # 67 | source_suffix = [".rst", ".md"] 68 | # source_suffix = ".md" 69 | 70 | # The master toctree document. 71 | master_doc = "index" 72 | 73 | # General information about the project. 74 | project = "pysradb" 75 | copyright = "2023, Saket Choudhary" 76 | author = "Saket Choudhary" 77 | # The version info for the project you're documenting, acts as replacement 78 | # for |version| and |release|, also used in various other places throughout 79 | # the built documents. 80 | # 81 | # The short X.Y version. 82 | version = pysradb.__version__ 83 | # The full version, including alpha/beta/rc tags. 84 | release = pysradb.__version__ 85 | 86 | # The language for content autogenerated by Sphinx. Refer to documentation 87 | # for a list of supported languages. 88 | # 89 | # This is also used if you do content translation via gettext catalogs. 90 | # Usually you set "language" from the command line for these cases. 91 | language = None 92 | 93 | # List of patterns, relative to source directory, that match files and 94 | # directories to ignore when looking for source files. 95 | # This patterns also effect to html_static_path and html_extra_path 96 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 97 | 98 | # The name of the Pygments (syntax highlighting) style to use. 99 | pygments_style = "sphinx" 100 | 101 | # If true, `todo` and `todoList` produce output, else they produce nothing. 102 | todo_include_todos = False 103 | 104 | 105 | # -- Options for HTML output ------------------------------------------- 106 | 107 | # The theme to use for HTML and HTML Help pages. See the documentation for 108 | # a list of builtin themes. 109 | # 110 | html_theme = "pydata_sphinx_theme" 111 | 112 | # Theme options are theme-specific and customize the look and feel of a 113 | # theme further. For a list of options available for each theme, see the 114 | # documentation. 115 | # 116 | # html_theme_options = {} 117 | 118 | # Add any paths that contain custom static files (such as style sheets) here, 119 | # relative to this directory. They are copied after the builtin static files, 120 | # so a file named "default.css" will overwrite the builtin "default.css". 121 | html_static_path = ["_static"] 122 | 123 | 124 | # -- Options for HTMLHelp output --------------------------------------- 125 | 126 | # Output file base name for HTML help builder. 127 | htmlhelp_basename = "pysradbdoc" 128 | 129 | 130 | # -- Options for LaTeX output ------------------------------------------ 131 | 132 | latex_elements = { 133 | # The paper size ('letterpaper' or 'a4paper'). 134 | # 135 | # 'papersize': 'letterpaper', 136 | # The font size ('10pt', '11pt' or '12pt'). 137 | # 138 | # 'pointsize': '10pt', 139 | # Additional stuff for the LaTeX preamble. 140 | # 141 | # 'preamble': '', 142 | # Latex figure (float) alignment 143 | # 144 | # 'figure_align': 'htbp', 145 | } 146 | 147 | # Grouping the document tree into LaTeX files. List of tuples 148 | # (source start file, target name, title, author, documentclass 149 | # [howto, manual, or own class]). 150 | latex_documents = [ 151 | (master_doc, "pysradb.tex", "pysradb Documentation", "Saket Choudhary", "manual") 152 | ] 153 | 154 | 155 | # -- Options for manual page output ------------------------------------ 156 | 157 | # One entry per manual page. List of tuples 158 | # (source start file, name, description, authors, manual section). 159 | man_pages = [(master_doc, "pysradb", "pysradb Documentation", [author], 1)] 160 | 161 | 162 | # -- Options for Texinfo output ---------------------------------------- 163 | 164 | # Grouping the document tree into Texinfo files. List of tuples 165 | # (source start file, target name, title, author, 166 | # dir menu entry, description, category) 167 | texinfo_documents = [ 168 | ( 169 | master_doc, 170 | "pysradb", 171 | "pysradb Documentation", 172 | author, 173 | "pysradb", 174 | "One line description of project.", 175 | "Miscellaneous", 176 | ) 177 | ] 178 | 179 | 180 | numpydoc_show_class_members = False 181 | 182 | 183 | ##html_theme_path = guzzle_sphinx_theme.html_theme_path() 184 | ##html_theme = "guzzle_sphinx_theme" 185 | ## 186 | ### Register the theme as an extension to generate a sitemap.xml 187 | ##extensions.append("guzzle_sphinx_theme") 188 | ## 189 | ### Guzzle theme options (see theme.conf for more information) 190 | ##html_theme_options = { 191 | ## # Set the name of the project to appear in the sidebar 192 | ## "project_nav_name": "pysradb" 193 | ##} 194 | 195 | scv_greatest_tag = True 196 | scv_show_banner = True 197 | 198 | html_logo = "_static/pysradb_v3.png" 199 | html_theme_options = { 200 | "github_url": "https://github.com/saketkc/pysradb", 201 | "google_analytics_id": "G-CKQZFCEENZ", 202 | # "navbar_align": "left", 203 | } 204 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every 4 | little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs 11 | 12 | Report bugs at . 13 | 14 | If you are reporting a bug, please include: 15 | 16 | - Your operating system name and version. 17 | - Any details about your local setup that might be helpful in 18 | troubleshooting. 19 | - Detailed steps to reproduce the bug. 20 | 21 | ### Fix Bugs 22 | 23 | Look through the GitHub issues for bugs. Anything tagged with \"bug\" 24 | and \"help wanted\" is open to whoever wants to implement it. 25 | 26 | ### Implement Features 27 | 28 | Look through the GitHub issues for features. Anything tagged with 29 | \"enhancement\" and \"help wanted\" is open to whoever wants to 30 | implement it. 31 | 32 | ### Write Documentation 33 | 34 | pysradb could always use more documentation, whether as part of the 35 | official pysradb docs, in docstrings, or even on the web in blog posts, 36 | articles, and such. 37 | 38 | ### Submit Feedback 39 | 40 | The best way to send feedback is to file an issue at 41 | . 42 | 43 | If you are proposing a feature: 44 | 45 | - Explain in detail how it would work. 46 | - Keep the scope as narrow as possible, to make it easier to 47 | implement. 48 | - Remember that this is a volunteer-driven project, and that 49 | contributions are welcome :) 50 | 51 | ## Get Started! 52 | 53 | Ready to contribute? Here\'s how to set up [pysradb]{.title-ref} for 54 | local development. 55 | 56 | 1. Fork the [pysradb]{.title-ref} repo on GitHub. 57 | 58 | 2. Clone your fork locally: 59 | 60 | ``` shell 61 | $ git clone git@github.com:your_name_here/pysradb.git 62 | ``` 63 | 64 | 3. Install your local copy into a virtualenv. Assuming you have 65 | virtualenvwrapper installed, this is how you set up your fork for 66 | local development (If python \--version is less than 3.0, run [\$ 67 | mkvirtualenv pysradb \--python=py3]{.title-ref} instead): 68 | 69 | ``` shell 70 | $ mkvirtualenv pysradb 71 | $ cd pysradb/ 72 | $ python setup.py develop 73 | ``` 74 | 75 | 4. Create a branch for local development: 76 | 77 | ``` shell 78 | $ git checkout -b name-of-your-bugfix-or-feature 79 | ``` 80 | 81 | Now you can make your changes locally. 82 | 83 | 5. When you\'re done making changes, check that your changes pass 84 | flake8 and the tests, including testing other Python versions with 85 | tox: 86 | 87 | ``` shell 88 | $ flake8 pysradb tests 89 | $ python setup.py test or py.test 90 | $ tox 91 | ``` 92 | 93 | To get flake8 and tox, just pip install them into your virtualenv. 94 | 95 | 6. Commit your changes and push your branch to GitHub: 96 | 97 | ``` shell 98 | $ git add . 99 | $ git commit -m "Your detailed description of your changes." 100 | $ git push origin name-of-your-bugfix-or-feature 101 | ``` 102 | 103 | 7. Submit a pull request through the GitHub website. 104 | 105 | ## Pull Request Guidelines 106 | 107 | Before you submit a pull request, check that it meets these guidelines: 108 | 109 | 1. The pull request should include tests. 110 | 2. If the pull request adds functionality, the docs should be updated. 111 | Put your new functionality into a function with a docstring, and add 112 | the feature to the list in README.rst. 113 | 3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and 114 | for PyPy. Check 115 | and make sure 116 | that the tests pass for all supported Python versions. 117 | 118 | ## Tips 119 | 120 | To run a subset of tests: 121 | 122 | ``` shell 123 | $ py.test tests.test_pysradb 124 | ``` 125 | 126 | ## Deploying 127 | 128 | A reminder for the maintainers on how to deploy. Make sure all your 129 | changes are committed (including an entry in HISTORY.rst). Then run: 130 | 131 | ``` shell 132 | $ bumpversion patch # possible: major / minor / patch 133 | $ git push 134 | $ git push --tags 135 | ``` 136 | 137 | Travis will then deploy to PyPI if tests pass. 138 | -------------------------------------------------------------------------------- /docs/history.md: -------------------------------------------------------------------------------- 1 | # History 2 | 3 | ## 2.1.0 (2023-05-16) 4 | 5 | - Fix for [gse-to-srp]{.title-ref} returning unrequested GSEs ([#186 6 | \]{.title-ref}) 7 | - Fix for [download]{.title-ref} using [public_urls]{.title-ref} 8 | - Fix for [gsm-to-srx]{.title-ref} returning false positives ([#165 9 | \]{.title-ref}) 10 | - Fix for delimiter not being consistent when metadata is printed on 11 | terminal ([#147 12 | \]{.title-ref}) 13 | - ENA search is currently broken because of an API change 14 | 15 | ## 2.0.2 (2023-04-09) 16 | 17 | - Fix for [gse-to-srp]{.title-ref} to handle cases where a project is 18 | missing but SRXs are returned ([#186 19 | \]{.title-ref}) 20 | - Fix gse-to-gsm ([#187 21 | \]{.title-ref}) 22 | 23 | ## 2.0.1 (2023-03-18) 24 | 25 | - Fix for [pysradb download]{.title-ref} - using 26 | [public_url]{.title-ref} 27 | - Fix for SRX -\> SRR and related conversions ([#183 28 | \]{.title-ref}) 29 | 30 | ## 2.0.0 (2023-02-23) 31 | 32 | - BREAKING change: Overhaul of how urls and associated metadata are 33 | returned (not backward compatible); all column names are lower cased 34 | by default 35 | - Fix extra space in \"organism_taxid\" column 36 | - Added support for Experiment attributes ([#89 37 | \]{.title-ref}) 38 | 39 | ## 1.4.2 (06-17-2022) 40 | 41 | - Fix ENA fastq fetching ([#163 42 | \]{.title-ref}) 43 | 44 | ## 1.4.1 (06-04-2022) 45 | 46 | - Fix for fetchin alternative URLs 47 | 48 | ## 1.4.0 (06-04-2022) 49 | 50 | - Added ability to fetch alternative URLs (GCP/AWS) for metadata 51 | ([#161 52 | \]{.title-ref}) 53 | - Fix for xmldict 0.13.0 no longer defaulting to OrderedDict ([#159 54 | \]{.title-ref}) 55 | - Fix for missing experiment model and description in metadata ([#160 56 | \]{.title-ref}) 57 | 58 | ## 1.3.0 (02-18-2022) 59 | 60 | - Add [study_title]{.title-ref} to [\--detailed]{.title-ref} flag 61 | ([#152](https://github.com/saketkc/pysradb/issues/152)) 62 | - Fix [KeyError]{.title-ref} in [metadata]{.title-ref} where some new 63 | IDs do not have any metadata 64 | ([#151](https://github.com/saketkc/pysradb/issues/151)) 65 | 66 | ## 1.2.0 (01-10-2022) 67 | 68 | - Do not exit if a qeury returns no hits ([#149 69 | \]{.title-ref}) 70 | 71 | ## 1.1.0 (12-12-2021) 72 | 73 | - Fixed [gsm-to-gse]{.title-ref} failure 74 | ([#128](https://github.com/saketkc/pysradb/pull/128)) 75 | - Fixed case sensitivity bug for ENA search 76 | ([#144](https://github.com/saketkc/pysradb/pull/144)) 77 | - Fixed publication date bug for search 78 | ([#146](https://github.com/saketkc/pysradb/pull/146)) 79 | - Added support for downloading data from GEO [pysradb dowload -g 80 | \]{.title-ref} 81 | ([#129](https://github.com/saketkc/pysradb/pull/129)) 82 | 83 | ## 1.0.1 (01-10-2021) 84 | 85 | - Dropped Python 3.6 since pandas 1.2 is not supported 86 | 87 | ## 1.0.0 (01-09-2021) 88 | 89 | - Retired `metadb` and `SRAdb` based search through CLI - everything 90 | defaults to `SRAweb` 91 | - `SRAweb` now supports 92 | [search](https://saket-choudhary.me/pysradb/quickstart.html#search) 93 | - [N/A]{.title-ref} is now replaced with [pd.NA]{.title-ref} 94 | - Two new fields in \`\--detailed\`: [instrument_model]{.title-ref} 95 | and [instrument_model_desc]{.title-ref} 96 | [#75](https://github.com/saketkc/pysradb/issues/75) 97 | - Updated documentation 98 | 99 | ## 0.11.1 (09-18-2020) 100 | 101 | - [library_layout]{.title-ref} is now outputted in metadata #56 102 | - [-detailed]{.title-ref} unifies columns for ENA fastq links instead 103 | of appending \_x/\_y #59 104 | - bugfix for parsing namespace in xml outputs #65 105 | - XML errors from NCBI are now handled more gracefully #69 106 | - Documentation and dependency updates 107 | 108 | ## 0.11.0 (09-04-2020) 109 | 110 | - [pysradb download]{.title-ref} now supports multiple threads for 111 | paralle downloads 112 | - [pysradb download]{.title-ref} also supports ultra fast downloads of 113 | FASTQs from ENA using aspera-client 114 | 115 | ## 0.10.3 (03-26-2020) 116 | 117 | - Added test cases for SRAweb 118 | - API limit exceeding errors are automagically handled 119 | - Bug fixes for GSE \<=\> SRR 120 | - Bug fix for metadata - supports multiple SRPs 121 | 122 | Contributors 123 | 124 | - Dibya Gautam 125 | - Marius van den Beek 126 | 127 | ## 0.10.2 (02-05-2020) 128 | 129 | - Bug fix: Handle API-rate limit exceeding =\> Retries 130 | - Enhancement: \'Alternatives\' URLs are now part of 131 | [\--detailed]{.title-ref} 132 | 133 | ## 0.10.1 (02-04-2020) 134 | 135 | - Bug fix: Handle Python3.6 for capture_output in subprocess.run 136 | 137 | ## 0.10.0 (01-31-2020) 138 | 139 | - All the subcommands (srx-to-srr, srx-to-srs) will now print 140 | additional columns where the first two columns represent the 141 | relevant conversion 142 | - Fixed a bug where for fetching entries with single efetch record 143 | 144 | ## 0.9.9 (01-15-2020) 145 | 146 | - Major fix: some SRRs would go missing as the experiment dict was 147 | being created only once per SRR (See #15) 148 | - Features: More detailed metadata by default in the SRAweb mode 149 | - See notebook: 150 | 151 | ## 0.9.7 (01-20-2020) 152 | 153 | - Feature: instrument, run size and total spots are now printed in the 154 | metadata by default (SRAweb mode only) 155 | - Issue: Fixed an issue with srapath failing on SRP. srapath is now 156 | run on individual SRRs. 157 | 158 | ## 0.9.6 (07-20-2019) 159 | 160 | - Introduced [SRAweb]{.title-ref} to perform queries over the web if 161 | the SQLite is missing or does not contain the relevant record. 162 | 163 | ## 0.9.0 (02-27-2019) 164 | 165 | ### Others 166 | 167 | - This release completely changes the command line interface replacing 168 | click with argparse () 169 | - Removed Python 2 comptaible stale code 170 | 171 | ## 0.8.0 (02-26-2019) 172 | 173 | ### New methods/functionality 174 | 175 | - \`srr-to-gsm\`: convert SRR to GSM 176 | - SRAmetadb.sqlite.gz file is deleted by default after extraction 177 | - When SRAmetadb is not found a confirmation is seeked before 178 | downloading 179 | - Confirmation option before SRA downloads 180 | 181 | ### Bugfix 182 | 183 | - download() works with wget 184 | 185 | ### Others 186 | 187 | - [\--out_dir]{.title-ref} is now [out-dir]{.title-ref} 188 | 189 | ## 0.7.1 (02-18-2019) 190 | 191 | Important: Python2 is no longer supported. Please consider moving to 192 | Python3. 193 | 194 | ### Bugfix 195 | 196 | - Included docs in the index whihch were missed out in the previous 197 | release 198 | 199 | ## 0.7.0 (02-08-2019) 200 | 201 | ### New methods/functionality 202 | 203 | - \`gsm-to-srr\`: convert GSM to SRR 204 | - \`gsm-to-srx\`: convert GSM to SRX 205 | - \`gsm-to-gse\`: convert GSM to GSE 206 | 207 | ### Renamed methods 208 | 209 | The following commad line options have been renamed and the changes are 210 | not compatible with 0.6.0 release: 211 | 212 | - [sra-metadata]{.title-ref} -\> [metadata]{.title-ref}. 213 | - [sra-search]{.title-ref} -\> [search]{.title-ref}. 214 | - [srametadb]{.title-ref} -\> [metadb]{.title-ref}. 215 | 216 | ## 0.6.0 (12-25-2018) 217 | 218 | ### Bugfix 219 | 220 | - Fixed bugs introduced in 0.5.0 with API changes where multiple 221 | redundant columns were output in [sra-metadata]{.title-ref} 222 | 223 | ### New methods/functionality 224 | 225 | - [download]{.title-ref} now allows piped inputs 226 | 227 | ## 0.5.0 (12-24-2018) 228 | 229 | ### New methods/functionality 230 | 231 | - Support for filtering by SRX Id for SRA downloads. 232 | - \`srr_to_srx\`: Convert SRR to SRX/SRP 233 | - \`srp_to_srx\`: Convert SRP to SRX 234 | - Stripped down [sra-metadata]{.title-ref} to give minimal information 235 | - Added [\--assay]{.title-ref}, [\--desc]{.title-ref}, 236 | [\--detailed]{.title-ref} flag for [sra-metadata]{.title-ref} 237 | - Improved table printing on terminal 238 | 239 | ## 0.4.2 (12-16-2018) 240 | 241 | ### Bugfix 242 | 243 | - Fixed unicode error in tests for Python2 244 | 245 | ## 0.4.0 (12-12-2018) 246 | 247 | ### New methods/functionality 248 | 249 | - Added a new [BASEdb]{.title-ref} class to handle common database 250 | connections 251 | - Initial support for GEOmetadb through GEOdb class 252 | - Initial support or a command line interface: 253 | - download Download SRA project (SRPnnnn) 254 | - gse-metadata Fetch metadata for GEO ID (GSEnnnn) 255 | - gse-to-gsm Get GSM(s) for GSE 256 | - gsm-metadata Fetch metadata for GSM ID (GSMnnnn) 257 | - sra-metadata Fetch metadata for SRA project (SRPnnnn) 258 | - Added three separate notebooks for SRAdb, GEOdb, CLI usage 259 | 260 | ## 0.3.0 (12-05-2018) 261 | 262 | ### New methods/functionality 263 | 264 | - [sample_attribute]{.title-ref} and 265 | [experiment_attribute]{.title-ref} are now included by default in 266 | the df returned by [sra_metadata()]{.title-ref} 267 | - [expand_sample_attribute_columns: expand metadata dataframe based on 268 | attributes in \`sample_attribute]{.title-ref} column 269 | - New methods to guess cell/tissue/strain: 270 | [guess_cell_type()]{.title-ref}/[guess_tissue_type()]{.title-ref}/[guess_strain_type()]{.title-ref} 271 | - Improved README and usage instructions 272 | 273 | ## 0.2.2 (12-03-2018) 274 | 275 | ### New methods/functionality 276 | 277 | - [search_sra()]{.title-ref} allows full text search on SRA metadata. 278 | 279 | ## 0.2.0 (12-03-2018) 280 | 281 | ### Renamed methods 282 | 283 | The following methods have been renamed and the changes are not 284 | compatible with 0.1.0 release: 285 | 286 | - [get_query()]{.title-ref} -\> [query()]{.title-ref}. 287 | - [sra_convert()]{.title-ref} -\> [sra_metadata()]{.title-ref}. 288 | - [get_table_counts()]{.title-ref} -\> [all_row_counts()]{.title-ref}. 289 | 290 | ### New methods/functionality 291 | 292 | - [download_sradb_file()]{.title-ref} makes fetching 293 | [SRAmetadb.sqlite]{.title-ref} file easy; wget is no longer 294 | required. 295 | - [ftp]{.title-ref} protocol is now supported besides 296 | [fsp]{.title-ref} and hence [aspera-client]{.title-ref} is now 297 | optional. We however, strongly recommend [aspera-client]{.title-ref} 298 | for faster downloads. 299 | 300 | ### Bug fixes 301 | 302 | - Silenced [SettingWithCopyWarning]{.title-ref} by excplicitly doing 303 | operations on a copy of the dataframe instead of the original. 304 | 305 | Besides these, all methods now follow a [numpydoc]{.title-ref} 306 | compatible documentation. 307 | 308 | ## 0.1.0 (12-01-2018) 309 | 310 | - First release on PyPI. 311 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Stable release 4 | 5 | To install pysradb, run this command in your terminal: 6 | 7 | ``` console 8 | $ pip install pysradb 9 | ``` 10 | 11 | This is the preferred method to install pysradb, as it will always 12 | install the most recent stable release. 13 | 14 | If you don\'t have [pip](https://pip.pypa.io) installed, this [Python 15 | installation 16 | guide](http://docs.python-guide.org/en/latest/starting/installation/) 17 | can guide you through the process. 18 | 19 | Alternatively, you may use conda: 20 | 21 | ``` bash 22 | conda install -c bioconda pysradb 23 | ``` 24 | 25 | This step will install all the dependencies except aspera-client (which 26 | is not required, but highly recommended). If you have an existing 27 | environment with a lot of pre-installed packages, conda might be 28 | [slow](https://github.com/bioconda/bioconda-recipes/issues/13774). 29 | Please consider creating a new enviroment for `pysradb`: 30 | 31 | ``` bash 32 | conda create -c bioconda -n pysradb PYTHON=3 pysradb 33 | ``` 34 | 35 | ## From sources 36 | 37 | The source files for pysradb can be downloaded from the [Github 38 | repo](https://github.com/saketkc/pysradb). 39 | 40 | You can either clone the public repository: 41 | 42 | ``` console 43 | $ git clone git://github.com/saketkc/pysradb 44 | ``` 45 | 46 | Or download the 47 | [tarball](https://github.com/saketkc/pysradb/tarball/master): 48 | 49 | ``` console 50 | $ curl -OL https://github.com/saketkc/pysradb/tarball/master 51 | ``` 52 | 53 | Once you have a copy of the source, you can install it with: 54 | 55 | ``` console 56 | $ python setup.py install 57 | ``` 58 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=pysradb 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/modules.md: -------------------------------------------------------------------------------- 1 | # pysradb 2 | 3 | ::: {.toctree maxdepth="4"} 4 | pysradb 5 | ::: 6 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | pysradb 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pysradb 8 | -------------------------------------------------------------------------------- /docs/pysradb.rst: -------------------------------------------------------------------------------- 1 | pysradb package 2 | =============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pysradb.basedb module 8 | --------------------- 9 | 10 | .. automodule:: pysradb.basedb 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pysradb.cli module 16 | ------------------ 17 | 18 | .. automodule:: pysradb.cli 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pysradb.download module 24 | ----------------------- 25 | 26 | .. automodule:: pysradb.download 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pysradb.exceptions module 32 | ------------------------- 33 | 34 | .. automodule:: pysradb.exceptions 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pysradb.filter\_attrs module 40 | ---------------------------- 41 | 42 | .. automodule:: pysradb.filter_attrs 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pysradb.geodb module 48 | -------------------- 49 | 50 | .. automodule:: pysradb.geodb 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pysradb.geoweb module 56 | --------------------- 57 | 58 | .. automodule:: pysradb.geoweb 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pysradb.search module 64 | --------------------- 65 | 66 | .. automodule:: pysradb.search 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | pysradb.sradb module 72 | -------------------- 73 | 74 | .. automodule:: pysradb.sradb 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | pysradb.sraweb module 80 | --------------------- 81 | 82 | .. automodule:: pysradb.sraweb 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | pysradb.taxid2name module 88 | ------------------------- 89 | 90 | .. automodule:: pysradb.taxid2name 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | pysradb.utils module 96 | -------------------- 97 | 98 | .. automodule:: pysradb.utils 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | Module contents 104 | --------------- 105 | 106 | .. automodule:: pysradb 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | -------------------------------------------------------------------------------- /docs/python-api-usage.md: -------------------------------------------------------------------------------- 1 | # Python API 2 | 3 | ## Use Case 1: Fetch the metadata table (SRA-runtable) 4 | 5 | The simplest use case of [pysradb]{.title-ref} is when you know the SRA 6 | project ID (SRP) and would simply want to fetch the metadata associated 7 | with it. This is generally reflected in the 8 | [SraRunTable.txt]{.title-ref} that you get from NCBI\'s website. See an 9 | [example](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP098789) of a 10 | SraRunTable. 11 | 12 | ``` python 13 | from pysradb import SRAweb 14 | db = SRAweb() 15 | df = db.sra_metadata('SRP098789') 16 | df.head() 17 | ``` 18 | 19 | =============== ==================== ====================================================================== ============= ======== ================= ============== ================ ============== ============ ========== ======== ============ =============== 20 | study_accession experiment_accession experiment_title run_accession taxon_id library_selection library_layout library_strategy library_source library_name bases spots adapter_spec avg_read_length 21 | =============== ==================== ====================================================================== ============= ======== ================= ============== ================ ============== ============ ========== ======== ============ =============== 22 | SRP098789 SRX2536403 GSM2475997: 1.5 µM PF-067446846, 10 min, rep 1; Homo sapiens; OTHER SRR5227288 9606 other SINGLE - OTHER TRANSCRIPTOMIC 2104142750 42082855 50 23 | SRP098789 SRX2536404 GSM2475998: 1.5 µM PF-067446846, 10 min, rep 2; Homo sapiens; OTHER SRR5227289 9606 other SINGLE - OTHER TRANSCRIPTOMIC 2082873050 41657461 50 24 | SRP098789 SRX2536405 GSM2475999: 1.5 µM PF-067446846, 10 min, rep 3; Homo sapiens; OTHER SRR5227290 9606 other SINGLE - OTHER TRANSCRIPTOMIC 2023148650 40462973 50 25 | SRP098789 SRX2536406 GSM2476000: 0.3 µM PF-067446846, 10 min, rep 1; Homo sapiens; OTHER SRR5227291 9606 other SINGLE - OTHER TRANSCRIPTOMIC 2057165950 41143319 50 26 | SRP098789 SRX2536407 GSM2476001: 0.3 µM PF-067446846, 10 min, rep 2; Homo sapiens; OTHER SRR5227292 9606 other SINGLE - OTHER TRANSCRIPTOMIC 3027621850 60552437 50 27 | =============== ==================== ====================================================================== ============= ======== ================= ============== ================ ============== ============ ========== ======== ============ =============== 28 | 29 | The metadata is returned as a [pandas]{.title-ref} dataframe and hence 30 | allows you to perform all regular select/query operations available 31 | through [pandas]{.title-ref}. 32 | 33 | ## Use Case 2: Downloading an entire project arranged experiment wise 34 | 35 | Once you have fetched the metadata and made sure, this is the project 36 | you were looking for, you would want to download everything at once. 37 | NCBI follows this hiererachy: [SRP =\> SRX =\> SRR]{.title-ref}. Each 38 | [SRP]{.title-ref} (project) has multiple [SRX]{.title-ref} (experiments) 39 | and each [SRX]{.title-ref} in turn has multiple [SRR]{.title-ref} (runs) 40 | inside it. We want to mimick this hiereachy in our downloads. The reason 41 | to do that is simple: in most cases you care about [SRX]{.title-ref} the 42 | most, and would want to \"merge\" your SRRs in one way or the other. 43 | Having this hierearchy ensures your downstream code can handle such 44 | cases easily, without worrying about which runs (SRR) need to be merged. 45 | 46 | We strongly recommend installing [aspera-client]{.title-ref} which uses 47 | UDP and is [designed to be faster](http://www.skullbox.net/tcpudp.php). 48 | 49 | ``` python 50 | from pysradb import SRAweb 51 | db = SRAweb() 52 | df = db.sra_metadata('SRP017942') 53 | db.download(df) 54 | ``` 55 | 56 | ## Use Case 3: Downloading a subset of experiments 57 | 58 | Often, you need to process only a smaller set of samples from a project 59 | (SRP). Consider this project which has data spanning four assays. 60 | 61 | ``` python 62 | df = db.sra_metadata('SRP000941') 63 | print(df.library_strategy.unique()) 64 | ['ChIP-Seq' 'Bisulfite-Seq' 'RNA-Seq' 'WGS' 'OTHER'] 65 | ``` 66 | 67 | But, you might be only interested in analyzing the [RNA-seq]{.title-ref} 68 | samples and would just want to download that subset. This is simple 69 | using [pysradb]{.title-ref} since the metadata can be subset just as you 70 | would subset a dataframe in pandas. 71 | 72 | ``` python 73 | df_rna = df[df.library_strategy == 'RNA-Seq'] 74 | db.download(df=df_rna, out_dir='/pysradb_downloads')() 75 | ``` 76 | 77 | ## Use Case 4: Getting cell-type/treatment information from sample_attributes 78 | 79 | Cell type/tissue informations is usually hidden in the 80 | [sample_attributes]{.title-ref} column, which can be expanded: 81 | 82 | ``` python 83 | from pysradb.filter_attrs import expand_sample_attribute_columns 84 | df = db.sra_metadata('SRP017942') 85 | expand_sample_attribute_columns(df).head() 86 | ``` 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 123 | 126 | 135 | 136 | 139 | 140 | 141 | 142 | 144 | 145 | 146 | 155 | 156 | 159 | 160 | 161 | 162 | 163 | 165 | 166 | 167 |
study_accessionexperiment_accessionexperiment_titleexperiment_attributesample_attributerun_accessiontaxon_idlibrary_selectionlibrary_layoutlibrary_strategylibrary_sourcelibrary_namebasesspotsadapter_specavg_read_lengthassay_typecell_linesource_nametransfected_withtreatment

SRP017942 SRP017942 SRP017942 SRP017942 SRP017942

SRX217028 SRX217029 SRX217030 SRX217031 SRX217956

GSM1063575: 293T_GFP; Homo sapiens; RNA-Seq GSM1063576: 119 | 293T_GFP_2hrs_severe_Heat_Shock; Homo sapiens; RNA-Seq GSM1063577: 120 | 293T_Hspa1a; Homo sapiens; RNA-Seq GSM1063578: 121 | 293T_Hspa1a_2hrs_severe_Heat_Shock; Homo sapiens; RNA-Seq GSM794854: 122 | 3T3-Control-Riboseq; Mus musculus; RNA-Seq

GEO Accession: GSM1063575 GEO Accession: GSM1063576 GEO 124 | Accession: GSM1063577 GEO Accession: GSM1063578 GEO Accession: 125 | GSM794854

source_name: 293T cells || cell line: 293T cells || transfected 127 | with: 3XFLAG-GFP || assay type: Riboseq source_name: 293T cells || cell 128 | line: 293T cells || transfected with: 3XFLAG-GFP || treatment: severe 129 | heat shock (44C 2 hours) || assay type: Riboseq source_name: 293T cells 130 | || cell line: 293T cells || transfected with: 3XFLAG-Hspa1a || assay 131 | type: Riboseq source_name: 293T cells || cell line: 293T cells || 132 | transfected with: 3XFLAG-Hspa1a || treatment: severe heat shock (44C 2 133 | hours) || assay type: Riboseq source_name: 3T3 cells || treatment: 134 | control || cell line: 3T3 cells || assay type: Riboseq

SRR648667 SRR648668 SRR648669 SRR648670 SRR649752

137 |

9606 9606 9606 9606 10090

138 |

other other other other cDNA

SINGLE -SINGLE -SINGLE -SINGLE -SINGLE -

RNA-Seq RNA-Seq RNA-Seq RNA-Seq RNA-Seq

TRANSCRIPTOMIC TRANSCRIPTOMIC TRANSCRIPTOMIC TRANSCRIPTOMIC 143 | TRANSCRIPTOMIC

1806641316 3436984836 3330909216 3622123512 594945396

147 |

50184481 95471801 92525256

148 |
149 |
150 |
100614542
151 |
152 |

16526261

153 |
154 |
157 |

36 36 36 36 36

158 |

riboseq riboseq riboseq riboseq riboseq

293t cells 293t cells 293t cells 293t cells 3t3 cells

293t cells 293t cells 293t cells 293t cells 3t3 cells

3xflag-gfp 3xflag-gfp 3xflag-hspa1a 3xflag-hspa1a NaN

NaN severe heat shock (44c 2 hours) NaN severe heat shock (44c 2 164 | hours) control

168 | 169 | ## Use Case 5: Searching for datasets 170 | 171 | Another common operation that we do on SRA is seach, plain text search. 172 | 173 | If you want to look up for all projects where [ribosome 174 | profiling]{.title-ref} appears somewhere in the description: 175 | 176 | ``` python 177 | df = db.search_sra(search_str='"ribosome profiling"') 178 | df.head() 179 | ``` 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 207 | 208 | 209 | 210 | 211 | 212 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 225 | 226 | 227 | 228 | 229 | 230 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 243 | 244 | 245 | 246 | 247 | 248 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 |
study_accessionexperiment_accessionexperiment_titlerun_accessiontaxon_idlibrary_selectionlibrary_layoutlibrary_strategylibrary_sourcelibrary_namebasesspots
DRP003075DRX019536Illumina Genome Analyzer IIx sequencing of SAMD00018584DRR021383
205 |

83333

206 |
otherSINGLE -OTHERTRANSCRIPTOMICGAII05_3
213 |

978776480

214 |
12234706
DRP003075DRX019537Illumina Genome Analyzer IIx sequencing of SAMD00018585DRR021384
223 |

83333

224 |
otherSINGLE -OTHERTRANSCRIPTOMICGAII05_4
231 |

894201680

232 |
11177521
DRP003075DRX019538Illumina Genome Analyzer IIx sequencing of SAMD00018586DRR021385
241 |

83333

242 |
otherSINGLE -OTHERTRANSCRIPTOMICGAII05_5
249 |

931536720

250 |
11644209
DRP003075DRX019540Illumina Genome Analyzer IIx sequencing of SAMD00018588DRR021387
259 |

83333

260 |
otherSINGLE -OTHERTRANSCRIPTOMICGAII07_4275939870027593987
DRP003075DRX019541Illumina Genome Analyzer IIx sequencing of SAMD00018589DRR021388
275 |

83333

276 |
otherSINGLE -OTHERTRANSCRIPTOMICGAII07_5238619650023861965
287 | 288 | Again, the results are available as a [pandas]{.title-ref} dataframe and 289 | hence you can perform all subset operations post your query. Your query 290 | doesn\'t need to be exact. 291 | -------------------------------------------------------------------------------- /notebooks/07.Multiple_SRPs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "07.Multiple_SRPs", 7 | "provenance": [], 8 | "mount_file_id": "1pNeuZJjjHliYFk582kGNRpGJ1Fa2h9cn", 9 | "authorship_tag": "ABX9TyMmcvA8kJPyf4bhs59mCISs", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "kPEeKLC38WGX", 32 | "colab_type": "code", 33 | "colab": { 34 | "base_uri": "https://localhost:8080/", 35 | "height": 401 36 | }, 37 | "outputId": "45055cff-1f41-4b1d-f67f-0a4dc37a8333" 38 | }, 39 | "source": [ 40 | "pip install git+https://github.com/saketkc/pysradb.git" 41 | ], 42 | "execution_count": null, 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "text": [ 47 | "Collecting git+https://github.com/saketkc/pysradb.git\n", 48 | " Cloning https://github.com/saketkc/pysradb.git to /tmp/pip-req-build-hmpqrmad\n", 49 | " Running command git clone -q https://github.com/saketkc/pysradb.git /tmp/pip-req-build-hmpqrmad\n", 50 | "Requirement already satisfied (use --upgrade to upgrade): pysradb==0.10.5.dev0 from git+https://github.com/saketkc/pysradb.git in /usr/local/lib/python3.6/dist-packages\n", 51 | "Requirement already satisfied: pandas==1.0.1 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (1.0.1)\n", 52 | "Requirement already satisfied: requests==2.23.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (2.23.0)\n", 53 | "Requirement already satisfied: tqdm==4.43.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (4.43.0)\n", 54 | "Requirement already satisfied: xmltodict==0.12.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (0.12.0)\n", 55 | "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (2.8.1)\n", 56 | "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (1.18.5)\n", 57 | "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (2018.9)\n", 58 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (2020.6.20)\n", 59 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (2.9)\n", 60 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (1.24.3)\n", 61 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (3.0.4)\n", 62 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas==1.0.1->pysradb==0.10.5.dev0) (1.12.0)\n", 63 | "Building wheels for collected packages: pysradb\n", 64 | " Building wheel for pysradb (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 65 | " Created wheel for pysradb: filename=pysradb-0.10.5.dev0-cp36-none-any.whl size=148723 sha256=394afd5781d90d04f37bd5eb7c26ff245792e4f7a4140f6bd1796fe0b5b724be\n", 66 | " Stored in directory: /tmp/pip-ephem-wheel-cache-y1mkqxnq/wheels/d5/24/42/81dccabc3a4aac9757e23b7175ad7270090a4b3c203cd4fc8f\n", 67 | "Successfully built pysradb\n" 68 | ], 69 | "name": "stdout" 70 | } 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "n-lUysUE8edh", 77 | "colab_type": "code", 78 | "colab": {} 79 | }, 80 | "source": [ 81 | "SRP_list = \"\"\"ERP009675\n", 82 | "ERP007116\n", 83 | "ERP007115\n", 84 | "ERP004563\n", 85 | "ERP005660\n", 86 | "ERP001266\n", 87 | "ERP002072\n", 88 | "ERP001882\n", 89 | "ERP004883\n", 90 | "ERP004508\n", 91 | "ERP004393\n", 92 | "ERP005409\n", 93 | "ERP001464\n", 94 | "ERP004042\n", 95 | "ERP004375\n", 96 | "ERP003293\n", 97 | "ERP004689\n", 98 | "ERP001094\n", 99 | "ERP003728\n", 100 | "ERP000730\n", 101 | "ERP000411\n", 102 | "ERP000319\n", 103 | "SRP041183\n", 104 | "SRP011912\n", 105 | "SRP058392\n", 106 | "SRP044705\n", 107 | "SRP036841\n", 108 | "SRP050120\n", 109 | "ERP004116\n", 110 | "SRP030662\n", 111 | "SRP047217\n", 112 | "SRP045505\n", 113 | "ERP001556\n", 114 | "ERP000546\n", 115 | "SRP045252\n", 116 | "SRP044714\n", 117 | "DRP000524\n", 118 | "SRP044907\n", 119 | "SRP019970\n", 120 | "SRP044131\n", 121 | "SRP044042\n", 122 | "SRP043602\n", 123 | "SRP043523\n", 124 | "SRP014570\n", 125 | "SRP043067\n", 126 | "SRP042370\n", 127 | "SRP042360\n", 128 | "SRP042159\n", 129 | "SRP042085\n", 130 | "SRP042053\n", 131 | "SRP041992\n", 132 | "SRP041738\n", 133 | "SRP041679\n", 134 | "SRP041669\n", 135 | "SRP041622\n", 136 | "SRP041182\n", 137 | "SRP041377\n", 138 | "SRP021009\n", 139 | "SRP041216\n", 140 | "SRP041129\n", 141 | "SRP041119\n", 142 | "SRP041044\n", 143 | "SRP040761\n", 144 | "SRP040479\n", 145 | "SRP040072\n", 146 | "SRP040070\n", 147 | "SRP040121\n", 148 | "SRP040044\n", 149 | "SRP039841\n", 150 | "SRP039779\n", 151 | "SRP039717\n", 152 | "SRP039699\n", 153 | "SRP039672\n", 154 | "SRP039661\n", 155 | "SRP039646\n", 156 | "SRP039634\n", 157 | "SRP035451\n", 158 | "SRP039551\n", 159 | "SRP039478\n", 160 | "SRP039440\n", 161 | "SRP039448\n", 162 | "SRP030474\n", 163 | "SRP034507\n", 164 | "SRP038004\n", 165 | "SRP037780\n", 166 | "SRP037583\n", 167 | "SRP036068\n", 168 | "SRP036637\n", 169 | "SRP036632\n", 170 | "SRP035368\n", 171 | "SRP035278\n", 172 | "SRP034930\n", 173 | "SRP034844\n", 174 | "ERP004159\n", 175 | "SRP017087\n", 176 | "SRP034444\n", 177 | "SRP033229\n", 178 | "SRP033198\n", 179 | "SRP033021\n", 180 | "ERP000964\n", 181 | "ERP002429\n", 182 | "SRP032928\n", 183 | "SRP032833\n", 184 | "SRP032792\n", 185 | "SRP032766\n", 186 | "ERP003855\n", 187 | "ERP000904\n", 188 | "SRP028229\n", 189 | "SRP026361\n", 190 | "SRP023111\n", 191 | "SRP021139\n", 192 | "SRP013319\n", 193 | "SRP020006\n", 194 | "SRP019994\n", 195 | "SRP019500\n", 196 | "SRP019241\n", 197 | "SRP018672\n", 198 | "SRP018358\n", 199 | "SRP016875\n", 200 | "SRP016501\n", 201 | "SRP015460\n", 202 | "SRP015370\n", 203 | "SRP015135\n", 204 | "SRP014437\n", 205 | "SRP012378\n", 206 | "SRP012018\n", 207 | "SRP010103\n", 208 | "SRP007831\n", 209 | "SRP007412\n", 210 | "SRP007400\n", 211 | "SRP002090\n", 212 | "SRP029153\n", 213 | "SRP029445\n", 214 | "SRP029427\n", 215 | "SRP029380\n", 216 | "SRP029330\n", 217 | "SRP029333\n", 218 | "ERP000606\n", 219 | "ERP000415\n", 220 | "ERP001977\n", 221 | "ERP000373\n", 222 | "SRP029172\n", 223 | "SRP028766\n", 224 | "ERP000668\n", 225 | "ERP003627\n", 226 | "SRP017364\n", 227 | "SRP025757\n", 228 | "SRP021189\n", 229 | "SRP011154\n", 230 | "SRP014574\n", 231 | "SRP017935\n", 232 | "SRP016889\n", 233 | "SRP002016\n", 234 | "SRP018826\n", 235 | "SRP018753\n", 236 | "SRP009821\n", 237 | "SRP012925\n", 238 | "SRP012850\n", 239 | "SRP009870\n", 240 | "SRP007799\n", 241 | "SRP006748\n", 242 | "SRP000373\"\"\"\n", 243 | "SRP_list = SRP_list.split(\"\\n\")" 244 | ], 245 | "execution_count": null, 246 | "outputs": [] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "metadata": { 251 | "id": "1CWmd5rF8yny", 252 | "colab_type": "code", 253 | "colab": { 254 | "base_uri": "https://localhost:8080/", 255 | "height": 72 256 | }, 257 | "outputId": "519c91be-e3b9-48a3-fd47-fbe415bd0b2a" 258 | }, 259 | "source": [ 260 | "import sys\n", 261 | "import time\n", 262 | "\n", 263 | "from pysradb import SRAweb\n", 264 | "\n", 265 | "db = SRAweb()\n", 266 | "\n", 267 | "for srp in SRP_list:\n", 268 | " try:\n", 269 | " df = db.sra_metadata(srp)\n", 270 | " df.to_csv(\"{}.tsv\".format(srp), sep=\"\\t\", index=False)\n", 271 | " except:\n", 272 | " sys.stderr.write(\"Error with {}\\n\".format(srp))\n", 273 | " time.sleep(0.5)\n", 274 | " time.sleep(0.5)" 275 | ], 276 | "execution_count": null, 277 | "outputs": [ 278 | { 279 | "output_type": "stream", 280 | "text": [ 281 | "/usr/local/lib/python3.6/dist-packages/pysradb/utils.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 282 | " from tqdm.autonotebook import tqdm\n" 283 | ], 284 | "name": "stderr" 285 | } 286 | ] 287 | } 288 | ] 289 | } -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks demonstrating functionalities of pysradb 2 | 3 | 1. [Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/01.Python-API_demo.ipynb) 4 | 2. [Downloading datasets from SRA - command line](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/02.Commandline_download.ipynb) 5 | 3. [Parallely download multiple datasets - Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/03.ParallelDownload.ipynb) 6 | 4. [Converting SRA-to-fastq - command line (requires conda)](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/04.SRA_to_fastq_conda.ipynb) 7 | 5. [Downloading subsets of a project - Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/05.Downloading_subsets_of_a_project.ipynb) 8 | 6. [Download BAMs](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/06.Download_BAMs.ipynb) 9 | 7. [Metadata for multiple SRPs](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/07.Multiple_SRPs.ipynb) 10 | 8. [Multithreaded fastq downloads using Aspera Client](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/08.pysradb_ascp_multithreaded.ipynb) 11 | 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "pysradb" 7 | dynamic = ["version"] 8 | description = "A Python package for interacting with SRAdb and downloading datasets from SRA/ENA/GEO" 9 | readme = "README.md" 10 | license = {file = "LICENSE"} 11 | requires-python = ">=3.9" 12 | authors = [ 13 | { name = "Saket Choudhary", email = "saketkc@gmail.com" }, 14 | ] 15 | keywords = [ 16 | "pysradb", 17 | ] 18 | classifiers = [ 19 | "Development Status :: 5 - Production/Stable", 20 | "Intended Audience :: Healthcare Industry", 21 | "Intended Audience :: Science/Research", 22 | "License :: OSI Approved :: BSD License", 23 | "Natural Language :: English", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Topic :: Scientific/Engineering :: Bio-Informatics", 29 | ] 30 | dependencies = [ 31 | "lxml>=4.6.3", 32 | "pandas>=1.3.2", 33 | "requests-ftp>=0.3.1", 34 | "requests>=2.26.0", 35 | "tqdm>=4.62.1", 36 | "xmltodict>=0.12.0", 37 | ] 38 | 39 | [project.scripts] 40 | pysradb = "pysradb.cli:parse_args" 41 | 42 | [project.urls] 43 | Homepage = "https://saket-choudhary.me/pysradb" 44 | 45 | [tool.distutils.bdist_wheel] 46 | universal = true 47 | 48 | [tool.hatch.version] 49 | path = "pysradb/__init__.py" 50 | 51 | [tool.hatch.build.targets.sdist] 52 | include = [ 53 | "/pysradb", 54 | ] 55 | -------------------------------------------------------------------------------- /pysradb/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Top-level package for pysradb.""" 3 | 4 | __author__ = """Saket Choudhary""" 5 | __email__ = "saketkc@gmail.com" 6 | __version__ = "2.2.2" 7 | 8 | from .filter_attrs import expand_sample_attribute_columns 9 | from .geodb import GEOdb 10 | from .geodb import download_geodb_file 11 | from .sradb import SRAdb 12 | from .sradb import download_sradb_file 13 | from .sraweb import SRAweb 14 | -------------------------------------------------------------------------------- /pysradb/basedb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | import sys 4 | import warnings 5 | 6 | import pandas as pd 7 | 8 | from .utils import _extract_first_field 9 | 10 | warnings.simplefilter(action="ignore", category=FutureWarning) 11 | 12 | 13 | class BASEdb(object): 14 | def __init__(self, sqlite_file): 15 | """Initialize SRAdb. 16 | 17 | Parameters 18 | ---------- 19 | 20 | sqlite_file: string 21 | Path to unzipped SRAmetadb.sqlite file 22 | 23 | 24 | """ 25 | self.sqlite_file = sqlite_file 26 | self.open() 27 | self.cursor = self.db.cursor() 28 | 29 | def open(self): 30 | """Open sqlite connection.""" 31 | # Originally sqlite3.connect(self.sqlite_file) 32 | self.db = sqlite3.connect("file:{}?mode=ro".format(self.sqlite_file), uri=True) 33 | self.db.text_factory = str 34 | 35 | def close(self): 36 | """Close sqlite connection.""" 37 | self.db.close() 38 | 39 | def list_tables(self): 40 | """List all tables in the sqlite file. 41 | 42 | Returns 43 | ------- 44 | table_list: list 45 | List of all table names 46 | """ 47 | results = self.cursor.execute( 48 | 'SELECT name FROM sqlite_master WHERE type="table";' 49 | ).fetchall() 50 | return _extract_first_field(results) 51 | 52 | def list_fields(self, table): 53 | """List all fields in a given table. 54 | 55 | Parameters 56 | ---------- 57 | table: string 58 | Table name. 59 | See `list_tables` for getting all table names 60 | 61 | Returns 62 | ------- 63 | field_list: list 64 | A list of field names for the table 65 | """ 66 | results = self.cursor.execute("SELECT * FROM {}".format(table)) 67 | return _extract_first_field(results.description) 68 | 69 | def desc_table(self, table): 70 | """Describe all fields in a table. 71 | 72 | Parameters 73 | ---------- 74 | table: string 75 | Table name. 76 | See `list_tables` for getting all table names 77 | 78 | Returns 79 | ------- 80 | table_desc: DataFrame 81 | A DataFrame with field name and its 82 | schema description 83 | """ 84 | results = self.cursor.execute( 85 | 'PRAGMA table_info("{}")'.format(table) 86 | ).fetchall() 87 | columns = ["cid", "name", "dtype", "notnull", "dflt_value", "pk"] 88 | data = [] 89 | for result in results: 90 | data.append(list([str(x) for x in result])) 91 | table_desc = pd.DataFrame(data, columns=columns) 92 | return table_desc 93 | 94 | def query(self, sql_query): 95 | """Run SQL query. 96 | 97 | Parameters 98 | ---------- 99 | sql_query: string 100 | SQL query string 101 | 102 | Returns 103 | ------- 104 | results: DataFrame 105 | Query results formatted as dataframe 106 | 107 | """ 108 | results = self.cursor.execute(sql_query).fetchall() 109 | column_names = list([x[0] for x in self.cursor.description]) 110 | results = [dict(list(zip(column_names, result))) for result in results] 111 | df = pd.DataFrame(results) 112 | if not results: 113 | # sys.stderr.write("Found no matching results for query: {}".format(sql_query)) 114 | sys.stderr.write("Found no matching results for query.\n") 115 | return df 116 | 117 | def get_row_count(self, table): 118 | """Get row counts for a table. 119 | 120 | Parameters 121 | ---------- 122 | table: string 123 | Table name. 124 | See `list_tables` for getting all table names 125 | 126 | Returns 127 | ------- 128 | row_count: int 129 | Number of rows in table 130 | """ 131 | return self.cursor.execute( 132 | "SELECT max(rowid) FROM {}".format(table) 133 | ).fetchone()[0] 134 | 135 | def all_row_counts(self): 136 | """Get row counts of all tables in the db file. 137 | 138 | Returns 139 | ------- 140 | row_counts: DataFrame 141 | A dataframe with table names and corresponding 142 | row count. 143 | 144 | """ 145 | tables = self.list_tables() 146 | results = dict([(table, self.get_row_count(table)) for table in tables]) 147 | return pd.DataFrame.from_dict(results, orient="index", columns=["count"]) 148 | -------------------------------------------------------------------------------- /pysradb/download.py: -------------------------------------------------------------------------------- 1 | """Utility function to download data""" 2 | 3 | import hashlib 4 | import math 5 | import os 6 | import shutil 7 | import sys 8 | import warnings 9 | 10 | import numpy as np 11 | import requests 12 | import requests_ftp 13 | from tqdm.autonotebook import tqdm 14 | 15 | from .utils import requests_3_retries 16 | 17 | requests_ftp.monkeypatch_session() 18 | warnings.simplefilter(action="ignore", category=FutureWarning) 19 | import pandas as pd 20 | 21 | tqdm.pandas() 22 | 23 | 24 | def millify(n): 25 | """Convert integer to human readable format. 26 | 27 | Parameters 28 | ---------- 29 | n : int 30 | 31 | Returns 32 | ------- 33 | millidx : str 34 | Formatted integer 35 | """ 36 | millnames = ["", " KB", " MB", " GB", " TB"] 37 | # Source: http://stackoverflow.com/a/3155023/756986 38 | n = float(n) 39 | millidx = max( 40 | 0, 41 | min( 42 | len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3)) 43 | ), 44 | ) 45 | 46 | return "{:.1f}{}".format(n / 10 ** (3 * millidx), millnames[millidx]) 47 | 48 | 49 | def get_file_size(row, url_col): 50 | """Get size of file to be downloaded. 51 | 52 | Parameters 53 | ---------- 54 | row: pd.DataFrame row 55 | 56 | url_col: str 57 | url_column 58 | 59 | Returns 60 | ------- 61 | content_length: int 62 | """ 63 | if row[url_col] is not None: 64 | url = row[url_col] 65 | else: 66 | url = row.download_url 67 | if url is pd.NA: 68 | return 0 69 | if not isinstance(url, str): 70 | return 0 71 | if url.startswith("ftp."): 72 | url = "ftp://" + url 73 | try: 74 | r = requests_3_retries().head(url) 75 | size = int(r.headers["content-length"]) 76 | r.raise_for_status() 77 | except requests.exceptions.Timeout: 78 | sys.exit(f"Connection to {url} has timed out. Please retry.") 79 | except requests.exceptions.HTTPError: 80 | print( 81 | f"The download URL: {url} is likely invalid.\n" 82 | f"Removing {row.run_accession} from the download list\n", 83 | flush=True, 84 | ) 85 | return np.NaN 86 | except KeyError: 87 | print("Key error for: " + url, flush=True) 88 | return 0 89 | return size 90 | 91 | 92 | def md5_validate_file(file_path, md5_hash): 93 | """Check file containt against an MD5. 94 | 95 | Parameters 96 | ---------- 97 | file_path: string 98 | Path to file 99 | md5_hash: string 100 | Expected md5 hash 101 | 102 | Returns 103 | ------- 104 | valid: bool 105 | True if expected and observed md5 match 106 | """ 107 | observed_md5 = hashlib.md5() 108 | with open(file_path, "rb") as f: 109 | while True: 110 | # read 1MB 111 | chunk = f.read(1000 * 1000) 112 | if not chunk: 113 | break 114 | observed_md5.update(chunk) 115 | return observed_md5.hexdigest() == md5_hash 116 | 117 | 118 | def download_file( 119 | url, 120 | file_path, 121 | md5_hash=None, 122 | timeout=10, 123 | block_size=1024 * 1024, 124 | show_progress=False, 125 | ): 126 | """Resumable download. 127 | Expect the server to support byte ranges. 128 | 129 | Parameters 130 | ---------- 131 | url: string 132 | URL 133 | file_path: string 134 | Local file path to store the downloaded file 135 | md5_hash: string 136 | Expected MD5 string of downloaded file 137 | timeout: int 138 | Seconds to wait before terminating request 139 | block_size: int 140 | Chunkx of bytes to read (default: 1024 * 1024 = 1MB) 141 | show_progress: bool 142 | Show progress bar 143 | """ 144 | if url.startswith("ftp."): 145 | url = "ftp://" + url 146 | session = requests.Session() 147 | else: 148 | session = requests 149 | if os.path.exists(file_path) and os.path.getsize(file_path): 150 | return 151 | tmp_file_path = file_path + ".part" 152 | first_byte = os.path.getsize(tmp_file_path) if os.path.exists(tmp_file_path) else 0 153 | file_mode = "ab" if first_byte else "wb" 154 | file_size = -1 155 | try: 156 | file_size = int(session.head(url).headers["Content-length"]) 157 | headers = {"Range": "bytes=%s-" % first_byte} 158 | r = session.get(url, headers=headers, stream=True) 159 | if show_progress: 160 | desc = "Downloading {}".format(url.split("/")[-1]) 161 | pbar = tqdm( 162 | total=file_size, 163 | initial=first_byte, 164 | unit="B", 165 | unit_scale=True, 166 | desc=desc, 167 | ) 168 | with open(tmp_file_path, file_mode) as f: 169 | for chunk in r.iter_content(chunk_size=block_size): 170 | if chunk: # filter out keep-alive new chunks 171 | f.write(chunk) 172 | if show_progress: 173 | pbar.update(block_size) 174 | if show_progress: 175 | pbar.close() 176 | except IOError as e: 177 | sys.stderr.write("IO Error - {}\n".format(e)) 178 | finally: 179 | # Move the temp file to desired location 180 | if file_size == os.path.getsize(tmp_file_path): 181 | # if there's a hash value, validate the file 182 | if md5_hash and not md5_validate_file(tmp_file_path, md5_hash): 183 | raise Exception("Error validating the file against its MD5 hash") 184 | shutil.move(tmp_file_path, file_path) 185 | elif file_size == -1: 186 | raise Exception("Error getting Content-Length from server: %s" % url) 187 | -------------------------------------------------------------------------------- /pysradb/exceptions.py: -------------------------------------------------------------------------------- 1 | """This file contains custom Exceptions for pysradb 2 | """ 3 | 4 | 5 | class MissingQueryException(Exception): 6 | """Exception raised when the user did not supply any query fields. 7 | 8 | Attributes: 9 | message: string 10 | Error message for this Exception 11 | 12 | """ 13 | 14 | def __init__(self): 15 | self.message = ( 16 | "No valid query has been supplied. \n" 17 | "A query must be supplied to one of the following fields:\n" 18 | "[--query, --accession, --organism, --layout, --mbases, --publication-date," 19 | " --platform, --selection, --source, --strategy, --title]" 20 | ) 21 | super().__init__(self.message) 22 | 23 | 24 | class IncorrectFieldException(Exception): 25 | """Exception raised when the user enters incorrect inputs for a flag.""" 26 | 27 | pass 28 | -------------------------------------------------------------------------------- /pysradb/filter_attrs.py: -------------------------------------------------------------------------------- 1 | import re 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def _get_sample_attr_keys(sample_attribute): 9 | if sample_attribute is None: 10 | return None, None 11 | sample_attribute_splitted = sample_attribute.split("||") 12 | split_by_colon = [ 13 | str(attr).strip().split(": ") for attr in sample_attribute_splitted 14 | ] 15 | 16 | # Iterate once more to consider first one as the key 17 | # and remaining as the value 18 | # This is because of bad annotations like in this example 19 | # Example: isolate: not applicable || organism: Mus musculus || cell_line: 17-Cl1 ||\ 20 | # infect: MHV-A59 || time point: 5: hour || compound: cycloheximide ||\ 21 | # sequencing protocol: RiboSeq || biological repeat: long read sequencing 22 | # Notice the `time: 5: hour` 23 | # sample_attribute: investigation type: metagenome || project name: Landsort Depth 20090415 transect || 24 | # sequencing method: 454 || collection date: 2009-04-15 || ammonium: 8.7: µM || chlorophyll: 0: µg/L || 25 | # dissolved oxygen: -1.33: µmol/kg || nitrate: 0.02: µM || nitrogen: 0: µM || 26 | # environmental package: water || geographic location (latitude): 58.6: DD || 27 | # geographic location (longitude): 18.2: DD || geographic location (country and/or sea,region): Baltic Sea || 28 | # environment (biome): 00002150 || environment (feature): 00002150 || environment (material): 00002150 || 29 | # depth: 400: m || Phosphate: || Total phosphorous: || Silicon: 30 | # Handle empty cases as above 31 | split_by_colon = [attr for attr in split_by_colon if len(attr) >= 2] 32 | 33 | for index, element in enumerate(split_by_colon): 34 | if len(element) > 2: 35 | key = element[0].strip() 36 | value = ":".join(element[1:]).strip() 37 | split_by_colon[index] = [key, value] 38 | 39 | try: 40 | sample_attribute_dict = dict(split_by_colon) 41 | except ValueError: 42 | print("This is most likely a bug, please report it upstream.") 43 | print(("sample_attribute: {}".format(sample_attribute))) 44 | raise 45 | sample_attribute_keys = list( 46 | map( 47 | lambda x: re.sub(r"\s+", " ", x.strip().replace(" ", "_").lower()), 48 | list(sample_attribute_dict.keys()), 49 | ) 50 | ) 51 | sample_attribute_values = list( 52 | map( 53 | lambda x: re.sub(r"\s+", " ", x.strip().lower().strip().replace(",", "__")), 54 | list(sample_attribute_dict.values()), 55 | ) 56 | ) 57 | return sample_attribute_keys, sample_attribute_values 58 | 59 | 60 | def expand_sample_attribute_columns(metadata_df): 61 | """Expand sample attribute columns to individual columns. 62 | 63 | Since the sample_attribute column content can be different 64 | for differnt rows even if coming from the same project (SRP), 65 | we explicitly iterate through the rows to first determine 66 | what additional columns need to be created. 67 | 68 | 69 | Parameters 70 | ---------- 71 | metadata_df: DataFrame 72 | Dataframe as obtained from sra_metadata 73 | or equivalent 74 | 75 | Returns 76 | ------- 77 | expanded_df: DataFrame 78 | Dataframe with additionals columns pertaining 79 | to sample_attribute appended 80 | """ 81 | additional_columns = [] 82 | metadata_df = metadata_df.copy() 83 | for idx, row in metadata_df.iterrows(): 84 | sample_attribute = row["sample_attribute"] 85 | if not sample_attribute: 86 | continue 87 | sample_attribute = sample_attribute.strip() 88 | sample_attribute_keys, _ = _get_sample_attr_keys(sample_attribute) 89 | if sample_attribute_keys: 90 | additional_columns += sample_attribute_keys 91 | additional_columns = list(sorted(set(additional_columns))) 92 | # if any of the additional column already exists 93 | # call the additional column as *_expanded 94 | additional_columns = list( 95 | map( 96 | lambda x: x if x not in metadata_df.columns.tolist() else x + "_expanded", 97 | additional_columns, 98 | ) 99 | ) 100 | additional_columns = list(sorted(additional_columns)) 101 | empty_df = pd.DataFrame(columns=additional_columns) 102 | metadata_df_expanded = pd.concat([metadata_df, empty_df], axis=1) 103 | for idx, row in metadata_df_expanded.iterrows(): 104 | sample_attribute = row["sample_attribute"] 105 | sample_attribute_keys, sample_attribute_values = _get_sample_attr_keys( 106 | sample_attribute 107 | ) 108 | if sample_attribute_keys: 109 | sample_attribute_keys = list( 110 | map( 111 | lambda x: ( 112 | x if x not in metadata_df.columns.tolist() else x + "_expanded" 113 | ), 114 | sample_attribute_keys, 115 | ) 116 | ) 117 | metadata_df_expanded.loc[idx, sample_attribute_keys] = sample_attribute_values 118 | if np.nan in metadata_df_expanded.columns.tolist(): 119 | del metadata_df_expanded[np.nan] 120 | return metadata_df_expanded 121 | 122 | 123 | def guess_cell_type(sample_attribute): 124 | """Guess possible cell line from sample_attribute data. 125 | 126 | Parameters 127 | ---------- 128 | sample_attribute: string 129 | sample_attribute string as in the metadata column 130 | 131 | Returns 132 | ------- 133 | cell_type: string 134 | Possible cell type of sample. 135 | Returns None if no match found. 136 | """ 137 | sample_attribute = str(sample_attribute) 138 | cell_type = None 139 | if "cell line:" in sample_attribute: 140 | x = re.search(r"cell line: \w+", sample_attribute) 141 | cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell line:").lower().strip()) 142 | if "cell_line:" in sample_attribute: 143 | x = re.search(r"cell_line: \w+", sample_attribute) 144 | cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell_line:").lower().strip()) 145 | if "cell-line:" in sample_attribute: 146 | x = re.search(r"cell-line: \w+", sample_attribute) 147 | cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell-line:").lower().strip()) 148 | if "cell_type:" in sample_attribute: 149 | x = re.search(r"cell_type: \w+", sample_attribute) 150 | return re.sub(r"\s+", " ", x.group(0).lstrip("cell_type:").lower().strip()) 151 | if "source_name:" in sample_attribute: 152 | x = re.search(r"source_name: \w+", sample_attribute) 153 | cell_type = re.sub( 154 | r"\s+", " ", x.group(0).lstrip("source_name:").lower().strip() 155 | ) 156 | else: 157 | warnings.warn( 158 | "Couldn't parse {} for cell line".format(sample_attribute), UserWarning 159 | ) 160 | return cell_type 161 | 162 | 163 | def guess_tissue_type(sample_attribute): 164 | """Guess tissue type from sample_attribute data. 165 | 166 | Parameters 167 | ---------- 168 | sample_attribute: string 169 | sample_attribute string as in the metadata column 170 | 171 | Returns 172 | ------- 173 | tissue_type: string 174 | Possible cell type of sample. 175 | Returns None if no match found. 176 | """ 177 | sample_attribute = str(sample_attribute) 178 | tissue_type = None 179 | if "tissue: " in sample_attribute: 180 | x = re.search(r"tissue: \w+", sample_attribute) 181 | tissue_type = re.sub(r"\s+", " ", x.group(0).lstrip("tissue:").lower().strip()) 182 | else: 183 | warnings.warn( 184 | "Couldn't parse {} for tissue".format(sample_attribute), UserWarning 185 | ) 186 | return tissue_type 187 | 188 | 189 | def guess_strain_type(sample_attribute): 190 | """Guess strain type from sample_attribute data. 191 | 192 | Parameters 193 | ---------- 194 | sample_attribute: string 195 | sample_attribute string as in the metadata column 196 | 197 | Returns 198 | ------- 199 | strain_type: string 200 | Possible cell type of sample. 201 | Returns None if no match found. 202 | """ 203 | sample_attribute = str(sample_attribute) 204 | strain_type = None 205 | if "strain: " in sample_attribute: 206 | x = re.search(r"strain: \w+", sample_attribute) 207 | strain_type = re.sub(r"\s+", " ", x.group(0).lstrip("strain:").lower().strip()) 208 | else: 209 | warnings.warn( 210 | "Couldn't parse {} for strain".format(sample_attribute), UserWarning 211 | ) 212 | return strain_type 213 | -------------------------------------------------------------------------------- /pysradb/geodb.py: -------------------------------------------------------------------------------- 1 | """Methods to interact with SRA""" 2 | 3 | # This is now defunct and will be removed in a future release. 4 | 5 | import gzip 6 | import os 7 | import re 8 | import sys 9 | 10 | from .basedb import BASEdb 11 | from .utils import _get_url 12 | from .utils import copyfileobj 13 | from .utils import get_gzip_uncompressed_size 14 | 15 | PY3 = True 16 | if sys.version_info[0] < 3: 17 | PY3 = False 18 | 19 | GEOmetadb_URL = "http://starbuck1.s3.amazonaws.com/sradb/GEOmetadb.sqlite.gz" 20 | 21 | 22 | def download_geodb_file(download_dir=os.getcwd(), overwrite=True): 23 | """Download GEOmetadb.sqlite file. 24 | 25 | Parameters 26 | ---------- 27 | download_dir: string 28 | Directory to download SRAmetadb.sqlite 29 | overwrite: bool 30 | overwrite existing file(s). 31 | Set to True by default. 32 | 33 | """ 34 | download_location = os.path.join(download_dir, "GEOmetadb.sqlite.gz") 35 | download_location_unzip = download_location.rstrip(".gz") 36 | 37 | if os.path.isfile(download_location) and overwrite is False: 38 | raise RuntimeError( 39 | "{} already exists! Set `overwrite=True` to redownload.".forma( 40 | download_location 41 | ) 42 | ) 43 | if os.path.isfile(download_location_unzip) and overwrite is False: 44 | raise RuntimeError( 45 | "{} already exists! Set `overwrite=True` to redownload.".format( 46 | download_location_unzip 47 | ) 48 | ) 49 | 50 | try: 51 | _get_url(GEOmetadb_URL, download_location) 52 | except Exception as e: 53 | raise RuntimeError( 54 | "Could not use {}.\nException: {}.\n".format(GEOmetadb_URL, e) 55 | ) 56 | print("Extracting {} ...".format(download_location)) 57 | filesize = get_gzip_uncompressed_size(download_location) 58 | with gzip.open(download_location, "rb") as fh_in: 59 | with open(download_location_unzip, "wb") as fh_out: 60 | copyfileobj( 61 | fh_in, 62 | fh_out, 63 | filesize=filesize, 64 | desc="Extracting {}".format("GEOmetadb.sqlite.gz"), 65 | ) 66 | print("Done!") 67 | db = GEOdb(download_location_unzip) 68 | metadata = db.query("SELECT * FROM metaInfo") 69 | db.close() 70 | print("Metadata associated with {}:".format(download_location_unzip)) 71 | print(metadata) 72 | 73 | 74 | class GEOdb(BASEdb): 75 | def __init__(self, sqlite_file): 76 | """Initialize SRAdb. 77 | 78 | Parameters 79 | ---------- 80 | 81 | sqlite_file: string 82 | Path to unzipped SRAmetadb.sqlite file 83 | 84 | 85 | """ 86 | super(GEOdb, self).__init__(sqlite_file) 87 | self._db_type = "GEO" 88 | self.valid_in_type = ["GSE", "GPL", "GSM", "GDS"] 89 | 90 | def gse_metadata(self, gse): 91 | """Get metadata for GSE ID. 92 | 93 | Parameters 94 | ---------- 95 | gse: string 96 | GSE ID 97 | 98 | Returns 99 | ------- 100 | metadata_df: DataFrame 101 | A dataframe with relevant fields 102 | """ 103 | return self.query("SELECT * from gse WHERE gse='{}';".format(gse)) 104 | 105 | def gsm_metadata(self, gsm): 106 | """Get metadata for GSM ID. 107 | 108 | Parameters 109 | ---------- 110 | gsm: string 111 | GSM ID 112 | 113 | Returns 114 | ------- 115 | metadata_df: DataFrame 116 | A dataframe with relevant fields 117 | """ 118 | return self.query("SELECT * from gsm WHERE gsm='{}';".format(gsm)) 119 | 120 | def geo_convert(self, from_acc): 121 | """Convert one GEO accession to other. 122 | 123 | Parameters 124 | ---------- 125 | from_acc: string 126 | GPL/GSE/GSM accession ID 127 | 128 | Returns 129 | ------- 130 | mapping_df: DataFrame 131 | A dataframe with relevant mappings 132 | """ 133 | return self.query( 134 | "SELECT * FROM geoConvert WHERE from_acc='{}';".format(from_acc) 135 | ) 136 | 137 | def gse_to_gsm(self, gse): 138 | """Fetch GSMs for a GSE. 139 | 140 | Parameters 141 | ---------- 142 | gse: string 143 | GSE ID 144 | 145 | Returns 146 | ------- 147 | mapping_df: DataFrame 148 | A dataframe with relevant mappings 149 | """ 150 | return self.query("SELECT * FROM gse_gsm WHERE gse='{}'".format(gse)) 151 | 152 | def gsm_to_gse(self, gsm): 153 | """Fetch GSE for a GSM. 154 | 155 | Parameters 156 | ---------- 157 | gsm: string 158 | GSM ID 159 | 160 | Returns 161 | ------- 162 | mapping_df: DataFrame 163 | A dataframe with relevant mappings 164 | """ 165 | mapping_df = self.query("SELECT * FROM gse_gsm WHERE gsm='{}'".format(gsm)) 166 | return mapping_df.loc[:, ["gsm", "gse"]] 167 | 168 | def guess_srp_from_gse(self, gse): 169 | """Convert GSE to SRP id. 170 | 171 | Parameters 172 | ---------- 173 | gse: string 174 | GSE ID 175 | 176 | Returns 177 | ------- 178 | srp: string 179 | SRP ID 180 | """ 181 | results = self.query('SELECT * FROM gse WHERE gse = "' + gse + '"') 182 | if results.shape[0] == 1: 183 | supp_file = results["supplementary_file"][0] 184 | if supp_file: 185 | splitted = supp_file.split(";") 186 | if len(splitted): 187 | match = re.findall("SRP.*", splitted[-1]) 188 | if len(match): 189 | srp = match[0].split("/")[-1] 190 | return srp 191 | return None 192 | -------------------------------------------------------------------------------- /pysradb/geoweb.py: -------------------------------------------------------------------------------- 1 | """Utilities to interact with GEO online""" 2 | 3 | import gzip 4 | import os 5 | import re 6 | import requests 7 | import sys 8 | from lxml import html 9 | 10 | from .download import download_file 11 | from .geodb import GEOdb 12 | from .utils import _get_url 13 | from .utils import copyfileobj 14 | from .utils import get_gzip_uncompressed_size 15 | 16 | PY3 = True 17 | if sys.version_info[0] < 3: 18 | PY3 = False 19 | 20 | 21 | class GEOweb(GEOdb): 22 | def __init__(self): 23 | """Initialize GEOweb without any database.""" 24 | 25 | def get_download_links(self, gse): 26 | """Obtain all links from the GEO FTP page. 27 | 28 | Parameters 29 | ---------- 30 | gse: string 31 | GSE ID 32 | 33 | Returns 34 | ------- 35 | links: list 36 | List of all valid downloadable links present for a GEO ID 37 | """ 38 | prefix = gse[:-3] 39 | url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}nnn/{gse}/suppl/" 40 | link_objects = html.fromstring(requests.get(url).content).xpath("//a") 41 | links = [i.attrib["href"] for i in link_objects] 42 | # remove vulnerability link 43 | links = [ 44 | link 45 | for link in links 46 | if link != "https://www.hhs.gov/vulnerability-disclosure-policy/index.html" 47 | ] 48 | # Check if returned results are a valid page - a link to the 49 | # home page only exists where the GSE ID dow not exist 50 | if "/" in links: 51 | raise KeyError(f"The provided GEO ID {gse} does not exist.") 52 | 53 | # The list of links for a valid GSE ID also contains a link to 54 | # the parent directory - we do not want that 55 | links = [i for i in links if "geo/series/" not in i] 56 | 57 | # The links are relative, we need absolute links to download 58 | links = [i for i in links] 59 | 60 | return links, url 61 | 62 | def download(self, links, root_url, gse, verbose=False, out_dir=None): 63 | """Download GEO files. 64 | 65 | Parameters 66 | ---------- 67 | links: list 68 | List of all links valid downloadable present for a GEO ID 69 | root_url: string 70 | url for root directory for a GEO ID 71 | gse: string 72 | GEO ID 73 | verbose: bool 74 | Print file list 75 | out_dir: string 76 | Directory location for download 77 | """ 78 | if out_dir is None: 79 | out_dir = os.path.join(os.getcwd(), "pysradb_downloads") 80 | 81 | # store output in a separate directory 82 | out_dir = os.path.join(out_dir, gse) 83 | os.makedirs(out_dir, exist_ok=True) 84 | 85 | # Display files to be downloaded 86 | print("\nThe following files will be downloaded: \n") 87 | for link in links: 88 | print(link) 89 | print(os.linesep) 90 | # Check if we can access list of files in the tar file 91 | tar_list = [i for i in links if ".tar" in i] 92 | if "filelist.txt" in links: 93 | tar_file = tar_list[0] 94 | if verbose: 95 | print(f"\nThe tar file {tar_file} contains the following files:\n") 96 | file_list_contents = requests.get( 97 | root_url + "filelist.txt" 98 | ).content.decode("utf-8") 99 | print(file_list_contents) 100 | 101 | # Download files 102 | for link in links: 103 | # add a prefix to distinguish filelist.txt from different downloads 104 | prefix = "" 105 | if link == "filelist.txt": 106 | prefix = gse + "_" 107 | geo_path = os.path.join(out_dir, prefix + link) 108 | download_file( 109 | root_url.lstrip("https://") + link, geo_path, show_progress=True 110 | ) 111 | -------------------------------------------------------------------------------- /pysradb/utils.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import gzip 3 | import io 4 | import ntpath 5 | import os 6 | import shlex 7 | import subprocess 8 | import urllib.request as urllib_request 9 | import warnings 10 | 11 | import requests 12 | from requests.adapters import HTTPAdapter 13 | from requests.packages.urllib3.util.retry import Retry 14 | from tqdm.autonotebook import tqdm 15 | 16 | from .exceptions import IncorrectFieldException 17 | 18 | warnings.simplefilter(action="ignore", category=FutureWarning) 19 | 20 | 21 | tqdm.pandas() 22 | 23 | 24 | def path_leaf(path): 25 | """Get path's tail from a filepath. 26 | 27 | Parameters 28 | ---------- 29 | path: string 30 | Filepath 31 | 32 | Returns 33 | ------- 34 | tail: string 35 | Filename 36 | """ 37 | head, tail = ntpath.split(path) 38 | return tail or ntpath.basename(head) 39 | 40 | 41 | def requests_3_retries(): 42 | """Generates a requests session object that allows 3 retries. 43 | 44 | Returns 45 | ------- 46 | session: requests.Session 47 | requests session object that allows 3 retries for server-side 48 | errors. 49 | """ 50 | session = requests.Session() 51 | retry = Retry( 52 | total=3, 53 | backoff_factor=0.5, 54 | status_forcelist=[500, 502, 503, 504], 55 | ) 56 | adapter = HTTPAdapter(max_retries=retry) 57 | session.mount("http://", adapter) 58 | session.mount("https://", adapter) 59 | return session 60 | 61 | 62 | def scientific_name_to_taxid(name): 63 | """Converts a scientific name to its corresponding taxonomy ID. 64 | 65 | Parameters 66 | ---------- 67 | name: str 68 | Scientific name of interest. 69 | 70 | Returns 71 | ------- 72 | taxid: str 73 | Taxonomy Id of the Scientific name. 74 | 75 | Raises 76 | ------ 77 | IncorrectFieldException 78 | If the scientific name cannot be found. 79 | 80 | """ 81 | 82 | r = requests.get( 83 | "https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name/" + name, 84 | timeout=5, 85 | ) 86 | if r.status_code == 404: 87 | raise IncorrectFieldException(f"Unknown scientific name: {name}") 88 | r.raise_for_status() 89 | return r.json()[0]["taxId"] 90 | 91 | 92 | def unique(sequence): 93 | """Get unique elements from a list maintaining the order. 94 | 95 | Parameters 96 | ---------- 97 | input_list: list 98 | 99 | Returns 100 | ------- 101 | unique_list: list 102 | List with unique elements maintaining the order 103 | """ 104 | visited = set() 105 | return [x for x in sequence if not (x in visited or visited.add(x))] 106 | 107 | 108 | class TqdmUpTo(tqdm): 109 | """Alternative Class-based version of the above. 110 | Provides `update_to(n)` which uses `tqdm.update(delta_n)`. 111 | Inspired by [twine#242](https://github.com/pypa/twine/pull/242), 112 | [here](https://github.com/pypa/twine/commit/42e55e06). 113 | 114 | Credits: 115 | https://github.com/tqdm/tqdm/blob/69326b718905816bb827e0e66c5508c9c04bc06c/examples/tqdm_wget.py 116 | """ 117 | 118 | def update_to(self, b=1, bsize=1, tsize=None): 119 | """ 120 | b : int, optional 121 | Number of blocks transferred so far [default: 1]. 122 | bsize : int, optional 123 | Size of each block (in tqdm units) [default: 1]. 124 | tsize : int, optional 125 | Total size (in tqdm units). If [default: None] remains unchanged. 126 | """ 127 | if tsize is not None: 128 | self.total = tsize 129 | self.update(b * bsize - self.n) # will also set self.n = b * bsize 130 | 131 | 132 | def _extract_first_field(data): 133 | """Extract first field from a list of fields.""" 134 | return list(next(iter(zip(*data)))) 135 | 136 | 137 | def _find_aspera_keypath(aspera_dir=None): 138 | """Locate aspera key. 139 | 140 | Parameters 141 | ---------- 142 | aspera_dir: string 143 | Location to aspera directory (optional) 144 | 145 | Returns 146 | ------- 147 | aspera_keypath: string 148 | Location to aspera key 149 | """ 150 | if aspera_dir is None: 151 | aspera_dir = os.path.join(os.path.expanduser("~"), ".aspera") 152 | aspera_keypath = os.path.join( 153 | aspera_dir, "connect", "etc", "asperaweb_id_dsa.openssh" 154 | ) 155 | if os.path.isfile(aspera_keypath): 156 | return aspera_keypath 157 | 158 | 159 | def mkdir_p(path): 160 | """Python version mkdir -p 161 | 162 | Parameters 163 | ---------- 164 | path : string 165 | Path to directory to create 166 | """ 167 | if path: 168 | try: 169 | os.makedirs(path) 170 | except OSError as exc: # Python >2.5 171 | if exc.errno == errno.EEXIST and os.path.isdir(path): 172 | pass 173 | else: 174 | raise 175 | 176 | 177 | def order_dataframe(df, columns): 178 | """Order a dataframe 179 | 180 | Order a dataframe by moving the `columns` in the front 181 | 182 | Parameters 183 | ---------- 184 | df: Dataframe 185 | Dataframe 186 | columns: list 187 | List of columns that need to be put in front 188 | """ 189 | remaining_columns = [w for w in df.columns if w not in columns] 190 | df = df[columns + remaining_columns] 191 | return df 192 | 193 | 194 | def _get_url(url, download_to, show_progress=True): 195 | """Download anything at a given url. 196 | 197 | Parameters 198 | ---------- 199 | url: string 200 | http/https/ftp url 201 | download_to: string 202 | File location to write the downloaded file to 203 | show_progress: bool 204 | Set to True by default to print progress bar 205 | """ 206 | desc_file = "Downloading {}".format(url.split("/")[-1]) 207 | mkdir_p(os.path.dirname(download_to)) 208 | if show_progress: 209 | with TqdmUpTo( 210 | unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc_file 211 | ) as t: 212 | urllib_request.urlretrieve( 213 | url, download_to, reporthook=t.update_to, data=None 214 | ) 215 | else: 216 | urllib_request.urlretrieve(url, download_to) 217 | 218 | 219 | def run_command(command, verbose=False): 220 | """Run a shell command""" 221 | process = subprocess.Popen( 222 | shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT 223 | ) 224 | 225 | while True: 226 | output = process.stdout.readline().strip() 227 | output = output.decode("utf-8") 228 | if output == "" and process.poll() is not None: 229 | break 230 | if output: 231 | if verbose: 232 | print((str(output.strip()))) 233 | rc = process.poll() 234 | return rc 235 | 236 | 237 | def get_gzip_uncompressed_size(filepath): 238 | """Get uncompressed size of a .gz file 239 | 240 | Parameters 241 | ---------- 242 | filepath: string 243 | Path to input file 244 | 245 | Returns 246 | ------- 247 | filesize: int 248 | Uncompressed file size 249 | """ 250 | with gzip.open(filepath, "rb") as file_obj: 251 | return file_obj.seek(0, io.SEEK_END) 252 | 253 | 254 | def confirm(preceeding_text): 255 | """Confirm user input. 256 | 257 | Parameters 258 | ---------- 259 | preceeding_text: str 260 | Text to print 261 | 262 | Returns 263 | ------- 264 | response: bool 265 | """ 266 | print(os.linesep, flush=True) 267 | notification_str = "Please respond with 'y' or 'n'" 268 | while True: 269 | choice = input("{} [Y/n]: ".format(preceeding_text)).lower() 270 | if choice in ["yes", "y"] or not choice: 271 | return True 272 | if choice in ["no", "n"]: 273 | return False 274 | print(notification_str, flush=True) 275 | 276 | 277 | def copyfileobj(fsrc, fdst, bufsize=16384, filesize=None, desc=""): 278 | """Copy file object with a progress bar. 279 | 280 | Parameters 281 | ---------- 282 | fsrc: filehandle 283 | Input file handle 284 | fdst: filehandle 285 | Output file handle 286 | bufsize: int 287 | Length of output buffer 288 | filesize: int 289 | Input file file size 290 | desc: string 291 | Description for tqdm status 292 | """ 293 | with tqdm( 294 | total=filesize, 295 | unit="B", 296 | unit_scale=True, 297 | miniters=1, 298 | unit_divisor=1024, 299 | desc=desc, 300 | ) as pbar: 301 | while True: 302 | buf = fsrc.read(bufsize) 303 | if not buf: 304 | break 305 | fdst.write(buf) 306 | pbar.update(len(buf)) 307 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml>=4.6.3 2 | pandas>=1.3.2 3 | requests>=2.26.0 4 | requests-ftp>=0.3.1 5 | tqdm>=4.62.1 6 | xmltodict>=0.12.0 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 2.2.0 3 | commit = True 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? 6 | serialize = 7 | {major}.{minor}.{patch}-{release}{build} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | optional_value = prod 12 | first_value = dev 13 | values = 14 | dev 15 | prod 16 | 17 | [bumpversion:part:build] 18 | 19 | [bumpversion:file:setup.py] 20 | search = version="{current_version}" 21 | replace = version="{new_version}" 22 | 23 | [bumpversion:file:pysradb/__init__.py] 24 | search = __version__ = "{current_version}" 25 | replace = __version__ = "{new_version}" 26 | 27 | [flake8] 28 | exclude = docs 29 | 30 | [aliases] 31 | test = pytest 32 | 33 | [tool:pytest] 34 | collect_ignore = ["setup.py"] 35 | -------------------------------------------------------------------------------- /tests/_test_geodb.py: -------------------------------------------------------------------------------- 1 | """Tests for geodb.py 2 | """ 3 | 4 | import pytest 5 | 6 | from pysradb import GEOdb 7 | 8 | """ 9 | 10 | @pytest.fixture(scope="module") 11 | def geodb_connection(conf_download_geodb_file): 12 | db_file = conf_download_geodb_file 13 | db = GEOdb(db_file) 14 | return db 15 | 16 | 17 | def test_all_row_counts(geodb_connection): 18 | assert geodb_connection.all_row_counts().loc["metaInfo", "count"] == 2 19 | 20 | 21 | def test_gse_metadata(geodb_connection): 22 | df = geodb_connection.gse_metadata("GSE114314") 23 | assert int(df["pubmed_id"][0]) == 29925996 24 | 25 | 26 | def test_gse_to_gsm(geodb_connection): 27 | df = geodb_connection.gse_to_gsm("GSE114314") 28 | assert df["gsm"][3] == "GSM3139412" 29 | 30 | 31 | def test_geo_convert(geodb_connection): 32 | df = geodb_connection.geo_convert("GSM3139409") 33 | assert df["to_acc"][0] == "GSE114314" 34 | 35 | 36 | def test_guess_srp_form_gse(geodb_connection): 37 | srp = geodb_connection.guess_srp_from_gse("GSE73136") 38 | assert srp == "SRP063852" 39 | """ 40 | -------------------------------------------------------------------------------- /tests/_test_pcli.py: -------------------------------------------------------------------------------- 1 | """Tests for cli.py 2 | """ 3 | 4 | import os 5 | import subprocess 6 | import sys 7 | from shlex import quote 8 | from shlex import split 9 | 10 | import pytest 11 | 12 | from pysradb import SRAdb 13 | 14 | 15 | def run(command): 16 | if sys.version_info.minor >= 7: 17 | result = subprocess.run(split(command), capture_output=True) 18 | else: 19 | result = subprocess.run(split(command), check=True, stdout=subprocess.PIPE) 20 | return str(result.stdout).strip() 21 | 22 | 23 | @pytest.fixture(scope="module") 24 | def sradb_connection(conf_download_sradb_file): 25 | db_file = conf_download_sradb_file 26 | db = SRAdb(db_file) 27 | return db 28 | 29 | 30 | def test_all_row_counts_sra(sradb_connection): 31 | assert sradb_connection.all_row_counts().loc["metaInfo", "count"] == 2 32 | 33 | 34 | @pytest.mark.xfail 35 | def test_download(): 36 | result = run( 37 | "pysradb download -y --db data/SRAmetadb.sqlite --out-dir srp_downloads -p SRP063852" 38 | ) 39 | assert "SRP063852" in result 40 | assert os.path.getsize("srp_downloads/SRP063852/SRX1254413/SRR2433794.sra") 41 | 42 | 43 | def test_sra_metadata(): 44 | result = run("pysradb metadata SRP098789 --db data/SRAmetadb.sqlite") 45 | assert "SRX2536403" in result 46 | 47 | 48 | def test_sra_metadata(): 49 | result = run( 50 | "pysradb metadata SRP098789 --db data/SRAmetadb.sqlite --detailed --expand" 51 | ) 52 | assert "treatment_time" in result 53 | 54 | 55 | def test_srp_to_srx(): 56 | result = run("pysradb srp-to-srx SRP098789 --db data/SRAmetadb.sqlite") 57 | assert "SRX2536403" in result 58 | 59 | 60 | def test_srp_assay(): 61 | result = run("pysradb metadata SRP098789 --db data/SRAmetadb.sqlite --assay") 62 | assert "RNA-Seq" in result 63 | 64 | 65 | def srr_to_srx(): 66 | result = run( 67 | "pysradb srr-to-srx --db data/SRAmetadb.sqlite SRR5227288 SRR649752 --desc" 68 | ) 69 | assert "3T3 cells" in result 70 | 71 | 72 | def srx_to_srr(): 73 | result = run( 74 | "pysradb srr-to-srx --db data/SRAmetadb.sqlite SRX217956 SRX2536403 --desc" 75 | ) 76 | assert "3T3 cells" in result 77 | 78 | 79 | def test_sra_metadata_detail(): 80 | result = run( 81 | "pysradb metadata --db data/SRAmetadb.sqlite SRP075720 --detailed --expand" 82 | ) 83 | assert "retina" in result 84 | 85 | 86 | def test_srp_to_gse(): 87 | result = run("pysradb srp-to-gse --db data/SRAmetadb.sqlite SRP075720") 88 | assert "GSE81903" in result 89 | 90 | 91 | def test_gsm_to_srp(): 92 | result = run("pysradb gsm-to-srp --db data/SRAmetadb.sqlite GSM2177186") 93 | assert "SRP075720" in result 94 | 95 | 96 | def test_gsm_to_gse(): 97 | result = run("pysradb gsm-to-gse --db data/SRAmetadb.sqlite GSM2177186") 98 | assert "GSE81903" in result 99 | 100 | 101 | def test_gsm_to_srr(): 102 | result = run( 103 | "pysradb gsm-to-srr --db data/SRAmetadb.sqlite GSM2177186 --detailed --desc --expand" 104 | ) 105 | assert "GSM2177186_r1" in result 106 | 107 | 108 | """ 109 | def test_assay_uniq(): 110 | result = subprocess.check_output( 111 | "pysradb metadata SRP000941 --db data/SRAmetadb.sqlite --assay | " 112 | + " tr -s {}".format(quote(" ")) 113 | + " | cut -f5 -d {}".format(quote(" ")) 114 | + " | sort | uniq -c", 115 | shell=True, 116 | ) 117 | assert "Bisulfite-Seq" in str(result) 118 | 119 | 120 | def test_pipe_download(): 121 | result = subprocess.check_output( 122 | "pysradb metadata SRP000941 --assay | " 123 | + " grep {}".format(quote("study\|RNA-Seq")) 124 | + " | head -2 | pysradb download --out-dir srp_downloads", 125 | shell=True, 126 | ) 127 | assert os.path.getsize("srp_downloads/SRP000941/SRX007165/SRR020287.sra") 128 | assert "following" in str(result) 129 | """ 130 | -------------------------------------------------------------------------------- /tests/_test_sradb.py: -------------------------------------------------------------------------------- 1 | """Tests for sradb.py 2 | """ 3 | 4 | import os 5 | from sqlite3 import OperationalError 6 | 7 | import pytest 8 | 9 | from pysradb import SRAdb 10 | from pysradb.filter_attrs import guess_cell_type 11 | from pysradb.filter_attrs import guess_strain_type 12 | from pysradb.filter_attrs import guess_tissue_type 13 | 14 | 15 | @pytest.fixture(scope="module") 16 | def sradb_connection(conf_download_sradb_file): 17 | db_file = conf_download_sradb_file 18 | db = SRAdb(db_file) 19 | return db 20 | 21 | 22 | def test_list_tables(sradb_connection): 23 | sra_tables = sradb_connection.list_tables() 24 | assert sra_tables == [ 25 | "metaInfo", 26 | "submission", 27 | "study", 28 | "sample", 29 | "experiment", 30 | "run", 31 | "sra", 32 | "sra_ft", 33 | "sra_ft_content", 34 | "sra_ft_segments", 35 | "sra_ft_segdir", 36 | "col_desc", 37 | "fastq", 38 | ] 39 | 40 | 41 | def test_list_fields(sradb_connection): 42 | fields = sradb_connection.list_fields("study") 43 | assert fields == [ 44 | "study_ID", 45 | "study_alias", 46 | "study_accession", 47 | "study_title", 48 | "study_type", 49 | "study_abstract", 50 | "broker_name", 51 | "center_name", 52 | "center_project_name", 53 | "study_description", 54 | "related_studies", 55 | "primary_study", 56 | "sra_link", 57 | "study_url_link", 58 | "xref_link", 59 | "study_entrez_link", 60 | "ddbj_link", 61 | "ena_link", 62 | "study_attribute", 63 | "submission_accession", 64 | "sradb_updated", 65 | ] 66 | 67 | 68 | def test_desc_table(sradb_connection): 69 | names = sorted(sradb_connection.desc_table("sra_ft").name.tolist()) 70 | assert names[:7] == [ 71 | "SRR_bamFile", 72 | "SRX_bamFile", 73 | "SRX_fastqFTP", 74 | "adapter_spec", 75 | "anonymized_name", 76 | "base_caller", 77 | "bases", 78 | ] 79 | 80 | 81 | def test_all_row_counts(sradb_connection): 82 | assert sradb_connection.all_row_counts().loc["metaInfo", "count"] == 2 83 | 84 | 85 | def test_all_row_counts2(sradb_connection): 86 | assert len(sradb_connection.all_row_counts()) == 13 87 | 88 | 89 | def test_sra_metadata(sradb_connection): 90 | df = sradb_connection.sra_metadata("SRP017942") 91 | assert df["experiment_accession"][0] == "SRX217027" 92 | 93 | 94 | def test_sra_metadata2(sradb_connection): 95 | df = sradb_connection.sra_metadata( 96 | "SRP017942", detailed=True, expand_sample_attributes=True 97 | ) 98 | assert "3xflag-gfp" in df["transfected_with"].tolist() 99 | 100 | 101 | def test_search(sradb_connection): 102 | df = sradb_connection.search_sra(search_str="breast cancer") 103 | assert len(df.index) 104 | 105 | 106 | def test_search2(sradb_connection): 107 | df = sradb_connection.search_sra( 108 | '"salivary microbiome" AND "diabetes mellitus"', detailed=True 109 | ) 110 | assert "SRP241848" in df["study_accession"].to_list() 111 | 112 | 113 | def test_search_by_expt_id(sradb_connection): 114 | df = sradb_connection.search_by_expt_id("SRX1254413") 115 | assert df.study_name.tolist()[0] == "GSE73136" 116 | 117 | 118 | def test_search_by_expt_id2(sradb_connection): 119 | srx_id = "SRX116363" 120 | df_expt = sradb_connection.search_by_expt_id(srx_id) 121 | sra_id = df_expt["submission_accession"].loc[0] 122 | df = sradb_connection.sra_metadata(sra_id) 123 | connected_srp = sradb_connection.srx_to_srp("SRX116363").iloc[0, 1] 124 | assert (srx_id in df["experiment_accession"].to_list()) and ( 125 | connected_srp == "SRP010374" 126 | ) 127 | 128 | 129 | # def test_download_fasp(sradb_connection): 130 | # df = sradb_connection.sra_metadata("SRP098789") 131 | # df = df[df.experiment_accession == "SRX2536403"] 132 | # sradb_connection.download(df=df, out_dir="data/", skip_confirmation=True) 133 | # assert os.path.isfile("data/SRP098789/SRX2536403/SRR5227288.sra") 134 | # assert os.path.getsize("data/SRP098789/SRX2536403/SRR5227288.sra") 135 | # os.remove("data/SRP098789/SRX2536403/SRR5227288.sra") 136 | 137 | 138 | @pytest.mark.xfail 139 | def test_download_ftp(sradb_connection): 140 | # This happens to fail because of ftp problems 141 | df = sradb_connection.sra_metadata("SRP098789") 142 | df = df[df.experiment_accession == "SRX2536404"] 143 | sradb_connection.download( 144 | df=df, protocol="ftp", out_dir="data/", skip_confirmation=True 145 | ) 146 | assert os.path.isfile("data/SRP098789/SRX2536404/SRR5227289.sra") 147 | assert os.path.getsize("data/SRP098789/SRX2536404/SRR5227289.sra") 148 | os.remove("data/SRP098789/SRX2536404/SRR5227289.sra") 149 | 150 | 151 | def test_tissue_type(sradb_connection): 152 | df = sradb_connection.sra_metadata("SRP016501", detailed=True) 153 | df = df[df.experiment_accession == "SRX196389"] 154 | cell_type = df["sample_attribute"].apply(lambda x: guess_cell_type(x)) 155 | tissue_type = df["sample_attribute"].apply(lambda x: guess_tissue_type(x)) 156 | assert cell_type.tolist() == ["chicken_brain"] 157 | assert tissue_type.tolist() == ["brain"] 158 | 159 | 160 | def test_strain_type(sradb_connection): 161 | df = sradb_connection.sra_metadata("SRP043036", detailed=True) 162 | df = df.sort_values(by="experiment_accession") 163 | strains = df["sample_attribute"].apply(lambda x: guess_strain_type(x)).tolist() 164 | assert strains == [ 165 | "by4741", 166 | "by4741", 167 | "by4741", 168 | "by4741", 169 | "by4741", 170 | "by4741", 171 | "by4741", 172 | "by4741", 173 | "s288c", 174 | "s288c", 175 | "s288c", 176 | "s288c", 177 | ] 178 | 179 | 180 | def test_srp_to_srx(sradb_connection): 181 | assert len(sradb_connection.srp_to_srx("SRP082570")) == 14 182 | 183 | 184 | def test_srp_to_srr(sradb_connection): 185 | df = sradb_connection.srp_to_srr("SRP091987") 186 | assert sorted(list(df["run_accession"])[:3]) == [ 187 | "SRR4447104", 188 | "SRR4447105", 189 | "SRR4447106", 190 | ] 191 | 192 | 193 | def test_srp_to_gse(sradb_connection): 194 | gse_id = sradb_connection.srp_to_gse("SRP050443").iloc[0, 1] 195 | df = sradb_connection.gse_to_gsm(gse_id) 196 | assert "GSM1557451" in df["experiment_alias"].to_list() 197 | 198 | 199 | def test_gsm_to_gse(sradb_connection): 200 | df = sradb_connection.gsm_to_gse(["GSM1020651", "GSM1020664", "GSM1020771"]) 201 | assert set(list(df["study_alias"])) == {"GSE41637"} 202 | 203 | 204 | def test_srs_to_gsm(sradb_connection): 205 | df = sradb_connection.srs_to_gsm("SRS1757470") 206 | assert "GSM2358940" == df.iloc[0, 1] 207 | 208 | 209 | @pytest.mark.xfail(raises=ValueError) 210 | def test_wrong_input_metadata(sradb_connection): 211 | df = sradb_connection.sra_metadata("should_throw_error") 212 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # contents of conftest.py 2 | import os 3 | 4 | import pytest 5 | 6 | from pysradb import download_geodb_file 7 | from pysradb import download_sradb_file 8 | 9 | 10 | @pytest.fixture(scope="session") 11 | def conf_download_sradb_file(): 12 | fn = os.path.join(os.getcwd(), "data", "SRAmetadb.sqlite") 13 | if os.path.isfile(fn): 14 | return fn 15 | download_sradb_file(download_dir=os.path.dirname(fn)) 16 | return fn 17 | 18 | 19 | @pytest.fixture(scope="session") 20 | def conf_download_geodb_file(): 21 | fn = os.path.join(os.getcwd(), "data", "GEOmetadb.sqlite") 22 | if os.path.isfile(fn): 23 | return fn 24 | download_geodb_file(download_dir=os.path.dirname(fn)) 25 | return fn 26 | -------------------------------------------------------------------------------- /tests/data/test_search/ena_search_test1.txt: -------------------------------------------------------------------------------- 1 | run_accession 2 | SRR492850 3 | SRR500270 4 | SRR609956 5 | SRR609957 6 | SRR609958 7 | SRR609959 8 | SRR609960 9 | SRR609961 10 | SRR609962 11 | SRR609963 12 | SRR609964 13 | SRR609965 14 | SRR609966 15 | SRR609967 16 | SRR609968 17 | SRR609969 18 | SRR609970 19 | SRR609971 20 | SRR609972 21 | SRR609973 22 | SRR609974 23 | SRR609975 24 | SRR609976 25 | SRR609977 26 | SRR609978 27 | SRR609979 28 | SRR609980 29 | SRR609981 30 | SRR609982 31 | SRR609983 32 | SRR609984 33 | SRR609985 34 | SRR609986 35 | SRR609987 36 | SRR609988 37 | SRR609989 38 | SRR609990 39 | SRR609991 40 | SRR609992 41 | SRR609993 42 | SRR609994 43 | SRR609995 44 | SRR609996 45 | SRR609997 46 | SRR609998 47 | SRR609999 48 | SRR610000 49 | SRR610001 50 | SRR610002 51 | SRR610003 52 | SRR610004 53 | SRR610005 54 | SRR610006 55 | SRR610007 56 | SRR610008 57 | SRR610009 58 | SRR610010 59 | SRR610011 60 | SRR610012 61 | SRR610013 62 | SRR610014 63 | SRR610015 64 | SRR610016 65 | SRR610017 66 | SRR610018 67 | SRR610019 68 | SRR610020 69 | SRR610021 70 | SRR610022 71 | SRR610023 72 | SRR610024 73 | SRR610025 74 | SRR610026 75 | SRR610027 76 | SRR610028 77 | SRR610029 78 | SRR610030 79 | -------------------------------------------------------------------------------- /tests/data/test_search/geo_search_test1.txt: -------------------------------------------------------------------------------- 1 | SRX8089313 2 | SRX8089314 3 | SRX8089315 4 | SRX8089316 5 | SRX8089317 6 | SRX8089318 7 | SRX8089319 8 | SRX8089320 9 | SRX8089286 10 | SRX8089275 11 | SRX8089276 12 | SRX8089277 13 | SRX8089278 14 | SRX8089279 15 | SRX8089280 16 | SRX8089281 17 | SRX8089282 18 | SRX8089283 19 | SRX8089284 20 | SRX8089285 21 | SRX8089321 22 | SRX8089287 23 | SRX8089288 24 | SRX8089289 25 | SRX8089290 26 | SRX8089291 27 | SRX8089292 28 | SRX8089293 29 | SRX8089294 30 | SRX8089295 31 | SRX8089296 32 | SRX8466645 33 | SRX8142119 34 | SRX8142120 35 | SRX8142121 36 | SRX8142122 37 | SRX8142123 38 | SRX8142124 39 | SRX8142125 40 | SRX8142126 41 | SRX8142127 42 | SRX8466643 43 | SRX8466644 44 | SRX8089344 45 | SRX8466646 46 | SRX8466647 47 | SRX8466648 48 | SRX8466649 49 | SRX8466650 50 | SRX8466651 51 | SRX8466652 52 | SRX8466653 53 | SRX8466654 54 | SRX8466655 55 | SRX8466656 56 | SRX8089333 57 | SRX8089322 58 | SRX8089323 59 | SRX8089324 60 | SRX8089325 61 | SRX8089326 62 | SRX8089327 63 | SRX8089328 64 | SRX8089329 65 | SRX8089330 66 | SRX8089331 67 | SRX8089332 68 | SRX8466765 69 | SRX8089334 70 | SRX8089335 71 | SRX8089336 72 | SRX8089337 73 | SRX8089338 74 | SRX8089339 75 | SRX8089340 76 | SRX8089341 77 | SRX8089342 78 | SRX8089343 -------------------------------------------------------------------------------- /tests/data/test_search/sra_search_test1.txt: -------------------------------------------------------------------------------- 1 | SRX137370 2 | SRX137371 -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_2_verbosity_0.csv: -------------------------------------------------------------------------------- 1 | run_accession 2 | ERR4229796 3 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_2_verbosity_1.csv: -------------------------------------------------------------------------------- 1 | run_accession,experiment_title 2 | ERR4229796,HiSeq X Ten paired end sequencing 3 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_2_verbosity_2.csv: -------------------------------------------------------------------------------- 1 | study_accession,experiment_accession,experiment_title,sample_taxon_id,sample_scientific_name,experiment_library_strategy,experiment_library_source,experiment_library_selection,sample_accession,sample_alias,experiment_instrument_model,pool_member_spots,run_1_size,run_1_accession,run_1_total_spots,run_1_total_bases 2 | ERP113893,ERX4190585,HiSeq X Ten paired end sequencing,562,Escherichia coli,WGS,GENOMIC,RANDOM,ERS3331676,96c1f848-4c9d-11e9-8292-68b599768938,HiSeq X Ten,1150278,94976452,ERR4229796,1150278,347383956 3 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_2_verbosity_3.csv: -------------------------------------------------------------------------------- 1 | study_accession,experiment_accession,experiment_title,sample_taxon_id,sample_scientific_name,experiment_library_strategy,experiment_library_source,experiment_library_selection,sample_accession,sample_alias,experiment_instrument_model,pool_member_spots,run_1_size,run_1_accession,run_1_total_spots,run_1_total_bases,experiment_alias,experiment_design_description,experiment_external_id,experiment_library_construction_protocol,experiment_library_name,experiment_platform,experiment_sample_descriptor_accession,library_layout,library_layout_nominal_length,library_layout_nominal_sdev,pool_external_id,pool_member_accession,pool_member_bases,pool_member_member_name,pool_member_organism,pool_member_sample_name,pool_member_sample_title,pool_member_tax_id,run_1_alias,run_1_assembly,run_1_base_A_count,run_1_base_C_count,run_1_base_G_count,run_1_base_N_count,run_1_base_T_count,run_1_cloudfile_1_filetype,run_1_cloudfile_1_location,run_1_cloudfile_1_provider,run_1_cloudfile_2_filetype,run_1_cloudfile_2_location,run_1_cloudfile_2_provider,run_1_cloudfile_3_filetype,run_1_cloudfile_3_location,run_1_cloudfile_3_provider,run_1_cloudfile_4_filetype,run_1_cloudfile_4_location,run_1_cloudfile_4_provider,run_1_cluster_name,run_1_database_1,run_1_is_public,run_1_load_done,run_1_published,run_1_srafile_1_alternative_1_access_type,run_1_srafile_1_alternative_1_free_egress,run_1_srafile_1_alternative_1_org,run_1_srafile_1_alternative_1_url,run_1_srafile_1_alternative_2_access_type,run_1_srafile_1_alternative_2_free_egress,run_1_srafile_1_alternative_2_org,run_1_srafile_1_alternative_2_url,run_1_srafile_1_cluster,run_1_srafile_1_date,run_1_srafile_1_filename,run_1_srafile_1_md5,run_1_srafile_1_semantic_name,run_1_srafile_1_size,run_1_srafile_1_sratoolkit,run_1_srafile_1_supertype,run_1_srafile_1_url,run_1_srafile_2_alternative_1_access_type,run_1_srafile_2_alternative_1_free_egress,run_1_srafile_2_alternative_1_org,run_1_srafile_2_alternative_1_url,run_1_srafile_2_alternative_2_access_type,run_1_srafile_2_alternative_2_free_egress,run_1_srafile_2_alternative_2_org,run_1_srafile_2_alternative_2_url,run_1_srafile_2_alternative_3_access_type,run_1_srafile_2_alternative_3_free_egress,run_1_srafile_2_alternative_3_org,run_1_srafile_2_alternative_3_url,run_1_srafile_2_cluster,run_1_srafile_2_date,run_1_srafile_2_filename,run_1_srafile_2_md5,run_1_srafile_2_semantic_name,run_1_srafile_2_size,run_1_srafile_2_sratoolkit,run_1_srafile_2_supertype,run_1_srafile_2_url,run_1_srafile_3_alternative_1_access_type,run_1_srafile_3_alternative_1_free_egress,run_1_srafile_3_alternative_1_org,run_1_srafile_3_alternative_1_url,run_1_srafile_3_cluster,run_1_srafile_3_date,run_1_srafile_3_filename,run_1_srafile_3_md5,run_1_srafile_3_semantic_name,run_1_srafile_3_size,run_1_srafile_3_sratoolkit,run_1_srafile_3_supertype,run_1_srafile_3_url,run_1_srafile_4_alternative_1_access_type,run_1_srafile_4_alternative_1_free_egress,run_1_srafile_4_alternative_1_org,run_1_srafile_4_alternative_1_url,run_1_srafile_4_cluster,run_1_srafile_4_date,run_1_srafile_4_filename,run_1_srafile_4_md5,run_1_srafile_4_semantic_name,run_1_srafile_4_size,run_1_srafile_4_sratoolkit,run_1_srafile_4_supertype,run_1_srafile_4_url,run_1_static_data_available,run_1_total_base_count,run_1_total_base_cs_native,sample_attributes_1_tag,sample_attributes_1_value,sample_attributes_2_tag,sample_attributes_2_value,sample_attributes_3_tag,sample_attributes_3_value,sample_attributes_4_tag,sample_attributes_4_value,sample_center_name,sample_common_name,sample_external_id_1,sample_external_id_1_namespace,sample_taxon_id,sample_title,study_alias,study_attributes_1_tag,study_attributes_1_value,study_attributes_2_tag,study_attributes_2_value,study_center_name,study_external_id_1,study_external_id_1_namespace,study_study_abstract,study_study_description,study_study_title,study_study_type_existing_study_type,submission_accession,submission_alias,submission_lab_name,submission_title 2 | ERP113893,ERX4190585,HiSeq X Ten paired end sequencing,562,Escherichia coli,WGS,GENOMIC,RANDOM,ERS3331676,96c1f848-4c9d-11e9-8292-68b599768938,HiSeq X Ten,1150278,94976452,ERR4229796,1150278,347383956,SC_EXP_29694_1#382,"Illumina sequencing of library DN539314J:H12, constructed from sample accession ERS3331676 for study accession ERP113893. This is part of an Illumina multiplexed sequencing run (29694_1). This submission includes reads tagged with the sequence TACCATTC.",SAMEA5529601,Standard,DN539314J:H12,ILLUMINA,ERS3331676,PAIRED,452,121,SAMEA5529601,ERS3331676,347383956,,Escherichia coli,96c1f848-4c9d-11e9-8292-68b599768938,SB277889442,562,SC_RUN_29694_1#382,GCF_000005845.1,85540613,88241122,88289114,100578,85212529,cram,gs.US,gs,cram,s3.us-east-1,s3,run,gs.US,gs,run,s3.us-east-1,s3,public,"
",true,true,2020-06-10 21:58:01,anonymous,worldwide,GCP,https://storage.googleapis.com/sra-pub-src-14/ERR4229796/29694_1_382.cram.1,anonymous,worldwide,AWS,https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1,public,2020-06-11 19:23:25,29694_1_382.cram,26e62fcae91b058b13be6b94214ae2e6,cram,100576829,0,Original,https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796,aws identity,s3.us-east-1,AWS,s3://sra-pub-run-2/ERR4229796/ERR4229796.1,gcp identity,gs.US,GCP,gs://sra-pub-run-4/ERR4229796/ERR4229796.1,public,2020-06-11 19:24:22,ERR4229796,b779862a8fe21d16152454f047fbd2c1,run,94990927,1,Primary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign,public,2020-06-11 19:34:28,ERR4229796.realign,4b45e5e3e6ebe3fa112f308a4d092ee4,run.realign,19534748,1,Secondary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv,public,2020-06-11 19:36:45,wgmlst_sig.tsv,1f83f3039c8d5b17e64cebbd875a38b4,wgmlst_sig,4253484,0,Secondary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv,1,347383956,false,SUBJECT_ID,SB277889442,ArrayExpress-SPECIES,E. coli,ENA-FIRST-PUBLIC,2020-06-09,ENA-LAST-UPDATE,2019-03-22,Wellcome Sanger Institute,E. coli,SAMEA5529601,BioSample,562,SB277889442,Maximizing_the_knowledge_gained_from_California_Senate_Bill_27-sc-5708,ENA-FIRST-PUBLIC,2020-06-02,ENA-LAST-UPDATE,2019-02-19,Wellcome Sanger Institute,PRJEB31347,BioProject,"Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California.","Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California. This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute please see http://www.sanger.ac.uk/datasharing/",Maximizing_the_knowledge_gained_from_California_Senate_Bill_27,Whole Genome Sequencing,ERA2689537,ERP113893-sc-20200609-1-2020-06-09T15:09:21Z,European Nucleotide Archive,Submitted by Wellcome Sanger Institute on 09-JUN-2020 3 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_ERS3331676.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | ERX4190585HiSeq X Ten paired end sequencingERP113893PRJEB31347Illumina sequencing of library DN539314J:H12, constructed from sample accession ERS3331676 for study accession ERP113893. This is part of an Illumina multiplexed sequencing run (29694_1). This submission includes reads tagged with the sequence TACCATTC.ERS3331676SAMEA5529601DN539314J:H12WGSGENOMICRANDOMStandardHiSeq X TenERA2689537Submitted by Wellcome Sanger Institute on 09-JUN-2020Wellcome Sanger InstituteERP113893PRJEB31347Maximizing_the_knowledge_gained_from_California_Senate_Bill_27Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California.Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California. This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute please see http://www.sanger.ac.uk/datasharing/ENA-FIRST-PUBLIC2020-06-02ENA-LAST-UPDATE2019-02-19ERS3331676SAMEA5529601SB277889442562E. coliEscherichia coliSUBJECT_IDSB277889442ArrayExpress-SPECIESE. coliENA-FIRST-PUBLIC2020-06-09ENA-LAST-UPDATE2019-03-22ERS3331676SAMEA5529601ERR4229796HiSeq X Ten paired end sequencingERX4190585ENA-FIRST-PUBLIC2020-06-09ENA-LAST-UPDATE2020-06-09ERS3331676SAMEA5529601
4 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_verbosity_0.csv: -------------------------------------------------------------------------------- 1 | run_accession 2 | SRR11217925 3 | SRR11217924 4 | SRR11217923 5 | SRR11217922 6 | SRR11217921 7 | SRR11217920 8 | SRR11217919 9 | SRR11217918 10 | SRR11217917 11 | SRR11217916 12 | SRR11217915 13 | SRR11217914 14 | SRR11186550 15 | SRR11186549 16 | SRR11186548 17 | SRR11186547 18 | SRR11186546 19 | SRR11186545 20 | SRR11186544 21 | SRR11186543 22 | SRR11186542 23 | SRR11186541 24 | SRR11186540 25 | SRR11186539 26 | SRR10398500 27 | SRR10398499 28 | SRR10398498 29 | SRR10398497 30 | SRR10398496 31 | SRR10398495 32 | SRR10398494 33 | SRR10398493 34 | SRR10398492 35 | SRR10398491 36 | SRR10398490 37 | SRR10398489 38 | SRR7241911 39 | SRR5026637 40 | SRR5026603 41 | SRR5026592 42 | SRR5026589 43 | SRR5026359 44 | SRR5026356 45 | SRR3219264 46 | SRR3219253 47 | SRR3219248 48 | SRR1804351 49 | SRR1804349 50 | SRR1804348 51 | SRR1804346 52 | SRR1804345 53 | SRR1804343 54 | SRR1804341 55 | SRR1804340 56 | SRR1635379 57 | SRR1635378 58 | SRR1635377 59 | SRR1635376 60 | SRR914339 61 | SRR914338 62 | SRR914337 63 | SRR914336 64 | SRR914335 65 | SRR914334 66 | SRR914333 67 | SRR914332 68 | SRR914331 69 | SRR914330 70 | SRR914329 71 | SRR914328 72 | SRR522897 73 | SRR522895 74 | SRR522883 75 | SRR522896 76 | SRR522872 77 | SRR522871 78 | SRR522892 79 | SRR522890 80 | SRR522881 81 | SRR522153 82 | SRR522152 83 | SRR522151 84 | SRR522882 85 | SRR522888 86 | SRR522011 87 | SRR522010 88 | SRR522009 89 | SRR1804352 90 | SRR1804350 91 | SRR1804347 92 | SRR1804344 93 | SRR1804342 94 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_test_verbosity_1.csv: -------------------------------------------------------------------------------- 1 | run_accession,experiment_title 2 | SRR11217925,GSM4369051: rnaH27nsun3; Caenorhabditis elegans; RNA-Seq 3 | SRR11217924,GSM4369050: rnaH27nsun2; Caenorhabditis elegans; RNA-Seq 4 | SRR11217923,GSM4369049: rnaH27nsun1; Caenorhabditis elegans; RNA-Seq 5 | SRR11217922,GSM4369048: rnaH27WT3; Caenorhabditis elegans; RNA-Seq 6 | SRR11217921,GSM4369047: rnaH27WT2; Caenorhabditis elegans; RNA-Seq 7 | SRR11217920,GSM4369046: rnaH27WT1; Caenorhabditis elegans; RNA-Seq 8 | SRR11217919,GSM4369045: rnaL20nsun3; Caenorhabditis elegans; RNA-Seq 9 | SRR11217918,GSM4369044: rnaL20nsun2; Caenorhabditis elegans; RNA-Seq 10 | SRR11217917,GSM4369043: rnaL20nsun1; Caenorhabditis elegans; RNA-Seq 11 | SRR11217916,GSM4369042: rnaL20WT3; Caenorhabditis elegans; RNA-Seq 12 | SRR11217915,GSM4369041: rnaL20WT2; Caenorhabditis elegans; RNA-Seq 13 | SRR11217914,GSM4369040: rnaL20WT1; Caenorhabditis elegans; RNA-Seq 14 | SRR11186550,GSM4340680: pmk-1(km25)RPF Replicate 2; Caenorhabditis elegans; RNA-Seq 15 | SRR11186549,GSM4340679: pmk-1(km25) total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq 16 | SRR11186548,GSM4340678: pmk-1(km25)RPF Replicate 1; Caenorhabditis elegans; RNA-Seq 17 | SRR11186547,GSM4340677: pmk-1(km25) total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq 18 | SRR11186546,GSM4340676: ced-3(n717)RPF Replicate 2; Caenorhabditis elegans; RNA-Seq 19 | SRR11186545,GSM4340675: ced-3(n717) total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq 20 | SRR11186544,GSM4340674: ced-3(n717)RPF Replicate 1; Caenorhabditis elegans; RNA-Seq 21 | SRR11186543,GSM4340673: ced-3(n717) total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq 22 | SRR11186542,GSM4340672: WT RPF Replicate 2; Caenorhabditis elegans; RNA-Seq 23 | SRR11186541,GSM4340671: WT total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq 24 | SRR11186540,GSM4340670: WT RPF Replicate 1; Caenorhabditis elegans; RNA-Seq 25 | SRR11186539,GSM4340669: WT total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq 26 | SRR10398500,GSM4148088: Slee141_mRNAseqFP_meg34_3: meg-3meg-4 mRNAseq-3; Caenorhabditis elegans; RNA-Seq 27 | SRR10398499,GSM4148087: Slee139_mRNAseqFP_meg34_2: meg-3meg-4 mRNAseq-2; Caenorhabditis elegans; RNA-Seq 28 | SRR10398498,GSM4148086: Slee125_mRNAseqFP_meg34_1: meg-3meg-4 mRNAseq-1; Caenorhabditis elegans; RNA-Seq 29 | SRR10398497,GSM4148085: Slee142_mRNAseqFP_N2_3: wild type mRNAseq-3; Caenorhabditis elegans; RNA-Seq 30 | SRR10398496,GSM4148084: Slee138_mRNAseqFP_N2_2: wild type mRNAseq-2; Caenorhabditis elegans; RNA-Seq 31 | SRR10398495,GSM4148083: Slee113_mRNAseqFP_N2_1: wild type mRNAseq-1; Caenorhabditis elegans; RNA-Seq 32 | SRR10398494,GSM4148082: Slee135_meg34: meg-3meg-4 Riboseq-3; Caenorhabditis elegans; OTHER 33 | SRR10398493,GSM4148081: Slee133_meg34: meg-3meg-4 Riboseq-2; Caenorhabditis elegans; OTHER 34 | SRR10398492,GSM4148080: Slee123_meg34: meg-3meg-4 Riboseq-1; Caenorhabditis elegans; OTHER 35 | SRR10398491,GSM4148079: Slee136_N2: wild type Riboseq-3; Caenorhabditis elegans; OTHER 36 | SRR10398490,GSM4148078: Slee132_N2: wild type Riboseq-2; Caenorhabditis elegans; OTHER 37 | SRR10398489,GSM4148077: Slee111_N2: wild type Riboseq-1; Caenorhabditis elegans; OTHER 38 | SRR7241911,GSM3168388: C. elegans embryos ribosome profiling; Caenorhabditis elegans; RNA-Seq 39 | SRR5026637,Bayesian Prediction of RNA Translation from Ribosome Profiling 40 | SRR5026603,Bayesian Prediction of RNA Translation from Ribosome Profiling 41 | SRR5026592,Bayesian Prediction of RNA Translation from Ribosome Profiling 42 | SRR5026589,Bayesian Prediction of RNA Translation from Ribosome Profiling 43 | SRR5026359,Bayesian Prediction of RNA Translation from Ribosome Profiling 44 | SRR5026356,Bayesian Prediction of RNA Translation from Ribosome Profiling 45 | SRR3219264,unc-54(cc3389) Ribo-seq 46 | SRR3219253,unc-54(+) Ribo-seq 47 | SRR3219248,unc-54(+) Ribo-seq 48 | SRR1804351,GSM1611598: frac_26-34nt; Caenorhabditis elegans; OTHER 49 | SRR1804349,GSM1611597: frac_28-35nt; Caenorhabditis elegans; OTHER 50 | SRR1804348,GSM1611596: frac_28-30nt; Caenorhabditis elegans; OTHER 51 | SRR1804346,GSM1611595: frac_25-30nt; Caenorhabditis elegans; OTHER 52 | SRR1804345,GSM1611594: 04_N2_893_GCkit; Caenorhabditis elegans; OTHER 53 | SRR1804343,GSM1611593: 03_N2_893_SGkit; Caenorhabditis elegans; OTHER 54 | SRR1804341,GSM1611592: 02_N2_893_GCop; Caenorhabditis elegans; OTHER 55 | SRR1804340,GSM1611591: 01_N2_893_SGop; Caenorhabditis elegans; OTHER 56 | SRR1635379,GSM1534607: RNASeq_c14_2; Caenorhabditis elegans; RNA-Seq 57 | SRR1635378,GSM1534606: RNASeq_c14_1; Caenorhabditis elegans; RNA-Seq 58 | SRR1635377,GSM1534605: RNASeq_N2_2; Caenorhabditis elegans; RNA-Seq 59 | SRR1635376,GSM1534604: RNASeq_N2_1; Caenorhabditis elegans; RNA-Seq 60 | SRR914339,GSM1169554: Celeg-3-FED-RPF; Caenorhabditis elegans; OTHER 61 | SRR914338,GSM1169553: Celeg-2-FED-RPF; Caenorhabditis elegans; OTHER 62 | SRR914337,GSM1169552: Celeg-1-FED-RPF; Caenorhabditis elegans; OTHER 63 | SRR914336,GSM1169551: Celeg-3-ST-RPF; Caenorhabditis elegans; OTHER 64 | SRR914335,GSM1169550: Celeg-2-ST-RPF; Caenorhabditis elegans; OTHER 65 | SRR914334,GSM1169549: Celeg-1-ST-RPF; Caenorhabditis elegans; OTHER 66 | SRR914333,GSM1169548: Celeg-3-FED-mRNA; Caenorhabditis elegans; RNA-Seq 67 | SRR914332,GSM1169547: Celeg-2-FED-mRNA; Caenorhabditis elegans; RNA-Seq 68 | SRR914331,GSM1169546: Celeg-1-FED-mRNA; Caenorhabditis elegans; RNA-Seq 69 | SRR914330,GSM1169545: Celeg-3-ST-mRNA; Caenorhabditis elegans; RNA-Seq 70 | SRR914329,GSM1169544: Celeg-2-ST-mRNA; Caenorhabditis elegans; RNA-Seq 71 | SRR914328,GSM1169543: Celeg-1-ST-mRNA; Caenorhabditis elegans; RNA-Seq 72 | SRR522897,L1 ribosome footprints replicate 4 73 | SRR522895,L2 ribosome footprints replicate 2 (seq-replicate) 74 | SRR522883,L4 ribosome footprints replicate 1 75 | SRR522896,L1 ribosome footprints replicate 3 76 | SRR522872,L1 ribosome footprints replicate 2 77 | SRR522871,L1 ribosome footprints replicate 1 78 | SRR522892,L2 ribosome footprints replicate 3 79 | SRR522890,L2 ribosome footprints replicate 2 80 | SRR522881,L2 ribosome footprints replicate 1 81 | SRR522153,L4 total RNA-seq 82 | SRR522152,L1 total RNA-seq 83 | SRR522151,L4 mRNA-seq replicate 2 84 | SRR522882,L4 mRNA-seq replicate 1 85 | SRR522888,L1 mRNA-seq replicate 4 86 | SRR522011,L1 replicate 3 mRNA-seq 87 | SRR522010,L1 replicate 2 mRNA-seq 88 | SRR522009,L1 mRNA-seq replicate 1 89 | SRR1804352,GSM1611598: frac_26-34nt; Caenorhabditis elegans; OTHER 90 | SRR1804350,GSM1611597: frac_28-35nt; Caenorhabditis elegans; OTHER 91 | SRR1804347,GSM1611595: frac_25-30nt; Caenorhabditis elegans; OTHER 92 | SRR1804344,GSM1611593: 03_N2_893_SGkit; Caenorhabditis elegans; OTHER 93 | SRR1804342,GSM1611592: 02_N2_893_GCop; Caenorhabditis elegans; OTHER 94 | -------------------------------------------------------------------------------- /tests/data/test_search/sra_uids.txt: -------------------------------------------------------------------------------- 1 | 155791 2 | 155790 3 | -------------------------------------------------------------------------------- /tests/test_geoweb.py: -------------------------------------------------------------------------------- 1 | """Tests for GEOweb""" 2 | 3 | import os 4 | import time 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | from pysradb.geoweb import GEOweb 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def geoweb_connection(): 14 | db = GEOweb() 15 | time.sleep(2) 16 | return db 17 | 18 | 19 | def test_valid_download_links(geoweb_connection): 20 | """Test if all links for a project are scraped""" 21 | links, url = geoweb_connection.get_download_links("GSE161707") 22 | assert links == ["GSE161707_RAW.tar", "filelist.txt"] 23 | 24 | 25 | def test_invalid_download_links(geoweb_connection): 26 | """Test if invalid GEO ID raises the expected error""" 27 | with pytest.raises(KeyError): 28 | links, url = geoweb_connection.get_download_links("GSE1691709") 29 | 30 | 31 | def test_file_download(geoweb_connection): 32 | """Test if file actually gets downloaded""" 33 | geoweb_connection.download( 34 | links=["GSE161707_RAW.tar", "filelist.txt"], 35 | root_url="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161707/suppl/", 36 | gse="GSE161707", 37 | out_dir="geoweb_downloads", 38 | ) 39 | assert os.path.getsize("geoweb_downloads/GSE161707/GSE161707_RAW.tar") 40 | assert os.path.getsize("geoweb_downloads/GSE161707/GSE161707_filelist.txt") 41 | -------------------------------------------------------------------------------- /tests/test_sradb.py: -------------------------------------------------------------------------------- 1 | """Tests for sradb.py 2 | """ 3 | 4 | import os 5 | from sqlite3 import OperationalError 6 | 7 | import pytest 8 | 9 | from pysradb import SRAdb 10 | from pysradb.filter_attrs import guess_cell_type 11 | from pysradb.filter_attrs import guess_strain_type 12 | from pysradb.filter_attrs import guess_tissue_type 13 | 14 | 15 | def test_not_valid_file(): 16 | """Test to check for error if file is either not 17 | present or not a valid sqlite file""" 18 | path = "SRAmetadb.sqlite" 19 | try: 20 | db = SRAdb(path) 21 | except SystemExit: 22 | assert os.path.isfile(path) == False 23 | except OperationalError: 24 | assert True 25 | -------------------------------------------------------------------------------- /tests/test_sraweb.py: -------------------------------------------------------------------------------- 1 | """Tests for SRAweb""" 2 | 3 | import time 4 | 5 | import pandas as pd 6 | import pytest 7 | 8 | from pysradb.sraweb import SRAweb 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def sraweb_connection(): 13 | db = SRAweb() 14 | time.sleep(2) 15 | return db 16 | 17 | 18 | def test_sra_metadata(sraweb_connection): 19 | """Test if metadata has right number of entries""" 20 | df = sraweb_connection.sra_metadata("SRP016501") 21 | assert df.shape[0] == 134 22 | 23 | 24 | def test_sra_metadata_missing_orgname(sraweb_connection): 25 | """Test if metadata has right number of entries""" 26 | df = sraweb_connection.sra_metadata("ERP000171") 27 | # See: https://github.com/saketkc/pysradb/issues/46#issuecomment-657268760 28 | assert sum(pd.isna(df.organism_name.tolist())) > 0 29 | 30 | 31 | def test_sra_metadata_multiple(sraweb_connection): 32 | """Test if metadata has right number of entries""" 33 | df = sraweb_connection.sra_metadata(["SRP016501", "SRP096025", "SRP103009"]) 34 | assert list(sorted(df.study_accession.unique())) == [ 35 | "SRP016501", 36 | "SRP096025", 37 | "SRP103009", 38 | ] 39 | 40 | 41 | def test_sra_metadata_multiple_detailed(sraweb_connection): 42 | """Test if metadata has right number of entries""" 43 | df = sraweb_connection.sra_metadata(["SRP002605", "SRP098789"], detailed=True) 44 | columns = ["treatment time", "library type", "transfection", "time"] 45 | assert len(set(columns).intersection(set(df.columns))) == 4 46 | ftp_cols = [ 47 | "ena_fastq_http", 48 | "ena_fastq_http_1", 49 | "ena_fastq_http_2", 50 | "ena_fastq_ftp", 51 | "ena_fastq_ftp_1", 52 | "ena_fastq_ftp_2", 53 | ] 54 | assert len(set(ftp_cols).intersection(set(df.columns))) == 6 55 | 56 | 57 | def test_tissue_column(sraweb_connection): 58 | """Test if tissue column exists""" 59 | df = sraweb_connection.sra_metadata("SRP096025", detailed="True") 60 | assert list(df["tissue"]) == ["Kidney"] * 4 61 | 62 | 63 | def test_metadata_exp_accession(sraweb_connection): 64 | """Test if experiment_accession column is correct""" 65 | df = sraweb_connection.sra_metadata("SRP103009", detailed="True") 66 | assert "SRX2705123" in list(df["experiment_accession"]) 67 | 68 | 69 | def test_fetch_gds_results(sraweb_connection): 70 | """Test if fetch_gds_result returns correct values""" 71 | df = sraweb_connection.fetch_gds_results("GSE34438") 72 | assert df["accession"][1] == "GSM849112" 73 | 74 | 75 | def test_srp_to_gse(sraweb_connection): 76 | """Test if srp is converted to gse correctly""" 77 | df = sraweb_connection.srp_to_gse("SRP009836") 78 | assert df["study_alias"][0] == "GSE34438" 79 | 80 | 81 | def test_srp_to_srr(sraweb_connection): 82 | """Test if srp is converted to srr correctly""" 83 | df = sraweb_connection.srp_to_srr("SRP002605", detailed=True) 84 | assert df["run_accession"].tolist()[:5] == [ 85 | "SRR057511", 86 | "SRR057512", 87 | "SRR057513", 88 | "SRR057514", 89 | "SRR057515", 90 | ] 91 | 92 | 93 | def test_srp_to_srs(sraweb_connection): 94 | """Test if srp is converted to srs correctly""" 95 | df = sraweb_connection.srp_to_srs("SRP014542") 96 | assert sorted(list(df["sample_accession"])) == [ 97 | "SRS351513", 98 | "SRS351514", 99 | "SRS351515", 100 | "SRS351516", 101 | "SRS351517", 102 | "SRS351518", 103 | ] 104 | 105 | 106 | def test_srp_to_srx(sraweb_connection): 107 | """Test if srp is converted to srx correctly""" 108 | df = sraweb_connection.srp_to_srx("SRP044932") 109 | assert list(df["experiment_accession"]) == ["SRX663253", "SRX663254"] 110 | 111 | 112 | def test_gse_to_gsm(sraweb_connection): 113 | """Test if gse is converted to gsm correctly""" 114 | df = sraweb_connection.gse_to_gsm("GSE56924", detailed=False) 115 | assert df.shape[0] == 96 116 | 117 | 118 | def test_gse_to_gsm2(sraweb_connection): 119 | """Test for gse to gsm""" 120 | df = sraweb_connection.gse_to_gsm("GSE200028", detailed=False) 121 | assert df.shape[0] == 15 122 | 123 | 124 | def test_gse_to_gsm1(sraweb_connection): 125 | """Test if gse_to_gsm works without passing `detailed` parameter""" 126 | df = sraweb_connection.gse_to_gsm("GSE63858") 127 | assert list(sorted(df["experiment_alias"])) == ["GSM1558530", "GSM1558531"] 128 | 129 | 130 | def test_gse_to_srp(sraweb_connection): 131 | """Test if gse is converted to srp correctly""" 132 | df = sraweb_connection.gse_to_srp("GSE63858") 133 | assert df["study_accession"].tolist()[0] == "SRP050548" 134 | 135 | 136 | def test_gse_to_srp2(sraweb_connection): 137 | """Test if gse is converted to srp correctly""" 138 | df = sraweb_connection.gse_to_srp(["GSE168880", "GSE209835"]) 139 | assert df["study_accession"].tolist()[0] == "SRP310566" 140 | assert df["study_accession"].tolist()[1] == "SRP388275" 141 | 142 | 143 | def test_gsm_to_srp(sraweb_connection): 144 | """Test if gsm is converted to srp correctly""" 145 | df = sraweb_connection.gsm_to_srp("GSM1371490") 146 | assert df["study_accession"].tolist()[0] == "SRP041298" 147 | 148 | 149 | def test_gsm_to_gse(sraweb_connection): 150 | """Test if gsm is converted to gse correctly""" 151 | df = sraweb_connection.gsm_to_gse("GSM1371490") 152 | assert df["study_alias"].tolist()[0] == "GSE56924" 153 | 154 | 155 | def test_gsm_to_srr(sraweb_connection): 156 | """Test if gsm is converted to srr correctly""" 157 | df = sraweb_connection.gsm_to_srr("GSM1371489") 158 | assert df["run_accession"].tolist()[0] == "SRR1257271" 159 | 160 | 161 | def test_gsm_to_srs(sraweb_connection): 162 | """Test if gsm is converted to srs correctly""" 163 | df = sraweb_connection.gsm_to_srs("GSM1371469") 164 | assert df["sample_accession"].tolist()[0] == "SRS594838" 165 | 166 | 167 | def test_gsm_to_srx(sraweb_connection): 168 | """Test if gsm is converted to srx correctly""" 169 | df = sraweb_connection.gsm_to_srx("GSM1371454") 170 | assert list(df["experiment_accession"]) == ["SRX522468"] 171 | 172 | 173 | def test_srr_to_gsm(sraweb_connection): 174 | df = sraweb_connection.srr_to_gsm("SRR057515") 175 | assert df["experiment_alias"].tolist()[0] == "GSM546921" 176 | 177 | 178 | def test_srr_to_srp(sraweb_connection): 179 | """Test if srr is converted to srp correctly""" 180 | df = sraweb_connection.srr_to_srp("SRR057511", detailed=False) 181 | assert list(df["study_accession"]) == ["SRP002605"] 182 | 183 | 184 | def test_srr_to_srp1(sraweb_connection): 185 | """Test if srr_to_srp works without passing the `detailed` parameter""" 186 | df = sraweb_connection.srr_to_srp("SRR057515") 187 | assert list(df["study_accession"]) == ["SRP002605"] 188 | 189 | 190 | def test_srr_to_srs(sraweb_connection): 191 | """Test if srr is converted to srs correctly""" 192 | df = sraweb_connection.srr_to_srs("SRR057513") 193 | assert list(df["sample_accession"]) == ["SRS079386"] 194 | 195 | 196 | def test_srr_to_srx(sraweb_connection): 197 | """Test if srr is converted to srx correctly""" 198 | df = sraweb_connection.srr_to_srx("SRR057514") 199 | assert list(df["experiment_accession"]) == ["SRX021967"] 200 | 201 | 202 | def test_srs_to_gsm(sraweb_connection): 203 | """Test if srs is converted to gsm correctly""" 204 | df = sraweb_connection.srs_to_gsm("SRS079386") 205 | assert df["experiment_alias"][0] == "GSM546921" 206 | 207 | 208 | def test_srs_to_srx(sraweb_connection): 209 | """Test if srs is converted to srx correctly""" 210 | df = sraweb_connection.srs_to_srx("SRS594838") 211 | assert list(df["experiment_accession"]) == ["SRX522483"] 212 | 213 | 214 | def test_srx_to_gsm(sraweb_connection): 215 | """Test if srx is converted to gsm correctly""" 216 | df = sraweb_connection.srx_to_gsm("SRX663253") 217 | assert list(df["experiment_alias"]) == ["GSM1446832"] 218 | 219 | 220 | def test_srx_to_srp(sraweb_connection): 221 | """Test if srx is converted to srp correctly""" 222 | df = sraweb_connection.srx_to_srp("SRX663254") 223 | assert list(df["study_accession"]) == ["SRP044932"] 224 | 225 | 226 | def test_srx_to_srr(sraweb_connection): 227 | """Test if srx is converted to srr correctly""" 228 | df = sraweb_connection.srx_to_srr("SRX2705123") 229 | assert list(df["run_accession"]) == ["SRR5413172"] 230 | 231 | 232 | def test_srx_to_srr1(sraweb_connection): 233 | """Test if srx is converted to srr correctly, including multiple srrs""" 234 | df = sraweb_connection.srx_to_srr("SRX8998846") 235 | assert list(df["run_accession"]) == ["SRR12508064", "SRR12508065"] 236 | 237 | 238 | def test_srx_to_srs(sraweb_connection): 239 | """Test if srx is converted to srs correctly""" 240 | df = sraweb_connection.srx_to_srs("SRX663253") 241 | assert list(df["sample_accession"]) == ["SRS668126"] 242 | 243 | 244 | def test_xmlns_id(sraweb_connection): 245 | df = sraweb_connection.sra_metadata(["GSM1013144", "GSM2520660"]) 246 | assert list(df["library_layout"]) == ["PAIRED", "SINGLE"] 247 | 248 | 249 | def test_GCP_url(sraweb_connection): 250 | df = sraweb_connection.sra_metadata(["SRP002605"], detailed=True) 251 | assert df["gcp_url"].tolist()[-1].startswith("gs:") 252 | 253 | 254 | def test_GCP_url2(sraweb_connection): 255 | df = sraweb_connection.sra_metadata(["DRR138929"], detailed=True) 256 | assert df["gcp_url"].tolist()[-1].startswith("gs:") 257 | 258 | 259 | def test_gse_to_srp3(sraweb_connection): 260 | # https://github.com/saketkc/pysradb/issues/190 261 | df = sraweb_connection.gse_to_srp(["GSE89545"]) 262 | assert df["study_accession"].tolist()[0] == "SRP093251" 263 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """Tests for utils.py 2 | """ 3 | 4 | import pytest 5 | 6 | from pysradb.utils import * 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def invalid_name(): 11 | return "Red blood cells" 12 | 13 | 14 | @pytest.fixture(scope="module") 15 | def valid_name(): 16 | return "Homo sapiens" 17 | 18 | 19 | def invalid_scientific_name_to_taxid(invalid_name): 20 | with pytest.raises(IncorrectFieldException) as e: 21 | scientific_name_to_taxid(invalid_name) 22 | assert "Unknown scientific name" in str(e.value) 23 | 24 | 25 | def valid_scientific_name_to_taxid(valid_name): 26 | assert scientific_name_to_taxid(valid_name) == "9606" 27 | --------------------------------------------------------------------------------