├── .coveragerc
├── .editorconfig
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── c4gt_community.yml
    │   └── feature_request.md
    ├── c4gt_community.yml
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── publish.yml
    │   ├── pull_request.yml
    │   └── push.yml
├── .gitignore
├── AUTHORS.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docs
    ├── Makefile
    ├── _static
    │   ├── demo.svg
    │   ├── e1.png
    │   ├── e2.png
    │   ├── e3.png
    │   ├── pysradb_v3.png
    │   └── pysradb_v3.svg
    ├── authors.md
    ├── case_studies.md
    ├── cmdline.md
    ├── commands.md
    ├── conf.py
    ├── contributing.md
    ├── history.md
    ├── index.rst
    ├── installation.md
    ├── make.bat
    ├── modules.md
    ├── modules.rst
    ├── pysradb.rst
    ├── python-api-usage.md
    └── quickstart.md
├── notebooks
    ├── 01.Python-API_demo.ipynb
    ├── 02.Commandline_download.ipynb
    ├── 03.ParallelDownload.ipynb
    ├── 04.SRA_to_fastq_conda.ipynb
    ├── 05.Downloading_subsets_of_a_project.ipynb
    ├── 06.Download_BAMs.ipynb
    ├── 07.Multiple_SRPs.ipynb
    ├── 08.pysradb_ascp_multithreaded.ipynb
    ├── 09.Query_Search.ipynb
    └── README.md
├── pyproject.toml
├── pysradb
    ├── __init__.py
    ├── basedb.py
    ├── cli.py
    ├── download.py
    ├── exceptions.py
    ├── filter_attrs.py
    ├── geodb.py
    ├── geoweb.py
    ├── search.py
    ├── sradb.py
    ├── sraweb.py
    ├── taxid2name.py
    └── utils.py
├── requirements.txt
├── setup.cfg
└── tests
    ├── _test_geodb.py
    ├── _test_pcli.py
    ├── _test_sradb.py
    ├── conftest.py
    ├── data
        └── test_search
        │   ├── ena_search_test1.txt
        │   ├── ena_test_verbosity_0.csv
        │   ├── ena_test_verbosity_0.json
        │   ├── ena_test_verbosity_1.csv
        │   ├── ena_test_verbosity_1.json
        │   ├── ena_test_verbosity_2.csv
        │   ├── ena_test_verbosity_2.json
        │   ├── ena_test_verbosity_3.csv
        │   ├── ena_test_verbosity_3.json
        │   ├── geo_search_test1.txt
        │   ├── sra_search_test1.txt
        │   ├── sra_test.xml
        │   ├── sra_test_2_verbosity_0.csv
        │   ├── sra_test_2_verbosity_1.csv
        │   ├── sra_test_2_verbosity_2.csv
        │   ├── sra_test_2_verbosity_3.csv
        │   ├── sra_test_ERS3331676.xml
        │   ├── sra_test_verbosity_0.csv
        │   ├── sra_test_verbosity_1.csv
        │   ├── sra_test_verbosity_2.csv
        │   ├── sra_test_verbosity_3.csv
        │   └── sra_uids.txt
    ├── test_geoweb.py
    ├── test_search.py
    ├── test_sradb.py
    ├── test_sraweb.py
    └── test_utils.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     pysradb/filter_attrs.py
4 |     pysradb/geodb.py
5 |     pysradb/sradb.py
6 |     pysradb/taxid2name.py
7 |     pysradb/utils.py
8 | 
9 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.rst linguist-documentation
2 | *.html linguist-documentation
3 | *.ipynb linguist-language=python
4 | 
5 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [saketkc]
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * pysradb version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | `pysradb <command> SRPxxx`
16 | 
17 | 
18 | **Desktop (please complete the following information):**
19 |  - OS: [e.g. Ubuntu 20.04]
20 |  - Python version [e.g. 3.8]
21 | 
22 | **Additional context**
23 | Add any other context about the problem here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/c4gt_community.yml:
--------------------------------------------------------------------------------
  1 | name: C4GT Community Template
  2 | description: Create a new Ticket for C4GT Community
  3 | title: "[C4GT Community]: "
  4 | labels: ["Please add the C4GT Community Label on all tickets that you will list. In addition to this, please add whichever labels best describe your project from this list: C4GT Community, C4GT Coding, C4GT Design, C4GT Mentorship, C4GT Bounty, C4GT Advisory"]
  5 | body:
  6 |   - type: textarea
  7 |     id: ticket-description
  8 |     validations:
  9 |       required: true
 10 |     attributes:
 11 |       label: Ticket Contents
 12 |       value: |
 13 |         ## Description
 14 |         [Provide a brief project description, outlining the need and measurable goals of the feature to be developed. Kindly, specify the number of users or scale of the product and feature, articulating its anticipated impact and intended use. Kindly also add the bounty amount you are willing to pay in case of this being a bounty ticket. For Design, Mentorship & Advisory tickets, we request you to make a copy of this google form (https://docs.google.com/forms/d/18EPdoqBUFS6lRoQQbRKKjTvzA711nNTdmU8jinKTXWs/edit) in your own drive and attach a link to it in the issue ticket for contributors to fill in. This way you will be able to see all the applications that come in and assign it to the selected contributor.]
 15 | 
 16 |   - type: textarea
 17 |     id: ticket-goals
 18 |     validations:
 19 |       required: true
 20 |     attributes:
 21 |       label: Goals
 22 |       description: List the goals of the feature.
 23 |       value: |
 24 |         ## Goals
 25 |         - [ ] [Goal 1]
 26 |         - [ ] [Goal 2]
 27 |         - [ ] [Goal 3]
 28 |         - [ ] [Goal 4]
 29 |         - [ ] [Goal 5]
 30 |         [Kindly, state and link installation guide/ set-up steps (if any) in this section]
 31 | 
 32 |   - type: textarea
 33 |     id: ticket-expected-outcome
 34 |     validations:
 35 |       required: true
 36 |     attributes:
 37 |       label: Expected Outcome
 38 |       description: Describe in detail what the final product or result should look like and how it should behave.
 39 | 
 40 |   - type: textarea
 41 |     id: ticket-acceptance-criteria
 42 |     attributes:
 43 |       label: Acceptance Criteria
 44 |       description: List the acceptance criteria for this feature.
 45 | 
 46 |   - type: textarea
 47 |     id: ticket-implementation-details
 48 |     validations:
 49 |       required: true
 50 |     attributes:
 51 |       label: Implementation Details
 52 |       description: List any technical details about the proposed implementation, including any specific technologies that will be used.
 53 | 
 54 |   - type: textarea
 55 |     id: ticket-mockups
 56 |     attributes:
 57 |       label: Mockups/Wireframes
 58 |       description: Include links to any visual aids, mockups, wireframes, or diagrams that help illustrate what the final product should look like. This is not always necessary, but can be very helpful in many cases.
 59 | 
 60 |   - type: input
 61 |     id: ticket-product
 62 |     attributes:
 63 |       label: Product Name
 64 |       placeholder: Enter Product Name
 65 |     validations:
 66 |       required: true
 67 | 
 68 |   - type: dropdown
 69 |     id: ticket-organisation
 70 |     attributes:
 71 |       label: Organisation Name
 72 |       description: Enter Organisation Name
 73 |       multiple: false
 74 |       options:
 75 |         - C4GT
 76 |         - Dhiway
 77 |         - FIDE
 78 |         - SamagraX
 79 |         - ShikshaLokam
 80 |         - Tech4Dev
 81 |         - Tibil
 82 |     validations:
 83 |       required: true
 84 | 
 85 |   - type: input
 86 |     id: ticket-governance-domain
 87 |     attributes:
 88 |       label: Domain
 89 |       placeholder: Enter Area of Governance
 90 | 
 91 |   - type: dropdown
 92 |     id: ticket-technical-skills-required
 93 |     attributes:
 94 |       label: Tech Skills Needed
 95 |       description: Select the technologies needed for this ticket (use Ctrl or Command to select multiple)
 96 |       multiple: true
 97 |       options:
 98 |         - .NET
 99 |         - Agile
100 |         - Angular
101 |         - Artificial Intelligence
102 |         - ASP.NET
103 |         - AWS
104 |         - Babel
105 |         - Bootstrap
106 |         - C#
107 |         - Chart.js
108 |         - CI/CD
109 |         - Computer Vision
110 |         - CORS
111 |         - cURL
112 |         - Cypress
113 |         - D3.js
114 |         - Database
115 |         - Debugging
116 |         - Design
117 |         - DevOps
118 |         - Django
119 |         - Docker
120 |         - Electron
121 |         - ESLint
122 |         - Express.js
123 |         - Feature
124 |         - Flask
125 |         - Go
126 |         - GraphQL
127 |         - HTML
128 |         - Ionic
129 |         - Jest
130 |         - Java
131 |         - JavaScript
132 |         - Jenkins
133 |         - JWT
134 |         - Kubernetes
135 |         - Laravel
136 |         - Machine Learning
137 |         - Maintenance
138 |         - Markdown
139 |         - Material-UI
140 |         - Microservices
141 |         - MongoDB
142 |         - Mobile
143 |         - Mockups
144 |         - Mocha
145 |         - Natural Language Processing
146 |         - NestJS
147 |         - Node.js
148 |         - NUnit
149 |         - OAuth
150 |         - Performance Improvement
151 |         - Prettier
152 |         - Python
153 |         - Question
154 |         - React
155 |         - React Native
156 |         - Redux
157 |         - RESTful APIs
158 |         - Ruby
159 |         - Ruby on Rails
160 |         - Rust
161 |         - Scala
162 |         - Security
163 |         - Selenium
164 |         - SEO
165 |         - Serverless
166 |         - Solidity
167 |         - Spring Boot
168 |         - SQL
169 |         - Swagger
170 |         - Tailwind CSS
171 |         - Test
172 |         - Testing Library
173 |         - Three.js
174 |         - TypeScript
175 |         - UI/UX/Design
176 |         - Virtual Reality
177 |         - Vue.js
178 |         - WebSockets
179 |         - Webpack
180 |         - Other
181 |     validations:
182 |       required: true
183 | 
184 |   - type: textarea
185 |     id: ticketorg-mentors
186 |     attributes:
187 |       label: Organizational Mentor
188 |       description: Please tag relevant mentors for the ticket
189 |     validations:
190 |       required: true
191 | 
192 |   - type: textarea
193 |     id: ticketangel-mentors
194 |     attributes:
195 |       label: Angel Mentor
196 |       description: Please tag relevant mentors for the ticket
197 |     validations:
198 |       required: false
199 | 
200 |   - type: dropdown
201 |     id: ticket-complexity
202 |     attributes: 
203 |       label: Complexity
204 |       description: Choose a complexity describing the complexity of your ticket
205 |       multiple: false
206 |       options:
207 |         - Low
208 |         - Medium
209 |         - High
210 |     validations:
211 |       required: true
212 | 
213 |   - type: dropdown
214 |     id: ticket-category
215 |     attributes: 
216 |       label: Category
217 |       description: Choose the categories that best describe your ticket
218 |       multiple: true
219 |       options:
220 |         - API
221 |         - Analytics
222 |         - Accessibility
223 |         - Backend
224 |         - Breaking Change
225 |         - Beginner Friendly
226 |         - Configuration
227 |         - CI/CD
228 |         - Database
229 |         - Data Science
230 |         - Deprecation
231 |         - Documentation
232 |         - Delpoyment
233 |         - Frontend
234 |         - Internationalization
235 |         - Localization
236 |         - Machine Learning
237 |         - Maintenance
238 |         - Mobile
239 |         - Performance Improvement
240 |         - Question
241 |         - Refactoring
242 |         - Research
243 |         - Needs Reproduction
244 |         - SEO
245 |         - Security
246 |         - Testing
247 |         - Other
248 |     validations:
249 |       required: true
250 | 
251 |   
252 |   
253 | 
254 |   
255 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[ENH]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 


--------------------------------------------------------------------------------
/.github/c4gt_community.yml:
--------------------------------------------------------------------------------
  1 | name: C4GT Community Template
  2 | description: Create a new Ticket for C4GT Community
  3 | title: "[C4GT Community]: "
  4 | labels: ["Please add the C4GT Community Label on all tickets that you will list. In addition to this, please add whichever labels best describe your project from this list: C4GT Community, C4GT Coding, C4GT Design, C4GT Mentorship, C4GT Bounty, C4GT Advisory"]
  5 | body:
  6 |   - type: textarea
  7 |     id: ticket-description
  8 |     validations:
  9 |       required: true
 10 |     attributes:
 11 |       label: Ticket Contents
 12 |       value: |
 13 |         ## Description
 14 |         [Provide a brief project description, outlining the need and measurable goals of the feature to be developed. Kindly, specify the number of users or scale of the product and feature, articulating its anticipated impact and intended use. Kindly also add the bounty amount you are willing to pay in case of this being a bounty ticket. For Design, Mentorship & Advisory tickets, we request you to make a copy of this google form (https://docs.google.com/forms/d/18EPdoqBUFS6lRoQQbRKKjTvzA711nNTdmU8jinKTXWs/edit) in your own drive and attach a link to it in the issue ticket for contributors to fill in. This way you will be able to see all the applications that come in and assign it to the selected contributor.]
 15 | 
 16 |   - type: textarea
 17 |     id: ticket-goals
 18 |     validations:
 19 |       required: true
 20 |     attributes:
 21 |       label: Goals
 22 |       description: List the goals of the feature.
 23 |       value: |
 24 |         ## Goals
 25 |         - [ ] [Goal 1]
 26 |         - [ ] [Goal 2]
 27 |         - [ ] [Goal 3]
 28 |         - [ ] [Goal 4]
 29 |         - [ ] [Goal 5]
 30 |         [Kindly, state and link installation guide/ set-up steps (if any) in this section]
 31 | 
 32 |   - type: textarea
 33 |     id: ticket-expected-outcome
 34 |     validations:
 35 |       required: true
 36 |     attributes:
 37 |       label: Expected Outcome
 38 |       description: Describe in detail what the final product or result should look like and how it should behave.
 39 | 
 40 |   - type: textarea
 41 |     id: ticket-acceptance-criteria
 42 |     attributes:
 43 |       label: Acceptance Criteria
 44 |       description: List the acceptance criteria for this feature.
 45 | 
 46 |   - type: textarea
 47 |     id: ticket-implementation-details
 48 |     validations:
 49 |       required: true
 50 |     attributes:
 51 |       label: Implementation Details
 52 |       description: List any technical details about the proposed implementation, including any specific technologies that will be used.
 53 | 
 54 |   - type: textarea
 55 |     id: ticket-mockups
 56 |     attributes:
 57 |       label: Mockups/Wireframes
 58 |       description: Include links to any visual aids, mockups, wireframes, or diagrams that help illustrate what the final product should look like. This is not always necessary, but can be very helpful in many cases.
 59 | 
 60 |   - type: input
 61 |     id: ticket-product
 62 |     attributes:
 63 |       label: Product Name
 64 |       placeholder: Enter Product Name
 65 |     validations:
 66 |       required: true
 67 | 
 68 |   - type: dropdown
 69 |     id: ticket-organisation
 70 |     attributes:
 71 |       label: Organisation Name
 72 |       description: Enter Organisation Name
 73 |       multiple: false
 74 |       options:
 75 |         - C4GT
 76 |         - Dhiway
 77 |         - FIDE
 78 |         - SamagraX
 79 |         - ShikshaLokam
 80 |         - Tech4Dev
 81 |         - Tibil
 82 |     validations:
 83 |       required: true
 84 | 
 85 |   - type: input
 86 |     id: ticket-governance-domain
 87 |     attributes:
 88 |       label: Domain
 89 |       placeholder: Enter Area of Governance
 90 | 
 91 |   - type: dropdown
 92 |     id: ticket-technical-skills-required
 93 |     attributes:
 94 |       label: Tech Skills Needed
 95 |       description: Select the technologies needed for this ticket (use Ctrl or Command to select multiple)
 96 |       multiple: true
 97 |       options:
 98 |         - .NET
 99 |         - Agile
100 |         - Angular
101 |         - Artificial Intelligence
102 |         - ASP.NET
103 |         - AWS
104 |         - Babel
105 |         - Bootstrap
106 |         - C#
107 |         - Chart.js
108 |         - CI/CD
109 |         - Computer Vision
110 |         - CORS
111 |         - cURL
112 |         - Cypress
113 |         - D3.js
114 |         - Database
115 |         - Debugging
116 |         - Design
117 |         - DevOps
118 |         - Django
119 |         - Docker
120 |         - Electron
121 |         - ESLint
122 |         - Express.js
123 |         - Feature
124 |         - Flask
125 |         - Go
126 |         - GraphQL
127 |         - HTML
128 |         - Ionic
129 |         - Jest
130 |         - Java
131 |         - JavaScript
132 |         - Jenkins
133 |         - JWT
134 |         - Kubernetes
135 |         - Laravel
136 |         - Machine Learning
137 |         - Maintenance
138 |         - Markdown
139 |         - Material-UI
140 |         - Microservices
141 |         - MongoDB
142 |         - Mobile
143 |         - Mockups
144 |         - Mocha
145 |         - Natural Language Processing
146 |         - NestJS
147 |         - Node.js
148 |         - NUnit
149 |         - OAuth
150 |         - Performance Improvement
151 |         - Prettier
152 |         - Python
153 |         - Question
154 |         - React
155 |         - React Native
156 |         - Redux
157 |         - RESTful APIs
158 |         - Ruby
159 |         - Ruby on Rails
160 |         - Rust
161 |         - Scala
162 |         - Security
163 |         - Selenium
164 |         - SEO
165 |         - Serverless
166 |         - Solidity
167 |         - Spring Boot
168 |         - SQL
169 |         - Swagger
170 |         - Tailwind CSS
171 |         - Test
172 |         - Testing Library
173 |         - Three.js
174 |         - TypeScript
175 |         - UI/UX/Design
176 |         - Virtual Reality
177 |         - Vue.js
178 |         - WebSockets
179 |         - Webpack
180 |         - Other
181 |     validations:
182 |       required: true
183 | 
184 |   - type: textarea
185 |     id: ticketorg-mentors
186 |     attributes:
187 |       label: Organizational Mentor
188 |       description: Please tag relevant mentors for the ticket
189 |     validations:
190 |       required: true
191 | 
192 |   - type: textarea
193 |     id: ticketangel-mentors
194 |     attributes:
195 |       label: Angel Mentor
196 |       description: Please tag relevant mentors for the ticket
197 |     validations:
198 |       required: false
199 | 
200 |   - type: dropdown
201 |     id: ticket-complexity
202 |     attributes: 
203 |       label: Complexity
204 |       description: Choose a complexity describing the complexity of your ticket
205 |       multiple: false
206 |       options:
207 |         - Low
208 |         - Medium
209 |         - High
210 |     validations:
211 |       required: true
212 | 
213 |   - type: dropdown
214 |     id: ticket-category
215 |     attributes: 
216 |       label: Category
217 |       description: Choose the categories that best describe your ticket
218 |       multiple: true
219 |       options:
220 |         - API
221 |         - Analytics
222 |         - Accessibility
223 |         - Backend
224 |         - Breaking Change
225 |         - Beginner Friendly
226 |         - Configuration
227 |         - CI/CD
228 |         - Database
229 |         - Data Science
230 |         - Deprecation
231 |         - Documentation
232 |         - Delpoyment
233 |         - Frontend
234 |         - Internationalization
235 |         - Localization
236 |         - Machine Learning
237 |         - Maintenance
238 |         - Mobile
239 |         - Performance Improvement
240 |         - Question
241 |         - Refactoring
242 |         - Research
243 |         - Needs Reproduction
244 |         - SEO
245 |         - Security
246 |         - Testing
247 |         - Other
248 |     validations:
249 |       required: true
250 | 
251 |   
252 |   
253 | 
254 |   
255 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '35 5 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: publish
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: pull_request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.9, '3.10', '3.11']
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v1
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 | 
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -U pip
24 |         pip install -r requirements.txt
25 |     - name: Lint with flake8
26 |       run: |
27 |         pip install -U pytest coverage pytest-cov codecov black flake8
28 |         # stop the build if there are Python syntax errors or undefined names
29 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
30 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
31 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
32 |         black --check .
33 |     - name: Test with pytest
34 |       run: |
35 |         pip install --editable .
36 |         pip install pytest
37 |         pytest
38 |         make coverage
39 |         codecov
40 |     - uses: ammaraskar/sphinx-action@master
41 |       with:
42 |         pre-build-command: "pip install -r requirements.txt && pip install . && pip install -U sphinx myst-parser && pip install sphinxcontrib-gtagjs ipython numpydoc sphinx-tabs sphinx_rtd_theme nbsphinx ipython pydata-sphinx-theme nbsphinx-link sphinx-panels"
43 |         docs-folder: "docs/"
44 | 


--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
 1 | name: push
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.9, '3.10', '3.11']
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v1
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 | 
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -U pip
24 |         pip install -r requirements.txt
25 |     - name: Lint with flake8
26 |       run: |
27 |         pip install -U pytest coverage pytest-cov codecov black flake8
28 |         # stop the build if there are Python syntax errors or undefined names
29 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
30 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
31 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
32 |         black --check .
33 |     - name: Test with pytest
34 |       run: |
35 |         pip install --editable .
36 |         pip install pytest
37 |         pytest
38 |         make coverage
39 |         codecov
40 |     - uses: ammaraskar/sphinx-action@master
41 |       with:
42 |         pre-build-command: "pip install -r requirements.txt && pip install . && pip install sphinx myst-parser && pip install sphinxcontrib-gtagjs ipython numpydoc sphinx-tabs sphinx_rtd_theme nbsphinx ipython pydata-sphinx-theme nbsphinx-link sphinx-panels"
43 |         docs-folder: "docs/"
44 |     - name: Deploy
45 |       uses: peaceiris/actions-gh-pages@v3
46 |       with:
47 |         github_token: ${{ secrets.GITHUB_TOKEN }}
48 |         publish_dir: ./docs/_build/html/
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | *.sqlite
110 | *.sqlite.gz
111 | 
112 | geoweb_downloads/


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # Credits
 2 | 
 3 | ## Contributors
 4 | 
 5 | -   [Boshen Yan](https://github.com/bscrow)
 6 | -   [Maarten van der Sande](https://github.com/Maarten-vd-Sande)
 7 | -   [Dibya Gautam](https://github.com/dibyaaaaax)
 8 | -   [Marius van den Beek](https://github.com/mvdbeek)
 9 | -   [Devang Thakkar](https://github.com/DevangThakkar)
10 | 
11 | ## Maintainer
12 | 
13 | -   Saket Choudhary \<<saketkc@gmail.com>\>
14 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Choudhary"
 5 |   given-names: "Saket"
 6 |   orcid: "https://orcid.org/0000-0001-5202-7633"
 7 | title: "pysradb"
 8 | version: 2.0.0
 9 | doi: 10.12688/f1000research.18676.1
10 | date-released: 2023-04-05
11 | url: "https://github.com/saketkc/pysradb"
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at saketkc@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome, and they are greatly appreciated! Every
  4 | little bit helps, and credit will always be given.
  5 | 
  6 | You can contribute in many ways:
  7 | 
  8 | ## Types of Contributions
  9 | 
 10 | ### Report Bugs
 11 | 
 12 | Report bugs at <https://github.com/saketkc/pysradb/issues>.
 13 | 
 14 | If you are reporting a bug, please include:
 15 | 
 16 | -   Your operating system name and version.
 17 | -   Any details about your local setup that might be helpful in
 18 |     troubleshooting.
 19 | -   Detailed steps to reproduce the bug.
 20 | 
 21 | ### Fix Bugs
 22 | 
 23 | Look through the GitHub issues for bugs. Anything tagged with \"bug\"
 24 | and \"help wanted\" is open to whoever wants to implement it.
 25 | 
 26 | ### Implement Features
 27 | 
 28 | Look through the GitHub issues for features. Anything tagged with
 29 | \"enhancement\" and \"help wanted\" is open to whoever wants to
 30 | implement it.
 31 | 
 32 | ### Write Documentation
 33 | 
 34 | pysradb could always use more documentation, whether as part of the
 35 | official pysradb docs, in docstrings, or even on the web in blog posts,
 36 | articles, and such.
 37 | 
 38 | ### Submit Feedback
 39 | 
 40 | The best way to send feedback is to file an issue at
 41 | <https://github.com/saketkc/pysradb/issues>.
 42 | 
 43 | If you are proposing a feature:
 44 | 
 45 | -   Explain in detail how it would work.
 46 | -   Keep the scope as narrow as possible, to make it easier to
 47 |     implement.
 48 | -   Remember that this is a volunteer-driven project, and that
 49 |     contributions are welcome :)
 50 | 
 51 | ## Get Started!
 52 | 
 53 | Ready to contribute? Here\'s how to set up [pysradb]{.title-ref} for
 54 | local development.
 55 | 
 56 | 1.  Fork the [pysradb]{.title-ref} repo on GitHub.
 57 | 
 58 | 2.  Clone your fork locally:
 59 | 
 60 |     ``` shell
 61 |     $ git clone git@github.com:your_name_here/pysradb.git
 62 |     ```
 63 | 
 64 | 3.  Install your local copy into a virtualenv. Assuming you have
 65 |     virtualenvwrapper installed, this is how you set up your fork for
 66 |     local development (If python \--version is less than 3.0, run [\$
 67 |     mkvirtualenv pysradb \--python=py3]{.title-ref} instead):
 68 | 
 69 |     ``` shell
 70 |     $ mkvirtualenv pysradb
 71 |     $ cd pysradb/
 72 |     $ python setup.py develop
 73 |     ```
 74 | 
 75 | 4.  Create a branch for local development:
 76 | 
 77 |     ``` shell
 78 |     $ git checkout -b name-of-your-bugfix-or-feature
 79 |     ```
 80 | 
 81 |     Now you can make your changes locally.
 82 | 
 83 | 5.  When you\'re done making changes, check that your changes pass
 84 |     flake8 and the tests, including testing other Python versions with
 85 |     tox:
 86 | 
 87 |     ``` shell
 88 |     $ flake8 pysradb tests
 89 |     $ python setup.py test or py.test
 90 |     $ tox
 91 |     ```
 92 | 
 93 |     To get flake8 and tox, just pip install them into your virtualenv.
 94 | 
 95 | 6.  Commit your changes and push your branch to GitHub:
 96 | 
 97 |     ``` shell
 98 |     $ git add .
 99 |     $ git commit -m "Your detailed description of your changes."
100 |     $ git push origin name-of-your-bugfix-or-feature
101 |     ```
102 | 
103 | 7.  Submit a pull request through the GitHub website.
104 | 
105 | ## Pull Request Guidelines
106 | 
107 | Before you submit a pull request, check that it meets these guidelines:
108 | 
109 | 1.  The pull request should include tests.
110 | 2.  If the pull request adds functionality, the docs should be updated.
111 |     Put your new functionality into a function with a docstring, and add
112 |     the feature to the list in README.rst.
113 | 3.  The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and
114 |     for PyPy. Check
115 |     <https://travis-ci.org/saketkc/pysradb/pull_requests> and make sure
116 |     that the tests pass for all supported Python versions.
117 | 
118 | ## Tips
119 | 
120 | To run a subset of tests:
121 | 
122 | ``` shell
123 | $ py.test tests.test_pysradb
124 | ```
125 | 
126 | ## Deploying
127 | 
128 | A reminder for the maintainers on how to deploy. Make sure all your
129 | changes are committed (including an entry in HISTORY.rst). Then run:
130 | 
131 | ``` shell
132 | $ bumpversion patch # possible: major / minor / patch
133 | $ git push
134 | $ git push --tags
135 | ```
136 | 
137 | Travis will then deploy to PyPI if tests pass.
138 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
  1 | # History
  2 | 
  3 | # UNRELEASED
  4 | - Fix for handling missing metadata keys [#223](https://github.com/saketkc/pysradb/pull/223). Thanks [@andrewdavidsmith](https://github.com/andrewdavidsmith)
  5 | 
  6 | # 2.2.2 (2024-10-03)
  7 | - Fix for handling ENA urls for paired end data
  8 | 
  9 | # 2.2.1 (2024-08-21)
 10 | - Fix for handling ENA urls
 11 | - Migrated to pyproject.toml
 12 | 
 13 | 
 14 | # 2.2.0 (2023-09-17)
 15 | 
 16 | - Add support for Biosamples and bioproject [#199](https://github.com/saketkc/pysradb/pull/198)
 17 | - Use retmode xml for Geo search [#200](https://github.com/saketkc/pysradb/pull/200)
 18 | - Documentation fixes
 19 | 
 20 | ## 2.1.0 (2023-05-16)
 21 | 
 22 | -   Fix for [gse-to-srp] returning unrequested GSEs [#186](https://github.com/saketkc/pysradb/issues/190)
 23 | -   Fix for [download] using [public_urls]
 24 | -   Fix for [gsm-to-srx] returning false positives [#165](https://github.com/saketkc/pysradb/issues/165)
 25 | -   Fix for delimiter not being consistent when metadata is printed on
 26 |     terminal [#147](https://github.com/saketkc/pysradb/issues/147)
 27 | -   ENA search is currently broken because of an API change
 28 | 
 29 | ## 2.0.2 (2023-04-09)
 30 | 
 31 | -   Fix for [gse-to-srp] to handle cases where a project is
 32 |     missing but SRXs are returned [#186](https://github.com/saketkc/pysradb/issues/186)
 33 | -   Fix gse-to-gsm [#187](https://github.com/saketkc/pysradb/issues/187)
 34 | 
 35 | ## 2.0.1 (2023-03-18)
 36 | 
 37 | -   Fix for [pysradb download] - using [public_url]
 38 | -   Fix for SRX -\> SRR and related conversions [#183](https://github.com/saketkc/pysradb/pull/183)
 39 | 
 40 | ## 2.0.0 (2023-02-23)
 41 | 
 42 | -   BREAKING change: Overhaul of how urls and associated metadata are
 43 |     returned (not backward compatible); all column names are lower cased
 44 |     by default
 45 | -   Fix extra space in \"organism_taxid\" column
 46 | -   Added support for Experiment attributes [#89](https://github.com/saketkc/pysradb/issues/89#issuecomment-1439319532)
 47 | 
 48 | ## 1.4.2 (06-17-2022)
 49 | 
 50 | -   Fix ENA fastq fetching [#163](https://github.com/saketkc/pysradb/issues/163)
 51 | 
 52 | ## 1.4.1 (06-04-2022)
 53 | 
 54 | -   Fix for fetching alternative URLs
 55 | 
 56 | ## 1.4.0 (06-04-2022)
 57 | 
 58 | -   Added ability to fetch alternative URLs (GCP/AWS) for metadata
 59 |     [#161](https://github.com/saketkc/pysradb/issues/161)
 60 | -   Fix for xmldict 0.13.0 no longer defaulting to OrderedDict [#159](https://github.com/saketkc/pysradb/pull/159)
 61 | -   Fix for missing experiment model and description in metadata [#160](https://github.com/saketkc/pysradb/issues/160)
 62 | 
 63 | ## 1.3.0 (02-18-2022)
 64 | 
 65 | -   Add [study_title] to [\--detailed] flag
 66 |     ([#152](https://github.com/saketkc/pysradb/issues/152))
 67 | -   Fix [KeyError] in [metadata] where some new
 68 |     IDs do not have any metadata
 69 |     ([#151](https://github.com/saketkc/pysradb/issues/151))
 70 | 
 71 | ## 1.2.0 (01-10-2022)
 72 | 
 73 | -   Do not exit if a qeury returns no hits ([#149](https://github.com/saketkc/pysradb/pull/149))
 74 | 
 75 | ## 1.1.0 (12-12-2021)
 76 | 
 77 | -   Fixed [gsm-to-gse] failure
 78 |     ([#128](https://github.com/saketkc/pysradb/pull/128))
 79 | -   Fixed case sensitivity bug for ENA search
 80 |     ([#144](https://github.com/saketkc/pysradb/pull/144))
 81 | -   Fixed publication date bug for search
 82 |     ([#146](https://github.com/saketkc/pysradb/pull/146))
 83 | -   Added support for downloading data from GEO [pysradb dowload -g
 84 |     GSE]
 85 |     ([#129](https://github.com/saketkc/pysradb/pull/129))
 86 | 
 87 | ## 1.0.1 (01-10-2021)
 88 | 
 89 | -   Dropped Python 3.6 since pandas 1.2 is not supported
 90 | 
 91 | ## 1.0.0 (01-09-2021)
 92 | 
 93 | -   Retired `metadb` and `SRAdb` based search through CLI - everything
 94 |     defaults to `SRAweb`
 95 | -   `SRAweb` now supports
 96 |     [search](https://saket-choudhary.me/pysradb/quickstart.html#search)
 97 | -   [N/A] is now replaced with [pd.NA]
 98 | -   Two new fields in \`\--detailed\`: [instrument_model]
 99 |     and [instrument_model_desc]
100 |     [#75](https://github.com/saketkc/pysradb/issues/75)
101 | -   Updated documentation
102 | 
103 | ## 0.11.1 (09-18-2020)
104 | 
105 | -   [library_layout] is now outputted in metadata #56
106 | -   [-detailed] unifies columns for ENA fastq links instead
107 |     of appending \_x/\_y #59
108 | -   bugfix for parsing namespace in xml outputs #65
109 | -   XML errors from NCBI are now handled more gracefully #69
110 | -   Documentation and dependency updates
111 | 
112 | ## 0.11.0 (09-04-2020)
113 | 
114 | -   [pysradb download] now supports multiple threads for
115 |     paralle downloads
116 | -   [pysradb download] also supports ultra fast downloads of
117 |     FASTQs from ENA using aspera-client
118 | 
119 | ## 0.10.3 (03-26-2020)
120 | 
121 | -   Added test cases for SRAweb
122 | -   API limit exceeding errors are automagically handled
123 | -   Bug fixes for GSE \<=\> SRR
124 | -   Bug fix for metadata - supports multiple SRPs
125 | 
126 | Contributors
127 | 
128 | -   Dibya Gautam
129 | -   Marius van den Beek
130 | 
131 | ## 0.10.2 (02-05-2020)
132 | 
133 | -   Bug fix: Handle API-rate limit exceeding =\> Retries
134 | -   Enhancement: \'Alternatives\' URLs are now part of
135 |     [\--detailed]
136 | 
137 | ## 0.10.1 (02-04-2020)
138 | 
139 | -   Bug fix: Handle Python3.6 for capture_output in subprocess.run
140 | 
141 | ## 0.10.0 (01-31-2020)
142 | 
143 | -   All the subcommands (srx-to-srr, srx-to-srs) will now print
144 |     additional columns where the first two columns represent the
145 |     relevant conversion
146 | -   Fixed a bug where for fetching entries with single efetch record
147 | 
148 | ## 0.9.9 (01-15-2020)
149 | 
150 | -   Major fix: some SRRs would go missing as the experiment dict was
151 |     being created only once per SRR (See #15)
152 | -   Features: More detailed metadata by default in the SRAweb mode
153 | -   See notebook: <https://colab.research.google.com/drive/1C60V->
154 | 
155 | ## 0.9.7 (01-20-2020)
156 | 
157 | -   Feature: instrument, run size and total spots are now printed in the
158 |     metadata by default (SRAweb mode only)
159 | -   Issue: Fixed an issue with srapath failing on SRP. srapath is now
160 |     run on individual SRRs.
161 | 
162 | ## 0.9.6 (07-20-2019)
163 | 
164 | -   Introduced [SRAweb] to perform queries over the web if
165 |     the SQLite is missing or does not contain the relevant record.
166 | 
167 | ## 0.9.0 (02-27-2019)
168 | 
169 | ### Others
170 | 
171 | -   This release completely changes the command line interface replacing
172 |     click with argparse ([#3](https://github.com/saketkc/pysradb/pull/3))
173 | -   Removed Python 2 comptaible stale code
174 | 
175 | ## 0.8.0 (02-26-2019)
176 | 
177 | ### New methods/functionality
178 | 
179 | -   \`srr-to-gsm\`: convert SRR to GSM
180 | -   SRAmetadb.sqlite.gz file is deleted by default after extraction
181 | -   When SRAmetadb is not found a confirmation is seeked before
182 |     downloading
183 | -   Confirmation option before SRA downloads
184 | 
185 | ### Bugfix
186 | 
187 | -   download() works with wget
188 | 
189 | ### Others
190 | 
191 | -   [\--out_dir] is now [out-dir]
192 | 
193 | ## 0.7.1 (02-18-2019)
194 | 
195 | Important: Python2 is no longer supported. Please consider moving to
196 | Python3.
197 | 
198 | ### Bugfix
199 | 
200 | -   Included docs in the index whihch were missed out in the previous
201 |     release
202 | 
203 | ## 0.7.0 (02-08-2019)
204 | 
205 | ### New methods/functionality
206 | 
207 | -   \`gsm-to-srr\`: convert GSM to SRR
208 | -   \`gsm-to-srx\`: convert GSM to SRX
209 | -   \`gsm-to-gse\`: convert GSM to GSE
210 | 
211 | ### Renamed methods
212 | 
213 | The following commad line options have been renamed and the changes are
214 | not compatible with 0.6.0 release:
215 | 
216 | -   [sra-metadata] -\> [metadata].
217 | -   [sra-search] -\> [search].
218 | -   [srametadb] -\> [metadb].
219 | 
220 | ## 0.6.0 (12-25-2018)
221 | 
222 | ### Bugfix
223 | 
224 | -   Fixed bugs introduced in 0.5.0 with API changes where multiple
225 |     redundant columns were output in [sra-metadata]
226 | 
227 | ### New methods/functionality
228 | 
229 | -   [download] now allows piped inputs
230 | 
231 | ## 0.5.0 (12-24-2018)
232 | 
233 | ### New methods/functionality
234 | 
235 | -   Support for filtering by SRX Id for SRA downloads.
236 | -   \`srr_to_srx\`: Convert SRR to SRX/SRP
237 | -   \`srp_to_srx\`: Convert SRP to SRX
238 | -   Stripped down [sra-metadata] to give minimal information
239 | -   Added [\--assay], [\--desc],
240 |     [\--detailed] flag for [sra-metadata]
241 | -   Improved table printing on terminal
242 | 
243 | ## 0.4.2 (12-16-2018)
244 | 
245 | ### Bugfix
246 | 
247 | -   Fixed unicode error in tests for Python2
248 | 
249 | ## 0.4.0 (12-12-2018)
250 | 
251 | ### New methods/functionality
252 | 
253 | -   Added a new [BASEdb] class to handle common database
254 |     connections
255 | -   Initial support for GEOmetadb through GEOdb class
256 | -   Initial support or a command line interface:
257 |     -   download Download SRA project (SRPnnnn)
258 |     -   gse-metadata Fetch metadata for GEO ID (GSEnnnn)
259 |     -   gse-to-gsm Get GSM(s) for GSE
260 |     -   gsm-metadata Fetch metadata for GSM ID (GSMnnnn)
261 |     -   sra-metadata Fetch metadata for SRA project (SRPnnnn)
262 | -   Added three separate notebooks for SRAdb, GEOdb, CLI usage
263 | 
264 | ## 0.3.0 (12-05-2018)
265 | 
266 | ### New methods/functionality
267 | 
268 | -   [sample_attribute] and
269 |     [experiment_attribute] are now included by default in
270 |     the df returned by [sra_metadata()]
271 | -   [expand_sample_attribute_columns: expand metadata dataframe based on
272 |     attributes in \`sample_attribute] column
273 | -   New methods to guess cell/tissue/strain:
274 |     [guess_cell_type()]/[guess_tissue_type()]/[guess_strain_type()]
275 | -   Improved README and usage instructions
276 | 
277 | ## 0.2.2 (12-03-2018)
278 | 
279 | ### New methods/functionality
280 | 
281 | -   [search_sra()] allows full text search on SRA metadata.
282 | 
283 | ## 0.2.0 (12-03-2018)
284 | 
285 | ### Renamed methods
286 | 
287 | The following methods have been renamed and the changes are not
288 | compatible with 0.1.0 release:
289 | 
290 | -   [get_query()] -\> [query()].
291 | -   [sra_convert()] -\> [sra_metadata()].
292 | -   [get_table_counts()] -\> [all_row_counts()].
293 | 
294 | ### New methods/functionality
295 | 
296 | -   [download_sradb_file()] makes fetching [SRAmetadb.sqlite] file easy; wget is no longer required.
297 | -   [ftp] protocol is now supported besides [fsp] and hence [aspera-client] is now optional. We however, strongly recommend [aspera-client] for faster downloads.
298 | 
299 | ### Bug fixes
300 | 
301 | -   Silenced [SettingWithCopyWarning] by excplicitly doing
302 |     operations on a copy of the dataframe instead of the original.
303 | 
304 | Besides these, all methods now follow a [numpydoc]
305 | compatible documentation.
306 | 
307 | ## 0.1.0 (12-01-2018)
308 | 
309 | -   First release on PyPI.
310 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020-2023, Saket Choudhary
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.md
 2 | include CONTRIBUTING.md
 3 | include HISTORY.md
 4 | include LICENSE
 5 | include README.md
 6 | include requirements.txt
 7 | 
 8 | recursive-include tests *
 9 | recursive-exclude * __pycache__
10 | recursive-exclude * *.py[co]
11 | recursive-exclude * *.sqlite
12 | recursive-exclude * *.sqlite.gz
13 | 
14 | recursive-include docs *.md conf.py Makefile make.bat *.jpg *.png *.gif *.rst
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-test clean-pyc clean-build docs help
 2 | .DEFAULT_GOAL := help
 3 | 
 4 | define BROWSER_PYSCRIPT
 5 | import os, webbrowser, sys
 6 | 
 7 | try:
 8 | 	from urllib import pathname2url
 9 | except:
10 | 	from urllib.request import pathname2url
11 | 
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 | 
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 | 
19 | for line in sys.stdin:
20 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | 	if match:
22 | 		target, help = match.groups()
23 | 		print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 | 
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 | 
29 | help:
30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 | 
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 | 
34 | clean-build: ## remove build artifacts
35 | 	rm -fr build/
36 | 	rm -fr dist/
37 | 	rm -fr .eggs/
38 | 	find . -name '*.egg-info' -exec rm -fr {} +
39 | 	find . -name '*.egg' -exec rm -f {} +
40 | 
41 | clean-pyc: ## remove Python file artifacts
42 | 	find . -name '*.pyc' -exec rm -f {} +
43 | 	find . -name '*.pyo' -exec rm -f {} +
44 | 	find . -name '*~' -exec rm -f {} +
45 | 	find . -name '__pycache__' -exec rm -fr {} +
46 | 
47 | clean-test: ## remove test and coverage artifacts
48 | 	rm -fr .tox/
49 | 	rm -f .coverage
50 | 	rm -fr htmlcov/
51 | 	rm -fr .pytest_cache
52 | 
53 | lint: ## check style with flake8
54 | 	flake8 pysradb tests
55 | 
56 | test: ## run tests quickly with the default Python
57 | 	pytest -s -v tests
58 | 
59 | test-all: ## run tests on every Python version with tox
60 | 	tox
61 | 
62 | coverage: ## check code coverage quickly with the default Python
63 | 	coverage run --source pysradb -m pytest
64 | 	coverage report -m
65 | 	coverage html
66 | 
67 | docs: ## generate Sphinx HTML documentation, including API docs
68 | 	rm -f docs/pysradb.rst
69 | 	rm -f docs/modules.rst
70 | 	sphinx-apidoc -o docs/ pysradb
71 | 	$(MAKE) -C docs clean
72 | 	$(MAKE) -C docs html
73 | 
74 | servedocs: docs ## compile the docs watching for changes
75 | 	#watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
76 | 	watchmedo shell-command -p '*.md|*.rst' -c '$(MAKE) -C docs html' -R -D .
77 | 
78 | release: dist ## package and upload a release
79 | 	python -m build
80 | 	twine upload dist/*
81 | 
82 | dist: clean ## builds source and wheel package
83 | 	python -m build
84 | 	ls -l dist
85 | 
86 | install: clean ## install the package to the active Python's site-packages
87 | 	pip install -e .
88 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = pysradb
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/e1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e1.png


--------------------------------------------------------------------------------
/docs/_static/e2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e2.png


--------------------------------------------------------------------------------
/docs/_static/e3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/e3.png


--------------------------------------------------------------------------------
/docs/_static/pysradb_v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saketkc/pysradb/bc7d54827903477bd59b7ec02774e6c7f41b7c3d/docs/_static/pysradb_v3.png


--------------------------------------------------------------------------------
/docs/_static/pysradb_v3.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="139.03136mm"
 13 |    height="61.615105mm"
 14 |    viewBox="0 0 139.03136 61.615104"
 15 |    version="1.1"
 16 |    id="svg8971"
 17 |    inkscape:version="0.92.4 (33fec40, 2019-01-16)"
 18 |    sodipodi:docname="pysradb_v3.svg">
 19 |   <defs
 20 |      id="defs8965" />
 21 |   <sodipodi:namedview
 22 |      id="base"
 23 |      pagecolor="#ffffff"
 24 |      bordercolor="#666666"
 25 |      borderopacity="1.0"
 26 |      inkscape:pageopacity="0.0"
 27 |      inkscape:pageshadow="2"
 28 |      inkscape:zoom="1"
 29 |      inkscape:cx="487.69671"
 30 |      inkscape:cy="-12.379223"
 31 |      inkscape:document-units="mm"
 32 |      inkscape:current-layer="g2239"
 33 |      showgrid="true"
 34 |      inkscape:window-width="2556"
 35 |      inkscape:window-height="1321"
 36 |      inkscape:window-x="0"
 37 |      inkscape:window-y="68"
 38 |      inkscape:window-maximized="0"
 39 |      fit-margin-top="0"
 40 |      fit-margin-left="0"
 41 |      fit-margin-right="0"
 42 |      fit-margin-bottom="0">
 43 |     <inkscape:grid
 44 |        type="xygrid"
 45 |        id="grid9516"
 46 |        originx="-69.839258"
 47 |        originy="-192.01355"
 48 |        spacingx="0.13229167"
 49 |        spacingy="0.13229167" />
 50 |   </sodipodi:namedview>
 51 |   <metadata
 52 |      id="metadata8968">
 53 |     <rdf:RDF>
 54 |       <cc:Work
 55 |          rdf:about="">
 56 |         <dc:format>image/svg+xml</dc:format>
 57 |         <dc:type
 58 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 59 |         <dc:title></dc:title>
 60 |       </cc:Work>
 61 |     </rdf:RDF>
 62 |   </metadata>
 63 |   <g
 64 |      inkscape:label="Layer 1"
 65 |      inkscape:groupmode="layer"
 66 |      id="layer1"
 67 |      transform="translate(-69.839241,-43.371355)">
 68 |     <g
 69 |        id="g9650"
 70 |        style="stroke-width:3;stroke-miterlimit:4;stroke-dasharray:none">
 71 |       <path
 72 |          inkscape:connector-curvature="0"
 73 |          id="path9583"
 74 |          d="m 90.995501,52.365533 4.177866,17.092808 4.230458,-17.092808"
 75 |          style="fill:none;stroke:#f30000;stroke-width:2.484272;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
 76 |          sodipodi:nodetypes="ccc" />
 77 |       <path
 78 |          style="color:#000000;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:medium;line-height:normal;font-family:sans-serif;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;letter-spacing:normal;word-spacing:normal;text-transform:none;writing-mode:lr-tb;direction:ltr;text-orientation:mixed;dominant-baseline:auto;baseline-shift:baseline;text-anchor:start;white-space:normal;shape-padding:0;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;vector-effect:none;fill:#f30000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:2.85143089;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape-rendering:auto;text-rendering:auto;enable-background:accumulate"
 79 |          d="m 93.890201,69.601343 v 8.524774 l 0.03687,1.521124 h 2.395673 l 0.03687,-1.521124 v -8.524774 z"
 80 |          id="path9585"
 81 |          inkscape:connector-curvature="0" />
 82 |     </g>
 83 |     <text
 84 |        xml:space="preserve"
 85 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:9.8777771px;line-height:1.25;font-family:Arial;-inkscape-font-specification:Arial;letter-spacing:0px;word-spacing:0px;fill:#808080;fill-opacity:1;stroke:none;stroke-width:0.26458332"
 86 |        x="66.955696"
 87 |        y="73.770111"
 88 |        id="text9642"><tspan
 89 |          sodipodi:role="line"
 90 |          id="tspan9640"
 91 |          x="66.955696"
 92 |          y="73.770111"
 93 |          style="font-size:43.7444458px;fill:#808080;fill-opacity:1;stroke-width:0.26458332">p</tspan></text>
 94 |     <text
 95 |        xml:space="preserve"
 96 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:9.8777771px;line-height:1.25;font-family:Arial;-inkscape-font-specification:Arial;letter-spacing:0px;word-spacing:0px;fill:#2a7fff;fill-opacity:1;stroke:none;stroke-width:0.26458332"
 97 |        x="99.650085"
 98 |        y="74.937042"
 99 |        id="text9646"><tspan
100 |          sodipodi:role="line"
101 |          id="tspan9644"
102 |          x="99.650085"
103 |          y="74.937042"
104 |          style="font-size:44.09722137px;fill:#808080;fill-opacity:1;stroke-width:0.26458332">sradb</tspan></text>
105 |     <g
106 |        transform="matrix(0.08268229,0,0,-0.08268229,88.74046,130.1457)"
107 |        id="g4477"
108 |        style="stroke:#f30000;stroke-opacity:1">
109 |       <g
110 |          id="g2239"
111 |          transform="rotate(-28.612149,100.23321,552.96869)">
112 |         <path
113 |            style="fill:#808080;fill-opacity:0;fill-rule:nonzero;stroke:#808080;stroke-width:6.91653633;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
114 |            d="m 52.137217,596.23393 c -1.257506,-0.66887 -2.547053,-1.41787 -3.870428,-2.24456 -36.898693,-23.73063 -7.267903,-44.7545 -2.618223,-149.51201 2.51375,-56.63489 93.682314,-29.58554 110.665484,-56.27335 26.2152,-21.57019 36.97598,-33.67034 66.51928,-27.92893 13.06625,0.68408 25.0378,3.52269 39.34457,7.20109 l 21.16686,11.54637 8.61592,4.69993 c 5.39477,2.9428 15.24532,5.65337 26.24387,9.8662 2.74955,1.05332 5.67248,2.27079 8.6331,3.7304 8.8819,4.37855 18.09161,10.97286 23.84765,22.13046 12.87933,24.96548 12.51333,52.16656 7.01993,77.5931 -5.49337,25.42649 -15.92592,50.13443 -25.57291,73.81919 -9.64701,23.6847 -18.46717,46.35145 -21.83865,65.81111 -3.37151,19.45964 -1.89295,34.45593 7.36374,48.29073 12.79834,19.1281 30.19784,30.1921 52.96644,37.63631 22.76863,7.44423 50.58323,10.53194 80.38973,12.42654 59.61295,3.78915 127.2971,1.92666 181.74539,33.95364 41.39144,24.34685 70.41163,60.95683 97.93896,95.96348 27.52728,35.00668 53.64942,68.4816 86.4409,90.103 52.83364,34.83639 146.23874,72.03117 225.40677,99.49707 613.2218,204.6732 0.1268,74.5571 -246.21338,-67.94113 -39.84301,-26.27091 -67.99929,-63.52318 -95.34279,-98.2961 -27.34347,-34.7729 -53.79117,-66.98634 -87.39106,-86.7501 -41.38307,-24.34195 -103.39557,-24.89934 -164.9818,-28.81393 -30.79314,-1.95732 -61.45039,-4.97263 -89.73447,-14.22016 -28.2841,-9.24746 -54.52195,-25.46376 -72.63687,-52.5379 -15.69861,-23.46292 -17.55641,-50.56392 -13.19032,-75.76411 4.36606,-25.2002 14.36595,-49.75901 24.08427,-73.61891 9.71829,-23.85985 19.19207,-47.01345 23.62662,-67.53894 4.43453,-20.52549 4.02273,-37.36764 -3.67244,-52.28417 -1.1983,-2.3228 -4.38374,-4.81783 -12.40445,-7.89004 -8.02066,-3.07219 -19.23125,-5.66269 -30.82654,-11.98784 l -8.61591,-4.69992 c -34.36305,-11.57378 -36.14791,-9.32602 -77.01039,21.89467 -6.78784,26.99243 28.10133,59.17444 19.179,80.99905 -2.2501,5.5039 -2.52886,12.59893 -14.62114,17.30353 -89.75151,34.91858 -101.657469,84.54117 -140.656904,63.83658 z"
115 |            id="path4194"
116 |            inkscape:connector-curvature="0"
117 |            sodipodi:nodetypes="ccscccccccsscccssscsccssscccsscssccsscc" />
118 |         <path
119 |            style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#808080;stroke-width:9.44881916;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
120 |            d="m 69.937005,511.39792 c 0,4.4479 -3.639254,8.08714 -8.087157,8.08714 -4.44789,0 -8.087137,-3.63924 -8.087137,-8.08714 0,-4.44791 3.639247,-8.08714 8.087137,-8.08714 4.447903,0 8.087157,3.63923 8.087157,8.08714 z"
121 |            id="path4196"
122 |            inkscape:connector-curvature="0" />
123 |         <path
124 |            style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#808080;stroke-width:9.44881916;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
125 |            d="m 124.8046,541.3278 c 0,4.44787 -3.23482,8.08713 -8.08713,8.08713 -4.4479,0 -8.08716,-3.63926 -8.08716,-8.08713 0,-4.4481 3.63926,-8.08735 8.08716,-8.08735 4.85231,0 8.08713,3.63925 8.08713,8.08735 z"
126 |            id="path4198"
127 |            inkscape:connector-curvature="0" />
128 |       </g>
129 |     </g>
130 |   </g>
131 | </svg>
132 | 


--------------------------------------------------------------------------------
/docs/authors.md:
--------------------------------------------------------------------------------
 1 | # Credits
 2 | 
 3 | ## Contributors
 4 | 
 5 | -   [Boshen Yan](https://github.com/bscrow)
 6 | -   [Maarten van der Sande](https://github.com/Maarten-vd-Sande)
 7 | -   [Dibya Gautam](https://github.com/dibyaaaaax)
 8 | -   [Marius van den Beek](https://github.com/mvdbeek)
 9 | -   [Devang Thakkar](https://github.com/DevangThakkar)
10 | 
11 | ## Maintainer
12 | 
13 | -   Saket Choudhary \<<saketkc@gmail.com>\>
14 | 


--------------------------------------------------------------------------------
/docs/cmdline.md:
--------------------------------------------------------------------------------
  1 | # CLI 
  2 | 
  3 |     $ pysradb
  4 |     usage: pysradb [-h] [--version] [--citation]
  5 |                    {metadb,metadata,download,search,gse-to-gsm,gse-to-srp,gsm-to-gse,gsm-to-srp,gsm-to-srr,gsm-to-srs,gsm-to-srx,srp-to-gse,srp-to-srr,srp-to-srs,srp-to-srx,srr-to-gsm,srr-to-srp,srr-to-srs,srr-to-srx,srs-to-gsm,srs-to-srx,srx-to-srp,srx-to-srr,srx-to-srs}
  6 |                    ...
  7 | 
  8 |     pysradb: Query NGS metadata and data from NCBI Sequence Read Archive.
  9 |     Citation: 10.12688/f1000research.18676.1
 10 | 
 11 |     optional arguments:
 12 |       -h, --help            show this help message and exit
 13 |       --version             show program's version number and exit
 14 |       --citation            how to cite
 15 | 
 16 |     subcommands:
 17 |       {metadb,metadata,download,search,gse-to-gsm,gse-to-srp,gsm-to-gse,gsm-to-srp,gsm-to-srr,gsm-to-srs,gsm-to-srx,srp-to-gse,srp-to-srr,srp-to-srs,srp-to-srx,srr-to-gsm,srr-to-srp,srr-to-srs,srr-to-srx,srs-to-gsm,srs-to-srx,srx-to-srp,srx-to-srr,srx-to-srs}
 18 |         metadata            Fetch metadata for SRA project (SRPnnnn)
 19 |         download            Download SRA project (SRPnnnn)
 20 |         search              Search SRA/ENA for matching text
 21 |         gse-to-gsm          Get GSM for a GSE
 22 |         gse-to-srp          Get SRP for a GSE
 23 |         gsm-to-gse          Get GSE for a GSM
 24 |         gsm-to-srp          Get SRP for a GSM
 25 |         gsm-to-srr          Get SRR for a GSM
 26 |         gsm-to-srs          Get SRS for a GSM
 27 |         gsm-to-srx          Get SRX for a GSM
 28 |         srp-to-gse          Get GSE for a SRP
 29 |         srp-to-srr          Get SRR for a SRP
 30 |         srp-to-srs          Get SRS for a SRP
 31 |         srp-to-srx          Get SRX for a SRP
 32 |         srr-to-gsm          Get GSM for a SRR
 33 |         srr-to-srp          Get SRP for a SRR
 34 |         srr-to-srs          Get SRS for a SRR
 35 |         srr-to-srx          Get SRX for a SRR
 36 |         srs-to-gsm          Get GSM for a SRS
 37 |         srs-to-srx          Get SRX for a SRS
 38 |         srx-to-srp          Get SRP for a SRX
 39 |         srx-to-srr          Get SRR for a SRX
 40 |         srx-to-srs          Get SRS for a SRX
 41 | 
 42 | ## Getting metadata for a SRA project (SRP)
 43 | 
 44 | The most basic information associated with any SRA project is its list
 45 | of experiments and run accessions.
 46 | 
 47 |     $ pysradb metadata SRP098789
 48 | 
 49 |      study_accession experiment_accession sample_accession run_accession
 50 |      SRP098789       SRX2536403           SRS1956353       SRR5227288
 51 |      SRP098789       SRX2536404           SRS1956354       SRR5227289
 52 |      SRP098789       SRX2536405           SRS1956355       SRR5227290
 53 |      SRP098789       SRX2536406           SRS1956356       SRR5227291
 54 |      SRP098789       SRX2536407           SRS1956357       SRR5227292
 55 |      SRP098789       SRX2536408           SRS1956358       SRR5227293
 56 |      SRP098789       SRX2536409           SRS1956359       SRR5227294
 57 | 
 58 | Listing SRX and SRRs for a SRP is often not useful. We might want to
 59 | take a quick look at the metadata associated with the samples:
 60 | 
 61 |     $ pysradb metadata SRP098789
 62 | 
 63 |      study_accession experiment_accession sample_accession run_accession sample_attribute
 64 |      SRP098789       SRX2536403           SRS1956353       SRR5227288    source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 65 |      SRP098789       SRX2536404           SRS1956354       SRR5227289    source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 66 |      SRP098789       SRX2536405           SRS1956355       SRR5227290    source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 67 |      SRP098789       SRX2536406           SRS1956356       SRR5227291    source_name: Huh7_0.3 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 68 |      SRP098789       SRX2536407           SRS1956357       SRR5227292    source_name: Huh7_0.3 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 69 |      SRP098789       SRX2536408           SRS1956358       SRR5227293    source_name: Huh7_0.3 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
 70 | 
 71 | The example here came from a Ribosome profiling study and consists of a
 72 | collection of both Ribo-seq and RNA-seq samples. We can filter out only
 73 | the RNA-seq samples:
 74 | 
 75 |     $ pysradb metadata SRP098789 --detailed | grep 'study|RNA-Seq'
 76 | 
 77 |     SRP098789       SRX2536422           SRR5227307    RNA-Seq          SINGLE -
 78 |     SRP098789       SRX2536424           SRR5227309    RNA-Seq          SINGLE -
 79 |     SRP098789       SRX2536426           SRR5227311    RNA-Seq          SINGLE -
 80 |     SRP098789       SRX2536428           SRR5227313    RNA-Seq          SINGLE -
 81 | 
 82 | A more complicated example will consist of multiple assays. For example
 83 | \`SRP000941\`:
 84 | 
 85 |     $ pysradb metadata SRP000941 --detailed  | tr -s '  ' | cut -f5 -d ' ' | sort | uniq -c
 86 |     999 Bisulfite-Seq
 87 |     768 ChIP-Seq
 88 |       1 library_strategy
 89 |     121 OTHER
 90 |     353 RNA-Seq
 91 |      28 WGS
 92 | 
 93 | ## Experiment accessions for a project (SRP =\> SRX)
 94 | 
 95 | A frequently encountered task involves getting all the experiments (SRX)
 96 | for a particular study accession (SRP). Consider project \`SRP048759\`:
 97 | 
 98 |     $ pysradb srp-to-srx SRP048759
 99 | 
100 | ## Sample accessions for a project (SRP =\> SRS)
101 | 
102 | Each experiment involves one or multiple biological samples (SRS), that
103 | are put through different experiments (SRX).
104 | 
105 |     $ pysradb srp-to-srs --detailed SRP048759
106 | 
107 |     study_accession sample_accession
108 |     SRP048759       SRS718878
109 |     SRP048759       SRS718879
110 |     SRP048759       SRS718880
111 |     SRP048759       SRS718881
112 |     SRP048759       SRS718882
113 |     SRP048759       SRS718883
114 |     SRP048759       SRS718884
115 |     SRP048759       SRS718885
116 |     SRP048759       SRS718886
117 | 
118 | This is very limited information. It can again be detailed out using the
119 | [\--detailed]{.title-ref} flag:
120 | 
121 |     $ pysradb srp-to-srs --detailed SRP048759
122 | 
123 |     study_accession sample_accession        experiment_accession    run_accession   study_alias     sample_alias    experiment_alias        run_alias
124 |     SRP048759       SRS718878       SRX729552       SRR1608490      GSE62190        GSM1521543      GSM1521543      GSM1521543_r1
125 |     SRP048759       SRS718878       SRX729552       SRR1608491      GSE62190        GSM1521543      GSM1521543      GSM1521543_r2
126 |     SRP048759       SRS718878       SRX729552       SRR1608492      GSE62190        GSM1521543      GSM1521543      GSM1521543_r3
127 |     SRP048759       SRS718878       SRX729552       SRR1608493      GSE62190        GSM1521543      GSM1521543      GSM1521543_r4
128 |     SRP048759       SRS718879       SRX729553       SRR1608494      GSE62190        GSM1521544      GSM1521544      GSM1521544_r1
129 |     SRP048759       SRS718879       SRX729553       SRR1608495      GSE62190        GSM1521544      GSM1521544      GSM1521544_r2
130 | 
131 | ## Run accessions for experiments (SRX =\> SRR)
132 | 
133 | Another frequently encountered task involves fetching the run accessions
134 | (SRR) for a particular experiment (SRX). Consider experiments
135 | [SRX217956]{.title-ref} and [SRX2536403]{.title-ref}. We want to be able
136 | to resolve the run accessions for these experiments:
137 | 
138 |     $ pysradb srx-to-srr SRX217956  SRX2536403 --detailed
139 | 
140 |     experiment_accession run_accession study_accession sample_attribute
141 |     SRX217956            SRR649752     SRP017942       source_name: 3T3 cells || treatment: control || cell line: 3T3 cells || assay type: Riboseq
142 |     SRX2536403           SRR5227288    SRP098789       source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
143 | 
144 | ## Experiment accessions for runs (SRR =\> SRX)
145 | 
146 | For fetching experiment accessions (SRX) for one or multiple run
147 | accessions (SRR):
148 | 
149 |     $ pysradb srr-to-srx SRR5227288 SRR649752 --detailed
150 |     run_accession study_accession experiment_accession sample_attribute
151 |     SRR649752     SRP017942       SRX217956            source_name: 3T3 cells || treatment: control || cell line: 3T3 cells || assay type: Riboseq
152 |     SRR5227288    SRP098789       SRX2536403           source_name: Huh7_1.5 Ã‚ÂµM PF-067446846_10 min_ribo-seq || cell line: Huh7 || treatment time: 10 min || library type: ribo-seq
153 | 
154 | ## Downaloading entire project
155 | 
156 |     $ pysradb metadata --detailed SRP098789 | pysradb download
157 | 
158 | ## GEO accessions for studies (SRP =\> GSE)
159 | 
160 |     $ pysradb srp-to-gse SRP090415
161 | 
162 |     study_accession study_alias
163 |     SRP090415       GSE87328
164 | 
165 | But not all SRPs will have an associated GEO id (GSE):
166 | 
167 |     $ pysradb srp-to-gse SRP029589
168 | 
169 |     study_accession study_alias
170 |     SRP029589       PRJNA218051
171 | 
172 | ## SRA accessions for GEO studies (GSE =\> SRP)
173 | 
174 |     $ pysradb gse-to-srp GSE87328i
175 | 
176 |     study_alias study_accession
177 |     GSE87328    SRP090415
178 | 
179 | Please see
180 | [quickstart](https://www.saket-choudhary.me/pysradb/quickstart.html#the-full-list-of-possible-pysradb-operations)
181 | for all possible operations available through `pysradb`.
182 | 


--------------------------------------------------------------------------------
/docs/commands.md:
--------------------------------------------------------------------------------
 1 | # API Documentation 
 2 | 
 3 | ::: {.toctree maxdepth="2"}
 4 | commands/download commands/metadata commands/search commands/gse-to-gsm
 5 | commands/gse-to-srp commands/srp-to-gse commands/srp-to-srr
 6 | commands/srp-to-srs commands/srp-to-srx commands/srr-to-srs
 7 | commands/srr-to-srx commands/srs-to-srx commands/srx-to-srr
 8 | commands/srx-to-srs
 9 | :::
10 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # pysradb documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another
 17 | # directory, add these directories to sys.path here. If the directory is
 18 | # relative to the documentation root, use os.path.abspath to make it
 19 | # absolute, like shown here.
 20 | #
 21 | import os
 22 | import sys
 23 | 
 24 | # import guzzle_sphinx_theme
 25 | import pysradb
 26 | 
 27 | autodoc_mock_imports = ["xmltodict", "numpy", "pandas", "requests", "tqdm"]
 28 | 
 29 | sys.path.insert(0, os.path.abspath(".."))
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 40 | extensions = [
 41 |     "IPython.sphinxext.ipython_directive",
 42 |     "IPython.sphinxext.ipython_console_highlighting",
 43 |     "sphinx.ext.mathjax",
 44 |     "sphinx.ext.autodoc",
 45 |     "sphinx.ext.autosummary",
 46 |     "sphinx.ext.doctest",
 47 |     "sphinx.ext.viewcode",
 48 |     "sphinx.ext.inheritance_diagram",
 49 |     "numpydoc",
 50 |     "sphinx_tabs.tabs",
 51 |     "sphinx_panels",
 52 |     "sphinxcontrib.gtagjs",
 53 |     "myst_parser",
 54 | ]
 55 | gtagjs_ids = [
 56 |     "G-CKQZFCEENZ",
 57 | ]
 58 | 
 59 | panels_add_bootstrap_css = False
 60 | 
 61 | # Add any paths that contain templates here, relative to this directory.
 62 | templates_path = ["_templates"]
 63 | 
 64 | # The suffix(es) of source filenames.
 65 | # You can specify multiple suffix as a list of string:
 66 | #
 67 | source_suffix = [".rst", ".md"]
 68 | # source_suffix = ".md"
 69 | 
 70 | # The master toctree document.
 71 | master_doc = "index"
 72 | 
 73 | # General information about the project.
 74 | project = "pysradb"
 75 | copyright = "2023, Saket Choudhary"
 76 | author = "Saket Choudhary"
 77 | # The version info for the project you're documenting, acts as replacement
 78 | # for |version| and |release|, also used in various other places throughout
 79 | # the built documents.
 80 | #
 81 | # The short X.Y version.
 82 | version = pysradb.__version__
 83 | # The full version, including alpha/beta/rc tags.
 84 | release = pysradb.__version__
 85 | 
 86 | # The language for content autogenerated by Sphinx. Refer to documentation
 87 | # for a list of supported languages.
 88 | #
 89 | # This is also used if you do content translation via gettext catalogs.
 90 | # Usually you set "language" from the command line for these cases.
 91 | language = None
 92 | 
 93 | # List of patterns, relative to source directory, that match files and
 94 | # directories to ignore when looking for source files.
 95 | # This patterns also effect to html_static_path and html_extra_path
 96 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 97 | 
 98 | # The name of the Pygments (syntax highlighting) style to use.
 99 | pygments_style = "sphinx"
100 | 
101 | # If true, `todo` and `todoList` produce output, else they produce nothing.
102 | todo_include_todos = False
103 | 
104 | 
105 | # -- Options for HTML output -------------------------------------------
106 | 
107 | # The theme to use for HTML and HTML Help pages.  See the documentation for
108 | # a list of builtin themes.
109 | #
110 | html_theme = "pydata_sphinx_theme"
111 | 
112 | # Theme options are theme-specific and customize the look and feel of a
113 | # theme further.  For a list of options available for each theme, see the
114 | # documentation.
115 | #
116 | # html_theme_options = {}
117 | 
118 | # Add any paths that contain custom static files (such as style sheets) here,
119 | # relative to this directory. They are copied after the builtin static files,
120 | # so a file named "default.css" will overwrite the builtin "default.css".
121 | html_static_path = ["_static"]
122 | 
123 | 
124 | # -- Options for HTMLHelp output ---------------------------------------
125 | 
126 | # Output file base name for HTML help builder.
127 | htmlhelp_basename = "pysradbdoc"
128 | 
129 | 
130 | # -- Options for LaTeX output ------------------------------------------
131 | 
132 | latex_elements = {
133 |     # The paper size ('letterpaper' or 'a4paper').
134 |     #
135 |     # 'papersize': 'letterpaper',
136 |     # The font size ('10pt', '11pt' or '12pt').
137 |     #
138 |     # 'pointsize': '10pt',
139 |     # Additional stuff for the LaTeX preamble.
140 |     #
141 |     # 'preamble': '',
142 |     # Latex figure (float) alignment
143 |     #
144 |     # 'figure_align': 'htbp',
145 | }
146 | 
147 | # Grouping the document tree into LaTeX files. List of tuples
148 | # (source start file, target name, title, author, documentclass
149 | # [howto, manual, or own class]).
150 | latex_documents = [
151 |     (master_doc, "pysradb.tex", "pysradb Documentation", "Saket Choudhary", "manual")
152 | ]
153 | 
154 | 
155 | # -- Options for manual page output ------------------------------------
156 | 
157 | # One entry per manual page. List of tuples
158 | # (source start file, name, description, authors, manual section).
159 | man_pages = [(master_doc, "pysradb", "pysradb Documentation", [author], 1)]
160 | 
161 | 
162 | # -- Options for Texinfo output ----------------------------------------
163 | 
164 | # Grouping the document tree into Texinfo files. List of tuples
165 | # (source start file, target name, title, author,
166 | #  dir menu entry, description, category)
167 | texinfo_documents = [
168 |     (
169 |         master_doc,
170 |         "pysradb",
171 |         "pysradb Documentation",
172 |         author,
173 |         "pysradb",
174 |         "One line description of project.",
175 |         "Miscellaneous",
176 |     )
177 | ]
178 | 
179 | 
180 | numpydoc_show_class_members = False
181 | 
182 | 
183 | ##html_theme_path = guzzle_sphinx_theme.html_theme_path()
184 | ##html_theme = "guzzle_sphinx_theme"
185 | ##
186 | ### Register the theme as an extension to generate a sitemap.xml
187 | ##extensions.append("guzzle_sphinx_theme")
188 | ##
189 | ### Guzzle theme options (see theme.conf for more information)
190 | ##html_theme_options = {
191 | ##    # Set the name of the project to appear in the sidebar
192 | ##    "project_nav_name": "pysradb"
193 | ##}
194 | 
195 | scv_greatest_tag = True
196 | scv_show_banner = True
197 | 
198 | html_logo = "_static/pysradb_v3.png"
199 | html_theme_options = {
200 |     "github_url": "https://github.com/saketkc/pysradb",
201 |     "google_analytics_id": "G-CKQZFCEENZ",
202 |     #    "navbar_align": "left",
203 | }
204 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome, and they are greatly appreciated! Every
  4 | little bit helps, and credit will always be given.
  5 | 
  6 | You can contribute in many ways:
  7 | 
  8 | ## Types of Contributions
  9 | 
 10 | ### Report Bugs
 11 | 
 12 | Report bugs at <https://github.com/saketkc/pysradb/issues>.
 13 | 
 14 | If you are reporting a bug, please include:
 15 | 
 16 | -   Your operating system name and version.
 17 | -   Any details about your local setup that might be helpful in
 18 |     troubleshooting.
 19 | -   Detailed steps to reproduce the bug.
 20 | 
 21 | ### Fix Bugs
 22 | 
 23 | Look through the GitHub issues for bugs. Anything tagged with \"bug\"
 24 | and \"help wanted\" is open to whoever wants to implement it.
 25 | 
 26 | ### Implement Features
 27 | 
 28 | Look through the GitHub issues for features. Anything tagged with
 29 | \"enhancement\" and \"help wanted\" is open to whoever wants to
 30 | implement it.
 31 | 
 32 | ### Write Documentation
 33 | 
 34 | pysradb could always use more documentation, whether as part of the
 35 | official pysradb docs, in docstrings, or even on the web in blog posts,
 36 | articles, and such.
 37 | 
 38 | ### Submit Feedback
 39 | 
 40 | The best way to send feedback is to file an issue at
 41 | <https://github.com/saketkc/pysradb/issues>.
 42 | 
 43 | If you are proposing a feature:
 44 | 
 45 | -   Explain in detail how it would work.
 46 | -   Keep the scope as narrow as possible, to make it easier to
 47 |     implement.
 48 | -   Remember that this is a volunteer-driven project, and that
 49 |     contributions are welcome :)
 50 | 
 51 | ## Get Started!
 52 | 
 53 | Ready to contribute? Here\'s how to set up [pysradb]{.title-ref} for
 54 | local development.
 55 | 
 56 | 1.  Fork the [pysradb]{.title-ref} repo on GitHub.
 57 | 
 58 | 2.  Clone your fork locally:
 59 | 
 60 |     ``` shell
 61 |     $ git clone git@github.com:your_name_here/pysradb.git
 62 |     ```
 63 | 
 64 | 3.  Install your local copy into a virtualenv. Assuming you have
 65 |     virtualenvwrapper installed, this is how you set up your fork for
 66 |     local development (If python \--version is less than 3.0, run [\$
 67 |     mkvirtualenv pysradb \--python=py3]{.title-ref} instead):
 68 | 
 69 |     ``` shell
 70 |     $ mkvirtualenv pysradb
 71 |     $ cd pysradb/
 72 |     $ python setup.py develop
 73 |     ```
 74 | 
 75 | 4.  Create a branch for local development:
 76 | 
 77 |     ``` shell
 78 |     $ git checkout -b name-of-your-bugfix-or-feature
 79 |     ```
 80 | 
 81 |     Now you can make your changes locally.
 82 | 
 83 | 5.  When you\'re done making changes, check that your changes pass
 84 |     flake8 and the tests, including testing other Python versions with
 85 |     tox:
 86 | 
 87 |     ``` shell
 88 |     $ flake8 pysradb tests
 89 |     $ python setup.py test or py.test
 90 |     $ tox
 91 |     ```
 92 | 
 93 |     To get flake8 and tox, just pip install them into your virtualenv.
 94 | 
 95 | 6.  Commit your changes and push your branch to GitHub:
 96 | 
 97 |     ``` shell
 98 |     $ git add .
 99 |     $ git commit -m "Your detailed description of your changes."
100 |     $ git push origin name-of-your-bugfix-or-feature
101 |     ```
102 | 
103 | 7.  Submit a pull request through the GitHub website.
104 | 
105 | ## Pull Request Guidelines
106 | 
107 | Before you submit a pull request, check that it meets these guidelines:
108 | 
109 | 1.  The pull request should include tests.
110 | 2.  If the pull request adds functionality, the docs should be updated.
111 |     Put your new functionality into a function with a docstring, and add
112 |     the feature to the list in README.rst.
113 | 3.  The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and
114 |     for PyPy. Check
115 |     <https://travis-ci.org/saketkc/pysradb/pull_requests> and make sure
116 |     that the tests pass for all supported Python versions.
117 | 
118 | ## Tips
119 | 
120 | To run a subset of tests:
121 | 
122 | ``` shell
123 | $ py.test tests.test_pysradb
124 | ```
125 | 
126 | ## Deploying
127 | 
128 | A reminder for the maintainers on how to deploy. Make sure all your
129 | changes are committed (including an entry in HISTORY.rst). Then run:
130 | 
131 | ``` shell
132 | $ bumpversion patch # possible: major / minor / patch
133 | $ git push
134 | $ git push --tags
135 | ```
136 | 
137 | Travis will then deploy to PyPI if tests pass.
138 | 


--------------------------------------------------------------------------------
/docs/history.md:
--------------------------------------------------------------------------------
  1 | # History
  2 | 
  3 | ## 2.1.0 (2023-05-16)
  4 | 
  5 | -   Fix for [gse-to-srp]{.title-ref} returning unrequested GSEs ([#186
  6 |     \<https://github.com/saketkc/pysradb/issues/190\>]{.title-ref})
  7 | -   Fix for [download]{.title-ref} using [public_urls]{.title-ref}
  8 | -   Fix for [gsm-to-srx]{.title-ref} returning false positives ([#165
  9 |     \<https://github.com/saketkc/pysradb/issues/165\>]{.title-ref})
 10 | -   Fix for delimiter not being consistent when metadata is printed on
 11 |     terminal ([#147
 12 |     \<https://github.com/saketkc/pysradb/issues/147\>]{.title-ref})
 13 | -   ENA search is currently broken because of an API change
 14 | 
 15 | ## 2.0.2 (2023-04-09)
 16 | 
 17 | -   Fix for [gse-to-srp]{.title-ref} to handle cases where a project is
 18 |     missing but SRXs are returned ([#186
 19 |     \<https://github.com/saketkc/pysradb/issues/186\>]{.title-ref})
 20 | -   Fix gse-to-gsm ([#187
 21 |     \<https://github.com/saketkc/pysradb/issues/187\>]{.title-ref})
 22 | 
 23 | ## 2.0.1 (2023-03-18)
 24 | 
 25 | -   Fix for [pysradb download]{.title-ref} - using
 26 |     [public_url]{.title-ref}
 27 | -   Fix for SRX -\> SRR and related conversions ([#183
 28 |     \<https://github.com/saketkc/pysradb/pull/183\>]{.title-ref})
 29 | 
 30 | ## 2.0.0 (2023-02-23)
 31 | 
 32 | -   BREAKING change: Overhaul of how urls and associated metadata are
 33 |     returned (not backward compatible); all column names are lower cased
 34 |     by default
 35 | -   Fix extra space in \"organism_taxid\" column
 36 | -   Added support for Experiment attributes ([#89
 37 |     \<https://github.com/saketkc/pysradb/issues/89#issuecomment-1439319532\>]{.title-ref})
 38 | 
 39 | ## 1.4.2 (06-17-2022)
 40 | 
 41 | -   Fix ENA fastq fetching ([#163
 42 |     \<https://github.com/saketkc/pysradb/issues/163\>]{.title-ref})
 43 | 
 44 | ## 1.4.1 (06-04-2022)
 45 | 
 46 | -   Fix for fetchin alternative URLs
 47 | 
 48 | ## 1.4.0 (06-04-2022)
 49 | 
 50 | -   Added ability to fetch alternative URLs (GCP/AWS) for metadata
 51 |     ([#161
 52 |     \<https://github.com/saketkc/pysradb/issues/161\>]{.title-ref})
 53 | -   Fix for xmldict 0.13.0 no longer defaulting to OrderedDict ([#159
 54 |     \<https://github.com/saketkc/pysradb/pull/159\>]{.title-ref})
 55 | -   Fix for missing experiment model and description in metadata ([#160
 56 |     \<https://github.com/saketkc/pysradb/issues/160\>]{.title-ref})
 57 | 
 58 | ## 1.3.0 (02-18-2022)
 59 | 
 60 | -   Add [study_title]{.title-ref} to [\--detailed]{.title-ref} flag
 61 |     ([#152](https://github.com/saketkc/pysradb/issues/152))
 62 | -   Fix [KeyError]{.title-ref} in [metadata]{.title-ref} where some new
 63 |     IDs do not have any metadata
 64 |     ([#151](https://github.com/saketkc/pysradb/issues/151))
 65 | 
 66 | ## 1.2.0 (01-10-2022)
 67 | 
 68 | -   Do not exit if a qeury returns no hits ([#149
 69 |     \<https://github.com/saketkc/pysradb/pull/149\>]{.title-ref})
 70 | 
 71 | ## 1.1.0 (12-12-2021)
 72 | 
 73 | -   Fixed [gsm-to-gse]{.title-ref} failure
 74 |     ([#128](https://github.com/saketkc/pysradb/pull/128))
 75 | -   Fixed case sensitivity bug for ENA search
 76 |     ([#144](https://github.com/saketkc/pysradb/pull/144))
 77 | -   Fixed publication date bug for search
 78 |     ([#146](https://github.com/saketkc/pysradb/pull/146))
 79 | -   Added support for downloading data from GEO [pysradb dowload -g
 80 |     \<GSE\>]{.title-ref}
 81 |     ([#129](https://github.com/saketkc/pysradb/pull/129))
 82 | 
 83 | ## 1.0.1 (01-10-2021)
 84 | 
 85 | -   Dropped Python 3.6 since pandas 1.2 is not supported
 86 | 
 87 | ## 1.0.0 (01-09-2021)
 88 | 
 89 | -   Retired `metadb` and `SRAdb` based search through CLI - everything
 90 |     defaults to `SRAweb`
 91 | -   `SRAweb` now supports
 92 |     [search](https://saket-choudhary.me/pysradb/quickstart.html#search)
 93 | -   [N/A]{.title-ref} is now replaced with [pd.NA]{.title-ref}
 94 | -   Two new fields in \`\--detailed\`: [instrument_model]{.title-ref}
 95 |     and [instrument_model_desc]{.title-ref}
 96 |     [#75](https://github.com/saketkc/pysradb/issues/75)
 97 | -   Updated documentation
 98 | 
 99 | ## 0.11.1 (09-18-2020)
100 | 
101 | -   [library_layout]{.title-ref} is now outputted in metadata #56
102 | -   [-detailed]{.title-ref} unifies columns for ENA fastq links instead
103 |     of appending \_x/\_y #59
104 | -   bugfix for parsing namespace in xml outputs #65
105 | -   XML errors from NCBI are now handled more gracefully #69
106 | -   Documentation and dependency updates
107 | 
108 | ## 0.11.0 (09-04-2020)
109 | 
110 | -   [pysradb download]{.title-ref} now supports multiple threads for
111 |     paralle downloads
112 | -   [pysradb download]{.title-ref} also supports ultra fast downloads of
113 |     FASTQs from ENA using aspera-client
114 | 
115 | ## 0.10.3 (03-26-2020)
116 | 
117 | -   Added test cases for SRAweb
118 | -   API limit exceeding errors are automagically handled
119 | -   Bug fixes for GSE \<=\> SRR
120 | -   Bug fix for metadata - supports multiple SRPs
121 | 
122 | Contributors
123 | 
124 | -   Dibya Gautam
125 | -   Marius van den Beek
126 | 
127 | ## 0.10.2 (02-05-2020)
128 | 
129 | -   Bug fix: Handle API-rate limit exceeding =\> Retries
130 | -   Enhancement: \'Alternatives\' URLs are now part of
131 |     [\--detailed]{.title-ref}
132 | 
133 | ## 0.10.1 (02-04-2020)
134 | 
135 | -   Bug fix: Handle Python3.6 for capture_output in subprocess.run
136 | 
137 | ## 0.10.0 (01-31-2020)
138 | 
139 | -   All the subcommands (srx-to-srr, srx-to-srs) will now print
140 |     additional columns where the first two columns represent the
141 |     relevant conversion
142 | -   Fixed a bug where for fetching entries with single efetch record
143 | 
144 | ## 0.9.9 (01-15-2020)
145 | 
146 | -   Major fix: some SRRs would go missing as the experiment dict was
147 |     being created only once per SRR (See #15)
148 | -   Features: More detailed metadata by default in the SRAweb mode
149 | -   See notebook: <https://colab.research.google.com/drive/1C60V->
150 | 
151 | ## 0.9.7 (01-20-2020)
152 | 
153 | -   Feature: instrument, run size and total spots are now printed in the
154 |     metadata by default (SRAweb mode only)
155 | -   Issue: Fixed an issue with srapath failing on SRP. srapath is now
156 |     run on individual SRRs.
157 | 
158 | ## 0.9.6 (07-20-2019)
159 | 
160 | -   Introduced [SRAweb]{.title-ref} to perform queries over the web if
161 |     the SQLite is missing or does not contain the relevant record.
162 | 
163 | ## 0.9.0 (02-27-2019)
164 | 
165 | ### Others
166 | 
167 | -   This release completely changes the command line interface replacing
168 |     click with argparse (<https://github.com/saketkc/pysradb/pull/3>)
169 | -   Removed Python 2 comptaible stale code
170 | 
171 | ## 0.8.0 (02-26-2019)
172 | 
173 | ### New methods/functionality
174 | 
175 | -   \`srr-to-gsm\`: convert SRR to GSM
176 | -   SRAmetadb.sqlite.gz file is deleted by default after extraction
177 | -   When SRAmetadb is not found a confirmation is seeked before
178 |     downloading
179 | -   Confirmation option before SRA downloads
180 | 
181 | ### Bugfix
182 | 
183 | -   download() works with wget
184 | 
185 | ### Others
186 | 
187 | -   [\--out_dir]{.title-ref} is now [out-dir]{.title-ref}
188 | 
189 | ## 0.7.1 (02-18-2019)
190 | 
191 | Important: Python2 is no longer supported. Please consider moving to
192 | Python3.
193 | 
194 | ### Bugfix
195 | 
196 | -   Included docs in the index whihch were missed out in the previous
197 |     release
198 | 
199 | ## 0.7.0 (02-08-2019)
200 | 
201 | ### New methods/functionality
202 | 
203 | -   \`gsm-to-srr\`: convert GSM to SRR
204 | -   \`gsm-to-srx\`: convert GSM to SRX
205 | -   \`gsm-to-gse\`: convert GSM to GSE
206 | 
207 | ### Renamed methods
208 | 
209 | The following commad line options have been renamed and the changes are
210 | not compatible with 0.6.0 release:
211 | 
212 | -   [sra-metadata]{.title-ref} -\> [metadata]{.title-ref}.
213 | -   [sra-search]{.title-ref} -\> [search]{.title-ref}.
214 | -   [srametadb]{.title-ref} -\> [metadb]{.title-ref}.
215 | 
216 | ## 0.6.0 (12-25-2018)
217 | 
218 | ### Bugfix
219 | 
220 | -   Fixed bugs introduced in 0.5.0 with API changes where multiple
221 |     redundant columns were output in [sra-metadata]{.title-ref}
222 | 
223 | ### New methods/functionality
224 | 
225 | -   [download]{.title-ref} now allows piped inputs
226 | 
227 | ## 0.5.0 (12-24-2018)
228 | 
229 | ### New methods/functionality
230 | 
231 | -   Support for filtering by SRX Id for SRA downloads.
232 | -   \`srr_to_srx\`: Convert SRR to SRX/SRP
233 | -   \`srp_to_srx\`: Convert SRP to SRX
234 | -   Stripped down [sra-metadata]{.title-ref} to give minimal information
235 | -   Added [\--assay]{.title-ref}, [\--desc]{.title-ref},
236 |     [\--detailed]{.title-ref} flag for [sra-metadata]{.title-ref}
237 | -   Improved table printing on terminal
238 | 
239 | ## 0.4.2 (12-16-2018)
240 | 
241 | ### Bugfix
242 | 
243 | -   Fixed unicode error in tests for Python2
244 | 
245 | ## 0.4.0 (12-12-2018)
246 | 
247 | ### New methods/functionality
248 | 
249 | -   Added a new [BASEdb]{.title-ref} class to handle common database
250 |     connections
251 | -   Initial support for GEOmetadb through GEOdb class
252 | -   Initial support or a command line interface:
253 |     -   download Download SRA project (SRPnnnn)
254 |     -   gse-metadata Fetch metadata for GEO ID (GSEnnnn)
255 |     -   gse-to-gsm Get GSM(s) for GSE
256 |     -   gsm-metadata Fetch metadata for GSM ID (GSMnnnn)
257 |     -   sra-metadata Fetch metadata for SRA project (SRPnnnn)
258 | -   Added three separate notebooks for SRAdb, GEOdb, CLI usage
259 | 
260 | ## 0.3.0 (12-05-2018)
261 | 
262 | ### New methods/functionality
263 | 
264 | -   [sample_attribute]{.title-ref} and
265 |     [experiment_attribute]{.title-ref} are now included by default in
266 |     the df returned by [sra_metadata()]{.title-ref}
267 | -   [expand_sample_attribute_columns: expand metadata dataframe based on
268 |     attributes in \`sample_attribute]{.title-ref} column
269 | -   New methods to guess cell/tissue/strain:
270 |     [guess_cell_type()]{.title-ref}/[guess_tissue_type()]{.title-ref}/[guess_strain_type()]{.title-ref}
271 | -   Improved README and usage instructions
272 | 
273 | ## 0.2.2 (12-03-2018)
274 | 
275 | ### New methods/functionality
276 | 
277 | -   [search_sra()]{.title-ref} allows full text search on SRA metadata.
278 | 
279 | ## 0.2.0 (12-03-2018)
280 | 
281 | ### Renamed methods
282 | 
283 | The following methods have been renamed and the changes are not
284 | compatible with 0.1.0 release:
285 | 
286 | -   [get_query()]{.title-ref} -\> [query()]{.title-ref}.
287 | -   [sra_convert()]{.title-ref} -\> [sra_metadata()]{.title-ref}.
288 | -   [get_table_counts()]{.title-ref} -\> [all_row_counts()]{.title-ref}.
289 | 
290 | ### New methods/functionality
291 | 
292 | -   [download_sradb_file()]{.title-ref} makes fetching
293 |     [SRAmetadb.sqlite]{.title-ref} file easy; wget is no longer
294 |     required.
295 | -   [ftp]{.title-ref} protocol is now supported besides
296 |     [fsp]{.title-ref} and hence [aspera-client]{.title-ref} is now
297 |     optional. We however, strongly recommend [aspera-client]{.title-ref}
298 |     for faster downloads.
299 | 
300 | ### Bug fixes
301 | 
302 | -   Silenced [SettingWithCopyWarning]{.title-ref} by excplicitly doing
303 |     operations on a copy of the dataframe instead of the original.
304 | 
305 | Besides these, all methods now follow a [numpydoc]{.title-ref}
306 | compatible documentation.
307 | 
308 | ## 0.1.0 (12-01-2018)
309 | 
310 | -   First release on PyPI.
311 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Stable release
 4 | 
 5 | To install pysradb, run this command in your terminal:
 6 | 
 7 | ``` console
 8 | $ pip install pysradb
 9 | ```
10 | 
11 | This is the preferred method to install pysradb, as it will always
12 | install the most recent stable release.
13 | 
14 | If you don\'t have [pip](https://pip.pypa.io) installed, this [Python
15 | installation
16 | guide](http://docs.python-guide.org/en/latest/starting/installation/)
17 | can guide you through the process.
18 | 
19 | Alternatively, you may use conda:
20 | 
21 | ``` bash
22 | conda install -c bioconda pysradb
23 | ```
24 | 
25 | This step will install all the dependencies except aspera-client (which
26 | is not required, but highly recommended). If you have an existing
27 | environment with a lot of pre-installed packages, conda might be
28 | [slow](https://github.com/bioconda/bioconda-recipes/issues/13774).
29 | Please consider creating a new enviroment for `pysradb`:
30 | 
31 | ``` bash
32 | conda create -c bioconda -n pysradb PYTHON=3 pysradb
33 | ```
34 | 
35 | ## From sources
36 | 
37 | The source files for pysradb can be downloaded from the [Github
38 | repo](https://github.com/saketkc/pysradb).
39 | 
40 | You can either clone the public repository:
41 | 
42 | ``` console
43 | $ git clone git://github.com/saketkc/pysradb
44 | ```
45 | 
46 | Or download the
47 | [tarball](https://github.com/saketkc/pysradb/tarball/master):
48 | 
49 | ``` console
50 | $ curl  -OL https://github.com/saketkc/pysradb/tarball/master
51 | ```
52 | 
53 | Once you have a copy of the source, you can install it with:
54 | 
55 | ``` console
56 | $ python setup.py install
57 | ```
58 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=pysradb
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/modules.md:
--------------------------------------------------------------------------------
1 | # pysradb
2 | 
3 | ::: {.toctree maxdepth="4"}
4 | pysradb
5 | :::
6 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | pysradb
2 | =======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pysradb
8 | 


--------------------------------------------------------------------------------
/docs/pysradb.rst:
--------------------------------------------------------------------------------
  1 | pysradb package
  2 | ===============
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | pysradb.basedb module
  8 | ---------------------
  9 | 
 10 | .. automodule:: pysradb.basedb
 11 |    :members:
 12 |    :undoc-members:
 13 |    :show-inheritance:
 14 | 
 15 | pysradb.cli module
 16 | ------------------
 17 | 
 18 | .. automodule:: pysradb.cli
 19 |    :members:
 20 |    :undoc-members:
 21 |    :show-inheritance:
 22 | 
 23 | pysradb.download module
 24 | -----------------------
 25 | 
 26 | .. automodule:: pysradb.download
 27 |    :members:
 28 |    :undoc-members:
 29 |    :show-inheritance:
 30 | 
 31 | pysradb.exceptions module
 32 | -------------------------
 33 | 
 34 | .. automodule:: pysradb.exceptions
 35 |    :members:
 36 |    :undoc-members:
 37 |    :show-inheritance:
 38 | 
 39 | pysradb.filter\_attrs module
 40 | ----------------------------
 41 | 
 42 | .. automodule:: pysradb.filter_attrs
 43 |    :members:
 44 |    :undoc-members:
 45 |    :show-inheritance:
 46 | 
 47 | pysradb.geodb module
 48 | --------------------
 49 | 
 50 | .. automodule:: pysradb.geodb
 51 |    :members:
 52 |    :undoc-members:
 53 |    :show-inheritance:
 54 | 
 55 | pysradb.geoweb module
 56 | ---------------------
 57 | 
 58 | .. automodule:: pysradb.geoweb
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | pysradb.search module
 64 | ---------------------
 65 | 
 66 | .. automodule:: pysradb.search
 67 |    :members:
 68 |    :undoc-members:
 69 |    :show-inheritance:
 70 | 
 71 | pysradb.sradb module
 72 | --------------------
 73 | 
 74 | .. automodule:: pysradb.sradb
 75 |    :members:
 76 |    :undoc-members:
 77 |    :show-inheritance:
 78 | 
 79 | pysradb.sraweb module
 80 | ---------------------
 81 | 
 82 | .. automodule:: pysradb.sraweb
 83 |    :members:
 84 |    :undoc-members:
 85 |    :show-inheritance:
 86 | 
 87 | pysradb.taxid2name module
 88 | -------------------------
 89 | 
 90 | .. automodule:: pysradb.taxid2name
 91 |    :members:
 92 |    :undoc-members:
 93 |    :show-inheritance:
 94 | 
 95 | pysradb.utils module
 96 | --------------------
 97 | 
 98 | .. automodule:: pysradb.utils
 99 |    :members:
100 |    :undoc-members:
101 |    :show-inheritance:
102 | 
103 | Module contents
104 | ---------------
105 | 
106 | .. automodule:: pysradb
107 |    :members:
108 |    :undoc-members:
109 |    :show-inheritance:
110 | 


--------------------------------------------------------------------------------
/docs/python-api-usage.md:
--------------------------------------------------------------------------------
  1 | # Python API 
  2 | 
  3 | ## Use Case 1: Fetch the metadata table (SRA-runtable)
  4 | 
  5 | The simplest use case of [pysradb]{.title-ref} is when you know the SRA
  6 | project ID (SRP) and would simply want to fetch the metadata associated
  7 | with it. This is generally reflected in the
  8 | [SraRunTable.txt]{.title-ref} that you get from NCBI\'s website. See an
  9 | [example](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP098789) of a
 10 | SraRunTable.
 11 | 
 12 | ``` python
 13 | from pysradb import SRAweb
 14 | db = SRAweb()
 15 | df = db.sra_metadata('SRP098789')
 16 | df.head()
 17 | ```
 18 | 
 19 |     ===============  ====================  ======================================================================  =============  ========  =================  ==============  ================  ==============  ============  ==========  ========  ============  ===============
 20 |     study_accession  experiment_accession                             experiment_title                             run_accession  taxon_id  library_selection  library_layout  library_strategy  library_source  library_name    bases      spots    adapter_spec  avg_read_length
 21 |     ===============  ====================  ======================================================================  =============  ========  =================  ==============  ================  ==============  ============  ==========  ========  ============  ===============
 22 |     SRP098789        SRX2536403            GSM2475997: 1.5 Ã‚ÂµM PF-067446846, 10 min, rep 1; Homo sapiens; OTHER  SRR5227288         9606  other              SINGLE -        OTHER             TRANSCRIPTOMIC                2104142750  42082855                             50
 23 |     SRP098789        SRX2536404            GSM2475998: 1.5 Ã‚ÂµM PF-067446846, 10 min, rep 2; Homo sapiens; OTHER  SRR5227289         9606  other              SINGLE -        OTHER             TRANSCRIPTOMIC                2082873050  41657461                             50
 24 |     SRP098789        SRX2536405            GSM2475999: 1.5 Ã‚ÂµM PF-067446846, 10 min, rep 3; Homo sapiens; OTHER  SRR5227290         9606  other              SINGLE -        OTHER             TRANSCRIPTOMIC                2023148650  40462973                             50
 25 |     SRP098789        SRX2536406            GSM2476000: 0.3 Ã‚ÂµM PF-067446846, 10 min, rep 1; Homo sapiens; OTHER  SRR5227291         9606  other              SINGLE -        OTHER             TRANSCRIPTOMIC                2057165950  41143319                             50
 26 |     SRP098789        SRX2536407            GSM2476001: 0.3 Ã‚ÂµM PF-067446846, 10 min, rep 2; Homo sapiens; OTHER  SRR5227292         9606  other              SINGLE -        OTHER             TRANSCRIPTOMIC                3027621850  60552437                             50
 27 |     ===============  ====================  ======================================================================  =============  ========  =================  ==============  ================  ==============  ============  ==========  ========  ============  ===============
 28 | 
 29 | The metadata is returned as a [pandas]{.title-ref} dataframe and hence
 30 | allows you to perform all regular select/query operations available
 31 | through [pandas]{.title-ref}.
 32 | 
 33 | ## Use Case 2: Downloading an entire project arranged experiment wise
 34 | 
 35 | Once you have fetched the metadata and made sure, this is the project
 36 | you were looking for, you would want to download everything at once.
 37 | NCBI follows this hiererachy: [SRP =\> SRX =\> SRR]{.title-ref}. Each
 38 | [SRP]{.title-ref} (project) has multiple [SRX]{.title-ref} (experiments)
 39 | and each [SRX]{.title-ref} in turn has multiple [SRR]{.title-ref} (runs)
 40 | inside it. We want to mimick this hiereachy in our downloads. The reason
 41 | to do that is simple: in most cases you care about [SRX]{.title-ref} the
 42 | most, and would want to \"merge\" your SRRs in one way or the other.
 43 | Having this hierearchy ensures your downstream code can handle such
 44 | cases easily, without worrying about which runs (SRR) need to be merged.
 45 | 
 46 | We strongly recommend installing [aspera-client]{.title-ref} which uses
 47 | UDP and is [designed to be faster](http://www.skullbox.net/tcpudp.php).
 48 | 
 49 | ``` python
 50 | from pysradb import SRAweb
 51 | db = SRAweb()
 52 | df = db.sra_metadata('SRP017942')
 53 | db.download(df)
 54 | ```
 55 | 
 56 | ## Use Case 3: Downloading a subset of experiments
 57 | 
 58 | Often, you need to process only a smaller set of samples from a project
 59 | (SRP). Consider this project which has data spanning four assays.
 60 | 
 61 | ``` python
 62 | df = db.sra_metadata('SRP000941')
 63 | print(df.library_strategy.unique())
 64 | ['ChIP-Seq' 'Bisulfite-Seq' 'RNA-Seq' 'WGS' 'OTHER']
 65 | ```
 66 | 
 67 | But, you might be only interested in analyzing the [RNA-seq]{.title-ref}
 68 | samples and would just want to download that subset. This is simple
 69 | using [pysradb]{.title-ref} since the metadata can be subset just as you
 70 | would subset a dataframe in pandas.
 71 | 
 72 | ``` python
 73 | df_rna = df[df.library_strategy == 'RNA-Seq']
 74 | db.download(df=df_rna, out_dir='/pysradb_downloads')()
 75 | ```
 76 | 
 77 | ## Use Case 4: Getting cell-type/treatment information from sample_attributes
 78 | 
 79 | Cell type/tissue informations is usually hidden in the
 80 | [sample_attributes]{.title-ref} column, which can be expanded:
 81 | 
 82 | ``` python
 83 | from pysradb.filter_attrs import expand_sample_attribute_columns
 84 | df = db.sra_metadata('SRP017942')
 85 | expand_sample_attribute_columns(df).head()
 86 | ```
 87 | 
 88 | <table>
 89 | <thead>
 90 | <tr class="header">
 91 | <th>study_accession</th>
 92 | <th>experiment_accession</th>
 93 | <th>experiment_title</th>
 94 | <th>experiment_attribute</th>
 95 | <th>sample_attribute</th>
 96 | <th>run_accession</th>
 97 | <th>taxon_id</th>
 98 | <th>library_selection</th>
 99 | <th>library_layout</th>
100 | <th>library_strategy</th>
101 | <th>library_source</th>
102 | <th>library_name</th>
103 | <th>bases</th>
104 | <th>spots</th>
105 | <th>adapter_spec</th>
106 | <th>avg_read_length</th>
107 | <th>assay_type</th>
108 | <th>cell_line</th>
109 | <th>source_name</th>
110 | <th>transfected_with</th>
111 | <th>treatment</th>
112 | </tr>
113 | </thead>
114 | <tbody>
115 | <tr class="odd">
116 | <td><p>SRP017942 SRP017942 SRP017942 SRP017942 SRP017942</p></td>
117 | <td><p>SRX217028 SRX217029 SRX217030 SRX217031 SRX217956</p></td>
118 | <td><p>GSM1063575: 293T_GFP; Homo sapiens; RNA-Seq GSM1063576:
119 | 293T_GFP_2hrs_severe_Heat_Shock; Homo sapiens; RNA-Seq GSM1063577:
120 | 293T_Hspa1a; Homo sapiens; RNA-Seq GSM1063578:
121 | 293T_Hspa1a_2hrs_severe_Heat_Shock; Homo sapiens; RNA-Seq GSM794854:
122 | 3T3-Control-Riboseq; Mus musculus; RNA-Seq</p></td>
123 | <td><p>GEO Accession: GSM1063575 GEO Accession: GSM1063576 GEO
124 | Accession: GSM1063577 GEO Accession: GSM1063578 GEO Accession:
125 | GSM794854</p></td>
126 | <td><p>source_name: 293T cells || cell line: 293T cells || transfected
127 | with: 3XFLAG-GFP || assay type: Riboseq source_name: 293T cells || cell
128 | line: 293T cells || transfected with: 3XFLAG-GFP || treatment: severe
129 | heat shock (44C 2 hours) || assay type: Riboseq source_name: 293T cells
130 | || cell line: 293T cells || transfected with: 3XFLAG-Hspa1a || assay
131 | type: Riboseq source_name: 293T cells || cell line: 293T cells ||
132 | transfected with: 3XFLAG-Hspa1a || treatment: severe heat shock (44C 2
133 | hours) || assay type: Riboseq source_name: 3T3 cells || treatment:
134 | control || cell line: 3T3 cells || assay type: Riboseq</p></td>
135 | <td><p>SRR648667 SRR648668 SRR648669 SRR648670 SRR649752</p></td>
136 | <td><blockquote>
137 | <p>9606 9606 9606 9606 10090</p>
138 | </blockquote></td>
139 | <td><p>other other other other cDNA</p></td>
140 | <td><p>SINGLE -SINGLE -SINGLE -SINGLE -SINGLE -</p></td>
141 | <td><p>RNA-Seq RNA-Seq RNA-Seq RNA-Seq RNA-Seq</p></td>
142 | <td><p>TRANSCRIPTOMIC TRANSCRIPTOMIC TRANSCRIPTOMIC TRANSCRIPTOMIC
143 | TRANSCRIPTOMIC</p></td>
144 | <td></td>
145 | <td><p>1806641316 3436984836 3330909216 3622123512 594945396</p></td>
146 | <td><blockquote>
147 | <p>50184481 95471801 92525256</p>
148 | </blockquote>
149 | <dl>
150 | <dt>100614542</dt>
151 | <dd>
152 | <p>16526261</p>
153 | </dd>
154 | </dl></td>
155 | <td></td>
156 | <td><blockquote>
157 | <p>36 36 36 36 36</p>
158 | </blockquote></td>
159 | <td><p>riboseq riboseq riboseq riboseq riboseq</p></td>
160 | <td><p>293t cells 293t cells 293t cells 293t cells 3t3 cells</p></td>
161 | <td><p>293t cells 293t cells 293t cells 293t cells 3t3 cells</p></td>
162 | <td><p>3xflag-gfp 3xflag-gfp 3xflag-hspa1a 3xflag-hspa1a NaN</p></td>
163 | <td><p>NaN severe heat shock (44c 2 hours) NaN severe heat shock (44c 2
164 | hours) control</p></td>
165 | </tr>
166 | </tbody>
167 | </table>
168 | 
169 | ## Use Case 5: Searching for datasets
170 | 
171 | Another common operation that we do on SRA is seach, plain text search.
172 | 
173 | If you want to look up for all projects where [ribosome
174 | profiling]{.title-ref} appears somewhere in the description:
175 | 
176 | ``` python
177 | df = db.search_sra(search_str='"ribosome profiling"')
178 | df.head()
179 | ```
180 | 
181 | <table>
182 | <thead>
183 | <tr class="header">
184 | <th>study_accession</th>
185 | <th>experiment_accession</th>
186 | <th>experiment_title</th>
187 | <th>run_accession</th>
188 | <th>taxon_id</th>
189 | <th>library_selection</th>
190 | <th>library_layout</th>
191 | <th>library_strategy</th>
192 | <th>library_source</th>
193 | <th>library_name</th>
194 | <th>bases</th>
195 | <th>spots</th>
196 | </tr>
197 | </thead>
198 | <tbody>
199 | <tr class="odd">
200 | <td>DRP003075</td>
201 | <td>DRX019536</td>
202 | <td>Illumina Genome Analyzer IIx sequencing of SAMD00018584</td>
203 | <td>DRR021383</td>
204 | <td><blockquote>
205 | <p>83333</p>
206 | </blockquote></td>
207 | <td>other</td>
208 | <td>SINGLE -</td>
209 | <td>OTHER</td>
210 | <td>TRANSCRIPTOMIC</td>
211 | <td>GAII05_3</td>
212 | <td><blockquote>
213 | <p>978776480</p>
214 | </blockquote></td>
215 | <td>12234706</td>
216 | </tr>
217 | <tr class="even">
218 | <td>DRP003075</td>
219 | <td>DRX019537</td>
220 | <td>Illumina Genome Analyzer IIx sequencing of SAMD00018585</td>
221 | <td>DRR021384</td>
222 | <td><blockquote>
223 | <p>83333</p>
224 | </blockquote></td>
225 | <td>other</td>
226 | <td>SINGLE -</td>
227 | <td>OTHER</td>
228 | <td>TRANSCRIPTOMIC</td>
229 | <td>GAII05_4</td>
230 | <td><blockquote>
231 | <p>894201680</p>
232 | </blockquote></td>
233 | <td>11177521</td>
234 | </tr>
235 | <tr class="odd">
236 | <td>DRP003075</td>
237 | <td>DRX019538</td>
238 | <td>Illumina Genome Analyzer IIx sequencing of SAMD00018586</td>
239 | <td>DRR021385</td>
240 | <td><blockquote>
241 | <p>83333</p>
242 | </blockquote></td>
243 | <td>other</td>
244 | <td>SINGLE -</td>
245 | <td>OTHER</td>
246 | <td>TRANSCRIPTOMIC</td>
247 | <td>GAII05_5</td>
248 | <td><blockquote>
249 | <p>931536720</p>
250 | </blockquote></td>
251 | <td>11644209</td>
252 | </tr>
253 | <tr class="even">
254 | <td>DRP003075</td>
255 | <td>DRX019540</td>
256 | <td>Illumina Genome Analyzer IIx sequencing of SAMD00018588</td>
257 | <td>DRR021387</td>
258 | <td><blockquote>
259 | <p>83333</p>
260 | </blockquote></td>
261 | <td>other</td>
262 | <td>SINGLE -</td>
263 | <td>OTHER</td>
264 | <td>TRANSCRIPTOMIC</td>
265 | <td>GAII07_4</td>
266 | <td>2759398700</td>
267 | <td>27593987</td>
268 | </tr>
269 | <tr class="odd">
270 | <td>DRP003075</td>
271 | <td>DRX019541</td>
272 | <td>Illumina Genome Analyzer IIx sequencing of SAMD00018589</td>
273 | <td>DRR021388</td>
274 | <td><blockquote>
275 | <p>83333</p>
276 | </blockquote></td>
277 | <td>other</td>
278 | <td>SINGLE -</td>
279 | <td>OTHER</td>
280 | <td>TRANSCRIPTOMIC</td>
281 | <td>GAII07_5</td>
282 | <td>2386196500</td>
283 | <td>23861965</td>
284 | </tr>
285 | </tbody>
286 | </table>
287 | 
288 | Again, the results are available as a [pandas]{.title-ref} dataframe and
289 | hence you can perform all subset operations post your query. Your query
290 | doesn\'t need to be exact.
291 | 


--------------------------------------------------------------------------------
/notebooks/07.Multiple_SRPs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "nbformat": 4,
  3 |  "nbformat_minor": 0,
  4 |  "metadata": {
  5 |   "colab": {
  6 |    "name": "07.Multiple_SRPs",
  7 |    "provenance": [],
  8 |    "mount_file_id": "1pNeuZJjjHliYFk582kGNRpGJ1Fa2h9cn",
  9 |    "authorship_tag": "ABX9TyMmcvA8kJPyf4bhs59mCISs",
 10 |    "include_colab_link": true
 11 |   },
 12 |   "kernelspec": {
 13 |    "name": "python3",
 14 |    "display_name": "Python 3"
 15 |   }
 16 |  },
 17 |  "cells": [
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {
 21 |     "id": "view-in-github",
 22 |     "colab_type": "text"
 23 |    },
 24 |    "source": [
 25 |     "<a href=\"https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/07.Multiple_SRPs.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "metadata": {
 31 |     "id": "kPEeKLC38WGX",
 32 |     "colab_type": "code",
 33 |     "colab": {
 34 |      "base_uri": "https://localhost:8080/",
 35 |      "height": 401
 36 |     },
 37 |     "outputId": "45055cff-1f41-4b1d-f67f-0a4dc37a8333"
 38 |    },
 39 |    "source": [
 40 |     "pip install git+https://github.com/saketkc/pysradb.git"
 41 |    ],
 42 |    "execution_count": null,
 43 |    "outputs": [
 44 |     {
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Collecting git+https://github.com/saketkc/pysradb.git\n",
 48 |       "  Cloning https://github.com/saketkc/pysradb.git to /tmp/pip-req-build-hmpqrmad\n",
 49 |       "  Running command git clone -q https://github.com/saketkc/pysradb.git /tmp/pip-req-build-hmpqrmad\n",
 50 |       "Requirement already satisfied (use --upgrade to upgrade): pysradb==0.10.5.dev0 from git+https://github.com/saketkc/pysradb.git in /usr/local/lib/python3.6/dist-packages\n",
 51 |       "Requirement already satisfied: pandas==1.0.1 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (1.0.1)\n",
 52 |       "Requirement already satisfied: requests==2.23.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (2.23.0)\n",
 53 |       "Requirement already satisfied: tqdm==4.43.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (4.43.0)\n",
 54 |       "Requirement already satisfied: xmltodict==0.12.0 in /usr/local/lib/python3.6/dist-packages (from pysradb==0.10.5.dev0) (0.12.0)\n",
 55 |       "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (2.8.1)\n",
 56 |       "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (1.18.5)\n",
 57 |       "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas==1.0.1->pysradb==0.10.5.dev0) (2018.9)\n",
 58 |       "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (2020.6.20)\n",
 59 |       "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (2.9)\n",
 60 |       "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (1.24.3)\n",
 61 |       "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests==2.23.0->pysradb==0.10.5.dev0) (3.0.4)\n",
 62 |       "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas==1.0.1->pysradb==0.10.5.dev0) (1.12.0)\n",
 63 |       "Building wheels for collected packages: pysradb\n",
 64 |       "  Building wheel for pysradb (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 65 |       "  Created wheel for pysradb: filename=pysradb-0.10.5.dev0-cp36-none-any.whl size=148723 sha256=394afd5781d90d04f37bd5eb7c26ff245792e4f7a4140f6bd1796fe0b5b724be\n",
 66 |       "  Stored in directory: /tmp/pip-ephem-wheel-cache-y1mkqxnq/wheels/d5/24/42/81dccabc3a4aac9757e23b7175ad7270090a4b3c203cd4fc8f\n",
 67 |       "Successfully built pysradb\n"
 68 |      ],
 69 |      "name": "stdout"
 70 |     }
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "metadata": {
 76 |     "id": "n-lUysUE8edh",
 77 |     "colab_type": "code",
 78 |     "colab": {}
 79 |    },
 80 |    "source": [
 81 |     "SRP_list = \"\"\"ERP009675\n",
 82 |     "ERP007116\n",
 83 |     "ERP007115\n",
 84 |     "ERP004563\n",
 85 |     "ERP005660\n",
 86 |     "ERP001266\n",
 87 |     "ERP002072\n",
 88 |     "ERP001882\n",
 89 |     "ERP004883\n",
 90 |     "ERP004508\n",
 91 |     "ERP004393\n",
 92 |     "ERP005409\n",
 93 |     "ERP001464\n",
 94 |     "ERP004042\n",
 95 |     "ERP004375\n",
 96 |     "ERP003293\n",
 97 |     "ERP004689\n",
 98 |     "ERP001094\n",
 99 |     "ERP003728\n",
100 |     "ERP000730\n",
101 |     "ERP000411\n",
102 |     "ERP000319\n",
103 |     "SRP041183\n",
104 |     "SRP011912\n",
105 |     "SRP058392\n",
106 |     "SRP044705\n",
107 |     "SRP036841\n",
108 |     "SRP050120\n",
109 |     "ERP004116\n",
110 |     "SRP030662\n",
111 |     "SRP047217\n",
112 |     "SRP045505\n",
113 |     "ERP001556\n",
114 |     "ERP000546\n",
115 |     "SRP045252\n",
116 |     "SRP044714\n",
117 |     "DRP000524\n",
118 |     "SRP044907\n",
119 |     "SRP019970\n",
120 |     "SRP044131\n",
121 |     "SRP044042\n",
122 |     "SRP043602\n",
123 |     "SRP043523\n",
124 |     "SRP014570\n",
125 |     "SRP043067\n",
126 |     "SRP042370\n",
127 |     "SRP042360\n",
128 |     "SRP042159\n",
129 |     "SRP042085\n",
130 |     "SRP042053\n",
131 |     "SRP041992\n",
132 |     "SRP041738\n",
133 |     "SRP041679\n",
134 |     "SRP041669\n",
135 |     "SRP041622\n",
136 |     "SRP041182\n",
137 |     "SRP041377\n",
138 |     "SRP021009\n",
139 |     "SRP041216\n",
140 |     "SRP041129\n",
141 |     "SRP041119\n",
142 |     "SRP041044\n",
143 |     "SRP040761\n",
144 |     "SRP040479\n",
145 |     "SRP040072\n",
146 |     "SRP040070\n",
147 |     "SRP040121\n",
148 |     "SRP040044\n",
149 |     "SRP039841\n",
150 |     "SRP039779\n",
151 |     "SRP039717\n",
152 |     "SRP039699\n",
153 |     "SRP039672\n",
154 |     "SRP039661\n",
155 |     "SRP039646\n",
156 |     "SRP039634\n",
157 |     "SRP035451\n",
158 |     "SRP039551\n",
159 |     "SRP039478\n",
160 |     "SRP039440\n",
161 |     "SRP039448\n",
162 |     "SRP030474\n",
163 |     "SRP034507\n",
164 |     "SRP038004\n",
165 |     "SRP037780\n",
166 |     "SRP037583\n",
167 |     "SRP036068\n",
168 |     "SRP036637\n",
169 |     "SRP036632\n",
170 |     "SRP035368\n",
171 |     "SRP035278\n",
172 |     "SRP034930\n",
173 |     "SRP034844\n",
174 |     "ERP004159\n",
175 |     "SRP017087\n",
176 |     "SRP034444\n",
177 |     "SRP033229\n",
178 |     "SRP033198\n",
179 |     "SRP033021\n",
180 |     "ERP000964\n",
181 |     "ERP002429\n",
182 |     "SRP032928\n",
183 |     "SRP032833\n",
184 |     "SRP032792\n",
185 |     "SRP032766\n",
186 |     "ERP003855\n",
187 |     "ERP000904\n",
188 |     "SRP028229\n",
189 |     "SRP026361\n",
190 |     "SRP023111\n",
191 |     "SRP021139\n",
192 |     "SRP013319\n",
193 |     "SRP020006\n",
194 |     "SRP019994\n",
195 |     "SRP019500\n",
196 |     "SRP019241\n",
197 |     "SRP018672\n",
198 |     "SRP018358\n",
199 |     "SRP016875\n",
200 |     "SRP016501\n",
201 |     "SRP015460\n",
202 |     "SRP015370\n",
203 |     "SRP015135\n",
204 |     "SRP014437\n",
205 |     "SRP012378\n",
206 |     "SRP012018\n",
207 |     "SRP010103\n",
208 |     "SRP007831\n",
209 |     "SRP007412\n",
210 |     "SRP007400\n",
211 |     "SRP002090\n",
212 |     "SRP029153\n",
213 |     "SRP029445\n",
214 |     "SRP029427\n",
215 |     "SRP029380\n",
216 |     "SRP029330\n",
217 |     "SRP029333\n",
218 |     "ERP000606\n",
219 |     "ERP000415\n",
220 |     "ERP001977\n",
221 |     "ERP000373\n",
222 |     "SRP029172\n",
223 |     "SRP028766\n",
224 |     "ERP000668\n",
225 |     "ERP003627\n",
226 |     "SRP017364\n",
227 |     "SRP025757\n",
228 |     "SRP021189\n",
229 |     "SRP011154\n",
230 |     "SRP014574\n",
231 |     "SRP017935\n",
232 |     "SRP016889\n",
233 |     "SRP002016\n",
234 |     "SRP018826\n",
235 |     "SRP018753\n",
236 |     "SRP009821\n",
237 |     "SRP012925\n",
238 |     "SRP012850\n",
239 |     "SRP009870\n",
240 |     "SRP007799\n",
241 |     "SRP006748\n",
242 |     "SRP000373\"\"\"\n",
243 |     "SRP_list = SRP_list.split(\"\\n\")"
244 |    ],
245 |    "execution_count": null,
246 |    "outputs": []
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "metadata": {
251 |     "id": "1CWmd5rF8yny",
252 |     "colab_type": "code",
253 |     "colab": {
254 |      "base_uri": "https://localhost:8080/",
255 |      "height": 72
256 |     },
257 |     "outputId": "519c91be-e3b9-48a3-fd47-fbe415bd0b2a"
258 |    },
259 |    "source": [
260 |     "import sys\n",
261 |     "import time\n",
262 |     "\n",
263 |     "from pysradb import SRAweb\n",
264 |     "\n",
265 |     "db = SRAweb()\n",
266 |     "\n",
267 |     "for srp in SRP_list:\n",
268 |     "    try:\n",
269 |     "        df = db.sra_metadata(srp)\n",
270 |     "        df.to_csv(\"{}.tsv\".format(srp), sep=\"\\t\", index=False)\n",
271 |     "    except:\n",
272 |     "        sys.stderr.write(\"Error with {}\\n\".format(srp))\n",
273 |     "        time.sleep(0.5)\n",
274 |     "    time.sleep(0.5)"
275 |    ],
276 |    "execution_count": null,
277 |    "outputs": [
278 |     {
279 |      "output_type": "stream",
280 |      "text": [
281 |       "/usr/local/lib/python3.6/dist-packages/pysradb/utils.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
282 |       "  from tqdm.autonotebook import tqdm\n"
283 |      ],
284 |      "name": "stderr"
285 |     }
286 |    ]
287 |   }
288 |  ]
289 | }


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Notebooks demonstrating functionalities of pysradb
 2 | 
 3 | 1. [Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/01.Python-API_demo.ipynb)
 4 | 2. [Downloading datasets from SRA - command line](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/02.Commandline_download.ipynb)
 5 | 3. [Parallely download multiple datasets - Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/03.ParallelDownload.ipynb)
 6 | 4. [Converting SRA-to-fastq - command line (requires conda)](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/04.SRA_to_fastq_conda.ipynb)
 7 | 5. [Downloading subsets of a project - Python API](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/05.Downloading_subsets_of_a_project.ipynb)
 8 | 6. [Download BAMs](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/06.Download_BAMs.ipynb)
 9 | 7. [Metadata for multiple SRPs](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/07.Multiple_SRPs.ipynb)
10 | 8. [Multithreaded fastq downloads using Aspera Client](https://colab.research.google.com/github/saketkc/pysradb/blob/master/notebooks/08.pysradb_ascp_multithreaded.ipynb)
11 | 
12 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pysradb"
 7 | dynamic = ["version"]
 8 | description = "A Python package for interacting with SRAdb and downloading datasets from SRA/ENA/GEO"
 9 | readme = "README.md"
10 | license = {file = "LICENSE"}
11 | requires-python = ">=3.9"
12 | authors = [
13 |     { name = "Saket Choudhary", email = "saketkc@gmail.com" },
14 | ]
15 | keywords = [
16 |     "pysradb",
17 | ]
18 | classifiers = [
19 |     "Development Status :: 5 - Production/Stable",
20 |     "Intended Audience :: Healthcare Industry",
21 |     "Intended Audience :: Science/Research",
22 |     "License :: OSI Approved :: BSD License",
23 |     "Natural Language :: English",
24 |     "Programming Language :: Python :: 3",
25 |     "Programming Language :: Python :: 3.9",
26 |     "Programming Language :: Python :: 3.10",
27 |     "Programming Language :: Python :: 3.11",
28 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
29 | ]
30 | dependencies = [
31 |     "lxml>=4.6.3",
32 |     "pandas>=1.3.2",
33 |     "requests-ftp>=0.3.1",
34 |     "requests>=2.26.0",
35 |     "tqdm>=4.62.1",
36 |     "xmltodict>=0.12.0",
37 | ]
38 | 
39 | [project.scripts]
40 | pysradb = "pysradb.cli:parse_args"
41 | 
42 | [project.urls]
43 | Homepage = "https://saket-choudhary.me/pysradb"
44 | 
45 | [tool.distutils.bdist_wheel]
46 | universal = true
47 | 
48 | [tool.hatch.version]
49 | path = "pysradb/__init__.py"
50 | 
51 | [tool.hatch.build.targets.sdist]
52 | include = [
53 |     "/pysradb",
54 | ]
55 | 


--------------------------------------------------------------------------------
/pysradb/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Top-level package for pysradb."""
 3 | 
 4 | __author__ = """Saket Choudhary"""
 5 | __email__ = "saketkc@gmail.com"
 6 | __version__ = "2.2.2"
 7 | 
 8 | from .filter_attrs import expand_sample_attribute_columns
 9 | from .geodb import GEOdb
10 | from .geodb import download_geodb_file
11 | from .sradb import SRAdb
12 | from .sradb import download_sradb_file
13 | from .sraweb import SRAweb
14 | 


--------------------------------------------------------------------------------
/pysradb/basedb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sqlite3
  3 | import sys
  4 | import warnings
  5 | 
  6 | import pandas as pd
  7 | 
  8 | from .utils import _extract_first_field
  9 | 
 10 | warnings.simplefilter(action="ignore", category=FutureWarning)
 11 | 
 12 | 
 13 | class BASEdb(object):
 14 |     def __init__(self, sqlite_file):
 15 |         """Initialize SRAdb.
 16 | 
 17 |         Parameters
 18 |         ----------
 19 | 
 20 |         sqlite_file: string
 21 |                      Path to unzipped SRAmetadb.sqlite file
 22 | 
 23 | 
 24 |         """
 25 |         self.sqlite_file = sqlite_file
 26 |         self.open()
 27 |         self.cursor = self.db.cursor()
 28 | 
 29 |     def open(self):
 30 |         """Open sqlite connection."""
 31 |         # Originally sqlite3.connect(self.sqlite_file)
 32 |         self.db = sqlite3.connect("file:{}?mode=ro".format(self.sqlite_file), uri=True)
 33 |         self.db.text_factory = str
 34 | 
 35 |     def close(self):
 36 |         """Close sqlite connection."""
 37 |         self.db.close()
 38 | 
 39 |     def list_tables(self):
 40 |         """List all tables in the sqlite file.
 41 | 
 42 |         Returns
 43 |         -------
 44 |         table_list: list
 45 |                     List of all table names
 46 |         """
 47 |         results = self.cursor.execute(
 48 |             'SELECT name FROM sqlite_master WHERE type="table";'
 49 |         ).fetchall()
 50 |         return _extract_first_field(results)
 51 | 
 52 |     def list_fields(self, table):
 53 |         """List all fields in a given table.
 54 | 
 55 |         Parameters
 56 |         ----------
 57 |         table: string
 58 |                Table name.
 59 |                See `list_tables` for getting all table names
 60 | 
 61 |         Returns
 62 |         -------
 63 |         field_list: list
 64 |                     A list of field names for the table
 65 |         """
 66 |         results = self.cursor.execute("SELECT * FROM {}".format(table))
 67 |         return _extract_first_field(results.description)
 68 | 
 69 |     def desc_table(self, table):
 70 |         """Describe all fields in a table.
 71 | 
 72 |         Parameters
 73 |         ----------
 74 |         table: string
 75 |                Table name.
 76 |                See `list_tables` for getting all table names
 77 | 
 78 |         Returns
 79 |         -------
 80 |         table_desc: DataFrame
 81 |                     A DataFrame with field name and its
 82 |                     schema description
 83 |         """
 84 |         results = self.cursor.execute(
 85 |             'PRAGMA table_info("{}")'.format(table)
 86 |         ).fetchall()
 87 |         columns = ["cid", "name", "dtype", "notnull", "dflt_value", "pk"]
 88 |         data = []
 89 |         for result in results:
 90 |             data.append(list([str(x) for x in result]))
 91 |         table_desc = pd.DataFrame(data, columns=columns)
 92 |         return table_desc
 93 | 
 94 |     def query(self, sql_query):
 95 |         """Run SQL query.
 96 | 
 97 |         Parameters
 98 |         ----------
 99 |         sql_query: string
100 |                    SQL query string
101 | 
102 |         Returns
103 |         -------
104 |         results: DataFrame
105 |                  Query results formatted as dataframe
106 | 
107 |         """
108 |         results = self.cursor.execute(sql_query).fetchall()
109 |         column_names = list([x[0] for x in self.cursor.description])
110 |         results = [dict(list(zip(column_names, result))) for result in results]
111 |         df = pd.DataFrame(results)
112 |         if not results:
113 |             # sys.stderr.write("Found no matching results for query: {}".format(sql_query))
114 |             sys.stderr.write("Found no matching results for query.\n")
115 |         return df
116 | 
117 |     def get_row_count(self, table):
118 |         """Get row counts for a table.
119 | 
120 |         Parameters
121 |         ----------
122 |         table: string
123 |                Table name.
124 |                See `list_tables` for getting all table names
125 | 
126 |         Returns
127 |         -------
128 |         row_count: int
129 |                    Number of rows in table
130 |         """
131 |         return self.cursor.execute(
132 |             "SELECT max(rowid) FROM {}".format(table)
133 |         ).fetchone()[0]
134 | 
135 |     def all_row_counts(self):
136 |         """Get row counts of all tables in the db file.
137 | 
138 |         Returns
139 |         -------
140 |         row_counts: DataFrame
141 |                     A dataframe with table names and corresponding
142 |                     row count.
143 | 
144 |         """
145 |         tables = self.list_tables()
146 |         results = dict([(table, self.get_row_count(table)) for table in tables])
147 |         return pd.DataFrame.from_dict(results, orient="index", columns=["count"])
148 | 


--------------------------------------------------------------------------------
/pysradb/download.py:
--------------------------------------------------------------------------------
  1 | """Utility function to download data"""
  2 | 
  3 | import hashlib
  4 | import math
  5 | import os
  6 | import shutil
  7 | import sys
  8 | import warnings
  9 | 
 10 | import numpy as np
 11 | import requests
 12 | import requests_ftp
 13 | from tqdm.autonotebook import tqdm
 14 | 
 15 | from .utils import requests_3_retries
 16 | 
 17 | requests_ftp.monkeypatch_session()
 18 | warnings.simplefilter(action="ignore", category=FutureWarning)
 19 | import pandas as pd
 20 | 
 21 | tqdm.pandas()
 22 | 
 23 | 
 24 | def millify(n):
 25 |     """Convert integer to human readable format.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     n : int
 30 | 
 31 |     Returns
 32 |     -------
 33 |     millidx : str
 34 |               Formatted integer
 35 |     """
 36 |     millnames = ["", " KB", " MB", " GB", " TB"]
 37 |     # Source: http://stackoverflow.com/a/3155023/756986
 38 |     n = float(n)
 39 |     millidx = max(
 40 |         0,
 41 |         min(
 42 |             len(millnames) - 1, int(math.floor(0 if n == 0 else math.log10(abs(n)) / 3))
 43 |         ),
 44 |     )
 45 | 
 46 |     return "{:.1f}{}".format(n / 10 ** (3 * millidx), millnames[millidx])
 47 | 
 48 | 
 49 | def get_file_size(row, url_col):
 50 |     """Get size of file to be downloaded.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     row: pd.DataFrame row
 55 | 
 56 |     url_col: str
 57 |         url_column
 58 | 
 59 |     Returns
 60 |     -------
 61 |     content_length: int
 62 |     """
 63 |     if row[url_col] is not None:
 64 |         url = row[url_col]
 65 |     else:
 66 |         url = row.download_url
 67 |     if url is pd.NA:
 68 |         return 0
 69 |     if not isinstance(url, str):
 70 |         return 0
 71 |     if url.startswith("ftp."):
 72 |         url = "ftp://" + url
 73 |     try:
 74 |         r = requests_3_retries().head(url)
 75 |         size = int(r.headers["content-length"])
 76 |         r.raise_for_status()
 77 |     except requests.exceptions.Timeout:
 78 |         sys.exit(f"Connection to {url} has timed out. Please retry.")
 79 |     except requests.exceptions.HTTPError:
 80 |         print(
 81 |             f"The download URL:  {url}  is likely invalid.\n"
 82 |             f"Removing {row.run_accession} from the download list\n",
 83 |             flush=True,
 84 |         )
 85 |         return np.NaN
 86 |     except KeyError:
 87 |         print("Key error for: " + url, flush=True)
 88 |         return 0
 89 |     return size
 90 | 
 91 | 
 92 | def md5_validate_file(file_path, md5_hash):
 93 |     """Check file containt against an MD5.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     file_path: string
 98 |                Path to file
 99 |     md5_hash: string
100 |              Expected md5 hash
101 | 
102 |     Returns
103 |     -------
104 |     valid: bool
105 |            True if expected and observed md5 match
106 |     """
107 |     observed_md5 = hashlib.md5()
108 |     with open(file_path, "rb") as f:
109 |         while True:
110 |             # read 1MB
111 |             chunk = f.read(1000 * 1000)
112 |             if not chunk:
113 |                 break
114 |             observed_md5.update(chunk)
115 |     return observed_md5.hexdigest() == md5_hash
116 | 
117 | 
118 | def download_file(
119 |     url,
120 |     file_path,
121 |     md5_hash=None,
122 |     timeout=10,
123 |     block_size=1024 * 1024,
124 |     show_progress=False,
125 | ):
126 |     """Resumable download.
127 |     Expect the server to support byte ranges.
128 | 
129 |     Parameters
130 |     ----------
131 |     url: string
132 |          URL
133 |     file_path: string
134 |                Local file path to store the downloaded file
135 |     md5_hash: string
136 |               Expected MD5 string of downloaded file
137 |     timeout: int
138 |              Seconds to wait before terminating request
139 |     block_size: int
140 |                 Chunkx of bytes to read (default: 1024 * 1024 = 1MB)
141 |     show_progress: bool
142 |                    Show progress bar
143 |     """
144 |     if url.startswith("ftp."):
145 |         url = "ftp://" + url
146 |         session = requests.Session()
147 |     else:
148 |         session = requests
149 |     if os.path.exists(file_path) and os.path.getsize(file_path):
150 |         return
151 |     tmp_file_path = file_path + ".part"
152 |     first_byte = os.path.getsize(tmp_file_path) if os.path.exists(tmp_file_path) else 0
153 |     file_mode = "ab" if first_byte else "wb"
154 |     file_size = -1
155 |     try:
156 |         file_size = int(session.head(url).headers["Content-length"])
157 |         headers = {"Range": "bytes=%s-" % first_byte}
158 |         r = session.get(url, headers=headers, stream=True)
159 |         if show_progress:
160 |             desc = "Downloading {}".format(url.split("/")[-1])
161 |             pbar = tqdm(
162 |                 total=file_size,
163 |                 initial=first_byte,
164 |                 unit="B",
165 |                 unit_scale=True,
166 |                 desc=desc,
167 |             )
168 |         with open(tmp_file_path, file_mode) as f:
169 |             for chunk in r.iter_content(chunk_size=block_size):
170 |                 if chunk:  # filter out keep-alive new chunks
171 |                     f.write(chunk)
172 |                     if show_progress:
173 |                         pbar.update(block_size)
174 |         if show_progress:
175 |             pbar.close()
176 |     except IOError as e:
177 |         sys.stderr.write("IO Error - {}\n".format(e))
178 |     finally:
179 |         # Move the temp file to desired location
180 |         if file_size == os.path.getsize(tmp_file_path):
181 |             # if there's a hash value, validate the file
182 |             if md5_hash and not md5_validate_file(tmp_file_path, md5_hash):
183 |                 raise Exception("Error validating the file against its MD5 hash")
184 |             shutil.move(tmp_file_path, file_path)
185 |         elif file_size == -1:
186 |             raise Exception("Error getting Content-Length from server: %s" % url)
187 | 


--------------------------------------------------------------------------------
/pysradb/exceptions.py:
--------------------------------------------------------------------------------
 1 | """This file contains custom Exceptions for pysradb
 2 | """
 3 | 
 4 | 
 5 | class MissingQueryException(Exception):
 6 |     """Exception raised when the user did not supply any query fields.
 7 | 
 8 |     Attributes:
 9 |         message: string
10 |             Error message for this Exception
11 | 
12 |     """
13 | 
14 |     def __init__(self):
15 |         self.message = (
16 |             "No valid query has been supplied. \n"
17 |             "A query must be supplied to one of the following fields:\n"
18 |             "[--query, --accession, --organism, --layout, --mbases, --publication-date,"
19 |             " --platform, --selection, --source, --strategy, --title]"
20 |         )
21 |         super().__init__(self.message)
22 | 
23 | 
24 | class IncorrectFieldException(Exception):
25 |     """Exception raised when the user enters incorrect inputs for a flag."""
26 | 
27 |     pass
28 | 


--------------------------------------------------------------------------------
/pysradb/filter_attrs.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import warnings
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | 
  8 | def _get_sample_attr_keys(sample_attribute):
  9 |     if sample_attribute is None:
 10 |         return None, None
 11 |     sample_attribute_splitted = sample_attribute.split("||")
 12 |     split_by_colon = [
 13 |         str(attr).strip().split(": ") for attr in sample_attribute_splitted
 14 |     ]
 15 | 
 16 |     # Iterate once more to consider first one as the key
 17 |     # and remaining as the value
 18 |     # This is because of bad annotations like in this example
 19 |     # Example: isolate: not applicable || organism: Mus musculus || cell_line: 17-Cl1 ||\
 20 |     # infect: MHV-A59 || time point: 5: hour || compound: cycloheximide ||\
 21 |     # sequencing protocol: RiboSeq || biological repeat: long read sequencing
 22 |     # Notice the `time: 5: hour`
 23 |     # sample_attribute: investigation type: metagenome || project name: Landsort Depth 20090415 transect ||
 24 |     # sequencing method: 454 || collection date: 2009-04-15 || ammonium: 8.7: Ã‚ÂµM || chlorophyll: 0: Ã‚Âµg/L ||
 25 |     # dissolved oxygen: -1.33: Ã‚Âµmol/kg || nitrate: 0.02: Ã‚ÂµM || nitrogen: 0: Ã‚ÂµM ||
 26 |     # environmental package: water || geographic location (latitude): 58.6: DD ||
 27 |     # geographic location (longitude): 18.2: DD || geographic location (country and/or sea,region): Baltic Sea ||
 28 |     # environment (biome): 00002150 || environment (feature): 00002150 || environment (material): 00002150 ||
 29 |     # depth: 400: m || Phosphate:  || Total phosphorous:  || Silicon:
 30 |     # Handle empty cases as above
 31 |     split_by_colon = [attr for attr in split_by_colon if len(attr) >= 2]
 32 | 
 33 |     for index, element in enumerate(split_by_colon):
 34 |         if len(element) > 2:
 35 |             key = element[0].strip()
 36 |             value = ":".join(element[1:]).strip()
 37 |             split_by_colon[index] = [key, value]
 38 | 
 39 |     try:
 40 |         sample_attribute_dict = dict(split_by_colon)
 41 |     except ValueError:
 42 |         print("This is most likely a bug, please report it upstream.")
 43 |         print(("sample_attribute: {}".format(sample_attribute)))
 44 |         raise
 45 |     sample_attribute_keys = list(
 46 |         map(
 47 |             lambda x: re.sub(r"\s+", " ", x.strip().replace(" ", "_").lower()),
 48 |             list(sample_attribute_dict.keys()),
 49 |         )
 50 |     )
 51 |     sample_attribute_values = list(
 52 |         map(
 53 |             lambda x: re.sub(r"\s+", " ", x.strip().lower().strip().replace(",", "__")),
 54 |             list(sample_attribute_dict.values()),
 55 |         )
 56 |     )
 57 |     return sample_attribute_keys, sample_attribute_values
 58 | 
 59 | 
 60 | def expand_sample_attribute_columns(metadata_df):
 61 |     """Expand sample attribute columns to individual columns.
 62 | 
 63 |     Since the sample_attribute column content can be different
 64 |     for differnt rows even if coming from the same project (SRP),
 65 |     we explicitly iterate through the rows to first determine
 66 |     what additional columns need to be created.
 67 | 
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     metadata_df: DataFrame
 72 |                  Dataframe as obtained from sra_metadata
 73 |                  or equivalent
 74 | 
 75 |     Returns
 76 |     -------
 77 |     expanded_df: DataFrame
 78 |                  Dataframe with additionals columns pertaining
 79 |                  to sample_attribute appended
 80 |     """
 81 |     additional_columns = []
 82 |     metadata_df = metadata_df.copy()
 83 |     for idx, row in metadata_df.iterrows():
 84 |         sample_attribute = row["sample_attribute"]
 85 |         if not sample_attribute:
 86 |             continue
 87 |         sample_attribute = sample_attribute.strip()
 88 |         sample_attribute_keys, _ = _get_sample_attr_keys(sample_attribute)
 89 |         if sample_attribute_keys:
 90 |             additional_columns += sample_attribute_keys
 91 |     additional_columns = list(sorted(set(additional_columns)))
 92 |     # if any of the additional column already exists
 93 |     # call the additional column  as *_expanded
 94 |     additional_columns = list(
 95 |         map(
 96 |             lambda x: x if x not in metadata_df.columns.tolist() else x + "_expanded",
 97 |             additional_columns,
 98 |         )
 99 |     )
100 |     additional_columns = list(sorted(additional_columns))
101 |     empty_df = pd.DataFrame(columns=additional_columns)
102 |     metadata_df_expanded = pd.concat([metadata_df, empty_df], axis=1)
103 |     for idx, row in metadata_df_expanded.iterrows():
104 |         sample_attribute = row["sample_attribute"]
105 |         sample_attribute_keys, sample_attribute_values = _get_sample_attr_keys(
106 |             sample_attribute
107 |         )
108 |         if sample_attribute_keys:
109 |             sample_attribute_keys = list(
110 |                 map(
111 |                     lambda x: (
112 |                         x if x not in metadata_df.columns.tolist() else x + "_expanded"
113 |                     ),
114 |                     sample_attribute_keys,
115 |                 )
116 |             )
117 |         metadata_df_expanded.loc[idx, sample_attribute_keys] = sample_attribute_values
118 |     if np.nan in metadata_df_expanded.columns.tolist():
119 |         del metadata_df_expanded[np.nan]
120 |     return metadata_df_expanded
121 | 
122 | 
123 | def guess_cell_type(sample_attribute):
124 |     """Guess possible cell line from sample_attribute data.
125 | 
126 |     Parameters
127 |     ----------
128 |     sample_attribute: string
129 |                       sample_attribute string as in the metadata column
130 | 
131 |     Returns
132 |     -------
133 |     cell_type: string
134 |                Possible cell type of sample.
135 |                Returns None if no match found.
136 |     """
137 |     sample_attribute = str(sample_attribute)
138 |     cell_type = None
139 |     if "cell line:" in sample_attribute:
140 |         x = re.search(r"cell line: \w+", sample_attribute)
141 |         cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell line:").lower().strip())
142 |     if "cell_line:" in sample_attribute:
143 |         x = re.search(r"cell_line: \w+", sample_attribute)
144 |         cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell_line:").lower().strip())
145 |     if "cell-line:" in sample_attribute:
146 |         x = re.search(r"cell-line: \w+", sample_attribute)
147 |         cell_type = re.sub(r"\s+", " ", x.group(0).lstrip("cell-line:").lower().strip())
148 |     if "cell_type:" in sample_attribute:
149 |         x = re.search(r"cell_type: \w+", sample_attribute)
150 |         return re.sub(r"\s+", " ", x.group(0).lstrip("cell_type:").lower().strip())
151 |     if "source_name:" in sample_attribute:
152 |         x = re.search(r"source_name: \w+", sample_attribute)
153 |         cell_type = re.sub(
154 |             r"\s+", " ", x.group(0).lstrip("source_name:").lower().strip()
155 |         )
156 |     else:
157 |         warnings.warn(
158 |             "Couldn't parse {} for cell line".format(sample_attribute), UserWarning
159 |         )
160 |     return cell_type
161 | 
162 | 
163 | def guess_tissue_type(sample_attribute):
164 |     """Guess tissue type from sample_attribute data.
165 | 
166 |     Parameters
167 |     ----------
168 |     sample_attribute: string
169 |                       sample_attribute string as in the metadata column
170 | 
171 |     Returns
172 |     -------
173 |     tissue_type: string
174 |                Possible cell type of sample.
175 |                Returns None if no match found.
176 |     """
177 |     sample_attribute = str(sample_attribute)
178 |     tissue_type = None
179 |     if "tissue: " in sample_attribute:
180 |         x = re.search(r"tissue: \w+", sample_attribute)
181 |         tissue_type = re.sub(r"\s+", " ", x.group(0).lstrip("tissue:").lower().strip())
182 |     else:
183 |         warnings.warn(
184 |             "Couldn't parse {} for tissue".format(sample_attribute), UserWarning
185 |         )
186 |     return tissue_type
187 | 
188 | 
189 | def guess_strain_type(sample_attribute):
190 |     """Guess strain type from sample_attribute data.
191 | 
192 |     Parameters
193 |     ----------
194 |     sample_attribute: string
195 |                       sample_attribute string as in the metadata column
196 | 
197 |     Returns
198 |     -------
199 |     strain_type: string
200 |                  Possible cell type of sample.
201 |                  Returns None if no match found.
202 |     """
203 |     sample_attribute = str(sample_attribute)
204 |     strain_type = None
205 |     if "strain: " in sample_attribute:
206 |         x = re.search(r"strain: \w+", sample_attribute)
207 |         strain_type = re.sub(r"\s+", " ", x.group(0).lstrip("strain:").lower().strip())
208 |     else:
209 |         warnings.warn(
210 |             "Couldn't parse {} for strain".format(sample_attribute), UserWarning
211 |         )
212 |     return strain_type
213 | 


--------------------------------------------------------------------------------
/pysradb/geodb.py:
--------------------------------------------------------------------------------
  1 | """Methods to interact with SRA"""
  2 | 
  3 | # This is now defunct and will be removed in a future release.
  4 | 
  5 | import gzip
  6 | import os
  7 | import re
  8 | import sys
  9 | 
 10 | from .basedb import BASEdb
 11 | from .utils import _get_url
 12 | from .utils import copyfileobj
 13 | from .utils import get_gzip_uncompressed_size
 14 | 
 15 | PY3 = True
 16 | if sys.version_info[0] < 3:
 17 |     PY3 = False
 18 | 
 19 | GEOmetadb_URL = "http://starbuck1.s3.amazonaws.com/sradb/GEOmetadb.sqlite.gz"
 20 | 
 21 | 
 22 | def download_geodb_file(download_dir=os.getcwd(), overwrite=True):
 23 |     """Download GEOmetadb.sqlite file.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     download_dir: string
 28 |                   Directory to download SRAmetadb.sqlite
 29 |     overwrite: bool
 30 |                overwrite existing file(s).
 31 |                Set to True by default.
 32 | 
 33 |     """
 34 |     download_location = os.path.join(download_dir, "GEOmetadb.sqlite.gz")
 35 |     download_location_unzip = download_location.rstrip(".gz")
 36 | 
 37 |     if os.path.isfile(download_location) and overwrite is False:
 38 |         raise RuntimeError(
 39 |             "{} already exists! Set `overwrite=True` to redownload.".forma(
 40 |                 download_location
 41 |             )
 42 |         )
 43 |     if os.path.isfile(download_location_unzip) and overwrite is False:
 44 |         raise RuntimeError(
 45 |             "{} already exists! Set `overwrite=True` to redownload.".format(
 46 |                 download_location_unzip
 47 |             )
 48 |         )
 49 | 
 50 |     try:
 51 |         _get_url(GEOmetadb_URL, download_location)
 52 |     except Exception as e:
 53 |         raise RuntimeError(
 54 |             "Could not use {}.\nException: {}.\n".format(GEOmetadb_URL, e)
 55 |         )
 56 |     print("Extracting {} ...".format(download_location))
 57 |     filesize = get_gzip_uncompressed_size(download_location)
 58 |     with gzip.open(download_location, "rb") as fh_in:
 59 |         with open(download_location_unzip, "wb") as fh_out:
 60 |             copyfileobj(
 61 |                 fh_in,
 62 |                 fh_out,
 63 |                 filesize=filesize,
 64 |                 desc="Extracting {}".format("GEOmetadb.sqlite.gz"),
 65 |             )
 66 |     print("Done!")
 67 |     db = GEOdb(download_location_unzip)
 68 |     metadata = db.query("SELECT * FROM metaInfo")
 69 |     db.close()
 70 |     print("Metadata associated with {}:".format(download_location_unzip))
 71 |     print(metadata)
 72 | 
 73 | 
 74 | class GEOdb(BASEdb):
 75 |     def __init__(self, sqlite_file):
 76 |         """Initialize SRAdb.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 | 
 81 |         sqlite_file: string
 82 |                      Path to unzipped SRAmetadb.sqlite file
 83 | 
 84 | 
 85 |         """
 86 |         super(GEOdb, self).__init__(sqlite_file)
 87 |         self._db_type = "GEO"
 88 |         self.valid_in_type = ["GSE", "GPL", "GSM", "GDS"]
 89 | 
 90 |     def gse_metadata(self, gse):
 91 |         """Get metadata for GSE ID.
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         gse: string
 96 |              GSE ID
 97 | 
 98 |         Returns
 99 |         -------
100 |         metadata_df: DataFrame
101 |                      A dataframe with relevant fields
102 |         """
103 |         return self.query("SELECT * from gse WHERE gse='{}';".format(gse))
104 | 
105 |     def gsm_metadata(self, gsm):
106 |         """Get metadata for GSM ID.
107 | 
108 |         Parameters
109 |         ----------
110 |         gsm: string
111 |              GSM ID
112 | 
113 |         Returns
114 |         -------
115 |         metadata_df: DataFrame
116 |                      A dataframe with relevant fields
117 |         """
118 |         return self.query("SELECT * from gsm WHERE gsm='{}';".format(gsm))
119 | 
120 |     def geo_convert(self, from_acc):
121 |         """Convert one GEO accession to other.
122 | 
123 |         Parameters
124 |         ----------
125 |         from_acc: string
126 |                   GPL/GSE/GSM accession ID
127 | 
128 |         Returns
129 |         -------
130 |         mapping_df: DataFrame
131 |                     A dataframe with relevant mappings
132 |         """
133 |         return self.query(
134 |             "SELECT * FROM geoConvert WHERE from_acc='{}';".format(from_acc)
135 |         )
136 | 
137 |     def gse_to_gsm(self, gse):
138 |         """Fetch GSMs for a GSE.
139 | 
140 |         Parameters
141 |         ----------
142 |         gse: string
143 |              GSE ID
144 | 
145 |         Returns
146 |         -------
147 |         mapping_df: DataFrame
148 |                     A dataframe with relevant mappings
149 |         """
150 |         return self.query("SELECT * FROM gse_gsm WHERE gse='{}'".format(gse))
151 | 
152 |     def gsm_to_gse(self, gsm):
153 |         """Fetch GSE for a GSM.
154 | 
155 |         Parameters
156 |         ----------
157 |         gsm: string
158 |              GSM ID
159 | 
160 |         Returns
161 |         -------
162 |         mapping_df: DataFrame
163 |                     A dataframe with relevant mappings
164 |         """
165 |         mapping_df = self.query("SELECT * FROM gse_gsm WHERE gsm='{}'".format(gsm))
166 |         return mapping_df.loc[:, ["gsm", "gse"]]
167 | 
168 |     def guess_srp_from_gse(self, gse):
169 |         """Convert GSE to SRP id.
170 | 
171 |         Parameters
172 |         ----------
173 |         gse: string
174 |              GSE ID
175 | 
176 |         Returns
177 |         -------
178 |         srp: string
179 |              SRP ID
180 |         """
181 |         results = self.query('SELECT * FROM gse WHERE gse = "' + gse + '"')
182 |         if results.shape[0] == 1:
183 |             supp_file = results["supplementary_file"][0]
184 |             if supp_file:
185 |                 splitted = supp_file.split(";")
186 |                 if len(splitted):
187 |                     match = re.findall("SRP.*", splitted[-1])
188 |                     if len(match):
189 |                         srp = match[0].split("/")[-1]
190 |                         return srp
191 |         return None
192 | 


--------------------------------------------------------------------------------
/pysradb/geoweb.py:
--------------------------------------------------------------------------------
  1 | """Utilities to interact with GEO online"""
  2 | 
  3 | import gzip
  4 | import os
  5 | import re
  6 | import requests
  7 | import sys
  8 | from lxml import html
  9 | 
 10 | from .download import download_file
 11 | from .geodb import GEOdb
 12 | from .utils import _get_url
 13 | from .utils import copyfileobj
 14 | from .utils import get_gzip_uncompressed_size
 15 | 
 16 | PY3 = True
 17 | if sys.version_info[0] < 3:
 18 |     PY3 = False
 19 | 
 20 | 
 21 | class GEOweb(GEOdb):
 22 |     def __init__(self):
 23 |         """Initialize GEOweb without any database."""
 24 | 
 25 |     def get_download_links(self, gse):
 26 |         """Obtain all links from the GEO FTP page.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         gse: string
 31 |              GSE ID
 32 | 
 33 |         Returns
 34 |         -------
 35 |         links: list
 36 |                List of all valid downloadable links present for a GEO ID
 37 |         """
 38 |         prefix = gse[:-3]
 39 |         url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{prefix}nnn/{gse}/suppl/"
 40 |         link_objects = html.fromstring(requests.get(url).content).xpath("//a")
 41 |         links = [i.attrib["href"] for i in link_objects]
 42 |         # remove vulnerability link
 43 |         links = [
 44 |             link
 45 |             for link in links
 46 |             if link != "https://www.hhs.gov/vulnerability-disclosure-policy/index.html"
 47 |         ]
 48 |         # Check if returned results are a valid page - a link to the
 49 |         # home page only exists where the GSE ID dow not exist
 50 |         if "/" in links:
 51 |             raise KeyError(f"The provided GEO ID {gse} does not exist.")
 52 | 
 53 |         # The list of links for a valid GSE ID also contains a link to
 54 |         # the parent directory - we do not want that
 55 |         links = [i for i in links if "geo/series/" not in i]
 56 | 
 57 |         # The links are relative, we need absolute links to download
 58 |         links = [i for i in links]
 59 | 
 60 |         return links, url
 61 | 
 62 |     def download(self, links, root_url, gse, verbose=False, out_dir=None):
 63 |         """Download GEO files.
 64 | 
 65 |         Parameters
 66 |         ----------
 67 |         links: list
 68 |                List of all links valid downloadable present for a GEO ID
 69 |         root_url: string
 70 |                   url for root directory for a GEO ID
 71 |         gse: string
 72 |              GEO ID
 73 |         verbose: bool
 74 |                  Print file list
 75 |         out_dir: string
 76 |                  Directory location for download
 77 |         """
 78 |         if out_dir is None:
 79 |             out_dir = os.path.join(os.getcwd(), "pysradb_downloads")
 80 | 
 81 |         # store output in a separate directory
 82 |         out_dir = os.path.join(out_dir, gse)
 83 |         os.makedirs(out_dir, exist_ok=True)
 84 | 
 85 |         # Display files to be downloaded
 86 |         print("\nThe following files will be downloaded: \n")
 87 |         for link in links:
 88 |             print(link)
 89 |         print(os.linesep)
 90 |         # Check if we can access list of files in the tar file
 91 |         tar_list = [i for i in links if ".tar" in i]
 92 |         if "filelist.txt" in links:
 93 |             tar_file = tar_list[0]
 94 |             if verbose:
 95 |                 print(f"\nThe tar file {tar_file} contains the following files:\n")
 96 |                 file_list_contents = requests.get(
 97 |                     root_url + "filelist.txt"
 98 |                 ).content.decode("utf-8")
 99 |                 print(file_list_contents)
100 | 
101 |         # Download files
102 |         for link in links:
103 |             # add a prefix to distinguish filelist.txt from different downloads
104 |             prefix = ""
105 |             if link == "filelist.txt":
106 |                 prefix = gse + "_"
107 |             geo_path = os.path.join(out_dir, prefix + link)
108 |             download_file(
109 |                 root_url.lstrip("https://") + link, geo_path, show_progress=True
110 |             )
111 | 


--------------------------------------------------------------------------------
/pysradb/utils.py:
--------------------------------------------------------------------------------
  1 | import errno
  2 | import gzip
  3 | import io
  4 | import ntpath
  5 | import os
  6 | import shlex
  7 | import subprocess
  8 | import urllib.request as urllib_request
  9 | import warnings
 10 | 
 11 | import requests
 12 | from requests.adapters import HTTPAdapter
 13 | from requests.packages.urllib3.util.retry import Retry
 14 | from tqdm.autonotebook import tqdm
 15 | 
 16 | from .exceptions import IncorrectFieldException
 17 | 
 18 | warnings.simplefilter(action="ignore", category=FutureWarning)
 19 | 
 20 | 
 21 | tqdm.pandas()
 22 | 
 23 | 
 24 | def path_leaf(path):
 25 |     """Get path's tail from a filepath.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     path: string
 30 |           Filepath
 31 | 
 32 |     Returns
 33 |     -------
 34 |     tail: string
 35 |           Filename
 36 |     """
 37 |     head, tail = ntpath.split(path)
 38 |     return tail or ntpath.basename(head)
 39 | 
 40 | 
 41 | def requests_3_retries():
 42 |     """Generates a requests session object that allows 3 retries.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     session: requests.Session
 47 |         requests session object that allows 3 retries for server-side
 48 |         errors.
 49 |     """
 50 |     session = requests.Session()
 51 |     retry = Retry(
 52 |         total=3,
 53 |         backoff_factor=0.5,
 54 |         status_forcelist=[500, 502, 503, 504],
 55 |     )
 56 |     adapter = HTTPAdapter(max_retries=retry)
 57 |     session.mount("http://", adapter)
 58 |     session.mount("https://", adapter)
 59 |     return session
 60 | 
 61 | 
 62 | def scientific_name_to_taxid(name):
 63 |     """Converts a scientific name to its corresponding taxonomy ID.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     name: str
 68 |         Scientific name of interest.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     taxid: str
 73 |         Taxonomy Id of the Scientific name.
 74 | 
 75 |     Raises
 76 |     ------
 77 |     IncorrectFieldException
 78 |         If the scientific name cannot be found.
 79 | 
 80 |     """
 81 | 
 82 |     r = requests.get(
 83 |         "https://www.ebi.ac.uk/ena/data/taxonomy/v1/taxon/scientific-name/" + name,
 84 |         timeout=5,
 85 |     )
 86 |     if r.status_code == 404:
 87 |         raise IncorrectFieldException(f"Unknown scientific name: {name}")
 88 |     r.raise_for_status()
 89 |     return r.json()[0]["taxId"]
 90 | 
 91 | 
 92 | def unique(sequence):
 93 |     """Get unique elements from a list maintaining the order.
 94 | 
 95 |     Parameters
 96 |     ----------
 97 |     input_list: list
 98 | 
 99 |     Returns
100 |     -------
101 |     unique_list: list
102 |                  List with unique elements maintaining the order
103 |     """
104 |     visited = set()
105 |     return [x for x in sequence if not (x in visited or visited.add(x))]
106 | 
107 | 
108 | class TqdmUpTo(tqdm):
109 |     """Alternative Class-based version of the above.
110 |     Provides `update_to(n)` which uses `tqdm.update(delta_n)`.
111 |     Inspired by [twine#242](https://github.com/pypa/twine/pull/242),
112 |     [here](https://github.com/pypa/twine/commit/42e55e06).
113 | 
114 |     Credits:
115 |     https://github.com/tqdm/tqdm/blob/69326b718905816bb827e0e66c5508c9c04bc06c/examples/tqdm_wget.py
116 |     """
117 | 
118 |     def update_to(self, b=1, bsize=1, tsize=None):
119 |         """
120 |         b  : int, optional
121 |             Number of blocks transferred so far [default: 1].
122 |         bsize  : int, optional
123 |             Size of each block (in tqdm units) [default: 1].
124 |         tsize  : int, optional
125 |             Total size (in tqdm units). If [default: None] remains unchanged.
126 |         """
127 |         if tsize is not None:
128 |             self.total = tsize
129 |         self.update(b * bsize - self.n)  # will also set self.n = b * bsize
130 | 
131 | 
132 | def _extract_first_field(data):
133 |     """Extract first field from a list of fields."""
134 |     return list(next(iter(zip(*data))))
135 | 
136 | 
137 | def _find_aspera_keypath(aspera_dir=None):
138 |     """Locate aspera key.
139 | 
140 |     Parameters
141 |     ----------
142 |     aspera_dir: string
143 |                 Location to aspera directory (optional)
144 | 
145 |     Returns
146 |     -------
147 |     aspera_keypath: string
148 |                     Location to aspera key
149 |     """
150 |     if aspera_dir is None:
151 |         aspera_dir = os.path.join(os.path.expanduser("~"), ".aspera")
152 |     aspera_keypath = os.path.join(
153 |         aspera_dir, "connect", "etc", "asperaweb_id_dsa.openssh"
154 |     )
155 |     if os.path.isfile(aspera_keypath):
156 |         return aspera_keypath
157 | 
158 | 
159 | def mkdir_p(path):
160 |     """Python version mkdir -p
161 | 
162 |     Parameters
163 |     ----------
164 |     path : string
165 |            Path to directory to create
166 |     """
167 |     if path:
168 |         try:
169 |             os.makedirs(path)
170 |         except OSError as exc:  # Python >2.5
171 |             if exc.errno == errno.EEXIST and os.path.isdir(path):
172 |                 pass
173 |             else:
174 |                 raise
175 | 
176 | 
177 | def order_dataframe(df, columns):
178 |     """Order a dataframe
179 | 
180 |     Order a dataframe by moving the `columns` in the front
181 | 
182 |     Parameters
183 |     ----------
184 |     df: Dataframe
185 |         Dataframe
186 |     columns: list
187 |              List of columns that need to be put in front
188 |     """
189 |     remaining_columns = [w for w in df.columns if w not in columns]
190 |     df = df[columns + remaining_columns]
191 |     return df
192 | 
193 | 
194 | def _get_url(url, download_to, show_progress=True):
195 |     """Download anything at a given url.
196 | 
197 |     Parameters
198 |     ----------
199 |     url: string
200 |          http/https/ftp url
201 |     download_to: string
202 |                  File location to write the downloaded file to
203 |     show_progress: bool
204 |                    Set to True by default to print progress bar
205 |     """
206 |     desc_file = "Downloading {}".format(url.split("/")[-1])
207 |     mkdir_p(os.path.dirname(download_to))
208 |     if show_progress:
209 |         with TqdmUpTo(
210 |             unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=desc_file
211 |         ) as t:
212 |             urllib_request.urlretrieve(
213 |                 url, download_to, reporthook=t.update_to, data=None
214 |             )
215 |     else:
216 |         urllib_request.urlretrieve(url, download_to)
217 | 
218 | 
219 | def run_command(command, verbose=False):
220 |     """Run a shell command"""
221 |     process = subprocess.Popen(
222 |         shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
223 |     )
224 | 
225 |     while True:
226 |         output = process.stdout.readline().strip()
227 |         output = output.decode("utf-8")
228 |         if output == "" and process.poll() is not None:
229 |             break
230 |         if output:
231 |             if verbose:
232 |                 print((str(output.strip())))
233 |     rc = process.poll()
234 |     return rc
235 | 
236 | 
237 | def get_gzip_uncompressed_size(filepath):
238 |     """Get uncompressed size of a .gz file
239 | 
240 |     Parameters
241 |     ----------
242 |     filepath: string
243 |               Path to input file
244 | 
245 |     Returns
246 |     -------
247 |     filesize: int
248 |               Uncompressed file size
249 |     """
250 |     with gzip.open(filepath, "rb") as file_obj:
251 |         return file_obj.seek(0, io.SEEK_END)
252 | 
253 | 
254 | def confirm(preceeding_text):
255 |     """Confirm user input.
256 | 
257 |     Parameters
258 |     ----------
259 |     preceeding_text: str
260 |                      Text to print
261 | 
262 |     Returns
263 |     -------
264 |     response: bool
265 |     """
266 |     print(os.linesep, flush=True)
267 |     notification_str = "Please respond with 'y' or 'n'"
268 |     while True:
269 |         choice = input("{} [Y/n]: ".format(preceeding_text)).lower()
270 |         if choice in ["yes", "y"] or not choice:
271 |             return True
272 |         if choice in ["no", "n"]:
273 |             return False
274 |         print(notification_str, flush=True)
275 | 
276 | 
277 | def copyfileobj(fsrc, fdst, bufsize=16384, filesize=None, desc=""):
278 |     """Copy file object with a progress bar.
279 | 
280 |     Parameters
281 |     ----------
282 |     fsrc: filehandle
283 |           Input file handle
284 |     fdst: filehandle
285 |           Output file handle
286 |     bufsize: int
287 |              Length of output buffer
288 |     filesize: int
289 |               Input file file size
290 |     desc: string
291 |           Description for tqdm status
292 |     """
293 |     with tqdm(
294 |         total=filesize,
295 |         unit="B",
296 |         unit_scale=True,
297 |         miniters=1,
298 |         unit_divisor=1024,
299 |         desc=desc,
300 |     ) as pbar:
301 |         while True:
302 |             buf = fsrc.read(bufsize)
303 |             if not buf:
304 |                 break
305 |             fdst.write(buf)
306 |             pbar.update(len(buf))
307 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml>=4.6.3
2 | pandas>=1.3.2
3 | requests>=2.26.0
4 | requests-ftp>=0.3.1
5 | tqdm>=4.62.1
6 | xmltodict>=0.12.0
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 2.2.0
 3 | commit = True
 4 | tag = False
 5 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 6 | serialize = 
 7 | 	{major}.{minor}.{patch}-{release}{build}
 8 | 	{major}.{minor}.{patch}
 9 | 
10 | [bumpversion:part:release]
11 | optional_value = prod
12 | first_value = dev
13 | values = 
14 | 	dev
15 | 	prod
16 | 
17 | [bumpversion:part:build]
18 | 
19 | [bumpversion:file:setup.py]
20 | search = version="{current_version}"
21 | replace = version="{new_version}"
22 | 
23 | [bumpversion:file:pysradb/__init__.py]
24 | search = __version__ = "{current_version}"
25 | replace = __version__ = "{new_version}"
26 | 
27 | [flake8]
28 | exclude = docs
29 | 
30 | [aliases]
31 | test = pytest
32 | 
33 | [tool:pytest]
34 | collect_ignore = ["setup.py"]
35 | 


--------------------------------------------------------------------------------
/tests/_test_geodb.py:
--------------------------------------------------------------------------------
 1 | """Tests for geodb.py
 2 | """
 3 | 
 4 | import pytest
 5 | 
 6 | from pysradb import GEOdb
 7 | 
 8 | """
 9 | 
10 | @pytest.fixture(scope="module")
11 | def geodb_connection(conf_download_geodb_file):
12 |     db_file = conf_download_geodb_file
13 |     db = GEOdb(db_file)
14 |     return db
15 | 
16 | 
17 | def test_all_row_counts(geodb_connection):
18 |     assert geodb_connection.all_row_counts().loc["metaInfo", "count"] == 2
19 | 
20 | 
21 | def test_gse_metadata(geodb_connection):
22 |     df = geodb_connection.gse_metadata("GSE114314")
23 |     assert int(df["pubmed_id"][0]) == 29925996
24 | 
25 | 
26 | def test_gse_to_gsm(geodb_connection):
27 |     df = geodb_connection.gse_to_gsm("GSE114314")
28 |     assert df["gsm"][3] == "GSM3139412"
29 | 
30 | 
31 | def test_geo_convert(geodb_connection):
32 |     df = geodb_connection.geo_convert("GSM3139409")
33 |     assert df["to_acc"][0] == "GSE114314"
34 | 
35 | 
36 | def test_guess_srp_form_gse(geodb_connection):
37 |     srp = geodb_connection.guess_srp_from_gse("GSE73136")
38 |     assert srp == "SRP063852"
39 | """
40 | 


--------------------------------------------------------------------------------
/tests/_test_pcli.py:
--------------------------------------------------------------------------------
  1 | """Tests for cli.py
  2 | """
  3 | 
  4 | import os
  5 | import subprocess
  6 | import sys
  7 | from shlex import quote
  8 | from shlex import split
  9 | 
 10 | import pytest
 11 | 
 12 | from pysradb import SRAdb
 13 | 
 14 | 
 15 | def run(command):
 16 |     if sys.version_info.minor >= 7:
 17 |         result = subprocess.run(split(command), capture_output=True)
 18 |     else:
 19 |         result = subprocess.run(split(command), check=True, stdout=subprocess.PIPE)
 20 |     return str(result.stdout).strip()
 21 | 
 22 | 
 23 | @pytest.fixture(scope="module")
 24 | def sradb_connection(conf_download_sradb_file):
 25 |     db_file = conf_download_sradb_file
 26 |     db = SRAdb(db_file)
 27 |     return db
 28 | 
 29 | 
 30 | def test_all_row_counts_sra(sradb_connection):
 31 |     assert sradb_connection.all_row_counts().loc["metaInfo", "count"] == 2
 32 | 
 33 | 
 34 | @pytest.mark.xfail
 35 | def test_download():
 36 |     result = run(
 37 |         "pysradb download -y --db data/SRAmetadb.sqlite --out-dir srp_downloads -p SRP063852"
 38 |     )
 39 |     assert "SRP063852" in result
 40 |     assert os.path.getsize("srp_downloads/SRP063852/SRX1254413/SRR2433794.sra")
 41 | 
 42 | 
 43 | def test_sra_metadata():
 44 |     result = run("pysradb metadata SRP098789 --db data/SRAmetadb.sqlite")
 45 |     assert "SRX2536403" in result
 46 | 
 47 | 
 48 | def test_sra_metadata():
 49 |     result = run(
 50 |         "pysradb metadata SRP098789 --db data/SRAmetadb.sqlite --detailed --expand"
 51 |     )
 52 |     assert "treatment_time" in result
 53 | 
 54 | 
 55 | def test_srp_to_srx():
 56 |     result = run("pysradb srp-to-srx SRP098789 --db data/SRAmetadb.sqlite")
 57 |     assert "SRX2536403" in result
 58 | 
 59 | 
 60 | def test_srp_assay():
 61 |     result = run("pysradb metadata SRP098789 --db data/SRAmetadb.sqlite --assay")
 62 |     assert "RNA-Seq" in result
 63 | 
 64 | 
 65 | def srr_to_srx():
 66 |     result = run(
 67 |         "pysradb srr-to-srx --db data/SRAmetadb.sqlite SRR5227288 SRR649752 --desc"
 68 |     )
 69 |     assert "3T3 cells" in result
 70 | 
 71 | 
 72 | def srx_to_srr():
 73 |     result = run(
 74 |         "pysradb srr-to-srx --db data/SRAmetadb.sqlite SRX217956 SRX2536403 --desc"
 75 |     )
 76 |     assert "3T3 cells" in result
 77 | 
 78 | 
 79 | def test_sra_metadata_detail():
 80 |     result = run(
 81 |         "pysradb metadata --db data/SRAmetadb.sqlite SRP075720 --detailed --expand"
 82 |     )
 83 |     assert "retina" in result
 84 | 
 85 | 
 86 | def test_srp_to_gse():
 87 |     result = run("pysradb srp-to-gse --db data/SRAmetadb.sqlite SRP075720")
 88 |     assert "GSE81903" in result
 89 | 
 90 | 
 91 | def test_gsm_to_srp():
 92 |     result = run("pysradb gsm-to-srp --db data/SRAmetadb.sqlite GSM2177186")
 93 |     assert "SRP075720" in result
 94 | 
 95 | 
 96 | def test_gsm_to_gse():
 97 |     result = run("pysradb gsm-to-gse --db data/SRAmetadb.sqlite GSM2177186")
 98 |     assert "GSE81903" in result
 99 | 
100 | 
101 | def test_gsm_to_srr():
102 |     result = run(
103 |         "pysradb gsm-to-srr --db data/SRAmetadb.sqlite GSM2177186 --detailed --desc --expand"
104 |     )
105 |     assert "GSM2177186_r1" in result
106 | 
107 | 
108 | """
109 | def test_assay_uniq():
110 |     result = subprocess.check_output(
111 |         "pysradb metadata SRP000941 --db data/SRAmetadb.sqlite --assay  | "
112 |         + " tr -s {}".format(quote("  "))
113 |         + " | cut -f5 -d {}".format(quote(" "))
114 |         + " | sort | uniq -c",
115 |         shell=True,
116 |     )
117 |     assert "Bisulfite-Seq" in str(result)
118 | 
119 | 
120 | def test_pipe_download():
121 |     result = subprocess.check_output(
122 |         "pysradb metadata SRP000941 --assay | "
123 |         + " grep {}".format(quote("study\|RNA-Seq"))
124 |         + " | head -2 | pysradb download --out-dir srp_downloads",
125 |         shell=True,
126 |     )
127 |     assert os.path.getsize("srp_downloads/SRP000941/SRX007165/SRR020287.sra")
128 |     assert "following" in str(result)
129 | """
130 | 


--------------------------------------------------------------------------------
/tests/_test_sradb.py:
--------------------------------------------------------------------------------
  1 | """Tests for sradb.py
  2 | """
  3 | 
  4 | import os
  5 | from sqlite3 import OperationalError
  6 | 
  7 | import pytest
  8 | 
  9 | from pysradb import SRAdb
 10 | from pysradb.filter_attrs import guess_cell_type
 11 | from pysradb.filter_attrs import guess_strain_type
 12 | from pysradb.filter_attrs import guess_tissue_type
 13 | 
 14 | 
 15 | @pytest.fixture(scope="module")
 16 | def sradb_connection(conf_download_sradb_file):
 17 |     db_file = conf_download_sradb_file
 18 |     db = SRAdb(db_file)
 19 |     return db
 20 | 
 21 | 
 22 | def test_list_tables(sradb_connection):
 23 |     sra_tables = sradb_connection.list_tables()
 24 |     assert sra_tables == [
 25 |         "metaInfo",
 26 |         "submission",
 27 |         "study",
 28 |         "sample",
 29 |         "experiment",
 30 |         "run",
 31 |         "sra",
 32 |         "sra_ft",
 33 |         "sra_ft_content",
 34 |         "sra_ft_segments",
 35 |         "sra_ft_segdir",
 36 |         "col_desc",
 37 |         "fastq",
 38 |     ]
 39 | 
 40 | 
 41 | def test_list_fields(sradb_connection):
 42 |     fields = sradb_connection.list_fields("study")
 43 |     assert fields == [
 44 |         "study_ID",
 45 |         "study_alias",
 46 |         "study_accession",
 47 |         "study_title",
 48 |         "study_type",
 49 |         "study_abstract",
 50 |         "broker_name",
 51 |         "center_name",
 52 |         "center_project_name",
 53 |         "study_description",
 54 |         "related_studies",
 55 |         "primary_study",
 56 |         "sra_link",
 57 |         "study_url_link",
 58 |         "xref_link",
 59 |         "study_entrez_link",
 60 |         "ddbj_link",
 61 |         "ena_link",
 62 |         "study_attribute",
 63 |         "submission_accession",
 64 |         "sradb_updated",
 65 |     ]
 66 | 
 67 | 
 68 | def test_desc_table(sradb_connection):
 69 |     names = sorted(sradb_connection.desc_table("sra_ft").name.tolist())
 70 |     assert names[:7] == [
 71 |         "SRR_bamFile",
 72 |         "SRX_bamFile",
 73 |         "SRX_fastqFTP",
 74 |         "adapter_spec",
 75 |         "anonymized_name",
 76 |         "base_caller",
 77 |         "bases",
 78 |     ]
 79 | 
 80 | 
 81 | def test_all_row_counts(sradb_connection):
 82 |     assert sradb_connection.all_row_counts().loc["metaInfo", "count"] == 2
 83 | 
 84 | 
 85 | def test_all_row_counts2(sradb_connection):
 86 |     assert len(sradb_connection.all_row_counts()) == 13
 87 | 
 88 | 
 89 | def test_sra_metadata(sradb_connection):
 90 |     df = sradb_connection.sra_metadata("SRP017942")
 91 |     assert df["experiment_accession"][0] == "SRX217027"
 92 | 
 93 | 
 94 | def test_sra_metadata2(sradb_connection):
 95 |     df = sradb_connection.sra_metadata(
 96 |         "SRP017942", detailed=True, expand_sample_attributes=True
 97 |     )
 98 |     assert "3xflag-gfp" in df["transfected_with"].tolist()
 99 | 
100 | 
101 | def test_search(sradb_connection):
102 |     df = sradb_connection.search_sra(search_str="breast cancer")
103 |     assert len(df.index)
104 | 
105 | 
106 | def test_search2(sradb_connection):
107 |     df = sradb_connection.search_sra(
108 |         '"salivary microbiome" AND "diabetes mellitus"', detailed=True
109 |     )
110 |     assert "SRP241848" in df["study_accession"].to_list()
111 | 
112 | 
113 | def test_search_by_expt_id(sradb_connection):
114 |     df = sradb_connection.search_by_expt_id("SRX1254413")
115 |     assert df.study_name.tolist()[0] == "GSE73136"
116 | 
117 | 
118 | def test_search_by_expt_id2(sradb_connection):
119 |     srx_id = "SRX116363"
120 |     df_expt = sradb_connection.search_by_expt_id(srx_id)
121 |     sra_id = df_expt["submission_accession"].loc[0]
122 |     df = sradb_connection.sra_metadata(sra_id)
123 |     connected_srp = sradb_connection.srx_to_srp("SRX116363").iloc[0, 1]
124 |     assert (srx_id in df["experiment_accession"].to_list()) and (
125 |         connected_srp == "SRP010374"
126 |     )
127 | 
128 | 
129 | # def test_download_fasp(sradb_connection):
130 | #    df = sradb_connection.sra_metadata("SRP098789")
131 | #    df = df[df.experiment_accession == "SRX2536403"]
132 | #    sradb_connection.download(df=df, out_dir="data/", skip_confirmation=True)
133 | #    assert os.path.isfile("data/SRP098789/SRX2536403/SRR5227288.sra")
134 | #    assert os.path.getsize("data/SRP098789/SRX2536403/SRR5227288.sra")
135 | #    os.remove("data/SRP098789/SRX2536403/SRR5227288.sra")
136 | 
137 | 
138 | @pytest.mark.xfail
139 | def test_download_ftp(sradb_connection):
140 |     # This happens to fail because of ftp problems
141 |     df = sradb_connection.sra_metadata("SRP098789")
142 |     df = df[df.experiment_accession == "SRX2536404"]
143 |     sradb_connection.download(
144 |         df=df, protocol="ftp", out_dir="data/", skip_confirmation=True
145 |     )
146 |     assert os.path.isfile("data/SRP098789/SRX2536404/SRR5227289.sra")
147 |     assert os.path.getsize("data/SRP098789/SRX2536404/SRR5227289.sra")
148 |     os.remove("data/SRP098789/SRX2536404/SRR5227289.sra")
149 | 
150 | 
151 | def test_tissue_type(sradb_connection):
152 |     df = sradb_connection.sra_metadata("SRP016501", detailed=True)
153 |     df = df[df.experiment_accession == "SRX196389"]
154 |     cell_type = df["sample_attribute"].apply(lambda x: guess_cell_type(x))
155 |     tissue_type = df["sample_attribute"].apply(lambda x: guess_tissue_type(x))
156 |     assert cell_type.tolist() == ["chicken_brain"]
157 |     assert tissue_type.tolist() == ["brain"]
158 | 
159 | 
160 | def test_strain_type(sradb_connection):
161 |     df = sradb_connection.sra_metadata("SRP043036", detailed=True)
162 |     df = df.sort_values(by="experiment_accession")
163 |     strains = df["sample_attribute"].apply(lambda x: guess_strain_type(x)).tolist()
164 |     assert strains == [
165 |         "by4741",
166 |         "by4741",
167 |         "by4741",
168 |         "by4741",
169 |         "by4741",
170 |         "by4741",
171 |         "by4741",
172 |         "by4741",
173 |         "s288c",
174 |         "s288c",
175 |         "s288c",
176 |         "s288c",
177 |     ]
178 | 
179 | 
180 | def test_srp_to_srx(sradb_connection):
181 |     assert len(sradb_connection.srp_to_srx("SRP082570")) == 14
182 | 
183 | 
184 | def test_srp_to_srr(sradb_connection):
185 |     df = sradb_connection.srp_to_srr("SRP091987")
186 |     assert sorted(list(df["run_accession"])[:3]) == [
187 |         "SRR4447104",
188 |         "SRR4447105",
189 |         "SRR4447106",
190 |     ]
191 | 
192 | 
193 | def test_srp_to_gse(sradb_connection):
194 |     gse_id = sradb_connection.srp_to_gse("SRP050443").iloc[0, 1]
195 |     df = sradb_connection.gse_to_gsm(gse_id)
196 |     assert "GSM1557451" in df["experiment_alias"].to_list()
197 | 
198 | 
199 | def test_gsm_to_gse(sradb_connection):
200 |     df = sradb_connection.gsm_to_gse(["GSM1020651", "GSM1020664", "GSM1020771"])
201 |     assert set(list(df["study_alias"])) == {"GSE41637"}
202 | 
203 | 
204 | def test_srs_to_gsm(sradb_connection):
205 |     df = sradb_connection.srs_to_gsm("SRS1757470")
206 |     assert "GSM2358940" == df.iloc[0, 1]
207 | 
208 | 
209 | @pytest.mark.xfail(raises=ValueError)
210 | def test_wrong_input_metadata(sradb_connection):
211 |     df = sradb_connection.sra_metadata("should_throw_error")
212 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # contents of conftest.py
 2 | import os
 3 | 
 4 | import pytest
 5 | 
 6 | from pysradb import download_geodb_file
 7 | from pysradb import download_sradb_file
 8 | 
 9 | 
10 | @pytest.fixture(scope="session")
11 | def conf_download_sradb_file():
12 |     fn = os.path.join(os.getcwd(), "data", "SRAmetadb.sqlite")
13 |     if os.path.isfile(fn):
14 |         return fn
15 |     download_sradb_file(download_dir=os.path.dirname(fn))
16 |     return fn
17 | 
18 | 
19 | @pytest.fixture(scope="session")
20 | def conf_download_geodb_file():
21 |     fn = os.path.join(os.getcwd(), "data", "GEOmetadb.sqlite")
22 |     if os.path.isfile(fn):
23 |         return fn
24 |     download_geodb_file(download_dir=os.path.dirname(fn))
25 |     return fn
26 | 


--------------------------------------------------------------------------------
/tests/data/test_search/ena_search_test1.txt:
--------------------------------------------------------------------------------
 1 | run_accession
 2 | SRR492850
 3 | SRR500270
 4 | SRR609956
 5 | SRR609957
 6 | SRR609958
 7 | SRR609959
 8 | SRR609960
 9 | SRR609961
10 | SRR609962
11 | SRR609963
12 | SRR609964
13 | SRR609965
14 | SRR609966
15 | SRR609967
16 | SRR609968
17 | SRR609969
18 | SRR609970
19 | SRR609971
20 | SRR609972
21 | SRR609973
22 | SRR609974
23 | SRR609975
24 | SRR609976
25 | SRR609977
26 | SRR609978
27 | SRR609979
28 | SRR609980
29 | SRR609981
30 | SRR609982
31 | SRR609983
32 | SRR609984
33 | SRR609985
34 | SRR609986
35 | SRR609987
36 | SRR609988
37 | SRR609989
38 | SRR609990
39 | SRR609991
40 | SRR609992
41 | SRR609993
42 | SRR609994
43 | SRR609995
44 | SRR609996
45 | SRR609997
46 | SRR609998
47 | SRR609999
48 | SRR610000
49 | SRR610001
50 | SRR610002
51 | SRR610003
52 | SRR610004
53 | SRR610005
54 | SRR610006
55 | SRR610007
56 | SRR610008
57 | SRR610009
58 | SRR610010
59 | SRR610011
60 | SRR610012
61 | SRR610013
62 | SRR610014
63 | SRR610015
64 | SRR610016
65 | SRR610017
66 | SRR610018
67 | SRR610019
68 | SRR610020
69 | SRR610021
70 | SRR610022
71 | SRR610023
72 | SRR610024
73 | SRR610025
74 | SRR610026
75 | SRR610027
76 | SRR610028
77 | SRR610029
78 | SRR610030
79 | 


--------------------------------------------------------------------------------
/tests/data/test_search/geo_search_test1.txt:
--------------------------------------------------------------------------------
 1 | SRX8089313
 2 | SRX8089314
 3 | SRX8089315
 4 | SRX8089316
 5 | SRX8089317
 6 | SRX8089318
 7 | SRX8089319
 8 | SRX8089320
 9 | SRX8089286
10 | SRX8089275
11 | SRX8089276
12 | SRX8089277
13 | SRX8089278
14 | SRX8089279
15 | SRX8089280
16 | SRX8089281
17 | SRX8089282
18 | SRX8089283
19 | SRX8089284
20 | SRX8089285
21 | SRX8089321
22 | SRX8089287
23 | SRX8089288
24 | SRX8089289
25 | SRX8089290
26 | SRX8089291
27 | SRX8089292
28 | SRX8089293
29 | SRX8089294
30 | SRX8089295
31 | SRX8089296
32 | SRX8466645
33 | SRX8142119
34 | SRX8142120
35 | SRX8142121
36 | SRX8142122
37 | SRX8142123
38 | SRX8142124
39 | SRX8142125
40 | SRX8142126
41 | SRX8142127
42 | SRX8466643
43 | SRX8466644
44 | SRX8089344
45 | SRX8466646
46 | SRX8466647
47 | SRX8466648
48 | SRX8466649
49 | SRX8466650
50 | SRX8466651
51 | SRX8466652
52 | SRX8466653
53 | SRX8466654
54 | SRX8466655
55 | SRX8466656
56 | SRX8089333
57 | SRX8089322
58 | SRX8089323
59 | SRX8089324
60 | SRX8089325
61 | SRX8089326
62 | SRX8089327
63 | SRX8089328
64 | SRX8089329
65 | SRX8089330
66 | SRX8089331
67 | SRX8089332
68 | SRX8466765
69 | SRX8089334
70 | SRX8089335
71 | SRX8089336
72 | SRX8089337
73 | SRX8089338
74 | SRX8089339
75 | SRX8089340
76 | SRX8089341
77 | SRX8089342
78 | SRX8089343


--------------------------------------------------------------------------------
/tests/data/test_search/sra_search_test1.txt:
--------------------------------------------------------------------------------
1 | SRX137370
2 | SRX137371


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_2_verbosity_0.csv:
--------------------------------------------------------------------------------
1 | run_accession
2 | ERR4229796
3 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_2_verbosity_1.csv:
--------------------------------------------------------------------------------
1 | run_accession,experiment_title
2 | ERR4229796,HiSeq X Ten paired end sequencing
3 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_2_verbosity_2.csv:
--------------------------------------------------------------------------------
1 | study_accession,experiment_accession,experiment_title,sample_taxon_id,sample_scientific_name,experiment_library_strategy,experiment_library_source,experiment_library_selection,sample_accession,sample_alias,experiment_instrument_model,pool_member_spots,run_1_size,run_1_accession,run_1_total_spots,run_1_total_bases
2 | ERP113893,ERX4190585,HiSeq X Ten paired end sequencing,562,Escherichia coli,WGS,GENOMIC,RANDOM,ERS3331676,96c1f848-4c9d-11e9-8292-68b599768938,HiSeq X Ten,1150278,94976452,ERR4229796,1150278,347383956
3 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_2_verbosity_3.csv:
--------------------------------------------------------------------------------
1 | study_accession,experiment_accession,experiment_title,sample_taxon_id,sample_scientific_name,experiment_library_strategy,experiment_library_source,experiment_library_selection,sample_accession,sample_alias,experiment_instrument_model,pool_member_spots,run_1_size,run_1_accession,run_1_total_spots,run_1_total_bases,experiment_alias,experiment_design_description,experiment_external_id,experiment_library_construction_protocol,experiment_library_name,experiment_platform,experiment_sample_descriptor_accession,library_layout,library_layout_nominal_length,library_layout_nominal_sdev,pool_external_id,pool_member_accession,pool_member_bases,pool_member_member_name,pool_member_organism,pool_member_sample_name,pool_member_sample_title,pool_member_tax_id,run_1_alias,run_1_assembly,run_1_base_A_count,run_1_base_C_count,run_1_base_G_count,run_1_base_N_count,run_1_base_T_count,run_1_cloudfile_1_filetype,run_1_cloudfile_1_location,run_1_cloudfile_1_provider,run_1_cloudfile_2_filetype,run_1_cloudfile_2_location,run_1_cloudfile_2_provider,run_1_cloudfile_3_filetype,run_1_cloudfile_3_location,run_1_cloudfile_3_provider,run_1_cloudfile_4_filetype,run_1_cloudfile_4_location,run_1_cloudfile_4_provider,run_1_cluster_name,run_1_database_1,run_1_is_public,run_1_load_done,run_1_published,run_1_srafile_1_alternative_1_access_type,run_1_srafile_1_alternative_1_free_egress,run_1_srafile_1_alternative_1_org,run_1_srafile_1_alternative_1_url,run_1_srafile_1_alternative_2_access_type,run_1_srafile_1_alternative_2_free_egress,run_1_srafile_1_alternative_2_org,run_1_srafile_1_alternative_2_url,run_1_srafile_1_cluster,run_1_srafile_1_date,run_1_srafile_1_filename,run_1_srafile_1_md5,run_1_srafile_1_semantic_name,run_1_srafile_1_size,run_1_srafile_1_sratoolkit,run_1_srafile_1_supertype,run_1_srafile_1_url,run_1_srafile_2_alternative_1_access_type,run_1_srafile_2_alternative_1_free_egress,run_1_srafile_2_alternative_1_org,run_1_srafile_2_alternative_1_url,run_1_srafile_2_alternative_2_access_type,run_1_srafile_2_alternative_2_free_egress,run_1_srafile_2_alternative_2_org,run_1_srafile_2_alternative_2_url,run_1_srafile_2_alternative_3_access_type,run_1_srafile_2_alternative_3_free_egress,run_1_srafile_2_alternative_3_org,run_1_srafile_2_alternative_3_url,run_1_srafile_2_cluster,run_1_srafile_2_date,run_1_srafile_2_filename,run_1_srafile_2_md5,run_1_srafile_2_semantic_name,run_1_srafile_2_size,run_1_srafile_2_sratoolkit,run_1_srafile_2_supertype,run_1_srafile_2_url,run_1_srafile_3_alternative_1_access_type,run_1_srafile_3_alternative_1_free_egress,run_1_srafile_3_alternative_1_org,run_1_srafile_3_alternative_1_url,run_1_srafile_3_cluster,run_1_srafile_3_date,run_1_srafile_3_filename,run_1_srafile_3_md5,run_1_srafile_3_semantic_name,run_1_srafile_3_size,run_1_srafile_3_sratoolkit,run_1_srafile_3_supertype,run_1_srafile_3_url,run_1_srafile_4_alternative_1_access_type,run_1_srafile_4_alternative_1_free_egress,run_1_srafile_4_alternative_1_org,run_1_srafile_4_alternative_1_url,run_1_srafile_4_cluster,run_1_srafile_4_date,run_1_srafile_4_filename,run_1_srafile_4_md5,run_1_srafile_4_semantic_name,run_1_srafile_4_size,run_1_srafile_4_sratoolkit,run_1_srafile_4_supertype,run_1_srafile_4_url,run_1_static_data_available,run_1_total_base_count,run_1_total_base_cs_native,sample_attributes_1_tag,sample_attributes_1_value,sample_attributes_2_tag,sample_attributes_2_value,sample_attributes_3_tag,sample_attributes_3_value,sample_attributes_4_tag,sample_attributes_4_value,sample_center_name,sample_common_name,sample_external_id_1,sample_external_id_1_namespace,sample_taxon_id,sample_title,study_alias,study_attributes_1_tag,study_attributes_1_value,study_attributes_2_tag,study_attributes_2_value,study_center_name,study_external_id_1,study_external_id_1_namespace,study_study_abstract,study_study_description,study_study_title,study_study_type_existing_study_type,submission_accession,submission_alias,submission_lab_name,submission_title
2 | ERP113893,ERX4190585,HiSeq X Ten paired end sequencing,562,Escherichia coli,WGS,GENOMIC,RANDOM,ERS3331676,96c1f848-4c9d-11e9-8292-68b599768938,HiSeq X Ten,1150278,94976452,ERR4229796,1150278,347383956,SC_EXP_29694_1#382,"Illumina sequencing of library DN539314J:H12, constructed from sample accession ERS3331676 for study accession ERP113893.  This is part of an Illumina multiplexed sequencing run (29694_1).  This submission includes reads tagged with the sequence TACCATTC.",SAMEA5529601,Standard,DN539314J:H12,ILLUMINA,ERS3331676,PAIRED,452,121,SAMEA5529601,ERS3331676,347383956,<NA>,Escherichia coli,96c1f848-4c9d-11e9-8292-68b599768938,SB277889442,562,SC_RUN_29694_1#382,GCF_000005845.1,85540613,88241122,88289114,100578,85212529,cram,gs.US,gs,cram,s3.us-east-1,s3,run,gs.US,gs,run,s3.us-east-1,s3,public,"<Database><Table name=""PRIMARY_ALIGNMENT""><Statistics source=""meta""><Rows count=""1949912"" /><Elements count=""294436712"" /></Statistics></Table><Table name=""REFERENCE""><Statistics source=""meta""><Rows count=""928"" /><Elements count=""4639675"" /></Statistics></Table><Table name=""SECONDARY_ALIGNMENT""><Statistics source=""meta""><Rows count=""6149"" /><Elements count=""928499"" /></Statistics></Table><Table name=""SEQUENCE""><Statistics source=""meta""><Rows count=""1150278"" /><Elements count=""347383956"" /></Statistics></Table></Database>",true,true,2020-06-10 21:58:01,anonymous,worldwide,GCP,https://storage.googleapis.com/sra-pub-src-14/ERR4229796/29694_1_382.cram.1,anonymous,worldwide,AWS,https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1,public,2020-06-11 19:23:25,29694_1_382.cram,26e62fcae91b058b13be6b94214ae2e6,cram,100576829,0,Original,https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796,aws identity,s3.us-east-1,AWS,s3://sra-pub-run-2/ERR4229796/ERR4229796.1,gcp identity,gs.US,GCP,gs://sra-pub-run-4/ERR4229796/ERR4229796.1,public,2020-06-11 19:24:22,ERR4229796,b779862a8fe21d16152454f047fbd2c1,run,94990927,1,Primary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign,public,2020-06-11 19:34:28,ERR4229796.realign,4b45e5e3e6ebe3fa112f308a4d092ee4,run.realign,19534748,1,Secondary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign,anonymous,worldwide,NCBI,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv,public,2020-06-11 19:36:45,wgmlst_sig.tsv,1f83f3039c8d5b17e64cebbd875a38b4,wgmlst_sig,4253484,0,Secondary ETL,https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv,1,347383956,false,SUBJECT_ID,SB277889442,ArrayExpress-SPECIES,E. coli,ENA-FIRST-PUBLIC,2020-06-09,ENA-LAST-UPDATE,2019-03-22,Wellcome Sanger Institute,E. coli,SAMEA5529601,BioSample,562,SB277889442,Maximizing_the_knowledge_gained_from_California_Senate_Bill_27-sc-5708,ENA-FIRST-PUBLIC,2020-06-02,ENA-LAST-UPDATE,2019-02-19,Wellcome Sanger Institute,PRJEB31347,BioProject,"Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California.","Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California. This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute please see http://www.sanger.ac.uk/datasharing/",Maximizing_the_knowledge_gained_from_California_Senate_Bill_27,Whole Genome Sequencing,ERA2689537,ERP113893-sc-20200609-1-2020-06-09T15:09:21Z,European Nucleotide Archive,Submitted by Wellcome Sanger Institute on 09-JUN-2020
3 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_ERS3331676.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" ?>
2 | <EXPERIMENT_PACKAGE_SET>
3 | <EXPERIMENT_PACKAGE><EXPERIMENT accession="ERX4190585" alias="SC_EXP_29694_1#382"><IDENTIFIERS><PRIMARY_ID>ERX4190585</PRIMARY_ID></IDENTIFIERS><TITLE>HiSeq X Ten paired end sequencing</TITLE><STUDY_REF accession="ERP113893"><IDENTIFIERS><PRIMARY_ID>ERP113893</PRIMARY_ID><EXTERNAL_ID namespace="BioProject">PRJEB31347</EXTERNAL_ID></IDENTIFIERS></STUDY_REF><DESIGN><DESIGN_DESCRIPTION>Illumina sequencing of library DN539314J:H12, constructed from sample accession ERS3331676 for study accession ERP113893.  This is part of an Illumina multiplexed sequencing run (29694_1).  This submission includes reads tagged with the sequence TACCATTC.</DESIGN_DESCRIPTION><SAMPLE_DESCRIPTOR accession="ERS3331676"><IDENTIFIERS><PRIMARY_ID>ERS3331676</PRIMARY_ID><EXTERNAL_ID namespace="BioSample">SAMEA5529601</EXTERNAL_ID></IDENTIFIERS></SAMPLE_DESCRIPTOR><LIBRARY_DESCRIPTOR><LIBRARY_NAME>DN539314J:H12</LIBRARY_NAME><LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY><LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT><PAIRED NOMINAL_LENGTH="452" NOMINAL_SDEV="121"/></LIBRARY_LAYOUT><LIBRARY_CONSTRUCTION_PROTOCOL>Standard</LIBRARY_CONSTRUCTION_PROTOCOL></LIBRARY_DESCRIPTOR></DESIGN><PLATFORM><ILLUMINA><INSTRUMENT_MODEL>HiSeq X Ten</INSTRUMENT_MODEL></ILLUMINA></PLATFORM></EXPERIMENT><SUBMISSION accession="ERA2689537" alias="ERP113893-sc-20200609-1-2020-06-09T15:09:21Z" lab_name="European Nucleotide Archive"><IDENTIFIERS><PRIMARY_ID>ERA2689537</PRIMARY_ID></IDENTIFIERS><TITLE>Submitted by Wellcome Sanger Institute on 09-JUN-2020</TITLE></SUBMISSION><Organization type="center"><Name abbr="Wellcome Sanger Institute">Wellcome Sanger Institute</Name></Organization><STUDY accession="ERP113893" alias="Maximizing_the_knowledge_gained_from_California_Senate_Bill_27-sc-5708" center_name="Wellcome Sanger Institute"><IDENTIFIERS><PRIMARY_ID>ERP113893</PRIMARY_ID><EXTERNAL_ID namespace="BioProject">PRJEB31347</EXTERNAL_ID></IDENTIFIERS><DESCRIPTOR><STUDY_TITLE>Maximizing_the_knowledge_gained_from_California_Senate_Bill_27</STUDY_TITLE><STUDY_TYPE existing_study_type="Whole Genome Sequencing"/><STUDY_ABSTRACT>Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California.</STUDY_ABSTRACT><STUDY_DESCRIPTION>Overuse of antibiotics contributes to antimicrobial resistance and is a growing threat to human health worldwide. In the U.S., approximately 70% of antibiotics are sold for use in livestock. On January 1, 2018, California enacted Senate Bill 27 (SB27), which will require a veterinarian's prescription for use of antimicrobial drugs and ban non-therapeutic antimicrobial uses for routine disease prevention and growth promotion in livestock. The project will quantify the effect of SB27 on E. coli, Campylobacter and Salmonella resistance rates from retail meat; estimate the proportion of human Campylobacter, Salmonella, and extraintestinal pathogenic E. coli infections caused by strains of food animal origin in California; characterize the effect of SB27 on antimicrobial susceptibility of Campylobacter, Salmonella, and extraintestinal E. coli infections caused by strains of food animal origin in California. This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute please see http://www.sanger.ac.uk/datasharing/</STUDY_DESCRIPTION></DESCRIPTOR><STUDY_ATTRIBUTES><STUDY_ATTRIBUTE><TAG>ENA-FIRST-PUBLIC</TAG><VALUE>2020-06-02</VALUE></STUDY_ATTRIBUTE><STUDY_ATTRIBUTE><TAG>ENA-LAST-UPDATE</TAG><VALUE>2019-02-19</VALUE></STUDY_ATTRIBUTE></STUDY_ATTRIBUTES></STUDY><SAMPLE accession="ERS3331676" alias="96c1f848-4c9d-11e9-8292-68b599768938" center_name="Wellcome Sanger Institute"><IDENTIFIERS><PRIMARY_ID>ERS3331676</PRIMARY_ID><EXTERNAL_ID namespace="BioSample">SAMEA5529601</EXTERNAL_ID></IDENTIFIERS><TITLE>SB277889442</TITLE><SAMPLE_NAME><TAXON_ID>562</TAXON_ID><COMMON_NAME>E. coli</COMMON_NAME><SCIENTIFIC_NAME>Escherichia coli</SCIENTIFIC_NAME></SAMPLE_NAME><SAMPLE_ATTRIBUTES><SAMPLE_ATTRIBUTE><TAG>SUBJECT_ID</TAG><VALUE>SB277889442</VALUE></SAMPLE_ATTRIBUTE><SAMPLE_ATTRIBUTE><TAG>ArrayExpress-SPECIES</TAG><VALUE>E. coli</VALUE></SAMPLE_ATTRIBUTE><SAMPLE_ATTRIBUTE><TAG>ENA-FIRST-PUBLIC</TAG><VALUE>2020-06-09</VALUE></SAMPLE_ATTRIBUTE><SAMPLE_ATTRIBUTE><TAG>ENA-LAST-UPDATE</TAG><VALUE>2019-03-22</VALUE></SAMPLE_ATTRIBUTE></SAMPLE_ATTRIBUTES></SAMPLE><Pool><Member member_name="" accession="ERS3331676" sample_name="96c1f848-4c9d-11e9-8292-68b599768938" sample_title="SB277889442" spots="1150278" bases="347383956" tax_id="562" organism="Escherichia coli"><IDENTIFIERS><PRIMARY_ID>ERS3331676</PRIMARY_ID><EXTERNAL_ID namespace="BioSample">SAMEA5529601</EXTERNAL_ID></IDENTIFIERS></Member></Pool><RUN_SET><RUN accession="ERR4229796" alias="SC_RUN_29694_1#382" total_spots="1150278" total_bases="347383956" size="94976452" load_done="true" published="2020-06-10 21:58:01" is_public="true" cluster_name="public" static_data_available="1" assembly="GCF_000005845.1"><IDENTIFIERS><PRIMARY_ID>ERR4229796</PRIMARY_ID></IDENTIFIERS><TITLE>HiSeq X Ten paired end sequencing</TITLE><EXPERIMENT_REF accession="ERX4190585"><IDENTIFIERS><PRIMARY_ID>ERX4190585</PRIMARY_ID></IDENTIFIERS></EXPERIMENT_REF><RUN_ATTRIBUTES><RUN_ATTRIBUTE><TAG>ENA-FIRST-PUBLIC</TAG><VALUE>2020-06-09</VALUE></RUN_ATTRIBUTE><RUN_ATTRIBUTE><TAG>ENA-LAST-UPDATE</TAG><VALUE>2020-06-09</VALUE></RUN_ATTRIBUTE></RUN_ATTRIBUTES><Pool><Member member_name="" accession="ERS3331676" sample_name="96c1f848-4c9d-11e9-8292-68b599768938" sample_title="SB277889442" spots="1150278" bases="347383956" tax_id="562" organism="Escherichia coli"><IDENTIFIERS><PRIMARY_ID>ERS3331676</PRIMARY_ID><EXTERNAL_ID namespace="BioSample">SAMEA5529601</EXTERNAL_ID></IDENTIFIERS></Member></Pool><AlignInfo path="/netmnt/traces04/era18/ERR/ERR4229/ERR4229796" cnt="1"><Alignment name="E-coli-K12.fa" seqid="NC_000913.2" gi="49175990"/></AlignInfo><SRAFiles><SRAFile cluster="public" filename="29694_1_382.cram" url="https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1" size="100576829" date="2020-06-11 19:23:25" md5="26e62fcae91b058b13be6b94214ae2e6" semantic_name="cram" supertype="Original" sratoolkit="0"><Alternatives url="https://storage.googleapis.com/sra-pub-src-14/ERR4229796/29694_1_382.cram.1" free_egress="worldwide" access_type="anonymous" org="GCP"/><Alternatives url="https://sra-pub-src-14.s3.amazonaws.com/ERR4229796/29694_1_382.cram.1" free_egress="worldwide" access_type="anonymous" org="AWS"/></SRAFile><SRAFile cluster="public" filename="ERR4229796" url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796" size="94990927" date="2020-06-11 19:24:22" md5="b779862a8fe21d16152454f047fbd2c1" semantic_name="run" supertype="Primary ETL" sratoolkit="1"><Alternatives url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERR/ERR4229/ERR4229796" free_egress="worldwide" access_type="anonymous" org="NCBI"/><Alternatives url="s3://sra-pub-run-2/ERR4229796/ERR4229796.1" free_egress="s3.us-east-1" access_type="aws identity" org="AWS"/><Alternatives url="gs://sra-pub-run-4/ERR4229796/ERR4229796.1" free_egress="gs.US" access_type="gcp identity" org="GCP"/></SRAFile><SRAFile cluster="public" filename="ERR4229796.realign" url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign" size="19534748" date="2020-06-11 19:34:28" md5="4b45e5e3e6ebe3fa112f308a4d092ee4" semantic_name="run.realign" supertype="Secondary ETL" sratoolkit="1"><Alternatives url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/ERR4229796.realign" free_egress="worldwide" access_type="anonymous" org="NCBI"/></SRAFile><SRAFile cluster="public" filename="wgmlst_sig.tsv" url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv" size="4253484" date="2020-06-11 19:36:45" md5="1f83f3039c8d5b17e64cebbd875a38b4" semantic_name="wgmlst_sig" supertype="Secondary ETL" sratoolkit="0"><Alternatives url="https://sra-download.ncbi.nlm.nih.gov/traces/era18/ERZ/004229/ERR4229796/wgmlst_sig.tsv" free_egress="worldwide" access_type="anonymous" org="NCBI"/></SRAFile></SRAFiles><CloudFiles><CloudFile filetype="cram" provider="gs" location="gs.US"/><CloudFile filetype="cram" provider="s3" location="s3.us-east-1"/><CloudFile filetype="run" provider="gs" location="gs.US"/><CloudFile filetype="run" provider="s3" location="s3.us-east-1"/></CloudFiles><Statistics nreads="2" nspots="1150278"><Read index="0" count="1150278" average="151" stdev="0"/><Read index="1" count="1150278" average="151" stdev="0"/></Statistics><Databases><Database><Table name="PRIMARY_ALIGNMENT"><Statistics source="meta"><Rows count="1949912"/><Elements count="294436712"/></Statistics></Table><Table name="REFERENCE"><Statistics source="meta"><Rows count="928"/><Elements count="4639675"/></Statistics></Table><Table name="SECONDARY_ALIGNMENT"><Statistics source="meta"><Rows count="6149"/><Elements count="928499"/></Statistics></Table><Table name="SEQUENCE"><Statistics source="meta"><Rows count="1150278"/><Elements count="347383956"/></Statistics></Table></Database></Databases><Bases cs_native="false" count="347383956"><Base value="A" count="85540613"/><Base value="C" count="88241122"/><Base value="G" count="88289114"/><Base value="T" count="85212529"/><Base value="N" count="100578"/></Bases></RUN></RUN_SET></EXPERIMENT_PACKAGE></EXPERIMENT_PACKAGE_SET>
4 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_verbosity_0.csv:
--------------------------------------------------------------------------------
 1 | run_accession
 2 | SRR11217925
 3 | SRR11217924
 4 | SRR11217923
 5 | SRR11217922
 6 | SRR11217921
 7 | SRR11217920
 8 | SRR11217919
 9 | SRR11217918
10 | SRR11217917
11 | SRR11217916
12 | SRR11217915
13 | SRR11217914
14 | SRR11186550
15 | SRR11186549
16 | SRR11186548
17 | SRR11186547
18 | SRR11186546
19 | SRR11186545
20 | SRR11186544
21 | SRR11186543
22 | SRR11186542
23 | SRR11186541
24 | SRR11186540
25 | SRR11186539
26 | SRR10398500
27 | SRR10398499
28 | SRR10398498
29 | SRR10398497
30 | SRR10398496
31 | SRR10398495
32 | SRR10398494
33 | SRR10398493
34 | SRR10398492
35 | SRR10398491
36 | SRR10398490
37 | SRR10398489
38 | SRR7241911
39 | SRR5026637
40 | SRR5026603
41 | SRR5026592
42 | SRR5026589
43 | SRR5026359
44 | SRR5026356
45 | SRR3219264
46 | SRR3219253
47 | SRR3219248
48 | SRR1804351
49 | SRR1804349
50 | SRR1804348
51 | SRR1804346
52 | SRR1804345
53 | SRR1804343
54 | SRR1804341
55 | SRR1804340
56 | SRR1635379
57 | SRR1635378
58 | SRR1635377
59 | SRR1635376
60 | SRR914339
61 | SRR914338
62 | SRR914337
63 | SRR914336
64 | SRR914335
65 | SRR914334
66 | SRR914333
67 | SRR914332
68 | SRR914331
69 | SRR914330
70 | SRR914329
71 | SRR914328
72 | SRR522897
73 | SRR522895
74 | SRR522883
75 | SRR522896
76 | SRR522872
77 | SRR522871
78 | SRR522892
79 | SRR522890
80 | SRR522881
81 | SRR522153
82 | SRR522152
83 | SRR522151
84 | SRR522882
85 | SRR522888
86 | SRR522011
87 | SRR522010
88 | SRR522009
89 | SRR1804352
90 | SRR1804350
91 | SRR1804347
92 | SRR1804344
93 | SRR1804342
94 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_test_verbosity_1.csv:
--------------------------------------------------------------------------------
 1 | run_accession,experiment_title
 2 | SRR11217925,GSM4369051: rnaH27nsun3; Caenorhabditis elegans; RNA-Seq
 3 | SRR11217924,GSM4369050: rnaH27nsun2; Caenorhabditis elegans; RNA-Seq
 4 | SRR11217923,GSM4369049: rnaH27nsun1; Caenorhabditis elegans; RNA-Seq
 5 | SRR11217922,GSM4369048: rnaH27WT3; Caenorhabditis elegans; RNA-Seq
 6 | SRR11217921,GSM4369047: rnaH27WT2; Caenorhabditis elegans; RNA-Seq
 7 | SRR11217920,GSM4369046: rnaH27WT1; Caenorhabditis elegans; RNA-Seq
 8 | SRR11217919,GSM4369045: rnaL20nsun3; Caenorhabditis elegans; RNA-Seq
 9 | SRR11217918,GSM4369044: rnaL20nsun2; Caenorhabditis elegans; RNA-Seq
10 | SRR11217917,GSM4369043: rnaL20nsun1; Caenorhabditis elegans; RNA-Seq
11 | SRR11217916,GSM4369042: rnaL20WT3; Caenorhabditis elegans; RNA-Seq
12 | SRR11217915,GSM4369041: rnaL20WT2; Caenorhabditis elegans; RNA-Seq
13 | SRR11217914,GSM4369040: rnaL20WT1; Caenorhabditis elegans; RNA-Seq
14 | SRR11186550,GSM4340680: pmk-1(km25)RPF Replicate 2; Caenorhabditis elegans; RNA-Seq
15 | SRR11186549,GSM4340679: pmk-1(km25) total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq
16 | SRR11186548,GSM4340678: pmk-1(km25)RPF Replicate 1; Caenorhabditis elegans; RNA-Seq
17 | SRR11186547,GSM4340677: pmk-1(km25) total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq
18 | SRR11186546,GSM4340676: ced-3(n717)RPF Replicate 2; Caenorhabditis elegans; RNA-Seq
19 | SRR11186545,GSM4340675: ced-3(n717) total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq
20 | SRR11186544,GSM4340674: ced-3(n717)RPF Replicate 1; Caenorhabditis elegans; RNA-Seq
21 | SRR11186543,GSM4340673: ced-3(n717) total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq
22 | SRR11186542,GSM4340672: WT RPF Replicate 2; Caenorhabditis elegans; RNA-Seq
23 | SRR11186541,GSM4340671: WT total mRNA Replicate 2; Caenorhabditis elegans; RNA-Seq
24 | SRR11186540,GSM4340670: WT RPF Replicate 1; Caenorhabditis elegans; RNA-Seq
25 | SRR11186539,GSM4340669: WT total mRNA Replicate 1; Caenorhabditis elegans; RNA-Seq
26 | SRR10398500,GSM4148088: Slee141_mRNAseqFP_meg34_3: meg-3meg-4 mRNAseq-3; Caenorhabditis elegans; RNA-Seq
27 | SRR10398499,GSM4148087: Slee139_mRNAseqFP_meg34_2: meg-3meg-4 mRNAseq-2; Caenorhabditis elegans; RNA-Seq
28 | SRR10398498,GSM4148086: Slee125_mRNAseqFP_meg34_1: meg-3meg-4 mRNAseq-1; Caenorhabditis elegans; RNA-Seq
29 | SRR10398497,GSM4148085: Slee142_mRNAseqFP_N2_3: wild type mRNAseq-3; Caenorhabditis elegans; RNA-Seq
30 | SRR10398496,GSM4148084: Slee138_mRNAseqFP_N2_2: wild type mRNAseq-2; Caenorhabditis elegans; RNA-Seq
31 | SRR10398495,GSM4148083: Slee113_mRNAseqFP_N2_1: wild type mRNAseq-1; Caenorhabditis elegans; RNA-Seq
32 | SRR10398494,GSM4148082: Slee135_meg34: meg-3meg-4 Riboseq-3; Caenorhabditis elegans; OTHER
33 | SRR10398493,GSM4148081: Slee133_meg34: meg-3meg-4 Riboseq-2; Caenorhabditis elegans; OTHER
34 | SRR10398492,GSM4148080: Slee123_meg34: meg-3meg-4 Riboseq-1; Caenorhabditis elegans; OTHER
35 | SRR10398491,GSM4148079: Slee136_N2: wild type Riboseq-3; Caenorhabditis elegans; OTHER
36 | SRR10398490,GSM4148078: Slee132_N2: wild type Riboseq-2; Caenorhabditis elegans; OTHER
37 | SRR10398489,GSM4148077: Slee111_N2: wild type Riboseq-1; Caenorhabditis elegans; OTHER
38 | SRR7241911,GSM3168388: C. elegans embryos ribosome profiling; Caenorhabditis elegans; RNA-Seq
39 | SRR5026637,Bayesian Prediction of RNA Translation from Ribosome Profiling
40 | SRR5026603,Bayesian Prediction of RNA Translation from Ribosome Profiling
41 | SRR5026592,Bayesian Prediction of RNA Translation from Ribosome Profiling
42 | SRR5026589,Bayesian Prediction of RNA Translation from Ribosome Profiling
43 | SRR5026359,Bayesian Prediction of RNA Translation from Ribosome Profiling
44 | SRR5026356,Bayesian Prediction of RNA Translation from Ribosome Profiling
45 | SRR3219264,unc-54(cc3389) Ribo-seq
46 | SRR3219253,unc-54(+) Ribo-seq
47 | SRR3219248,unc-54(+) Ribo-seq
48 | SRR1804351,GSM1611598: frac_26-34nt; Caenorhabditis elegans; OTHER
49 | SRR1804349,GSM1611597: frac_28-35nt; Caenorhabditis elegans; OTHER
50 | SRR1804348,GSM1611596: frac_28-30nt; Caenorhabditis elegans; OTHER
51 | SRR1804346,GSM1611595: frac_25-30nt; Caenorhabditis elegans; OTHER
52 | SRR1804345,GSM1611594: 04_N2_893_GCkit; Caenorhabditis elegans; OTHER
53 | SRR1804343,GSM1611593: 03_N2_893_SGkit; Caenorhabditis elegans; OTHER
54 | SRR1804341,GSM1611592: 02_N2_893_GCop; Caenorhabditis elegans; OTHER
55 | SRR1804340,GSM1611591: 01_N2_893_SGop; Caenorhabditis elegans; OTHER
56 | SRR1635379,GSM1534607: RNASeq_c14_2; Caenorhabditis elegans; RNA-Seq
57 | SRR1635378,GSM1534606: RNASeq_c14_1; Caenorhabditis elegans; RNA-Seq
58 | SRR1635377,GSM1534605: RNASeq_N2_2; Caenorhabditis elegans; RNA-Seq
59 | SRR1635376,GSM1534604: RNASeq_N2_1; Caenorhabditis elegans; RNA-Seq
60 | SRR914339,GSM1169554: Celeg-3-FED-RPF; Caenorhabditis elegans; OTHER
61 | SRR914338,GSM1169553: Celeg-2-FED-RPF; Caenorhabditis elegans; OTHER
62 | SRR914337,GSM1169552: Celeg-1-FED-RPF; Caenorhabditis elegans; OTHER
63 | SRR914336,GSM1169551: Celeg-3-ST-RPF; Caenorhabditis elegans; OTHER
64 | SRR914335,GSM1169550: Celeg-2-ST-RPF; Caenorhabditis elegans; OTHER
65 | SRR914334,GSM1169549: Celeg-1-ST-RPF; Caenorhabditis elegans; OTHER
66 | SRR914333,GSM1169548: Celeg-3-FED-mRNA; Caenorhabditis elegans; RNA-Seq
67 | SRR914332,GSM1169547: Celeg-2-FED-mRNA; Caenorhabditis elegans; RNA-Seq
68 | SRR914331,GSM1169546: Celeg-1-FED-mRNA; Caenorhabditis elegans; RNA-Seq
69 | SRR914330,GSM1169545: Celeg-3-ST-mRNA; Caenorhabditis elegans; RNA-Seq
70 | SRR914329,GSM1169544: Celeg-2-ST-mRNA; Caenorhabditis elegans; RNA-Seq
71 | SRR914328,GSM1169543: Celeg-1-ST-mRNA; Caenorhabditis elegans; RNA-Seq
72 | SRR522897,L1 ribosome footprints replicate 4
73 | SRR522895,L2 ribosome footprints replicate 2 (seq-replicate)
74 | SRR522883,L4 ribosome footprints replicate 1
75 | SRR522896,L1 ribosome footprints replicate 3
76 | SRR522872,L1 ribosome footprints replicate 2
77 | SRR522871,L1 ribosome footprints replicate 1
78 | SRR522892,L2 ribosome footprints replicate 3
79 | SRR522890,L2 ribosome footprints replicate 2
80 | SRR522881,L2 ribosome footprints replicate 1
81 | SRR522153,L4 total RNA-seq
82 | SRR522152,L1 total RNA-seq
83 | SRR522151,L4 mRNA-seq replicate 2
84 | SRR522882,L4 mRNA-seq replicate 1
85 | SRR522888,L1 mRNA-seq replicate 4
86 | SRR522011,L1 replicate 3 mRNA-seq
87 | SRR522010,L1 replicate 2 mRNA-seq
88 | SRR522009,L1 mRNA-seq replicate 1
89 | SRR1804352,GSM1611598: frac_26-34nt; Caenorhabditis elegans; OTHER
90 | SRR1804350,GSM1611597: frac_28-35nt; Caenorhabditis elegans; OTHER
91 | SRR1804347,GSM1611595: frac_25-30nt; Caenorhabditis elegans; OTHER
92 | SRR1804344,GSM1611593: 03_N2_893_SGkit; Caenorhabditis elegans; OTHER
93 | SRR1804342,GSM1611592: 02_N2_893_GCop; Caenorhabditis elegans; OTHER
94 | 


--------------------------------------------------------------------------------
/tests/data/test_search/sra_uids.txt:
--------------------------------------------------------------------------------
1 | 155791
2 | 155790
3 | 


--------------------------------------------------------------------------------
/tests/test_geoweb.py:
--------------------------------------------------------------------------------
 1 | """Tests for GEOweb"""
 2 | 
 3 | import os
 4 | import time
 5 | 
 6 | import pandas as pd
 7 | import pytest
 8 | 
 9 | from pysradb.geoweb import GEOweb
10 | 
11 | 
12 | @pytest.fixture(scope="module")
13 | def geoweb_connection():
14 |     db = GEOweb()
15 |     time.sleep(2)
16 |     return db
17 | 
18 | 
19 | def test_valid_download_links(geoweb_connection):
20 |     """Test if all links for a project are scraped"""
21 |     links, url = geoweb_connection.get_download_links("GSE161707")
22 |     assert links == ["GSE161707_RAW.tar", "filelist.txt"]
23 | 
24 | 
25 | def test_invalid_download_links(geoweb_connection):
26 |     """Test if invalid GEO ID raises the expected error"""
27 |     with pytest.raises(KeyError):
28 |         links, url = geoweb_connection.get_download_links("GSE1691709")
29 | 
30 | 
31 | def test_file_download(geoweb_connection):
32 |     """Test if file actually gets downloaded"""
33 |     geoweb_connection.download(
34 |         links=["GSE161707_RAW.tar", "filelist.txt"],
35 |         root_url="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161707/suppl/",
36 |         gse="GSE161707",
37 |         out_dir="geoweb_downloads",
38 |     )
39 |     assert os.path.getsize("geoweb_downloads/GSE161707/GSE161707_RAW.tar")
40 |     assert os.path.getsize("geoweb_downloads/GSE161707/GSE161707_filelist.txt")
41 | 


--------------------------------------------------------------------------------
/tests/test_sradb.py:
--------------------------------------------------------------------------------
 1 | """Tests for sradb.py
 2 | """
 3 | 
 4 | import os
 5 | from sqlite3 import OperationalError
 6 | 
 7 | import pytest
 8 | 
 9 | from pysradb import SRAdb
10 | from pysradb.filter_attrs import guess_cell_type
11 | from pysradb.filter_attrs import guess_strain_type
12 | from pysradb.filter_attrs import guess_tissue_type
13 | 
14 | 
15 | def test_not_valid_file():
16 |     """Test to check for error if file is either not
17 |     present or not a valid sqlite file"""
18 |     path = "SRAmetadb.sqlite"
19 |     try:
20 |         db = SRAdb(path)
21 |     except SystemExit:
22 |         assert os.path.isfile(path) == False
23 |     except OperationalError:
24 |         assert True
25 | 


--------------------------------------------------------------------------------
/tests/test_sraweb.py:
--------------------------------------------------------------------------------
  1 | """Tests for SRAweb"""
  2 | 
  3 | import time
  4 | 
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | from pysradb.sraweb import SRAweb
  9 | 
 10 | 
 11 | @pytest.fixture(scope="module")
 12 | def sraweb_connection():
 13 |     db = SRAweb()
 14 |     time.sleep(2)
 15 |     return db
 16 | 
 17 | 
 18 | def test_sra_metadata(sraweb_connection):
 19 |     """Test if metadata has right number of entries"""
 20 |     df = sraweb_connection.sra_metadata("SRP016501")
 21 |     assert df.shape[0] == 134
 22 | 
 23 | 
 24 | def test_sra_metadata_missing_orgname(sraweb_connection):
 25 |     """Test if metadata has right number of entries"""
 26 |     df = sraweb_connection.sra_metadata("ERP000171")
 27 |     # See: https://github.com/saketkc/pysradb/issues/46#issuecomment-657268760
 28 |     assert sum(pd.isna(df.organism_name.tolist())) > 0
 29 | 
 30 | 
 31 | def test_sra_metadata_multiple(sraweb_connection):
 32 |     """Test if metadata has right number of entries"""
 33 |     df = sraweb_connection.sra_metadata(["SRP016501", "SRP096025", "SRP103009"])
 34 |     assert list(sorted(df.study_accession.unique())) == [
 35 |         "SRP016501",
 36 |         "SRP096025",
 37 |         "SRP103009",
 38 |     ]
 39 | 
 40 | 
 41 | def test_sra_metadata_multiple_detailed(sraweb_connection):
 42 |     """Test if metadata has right number of entries"""
 43 |     df = sraweb_connection.sra_metadata(["SRP002605", "SRP098789"], detailed=True)
 44 |     columns = ["treatment time", "library type", "transfection", "time"]
 45 |     assert len(set(columns).intersection(set(df.columns))) == 4
 46 |     ftp_cols = [
 47 |         "ena_fastq_http",
 48 |         "ena_fastq_http_1",
 49 |         "ena_fastq_http_2",
 50 |         "ena_fastq_ftp",
 51 |         "ena_fastq_ftp_1",
 52 |         "ena_fastq_ftp_2",
 53 |     ]
 54 |     assert len(set(ftp_cols).intersection(set(df.columns))) == 6
 55 | 
 56 | 
 57 | def test_tissue_column(sraweb_connection):
 58 |     """Test if tissue column exists"""
 59 |     df = sraweb_connection.sra_metadata("SRP096025", detailed="True")
 60 |     assert list(df["tissue"]) == ["Kidney"] * 4
 61 | 
 62 | 
 63 | def test_metadata_exp_accession(sraweb_connection):
 64 |     """Test if experiment_accession column is correct"""
 65 |     df = sraweb_connection.sra_metadata("SRP103009", detailed="True")
 66 |     assert "SRX2705123" in list(df["experiment_accession"])
 67 | 
 68 | 
 69 | def test_fetch_gds_results(sraweb_connection):
 70 |     """Test if fetch_gds_result returns correct values"""
 71 |     df = sraweb_connection.fetch_gds_results("GSE34438")
 72 |     assert df["accession"][1] == "GSM849112"
 73 | 
 74 | 
 75 | def test_srp_to_gse(sraweb_connection):
 76 |     """Test if srp is converted to gse correctly"""
 77 |     df = sraweb_connection.srp_to_gse("SRP009836")
 78 |     assert df["study_alias"][0] == "GSE34438"
 79 | 
 80 | 
 81 | def test_srp_to_srr(sraweb_connection):
 82 |     """Test if srp is converted to srr correctly"""
 83 |     df = sraweb_connection.srp_to_srr("SRP002605", detailed=True)
 84 |     assert df["run_accession"].tolist()[:5] == [
 85 |         "SRR057511",
 86 |         "SRR057512",
 87 |         "SRR057513",
 88 |         "SRR057514",
 89 |         "SRR057515",
 90 |     ]
 91 | 
 92 | 
 93 | def test_srp_to_srs(sraweb_connection):
 94 |     """Test if srp is converted to srs correctly"""
 95 |     df = sraweb_connection.srp_to_srs("SRP014542")
 96 |     assert sorted(list(df["sample_accession"])) == [
 97 |         "SRS351513",
 98 |         "SRS351514",
 99 |         "SRS351515",
100 |         "SRS351516",
101 |         "SRS351517",
102 |         "SRS351518",
103 |     ]
104 | 
105 | 
106 | def test_srp_to_srx(sraweb_connection):
107 |     """Test if srp is converted to srx correctly"""
108 |     df = sraweb_connection.srp_to_srx("SRP044932")
109 |     assert list(df["experiment_accession"]) == ["SRX663253", "SRX663254"]
110 | 
111 | 
112 | def test_gse_to_gsm(sraweb_connection):
113 |     """Test if gse is converted to gsm correctly"""
114 |     df = sraweb_connection.gse_to_gsm("GSE56924", detailed=False)
115 |     assert df.shape[0] == 96
116 | 
117 | 
118 | def test_gse_to_gsm2(sraweb_connection):
119 |     """Test for gse to gsm"""
120 |     df = sraweb_connection.gse_to_gsm("GSE200028", detailed=False)
121 |     assert df.shape[0] == 15
122 | 
123 | 
124 | def test_gse_to_gsm1(sraweb_connection):
125 |     """Test if gse_to_gsm works without passing `detailed` parameter"""
126 |     df = sraweb_connection.gse_to_gsm("GSE63858")
127 |     assert list(sorted(df["experiment_alias"])) == ["GSM1558530", "GSM1558531"]
128 | 
129 | 
130 | def test_gse_to_srp(sraweb_connection):
131 |     """Test if gse is converted to srp correctly"""
132 |     df = sraweb_connection.gse_to_srp("GSE63858")
133 |     assert df["study_accession"].tolist()[0] == "SRP050548"
134 | 
135 | 
136 | def test_gse_to_srp2(sraweb_connection):
137 |     """Test if gse is converted to srp correctly"""
138 |     df = sraweb_connection.gse_to_srp(["GSE168880", "GSE209835"])
139 |     assert df["study_accession"].tolist()[0] == "SRP310566"
140 |     assert df["study_accession"].tolist()[1] == "SRP388275"
141 | 
142 | 
143 | def test_gsm_to_srp(sraweb_connection):
144 |     """Test if gsm is converted to srp correctly"""
145 |     df = sraweb_connection.gsm_to_srp("GSM1371490")
146 |     assert df["study_accession"].tolist()[0] == "SRP041298"
147 | 
148 | 
149 | def test_gsm_to_gse(sraweb_connection):
150 |     """Test if gsm is converted to gse correctly"""
151 |     df = sraweb_connection.gsm_to_gse("GSM1371490")
152 |     assert df["study_alias"].tolist()[0] == "GSE56924"
153 | 
154 | 
155 | def test_gsm_to_srr(sraweb_connection):
156 |     """Test if gsm is converted to srr correctly"""
157 |     df = sraweb_connection.gsm_to_srr("GSM1371489")
158 |     assert df["run_accession"].tolist()[0] == "SRR1257271"
159 | 
160 | 
161 | def test_gsm_to_srs(sraweb_connection):
162 |     """Test if gsm is converted to srs correctly"""
163 |     df = sraweb_connection.gsm_to_srs("GSM1371469")
164 |     assert df["sample_accession"].tolist()[0] == "SRS594838"
165 | 
166 | 
167 | def test_gsm_to_srx(sraweb_connection):
168 |     """Test if gsm is converted to srx correctly"""
169 |     df = sraweb_connection.gsm_to_srx("GSM1371454")
170 |     assert list(df["experiment_accession"]) == ["SRX522468"]
171 | 
172 | 
173 | def test_srr_to_gsm(sraweb_connection):
174 |     df = sraweb_connection.srr_to_gsm("SRR057515")
175 |     assert df["experiment_alias"].tolist()[0] == "GSM546921"
176 | 
177 | 
178 | def test_srr_to_srp(sraweb_connection):
179 |     """Test if srr is converted to srp correctly"""
180 |     df = sraweb_connection.srr_to_srp("SRR057511", detailed=False)
181 |     assert list(df["study_accession"]) == ["SRP002605"]
182 | 
183 | 
184 | def test_srr_to_srp1(sraweb_connection):
185 |     """Test if srr_to_srp works without passing the `detailed` parameter"""
186 |     df = sraweb_connection.srr_to_srp("SRR057515")
187 |     assert list(df["study_accession"]) == ["SRP002605"]
188 | 
189 | 
190 | def test_srr_to_srs(sraweb_connection):
191 |     """Test if srr is converted to srs correctly"""
192 |     df = sraweb_connection.srr_to_srs("SRR057513")
193 |     assert list(df["sample_accession"]) == ["SRS079386"]
194 | 
195 | 
196 | def test_srr_to_srx(sraweb_connection):
197 |     """Test if srr is converted to srx correctly"""
198 |     df = sraweb_connection.srr_to_srx("SRR057514")
199 |     assert list(df["experiment_accession"]) == ["SRX021967"]
200 | 
201 | 
202 | def test_srs_to_gsm(sraweb_connection):
203 |     """Test if srs is converted to gsm correctly"""
204 |     df = sraweb_connection.srs_to_gsm("SRS079386")
205 |     assert df["experiment_alias"][0] == "GSM546921"
206 | 
207 | 
208 | def test_srs_to_srx(sraweb_connection):
209 |     """Test if srs is converted to srx correctly"""
210 |     df = sraweb_connection.srs_to_srx("SRS594838")
211 |     assert list(df["experiment_accession"]) == ["SRX522483"]
212 | 
213 | 
214 | def test_srx_to_gsm(sraweb_connection):
215 |     """Test if srx is converted to gsm correctly"""
216 |     df = sraweb_connection.srx_to_gsm("SRX663253")
217 |     assert list(df["experiment_alias"]) == ["GSM1446832"]
218 | 
219 | 
220 | def test_srx_to_srp(sraweb_connection):
221 |     """Test if srx is converted to srp correctly"""
222 |     df = sraweb_connection.srx_to_srp("SRX663254")
223 |     assert list(df["study_accession"]) == ["SRP044932"]
224 | 
225 | 
226 | def test_srx_to_srr(sraweb_connection):
227 |     """Test if srx is converted to srr correctly"""
228 |     df = sraweb_connection.srx_to_srr("SRX2705123")
229 |     assert list(df["run_accession"]) == ["SRR5413172"]
230 | 
231 | 
232 | def test_srx_to_srr1(sraweb_connection):
233 |     """Test if srx is converted to srr correctly, including multiple srrs"""
234 |     df = sraweb_connection.srx_to_srr("SRX8998846")
235 |     assert list(df["run_accession"]) == ["SRR12508064", "SRR12508065"]
236 | 
237 | 
238 | def test_srx_to_srs(sraweb_connection):
239 |     """Test if srx is converted to srs correctly"""
240 |     df = sraweb_connection.srx_to_srs("SRX663253")
241 |     assert list(df["sample_accession"]) == ["SRS668126"]
242 | 
243 | 
244 | def test_xmlns_id(sraweb_connection):
245 |     df = sraweb_connection.sra_metadata(["GSM1013144", "GSM2520660"])
246 |     assert list(df["library_layout"]) == ["PAIRED", "SINGLE"]
247 | 
248 | 
249 | def test_GCP_url(sraweb_connection):
250 |     df = sraweb_connection.sra_metadata(["SRP002605"], detailed=True)
251 |     assert df["gcp_url"].tolist()[-1].startswith("gs:")
252 | 
253 | 
254 | def test_GCP_url2(sraweb_connection):
255 |     df = sraweb_connection.sra_metadata(["DRR138929"], detailed=True)
256 |     assert df["gcp_url"].tolist()[-1].startswith("gs:")
257 | 
258 | 
259 | def test_gse_to_srp3(sraweb_connection):
260 |     # https://github.com/saketkc/pysradb/issues/190
261 |     df = sraweb_connection.gse_to_srp(["GSE89545"])
262 |     assert df["study_accession"].tolist()[0] == "SRP093251"
263 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | """Tests for utils.py
 2 | """
 3 | 
 4 | import pytest
 5 | 
 6 | from pysradb.utils import *
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def invalid_name():
11 |     return "Red blood cells"
12 | 
13 | 
14 | @pytest.fixture(scope="module")
15 | def valid_name():
16 |     return "Homo sapiens"
17 | 
18 | 
19 | def invalid_scientific_name_to_taxid(invalid_name):
20 |     with pytest.raises(IncorrectFieldException) as e:
21 |         scientific_name_to_taxid(invalid_name)
22 |     assert "Unknown scientific name" in str(e.value)
23 | 
24 | 
25 | def valid_scientific_name_to_taxid(valid_name):
26 |     assert scientific_name_to_taxid(valid_name) == "9606"
27 | 


--------------------------------------------------------------------------------