├── .coveragerc ├── .gitattributes ├── .github ├── CONTRIBUTING.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ └── python.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.rst ├── Doxyfile.in ├── MANIFEST.in ├── Makefile ├── README.md ├── benchmarks ├── faGen.py ├── fqGen.py ├── fqToFaConvert.py ├── mysql │ ├── create.py │ ├── mdbConstants.py │ ├── mydb.py │ ├── mysqlCreateTimeit.py │ ├── mysqlTimeit.py │ └── mysql_login.txt ├── pgres │ ├── create.py │ ├── drop.py │ ├── pdbConstants.py │ ├── pgdb.py │ ├── pgresCreateTimeit.py │ ├── pgresTimeit.py │ └── pgres_login.txt ├── screedCreateTimeit.py ├── screedTimeit.py └── screedTimeit1M.py ├── bigtests └── __init__.py ├── doc ├── CODE_OF_CONDUCT.rst ├── LICENSE.rst ├── Makefile ├── _static │ ├── labibi.css │ └── labibi.js ├── conf.py ├── dev │ ├── coding-guidelines-and-review.rst │ ├── index.rst │ ├── parsers.rst │ └── release-checklist.rst ├── example.rst ├── index.rst ├── release-notes │ ├── RELEASE-0.5.rst │ ├── RELEASE-0.8.md │ ├── RELEASE-0.8.rst │ ├── RELEASE-0.9.md │ ├── RELEASE-0.9.rst │ └── index.rst ├── run-doctests.py ├── screed.rst └── user │ └── known-issues.rst ├── legacy ├── ChangeLog └── jenkins-build.sh ├── pyproject.toml ├── pytest.ini ├── screed ├── DBConstants.py ├── __init__.py ├── __main__.py ├── conversion.py ├── createscreed.py ├── dna.py ├── dump_fasta.py ├── dump_fastq.py ├── fasta.py ├── fastq.py ├── hava.py ├── openscreed.py ├── pygr_api.py ├── screedRecord.py ├── seqparse.py ├── tests │ ├── __init__.py │ ├── havaGen.py │ ├── screed_tst_utils.py │ ├── test-data │ │ ├── empty.fa │ │ ├── test-whitespace.fa │ │ ├── test.fa │ │ ├── test.fa.bz2 │ │ ├── test.fa.gz │ │ ├── test.fa.zip │ │ ├── test.fastq │ │ ├── test.fastq.bz2 │ │ ├── test.fastq.gz │ │ └── test.hava │ ├── test_attriberror.py │ ├── test_convert.py │ ├── test_db.py │ ├── test_dictionary.py │ ├── test_dna.py │ ├── test_fasta.py │ ├── test_fasta_recover.py │ ├── test_fastq.py │ ├── test_fastq_recover.py │ ├── test_hava_methods.py │ ├── test_open.py │ ├── test_open_cm.py │ ├── test_pygr_api.py │ ├── test_record.py │ ├── test_shell.py │ └── test_streaming.py └── utils.py ├── setup.cfg ├── setup.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = screed/tests/* 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | screed/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | See [our development docs](https://screed.readthedocs.io/en/latest/dev/). 2 | 3 | Be sure to copy and paste the [checklist](https://screed.readthedocs.io/en/latest/dev/coding-guidelines-and-review.html#checklist) in the Pull-Request comment 4 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | open-pull-requests-limit: 10 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: daily 12 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python tests 2 | 3 | on: 4 | push: 5 | branches: [latest] 6 | pull_request: 7 | branches: [latest] 8 | schedule: 9 | - cron: "0 0 7 * *" # monthly 10 | 11 | jobs: 12 | test: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest, macos-latest] 17 | py: ["3.10", 3.9, 3.8] 18 | fail-fast: false 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Set up Python ${{ matrix.py }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.py }} 29 | 30 | - name: Get pip cache dir 31 | id: pip-cache 32 | run: | 33 | echo "::set-output name=dir::$(pip cache dir)" 34 | 35 | - name: pip cache 36 | uses: actions/cache@v4 37 | with: 38 | path: ${{ steps.pip-cache.outputs.dir }} 39 | key: ${{ runner.os }}-pip-v2-${{ hashFiles('**/setup.py') }} 40 | restore-keys: | 41 | ${{ runner.os }}-pip-v2 42 | 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | pip install tox tox-gh-actions 47 | 48 | - name: tox cache 49 | uses: actions/cache@v4 50 | with: 51 | path: .tox/ 52 | key: ${{ runner.os }}-tox-v2-${{ hashFiles('**/setup.py') }} 53 | restore-keys: | 54 | ${{ runner.os }}-tox-v2 55 | 56 | - name: Test with tox 57 | run: tox 58 | env: 59 | PYTHONDEVMODE: 1 60 | 61 | - name: Upload Python coverage to codecov 62 | uses: codecov/codecov-action@v3 63 | with: 64 | flags: python 65 | fail_ci_if_error: true 66 | files: coverage.xml 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | *_screed 4 | *.fa 5 | *.fastq 6 | build 7 | screed.egg-info 8 | dist 9 | screed/tests/fa_to_fq 10 | screed/tests/fq_to_fa 11 | Doxyfile 12 | .coverage 13 | coverage.xml 14 | diff-cover.html 15 | doc/doxygen/ 16 | env/ 17 | htmlcov/ 18 | nosetests.xml 19 | pylint_report.txt 20 | .*.swp 21 | MANIFEST 22 | doc/_build/* 23 | screed/version.py 24 | .eggs 25 | .tox 26 | .cache 27 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.10" 8 | 9 | # Build documentation in the docs/ directory with Sphinx 10 | sphinx: 11 | configuration: doc/conf.py 12 | 13 | python: 14 | install: 15 | - method: pip 16 | path: . 17 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to the screed project will be documented in this file. 3 | See [keepachangelog](http://keepachangelog.com/) for more info. 4 | 5 | The screed Python and command-line APIs adhere to 6 | [Semantic Versioning](http://semver.org/). 7 | 8 | ## [Unreleased] 9 | ### Added 10 | - screed CLI, with database creation and conversion commands. 11 | - screed.make_db, a simplified way of creating DB using the Python API. 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | Contributor Code of Conduct 2 | =========================== 3 | 4 | As contributors and maintainers of this project, we pledge to respect 5 | all people who contribute through reporting issues, posting feature 6 | requests, updating documentation, submitting pull requests or patches, 7 | and other activities. 8 | 9 | We are committed to making participation in this project a 10 | harassment-free experience for everyone, regardless of level of 11 | experience, gender, gender identity and expression, sexual orientation, 12 | disability, personal appearance, body size, race, age, or religion. 13 | 14 | Examples of unacceptable behavior by participants include the use of 15 | sexual language or imagery, derogatory comments or personal attacks, 16 | trolling, public or private harassment, insults, or other unprofessional 17 | conduct. 18 | 19 | Project maintainers have the right and responsibility to remove, edit, 20 | or reject comments, commits, code, wiki edits, issues, and other 21 | contributions that are not aligned to this Code of Conduct. Project 22 | maintainers or contributors who do not follow the Code of Conduct may be 23 | removed from the project team. 24 | 25 | Instances of abusive, harassing, or otherwise unacceptable behavior may 26 | be reported by emailing `khmer-project@idyll.org 27 | `__ which only goes to C. Titus Brown and 28 | Michael R. Crusoe. To report an issue involving either of them please email 29 | `Judi Brown Clarke, Ph.D. `__ the Diversity Director 30 | at the BEACON Center for the Study of Evolution in Action, an NSF Center for 31 | Science and Technology. 32 | 33 | This Code of Conduct is adapted from the `Contributor 34 | Covenant `__, version 1.0.0, available at 35 | http://contributor-covenant.org/version/1/0/0/ 36 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ChangeLog Makefile README.md 2 | include MANIFEST.in Doxyfile.in 3 | include TODO doc/LICENSE.rst 4 | include screed/version.py 5 | graft screed/tests/test-data 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # make pep8 to check for basic Python code compliance 2 | # make pylint to check Python code for enhanced compliance including naming 3 | # and documentation 4 | # make coverage-report to check coverage of the python scripts by the tests 5 | 6 | PYSOURCES=$(wildcard screed/*.py) 7 | TESTSOURCES=$(wildcard screed/tests/*.py) 8 | SOURCES=$(PYSOURCES) setup.py 9 | 10 | VERSION=$(shell git describe --tags --dirty | sed s/v//) 11 | all: 12 | ./setup.py build 13 | 14 | install: FORCE 15 | ./setup.py build install 16 | 17 | install-dependencies: FORCE 18 | pip install -e .[all] 19 | 20 | develop: FORCE 21 | ./setup.py develop 22 | 23 | dist: dist/screed-$(VERSION).tar.gz 24 | 25 | dist/screed-$(VERSION).tar.gz: $(SOURCES) 26 | ./setup.py sdist 27 | 28 | clean: FORCE 29 | ./setup.py clean --all || true 30 | rm -rf build/ 31 | rm -rf coverage-debug .coverage coverage.xml 32 | rm -rf doc/_build 33 | rm -rf .eggs/ *.egg-info/ .cache/ __pycache__/ *.pyc */*.pyc */*/*.pyc 34 | 35 | pep8: $(PYSOURCES) $(TESTSOURCES) 36 | pycodestyle --exclude=_version.py setup.py screed/ 37 | 38 | pylint: FORCE 39 | pylint --msg-template="{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}" \ 40 | setup.py screed || true 41 | 42 | doc: FORCE 43 | cd doc && make html 44 | 45 | test: FORCE 46 | pytest 47 | 48 | FORCE: 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # screed -- short read sequence utils in Python. 2 | 3 | [![Documentation](https://readthedocs.org/projects/screed/badge/?version=latest)](http://screed.readthedocs.io/en/latest/) 4 | PyPI 5 | License: 3-Clause BSD 6 | ![Python tests](https://github.com/dib-lab/screed/workflows/Python%20tests/badge.svg) 7 | [![Debian Stable Badge](https://badges.debian.net/badges/debian/stable/python3-screed/version.svg)](https://packages.debian.org/stable/python3-screed) 8 | [![Debian Testing Badge](https://badges.debian.net/badges/debian/testing/python3-screed/version.svg)](https://packages.debian.org/testing/python3-screed) 9 | 10 | The official repository for screed is: 11 | 12 | https://github.com/dib-lab/screed 13 | 14 | See http://readthedocs.org/docs/screed/en/latest/ for docs. 15 | 16 | Issues are tracked at https://github.com/dib-lab/khmer/issues. 17 | -------------------------------------------------------------------------------- /benchmarks/faGen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | 4 | import sys, os 5 | import random 6 | 7 | seqLength = (8000, 12000) 8 | 9 | class collectionOFiles(object): 10 | def __init__(self, baseName, divisions, totalSize): 11 | self.baseName = baseName 12 | self.divisions = divisions 13 | self.totalSize = totalSize 14 | 15 | self.fileHandles = {} 16 | for i in range(0, divisions): 17 | filename = self.baseName + "_%d" % i 18 | fh = open(filename, "wb") 19 | divisor = 2 ** i 20 | 21 | self.fileHandles[filename]= (fh, self.totalSize/divisor, 0) 22 | 23 | def writeRecord(self, name, description, sequence): 24 | toRemove = [] 25 | for filename in self.fileHandles: 26 | file, limit, count = self.fileHandles[filename] 27 | file.write("%s %s\n%s\n" % (name, description, sequence)) 28 | count += 1 29 | if count >= limit: 30 | file.close() 31 | toRemove.append(filename) 32 | else: 33 | self.fileHandles[filename] = (file, limit, count) 34 | 35 | for fh in toRemove: 36 | self.fileHandles.pop(fh) 37 | 38 | def finished(self): 39 | return len(self.fileHandles) == 0 40 | 41 | def genSeq(min, max): 42 | """ 43 | Generates a sequence with min <= length <= max 44 | """ 45 | choices = ['A','T','C','G'] 46 | result = [] 47 | length = random.randrange(min, max) 48 | for i in range(0, length): 49 | result.append(random.choice(choices)) 50 | if i % 80 == 0: 51 | result.append('\n') 52 | return "".join(result) 53 | 54 | def createFastaFiles(filename, size, divisions): 55 | cof = collectionOFiles(filename, divisions, size) 56 | counter = 0 57 | description="cdna:Genscan chromosome:PPYG2:6_qbl_hap2_random:95622:98297:1" 58 | while(not cof.finished()): 59 | name = ">GENSCAN00%d" % counter 60 | sequence = genSeq(seqLength[0], seqLength[1]) 61 | cof.writeRecord(name, description, sequence) 62 | counter += 1 63 | return 64 | 65 | if __name__ == '__main__': 66 | if len(sys.argv) != 4: 67 | print "Usage: " 68 | exit(1) 69 | 70 | filename = sys.argv[1] 71 | size = int(sys.argv[2]) 72 | divisions = int(sys.argv[3]) 73 | 74 | createFastaFiles(filename, size, divisions) 75 | -------------------------------------------------------------------------------- /benchmarks/fqGen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | 4 | import sys, os 5 | import random 6 | 7 | seqLength = 37 8 | 9 | class collectionOFiles(object): 10 | def __init__(self, baseName, divisions, totalSize): 11 | self.baseName = baseName 12 | self.divisions = divisions 13 | self.totalSize = totalSize 14 | 15 | self.fileHandles = {} 16 | for i in range(0, divisions): 17 | filename = self.baseName + "_%d" % i 18 | fh = open(filename, "wb") 19 | divisor = 2 ** i 20 | 21 | self.fileHandles[filename]= (fh, self.totalSize/divisor, 0) 22 | 23 | def writeRecord(self, name, sequence, quality): 24 | toRemove = [] 25 | for filename in self.fileHandles: 26 | file, limit, count = self.fileHandles[filename] 27 | file.write("%s\n%s\n+\n%s\n" % (name, sequence, quality)) 28 | count += 1 29 | if count >= limit: 30 | file.close() 31 | toRemove.append(filename) 32 | else: 33 | self.fileHandles[filename] = (file, limit, count) 34 | 35 | for fh in toRemove: 36 | self.fileHandles.pop(fh) 37 | 38 | def finished(self): 39 | return len(self.fileHandles) == 0 40 | 41 | 42 | def genSeq(length): 43 | """ 44 | Generates a sequence with length characters 45 | """ 46 | choices = ['A','T','C','G'] 47 | result = [] 48 | for i in range(0, length): 49 | result.append(random.choice(choices)) 50 | return "".join(result) 51 | 52 | def genAcc(length): 53 | """ 54 | Generates a quality with length characters 55 | """ 56 | choices = ['A','1','7','3','.',';','*','<'] 57 | result = [] 58 | for i in range(0, length): 59 | result.append(random.choice(choices)) 60 | return "".join(result) 61 | 62 | def createFastqFiles(filename, size, divisions): 63 | cof = collectionOFiles(filename, divisions, size) 64 | counter = 0 65 | while(not cof.finished()): 66 | name = "@HWI-EAS_4_PE-F%d" % counter 67 | sequence = genSeq(seqLength) 68 | quality = genAcc(seqLength) 69 | cof.writeRecord(name, sequence, quality) 70 | counter += 1 71 | return 72 | 73 | if __name__ == '__main__': 74 | if len(sys.argv) != 4: 75 | print "Usage: " 76 | exit(1) 77 | 78 | filename = sys.argv[1] 79 | size = int(sys.argv[2]) 80 | divisions = int(sys.argv[3]) 81 | 82 | createFastqFiles(filename, size, divisions) 83 | -------------------------------------------------------------------------------- /benchmarks/fqToFaConvert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | import sys 4 | import os 5 | 6 | class fastaModel(object): 7 | """ 8 | Contains methods for writing data to a file in the fasta format 9 | """ 10 | def __init__(self, fileHandle): 11 | self.fileHandle = fileHandle 12 | self.currSeq = "" 13 | 14 | def writeName(self, name): 15 | """ 16 | Writes the given name to the fileHandle in the fasta format 17 | """ 18 | self.fileHandle.write(">%s " % name.strip()) 19 | 20 | def writeDescription(self, description): 21 | """ 22 | Writes the given description and the stored sequence to the file 23 | """ 24 | self.fileHandle.write("%s\n%s\n" % (description.strip(), self.currSeq)) 25 | 26 | def writeSequence(self, sequence): 27 | """ 28 | Stores the given sequence until a call to writeDescription is made 29 | so that the description and sequence will be stored in the correct 30 | fasta order 31 | """ 32 | self.currSeq = sequence.strip() 33 | 34 | def convertFastqToFasta(inputFilename, outputFilename): 35 | """ 36 | Converts the given fastq file (inputFilename) to an equilivalent fasta file 37 | (outputFilename). The fastq's quality information is converted to a fasta's 38 | 'description' field. Sequence and name fields are left alone 39 | """ 40 | 41 | inputFile = open(inputFilename, "rb") 42 | outputFile = open(outputFilename, "wb") 43 | 44 | model = fastaModel(outputFile) 45 | 46 | for line in inputFile: 47 | if line.startswith("@"): # Line is a name 48 | model.writeName(line[1:]) 49 | elif line.startswith('+'): # Next line is the quality 50 | quality = inputFile.next() 51 | model.writeDescription(quality) 52 | else: # Line is the sequence 53 | model.writeSequence(line) 54 | 55 | outputFile.close() 56 | 57 | if __name__ == '__main__': 58 | if len(sys.argv) != 3: 59 | print "Usage: " 60 | exit(1) 61 | 62 | inputFilename = sys.argv[1] 63 | outputFilename = sys.argv[2] 64 | 65 | if not os.path.isfile(inputFilename): 66 | print "Error: %s doesn't exist" % inputFilename 67 | exit(2) 68 | 69 | convertFastqToFasta(inputFilename, outputFilename) 70 | -------------------------------------------------------------------------------- /benchmarks/mysql/create.py: -------------------------------------------------------------------------------- 1 | import os 2 | import MySQLdb 3 | import mdbConstants 4 | 5 | def create_db(fields, rcrditer): 6 | """ 7 | Populates the mysql database with records from the record iter 8 | """ 9 | conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER) 10 | 11 | cur = conn.cursor() 12 | 13 | # Create the admin table 14 | cur.execute('CREATE TABLE %s (ID int NOT NULL auto_increment, '\ 15 | 'FIELDNAME TEXT, PRIMARY KEY(ID))' % mdbConstants._SCREEDADMIN) 16 | 17 | for attribute in fields: 18 | cur.execute("INSERT INTO %s (FIELDNAME) VALUES ('%s')" % \ 19 | (mdbConstants._SCREEDADMIN, attribute)) 20 | 21 | # Setup the dictionary table creation field substring 22 | otherFields = fields[1:] 23 | createsub = ['%s TEXT' % field for field in otherFields] 24 | createsub.insert(0, '%s VARCHAR(100)' % fields[0]) 25 | createsub = ','.join(createsub) 26 | 27 | # Create the dictionary table 28 | cur.execute('CREATE TABLE %s (%s int NOT NULL auto_increment, %s, PRIMARY KEY(%s))' % 29 | (mdbConstants._DICT_TABLE, mdbConstants._PRIMARY_KEY, 30 | createsub, 31 | mdbConstants._PRIMARY_KEY)) 32 | 33 | # Attribute to index 34 | queryby = fields[0] 35 | 36 | # Make the index on the 'queryby' attribute 37 | cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' % 38 | (queryby, mdbConstants._DICT_TABLE, queryby)) 39 | 40 | # Setup the 'perc' pgres substring 41 | perc = ', '.join(['%s' for i in range(len(fields))]) 42 | 43 | # Setup the sql substring for inserting data into db 44 | fieldsub = ','.join(fields) 45 | 46 | # Pull data from rcrditer and store in database 47 | for record in rcrditer: 48 | data = tuple([record[key] for key in fields]) 49 | cur.execute('INSERT INTO %s (%s) VALUES (%s)' %\ 50 | (mdbConstants._DICT_TABLE, fieldsub, perc), 51 | data) 52 | 53 | conn.commit() 54 | cur.close() 55 | conn.close() 56 | 57 | def droptables(): 58 | """ 59 | Drops tables in db 60 | """ 61 | conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER) 62 | 63 | cur = conn.cursor() 64 | 65 | try: 66 | cur.execute('DROP TABLE %s;' % mdbConstants._DICT_TABLE) 67 | except: 68 | pass 69 | try: 70 | cur.execute('DROP TABLE %s;' % mdbConstants._SCREEDADMIN) 71 | except: 72 | pass 73 | 74 | conn.commit() 75 | cur.close() 76 | conn.close() 77 | -------------------------------------------------------------------------------- /benchmarks/mysql/mdbConstants.py: -------------------------------------------------------------------------------- 1 | _SCREEDADMIN = 'SCREEDADMIN' 2 | _DICT_TABLE = 'DICTIONARY_TABLE' 3 | _PRIMARY_KEY = 'id' 4 | _DBNAME = 'sdb' 5 | _USER = 'alex' 6 | -------------------------------------------------------------------------------- /benchmarks/mysql/mydb.py: -------------------------------------------------------------------------------- 1 | import mdbConstants 2 | import MySQLdb 3 | import UserDict 4 | import types 5 | 6 | class _mdb_record_dict(UserDict.DictMixin): 7 | """ 8 | Simple dict-like record interface with bag behavior. 9 | """ 10 | def __init__(self, *args, **kwargs): 11 | self.d = dict(*args, **kwargs) 12 | 13 | def __getitem__(self, name): 14 | return self.d[name] 15 | 16 | def __setitem__(self, name, value): 17 | self.d[name] = value 18 | 19 | def __getattr__(self, name): 20 | try: 21 | return self.d[name] 22 | except KeyError: 23 | raise AttributeError, name 24 | 25 | def keys(self): 26 | return self.d.keys() 27 | 28 | class mydb(object): 29 | def __init__(self): 30 | self._conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER) 31 | 32 | cur = self._conn.cursor() 33 | cur.execute('SELECT id, fieldname FROM %s' % mdbConstants._SCREEDADMIN) 34 | self._adm = dict(cur.fetchall()) 35 | keys = self._adm.keys() 36 | keys.sort() 37 | 38 | self._fields = self._adm.values() 39 | self._fields.insert(0, mdbConstants._PRIMARY_KEY.lower()) 40 | self._fieldStr = ",".join(self._fields) 41 | 42 | self._queryBy = self._adm[keys[0]] 43 | 44 | def close(self): 45 | """ 46 | Closes the database handles 47 | """ 48 | self._conn.close() 49 | 50 | def loadRecordByIndex(self, idx): 51 | """ 52 | Loads a record from the database by index 53 | """ 54 | 55 | def loadRecordByName(self, key): 56 | """ 57 | As above, by name 58 | """ 59 | cursor = self._conn.cursor() 60 | query = "SELECT %s FROM %s WHERE %s='%s'" % (self._queryBy, 61 | mdbConstants._DICT_TABLE, 62 | self._queryBy, 63 | key) 64 | cursor.execute(query) 65 | if type(cursor.fetchone()) == types.NoneType: 66 | raise KeyError("Key %s not found" % key) 67 | 68 | query = "SELECT %s FROM %s WHERE %s='%s'" % (self._fieldStr, 69 | mdbConstants._DICT_TABLE, 70 | self._queryBy, 71 | key) 72 | cursor.execute(query) 73 | return _mdb_record_dict(zip(self._fields, cursor.fetchone())) 74 | 75 | def keys(self): 76 | """ 77 | Returns a list of keys in database 78 | """ 79 | cursor = self._conn.cursor() 80 | query = "SELECT %s FROM %s" % (self._queryBy, 81 | mdbConstants._DICT_TABLE) 82 | cursor.execute(query) 83 | return [elem for elem, in cursor] 84 | -------------------------------------------------------------------------------- /benchmarks/mysql/mysqlCreateTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import timeit 5 | 6 | if __name__ == '__main__': 7 | if len(sys.argv) != 3: 8 | print "Usage: %s " % sys.argv[0] 9 | exit(1) 10 | 11 | filename = sys.argv[1] 12 | fafq = sys.argv[2] 13 | 14 | fqrunStatement = """ 15 | create.create_db(FASTQFIELDTYPES, iterfunc) 16 | theFile.close() 17 | """ 18 | 19 | fqsetupStatement = """ 20 | import os, sys 21 | import create 22 | thisdir = sys.path[0] 23 | libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed')) 24 | sys.path.insert(0, libdir) 25 | from fastq import fqiter 26 | create.droptables() 27 | FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'quality') 28 | theFile = open('%s', 'rb') 29 | iterfunc = fqiter(theFile) 30 | """ % filename 31 | 32 | farunStatement = """ 33 | create.create_db(FASTAFIELDTYPES, iterfunc) 34 | theFile.close() 35 | """ 36 | 37 | fasetupStatement = """ 38 | import os, sys 39 | import create 40 | thisdir = sys.path[0] 41 | libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed')) 42 | sys.path.insert(0, libdir) 43 | from fasta import faiter 44 | create.droptables() 45 | FASTAFIELDTYPES = ('name', 'description', 'sequence') 46 | theFile = open('%s', 'rb') 47 | iterfunc = faiter(theFile) 48 | """ % filename 49 | 50 | t = None 51 | if fafq == 'fasta': 52 | t = timeit.Timer(farunStatement, fasetupStatement) 53 | elif fafq == 'fastq': 54 | t = timeit.Timer(fqrunStatement, fqsetupStatement) 55 | else: 56 | raise ValueError("Invalid db type specified: %s" % fafq) 57 | 58 | print "[MYSQL CREATE]%s:" % filename 59 | print t.repeat(2, 1) 60 | -------------------------------------------------------------------------------- /benchmarks/mysql/mysqlTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import timeit 4 | import sys 5 | 6 | if __name__ == '__main__': 7 | runStatement = """ 8 | for i in xrange(0, 100000): 9 | entry = db.loadRecordByName(random.choice(keys)) 10 | """ 11 | 12 | setupStatement = """ 13 | import os, sys 14 | import random 15 | import mydb 16 | db = mydb.mydb() 17 | keys = db.keys() 18 | """ 19 | 20 | t = timeit.Timer(runStatement, setupStatement) 21 | 22 | print "[MYSQL TIMEIT]" 23 | print t.repeat(2, 1) 24 | -------------------------------------------------------------------------------- /benchmarks/mysql/mysql_login.txt: -------------------------------------------------------------------------------- 1 | dbname: sdb 2 | user: alex 3 | -------------------------------------------------------------------------------- /benchmarks/pgres/create.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psycopg2 3 | import pdbConstants 4 | 5 | def create_db(fields, rcrditer): 6 | """ 7 | Populates the pgres database with records from the record iter 8 | """ 9 | 10 | conn = psycopg2.connect('dbname=%s user=%s' % (pdbConstants._DBNAME, 11 | pdbConstants._USER)) 12 | cur = conn.cursor() 13 | 14 | # Create the admin table 15 | cur.execute('CREATE TABLE %s (ID serial PRIMARY KEY, '\ 16 | 'FIELDNAME TEXT)' % pdbConstants._SCREEDADMIN) 17 | 18 | for attribute in fields: 19 | cur.execute("INSERT INTO %s (FIELDNAME) VALUES ('%s')" % \ 20 | (pdbConstants._SCREEDADMIN, attribute)) 21 | 22 | # Setup the dictionary table creation field substring 23 | createsub = ','.join(['%s TEXT' % field for field in fields]) 24 | 25 | # Create the dictionary table 26 | cur.execute('CREATE TABLE %s (%s serial PRIMARY KEY, %s)' % 27 | (pdbConstants._DICT_TABLE, pdbConstants._PRIMARY_KEY, 28 | createsub)) 29 | 30 | # Attribute to index 31 | queryby = fields[0] 32 | 33 | # Make the index on the 'queryby' attribute 34 | cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' % 35 | (queryby, pdbConstants._DICT_TABLE, queryby)) 36 | 37 | # Setup the 'perc' pgres substring 38 | perc = ', '.join(['%s' for i in range(len(fields))]) 39 | 40 | # Setup the sql substring for inserting data into db 41 | fieldsub = ','.join(fields) 42 | 43 | # Pull data from rcrditer and store in database 44 | for record in rcrditer: 45 | data = tuple([record[key] for key in fields]) 46 | cur.execute('INSERT INTO %s (%s) VALUES (%s)' %\ 47 | (pdbConstants._DICT_TABLE, fieldsub, perc), 48 | data) 49 | 50 | conn.commit() 51 | cur.close() 52 | conn.close() 53 | 54 | def droptables(): 55 | """ 56 | Drops tables in db 57 | """ 58 | conn = psycopg2.connect('dbname=%s user=%s' % (pdbConstants._DBNAME, 59 | pdbConstants._USER)) 60 | cur = conn.cursor() 61 | 62 | try: 63 | cur.execute('DROP TABLE %s;' % pdbConstants._DICT_TABLE) 64 | except: 65 | pass 66 | try: 67 | cur.execute('DROP TABLE %s;' % pdbConstants._SCREEDADMIN) 68 | except: 69 | pass 70 | 71 | conn.commit() 72 | cur.close() 73 | conn.close() 74 | -------------------------------------------------------------------------------- /benchmarks/pgres/drop.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from create import droptables 4 | 5 | if __name__ == '__main__': 6 | droptables() 7 | -------------------------------------------------------------------------------- /benchmarks/pgres/pdbConstants.py: -------------------------------------------------------------------------------- 1 | _SCREEDADMIN = 'SCREEDADMIN' 2 | _DICT_TABLE = 'DICTIONARY_TABLE' 3 | _PRIMARY_KEY = 'id' 4 | _DBNAME = 'sdb' 5 | _USER = 'alex' 6 | -------------------------------------------------------------------------------- /benchmarks/pgres/pgdb.py: -------------------------------------------------------------------------------- 1 | import pdbConstants 2 | import psycopg2 3 | import UserDict 4 | import types 5 | 6 | class _pdb_record_dict(UserDict.DictMixin): 7 | """ 8 | Simple dict-like record interface with bag behavior. 9 | """ 10 | def __init__(self, *args, **kwargs): 11 | self.d = dict(*args, **kwargs) 12 | 13 | def __getitem__(self, name): 14 | return self.d[name] 15 | 16 | def __setitem__(self, name, value): 17 | self.d[name] = value 18 | 19 | def __getattr__(self, name): 20 | try: 21 | return self.d[name] 22 | except KeyError: 23 | raise AttributeError, name 24 | 25 | def keys(self): 26 | return self.d.keys() 27 | 28 | class pgdb(object): 29 | def __init__(self): 30 | self._conn = psycopg2.connect('dbname=%s user=%s' % 31 | (pdbConstants._DBNAME, 32 | pdbConstants._USER)) 33 | cur = self._conn.cursor() 34 | cur.execute('SELECT id, fieldname FROM %s' % pdbConstants._SCREEDADMIN) 35 | self._adm = dict(cur.fetchall()) 36 | keys = self._adm.keys() 37 | keys.sort() 38 | 39 | self._fields = self._adm.values() 40 | self._fields.insert(0, pdbConstants._PRIMARY_KEY.lower()) 41 | self._fieldStr = ",".join(self._fields) 42 | 43 | self._queryBy = self._adm[keys[0]] 44 | 45 | def close(self): 46 | """ 47 | Closes the database handles 48 | """ 49 | self._conn.close() 50 | 51 | def loadRecordByIndex(self, idx): 52 | """ 53 | Loads a record from the database by index 54 | """ 55 | 56 | def loadRecordByName(self, key): 57 | """ 58 | As above, by name 59 | """ 60 | cursor = self._conn.cursor() 61 | query = "SELECT %s FROM %s WHERE %s='%s'" % (self._queryBy, 62 | pdbConstants._DICT_TABLE, 63 | self._queryBy, 64 | key) 65 | cursor.execute(query) 66 | if type(cursor.fetchone()) == types.NoneType: 67 | raise KeyError("Key %s not found" % key) 68 | 69 | query = "SELECT %s FROM %s WHERE %s='%s'" % (self._fieldStr, 70 | pdbConstants._DICT_TABLE, 71 | self._queryBy, 72 | key) 73 | cursor.execute(query) 74 | return _pdb_record_dict(zip(self._fields, cursor.fetchone())) 75 | 76 | def keys(self): 77 | """ 78 | Returns a list of keys in database 79 | """ 80 | cursor = self._conn.cursor() 81 | query = "SELECT %s FROM %s" % (self._queryBy, 82 | pdbConstants._DICT_TABLE) 83 | cursor.execute(query) 84 | return [elem for elem, in cursor] 85 | -------------------------------------------------------------------------------- /benchmarks/pgres/pgresCreateTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import timeit 5 | 6 | if __name__ == '__main__': 7 | if len(sys.argv) != 3: 8 | print "Usage: %s " % sys.argv[0] 9 | exit(1) 10 | 11 | filename = sys.argv[1] 12 | fafq = sys.argv[2] 13 | 14 | fqrunStatement = """ 15 | create.create_db(FASTQFIELDTYPES, iterfunc) 16 | theFile.close() 17 | """ 18 | 19 | fqsetupStatement = """ 20 | import os, sys 21 | import create 22 | thisdir = sys.path[0] 23 | libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed')) 24 | sys.path.insert(0, libdir) 25 | from fastq import fqiter 26 | create.droptables() 27 | FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'quality') 28 | theFile = open('%s', 'rb') 29 | iterfunc = fqiter(theFile) 30 | """ % filename 31 | 32 | farunStatement = """ 33 | create.create_db(FASTAFIELDTYPES, iterfunc) 34 | theFile.close() 35 | """ 36 | 37 | fasetupStatement = """ 38 | import os, sys 39 | import create 40 | thisdir = sys.path[0] 41 | libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed')) 42 | sys.path.insert(0, libdir) 43 | from fasta import faiter 44 | create.droptables() 45 | FASTAFIELDTYPES = ('name', 'description', 'sequence') 46 | theFile = open('%s', 'rb') 47 | iterfunc = faiter(theFile) 48 | """ % filename 49 | 50 | t = None 51 | if fafq == 'fasta': 52 | t = timeit.Timer(farunStatement, fasetupStatement) 53 | elif fafq == 'fastq': 54 | t = timeit.Timer(fqrunStatement, fqsetupStatement) 55 | else: 56 | raise ValueError("Invalid db type specified: %s" % fafq) 57 | 58 | print "[PGRES CREATE]%s:" % filename 59 | print t.repeat(2, 1) 60 | -------------------------------------------------------------------------------- /benchmarks/pgres/pgresTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import timeit 4 | import sys 5 | 6 | if __name__ == '__main__': 7 | runStatement = """ 8 | for i in xrange(0, 100000): 9 | entry = db.loadRecordByName(random.choice(keys)) 10 | """ 11 | 12 | setupStatement = """ 13 | import os, sys 14 | import random 15 | import pgdb 16 | db = pgdb.pgdb() 17 | keys = db.keys() 18 | """ 19 | 20 | t = timeit.Timer(runStatement, setupStatement) 21 | 22 | print "[PGRES RUN]" 23 | print t.repeat(2, 1) 24 | -------------------------------------------------------------------------------- /benchmarks/pgres/pgres_login.txt: -------------------------------------------------------------------------------- 1 | dbname: sdb 2 | user: postgres 3 | pass: blah 4 | 5 | user: alex 6 | -------------------------------------------------------------------------------- /benchmarks/screedCreateTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | 4 | import sys 5 | import timeit 6 | 7 | if __name__ == '__main__': 8 | if len(sys.argv) != 3: 9 | print "Usage: %s " % sys.argv[0] 10 | exit(1) 11 | 12 | filename = sys.argv[1] 13 | fafq = sys.argv[2] 14 | 15 | fqrunStatement = """ 16 | createscreed.create_db(filename, fastq.FieldTypes, iterfunc) 17 | theFile.close() 18 | """ 19 | 20 | fqsetupStatement = """ 21 | import os, sys 22 | thisdir = sys.path[0] 23 | libdir = os.path.abspath(os.path.join(thisdir, '..', 'screed')) 24 | sys.path.insert(0, libdir) 25 | import createscreed 26 | import fastq 27 | FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'quality') 28 | filename = '%s' 29 | theFile = open(filename, 'rb') 30 | iterfunc = fastq.fastq_iter(theFile) 31 | """ % filename 32 | 33 | farunStatement = """ 34 | createscreed.create_db(filename, fasta.FieldTypes, iterfunc) 35 | theFile.close() 36 | """ 37 | 38 | fasetupStatement = """ 39 | import os, sys 40 | thisdir = sys.path[0] 41 | libdir = os.path.abspath(os.path.join(thisdir, '..', 'screed')) 42 | sys.path.insert(0, libdir) 43 | import createscreed 44 | import fasta 45 | FASTAFIELDTYPES = ('name', 'description', 'sequence') 46 | filename = '%s' 47 | theFile = open(filename, 'rb') 48 | iterfunc = fasta.fasta_iter(theFile) 49 | """ % filename 50 | 51 | t = None 52 | if fafq == 'fasta': 53 | t = timeit.Timer(farunStatement, fasetupStatement) 54 | elif fafq == 'fastq': 55 | t = timeit.Timer(fqrunStatement, fqsetupStatement) 56 | else: 57 | raise ValueError("Invalid db type specified: %s" % fafq) 58 | 59 | print "[SCREED CREATE]%s:" % filename 60 | print t.repeat(2, 1) 61 | -------------------------------------------------------------------------------- /benchmarks/screedTimeit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | 4 | import timeit 5 | import sys 6 | import os 7 | 8 | if __name__ == '__main__': 9 | if len(sys.argv) != 2: 10 | print "Usage: %s " % sys.argv[0] 11 | exit(1) 12 | 13 | screedFile = sys.argv[1] 14 | if not os.path.isfile(screedFile): 15 | print "No such file: %s" % screedFile 16 | exit(1) 17 | 18 | runStatement = """ 19 | for i in xrange(0, 100000): 20 | entry = str(db[random.choice(keys)].sequence) 21 | """ 22 | 23 | setupStatement = """ 24 | import os, sys 25 | import random 26 | thisdir = sys.path[0] 27 | libdir = os.path.abspath(os.path.join(thisdir, '..')) 28 | sys.path.insert(0, libdir) 29 | import screed 30 | db = screed.openscreed.ScreedDB('%s') 31 | keys = db.keys() 32 | """ % screedFile 33 | 34 | t = timeit.Timer(runStatement, setupStatement) 35 | 36 | print "[SCREED RUN]%s:" % screedFile 37 | print t.repeat(2, 1) 38 | -------------------------------------------------------------------------------- /benchmarks/screedTimeit1M.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2016, The Regents of the University of California. 3 | 4 | import timeit 5 | import sys 6 | import os 7 | 8 | if __name__ == '__main__': 9 | if len(sys.argv) != 2: 10 | print "Usage: %s " % sys.argv[0] 11 | exit(1) 12 | 13 | screedFile = sys.argv[1] 14 | if not os.path.isfile(screedFile): 15 | print "No such file: %s" % screedFile 16 | exit(1) 17 | 18 | runStatement = """ 19 | for i in xrange(0, 100000): 20 | entry = str(db[random.choice(keys)].sequence) 21 | """ 22 | 23 | setupStatement = """ 24 | import os, sys 25 | import random 26 | thisdir = sys.path[0] 27 | libdir = os.path.abspath(os.path.join(thisdir, '..')) 28 | sys.path.insert(0, libdir) 29 | import screed 30 | db = screed.openscreed.ScreedDB('%s') 31 | keys = [] 32 | for i, k in enumerate(db.iterkeys()): 33 | if i > 1000000: 34 | break 35 | keys.append(k) 36 | """ % screedFile 37 | 38 | t = timeit.Timer(runStatement, setupStatement) 39 | 40 | print "[SCREED RUN]%s:" % screedFile 41 | print t.repeat(2, 1) 42 | -------------------------------------------------------------------------------- /doc/CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | ../CODE_OF_CONDUCT.rst -------------------------------------------------------------------------------- /doc/LICENSE.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | License 3 | ======= 4 | 5 | Copyright (c) 2008, Michigan State University. 6 | Copyright (c) 2015, The Regents of the University of California. 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without modification, 10 | are permitted provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, 13 | this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | * Neither the name of the author nor the names of its contributors may be 18 | used to endorse or promote products derived from this software without 19 | specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | all: html 20 | 21 | help: 22 | @echo "Please use \`make ' where is one of" 23 | @echo " html to make standalone HTML files" 24 | @echo " dirhtml to make HTML files named index.html in directories" 25 | @echo " singlehtml to make a single large HTML file" 26 | @echo " pickle to make pickle files" 27 | @echo " json to make JSON files" 28 | @echo " htmlhelp to make HTML files and a HTML help project" 29 | @echo " qthelp to make HTML files and a qthelp project" 30 | @echo " devhelp to make HTML files and a Devhelp project" 31 | @echo " epub to make an epub" 32 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 33 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " linkcheck to check all external links for integrity" 41 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 42 | 43 | clean: 44 | -rm -rf $(BUILDDIR)/* 45 | 46 | html: 47 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 48 | @echo 49 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 50 | 51 | dirhtml: 52 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 53 | @echo 54 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 55 | 56 | singlehtml: 57 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 58 | @echo 59 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 60 | 61 | pickle: 62 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 63 | @echo 64 | @echo "Build finished; now you can process the pickle files." 65 | 66 | json: 67 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 68 | @echo 69 | @echo "Build finished; now you can process the JSON files." 70 | 71 | htmlhelp: 72 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 73 | @echo 74 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 75 | ".hhp project file in $(BUILDDIR)/htmlhelp." 76 | 77 | qthelp: 78 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 79 | @echo 80 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 81 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 82 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/screed.qhcp" 83 | @echo "To view the help file:" 84 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/screed.qhc" 85 | 86 | devhelp: 87 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 88 | @echo 89 | @echo "Build finished." 90 | @echo "To view the help file:" 91 | @echo "# mkdir -p $$HOME/.local/share/devhelp/screed" 92 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/screed" 93 | @echo "# devhelp" 94 | 95 | epub: 96 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 97 | @echo 98 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 99 | 100 | latex: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo 103 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 104 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 105 | "(use \`make latexpdf' here to do that automatically)." 106 | 107 | latexpdf: 108 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 109 | @echo "Running LaTeX files through pdflatex..." 110 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 111 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 112 | 113 | text: 114 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 115 | @echo 116 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 117 | 118 | man: 119 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 120 | @echo 121 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 122 | 123 | texinfo: 124 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 125 | @echo 126 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 127 | @echo "Run \`make' in that directory to run these through makeinfo" \ 128 | "(use \`make info' here to do that automatically)." 129 | 130 | info: 131 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 132 | @echo "Running Texinfo files through makeinfo..." 133 | make -C $(BUILDDIR)/texinfo info 134 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 135 | 136 | gettext: 137 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 138 | @echo 139 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 140 | 141 | changes: 142 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 143 | @echo 144 | @echo "The overview file is in $(BUILDDIR)/changes." 145 | 146 | linkcheck: 147 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 148 | @echo 149 | @echo "Link check complete; look for any errors in the above output " \ 150 | "or in $(BUILDDIR)/linkcheck/output.txt." 151 | 152 | doctest: 153 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 154 | @echo "Testing of doctests in the sources finished, look at the " \ 155 | "results in $(BUILDDIR)/doctest/output.txt." 156 | -------------------------------------------------------------------------------- /doc/_static/labibi.css: -------------------------------------------------------------------------------- 1 | @import url('default.css'); 2 | 3 | /* Styles for floating Edit on GitHub box */ 4 | #editor-trap { 5 | margin: 1em; 6 | padding: 1em; 7 | border: 1px solid black; 8 | box-shadow: 0 0 3px black; 9 | width: 200px; 10 | 11 | background: #fefabc; 12 | position: fixed; 13 | bottom: 1em; 14 | left: 1em; 15 | font-size: 60%; 16 | text-align: left; 17 | z-index: 2; 18 | 19 | -moz-transform: rotate(-4deg); 20 | -webkit-transform: rotate(-4deg); 21 | -o-transform: rotate(-4deg); 22 | -ms-transform: rotate(-4deg); 23 | transform: rotate(-4deg); 24 | box-shadow: 0px 4px 6px #333; 25 | -moz-box-shadow: 0px 4px 6px #333; 26 | -webkit-box-shadow: 0px 4px 6px #333; 27 | 28 | 29 | cursor: pointer; 30 | } 31 | 32 | #editor-trap h3 { 33 | margin: 0 0 0.5em 0; 34 | padding: 0; 35 | background: transparent; 36 | } 37 | 38 | #editor-trap ol { 39 | margin: 0; 40 | padding: 0 0 0 2em; 41 | } 42 | 43 | /* Hide trick */ 44 | 45 | #editor-trap.toggled > * { 46 | display: none; 47 | } 48 | 49 | 50 | #editor-trap.toggled > h3 { 51 | display: block; 52 | } 53 | 54 | 55 | -------------------------------------------------------------------------------- /doc/_static/labibi.js: -------------------------------------------------------------------------------- 1 | // Store editor pop-up help state in localStorage 2 | // so it does not re-pop-up itself between page loads. 3 | // Do not even to pretend to support IE gracefully. 4 | (function($) { 5 | 6 | $(document).ready(function() { 7 | var box = $("#editor-trap"); 8 | var klass = "toggled"; 9 | var storageKey = "toggled"; 10 | 11 | function toggle() { 12 | box.toggleClass(klass); 13 | // Store the toggle status in local storage as "has value string" or null 14 | window.localStorage.setItem(storageKey, box.hasClass(klass) ? "toggled" : "not-toggled"); 15 | } 16 | 17 | box.click(toggle); 18 | 19 | // Check the persistent state of the editor pop-up 20 | // Note that localStorage does not necessarily support boolean values (ugh!) 21 | // http://stackoverflow.com/questions/3263161/cannot-set-boolean-values-in-localstorage 22 | var v = window.localStorage.getItem(storageKey); 23 | if(v == "toggled" || !v) { 24 | box.addClass(klass); 25 | } 26 | 27 | }); 28 | 29 | })(jQuery); 30 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # screed documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Jun 6 16:32:37 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing 7 | # dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ---------------------------------------------------- 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | # needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 30 | extensions = [] 31 | 32 | # Add any paths that contain templates here, relative to this directory. 33 | templates_path = ['_templates'] 34 | 35 | # The suffix of source filenames. 36 | source_suffix = '.rst' 37 | 38 | # The encoding of source files. 39 | # source_encoding = 'utf-8-sig' 40 | 41 | # The master toctree document. 42 | master_doc = 'index' 43 | 44 | # General information about the project. 45 | project = u'screed' 46 | copyright = u'2012-2015, Michigan State University' 47 | 48 | # The version info for the project you're documenting, acts as replacement for 49 | # |version| and |release|, also used in various other places throughout the 50 | # built documents. 51 | # 52 | 53 | # The full version, including alpha/beta/rc tags. 54 | 55 | sys.path.insert(0, '.') 56 | 57 | import screed 58 | release = screed.VERSION 59 | 60 | # The short X.Y version. 61 | version = '.'.join(release.split('.')[:2]) 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # language = None 66 | 67 | # There are two options for replacing |today|: either, you set today to some 68 | # non-false value, then it is used: 69 | # today = '' 70 | 71 | # Else, today_fmt is used as the format for a strftime call. 72 | # today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | exclude_patterns = ['_build'] 77 | 78 | # The reST default role (used for this markup: `text`) to use for all 79 | # documents. 80 | # default_role = None 81 | 82 | # If true, '()' will be appended to :func: etc. cross-reference text. 83 | # add_function_parentheses = True 84 | 85 | # If true, the current module name will be prepended to all description 86 | # unit titles (such as .. function::). 87 | # add_module_names = True 88 | 89 | # If true, sectionauthor and moduleauthor directives will be shown in the 90 | # output. They are ignored by default. 91 | # show_authors = False 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = 'sphinx' 95 | 96 | # A list of ignored prefixes for module index sorting. 97 | # modindex_common_prefix = [] 98 | 99 | 100 | # -- Options for HTML output -------------------------------------------------- 101 | 102 | # The theme to use for HTML and HTML Help pages. See the documentation for 103 | # a list of builtin themes. 104 | html_theme = 'default' 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | # html_theme_options = {} 110 | 111 | # Add any paths that contain custom themes here, relative to this directory. 112 | # html_theme_path = [] 113 | 114 | # The name for this set of Sphinx documents. If None, it defaults to 115 | # " v documentation". 116 | # html_title = None 117 | 118 | # A shorter title for the navigation bar. Default is the same as html_title. 119 | # html_short_title = None 120 | 121 | # The name of an image file (relative to this directory) to place at the top 122 | # of the sidebar. 123 | # html_logo = None 124 | 125 | # The name of an image file (within the static path) to use as favicon of the 126 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 127 | # pixels large. 128 | # html_favicon = None 129 | 130 | # Add any paths that contain custom static files (such as style sheets) here, 131 | # relative to this directory. They are copied after the builtin static files, 132 | # so a file named "default.css" will overwrite the builtin "default.css". 133 | html_static_path = ['_static'] 134 | html_style = 'labibi.css' 135 | 136 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 137 | # using the given strftime format. 138 | # html_last_updated_fmt = '%b %d, %Y' 139 | 140 | # If true, SmartyPants will be used to convert quotes and dashes to 141 | # typographically correct entities. 142 | # html_use_smartypants = True 143 | 144 | # Custom sidebar templates, maps document names to template names. 145 | # html_sidebars = {} 146 | 147 | # Additional templates that should be rendered to pages, maps page names to 148 | # template names. 149 | # html_additional_pages = {} 150 | 151 | # If false, no module index is generated. 152 | # html_domain_indices = True 153 | 154 | # If false, no index is generated. 155 | # html_use_index = True 156 | 157 | # If true, the index is split into individual pages for each letter. 158 | # html_split_index = False 159 | 160 | # If true, links to the reST sources are added to the pages. 161 | # html_show_sourcelink = True 162 | 163 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 164 | # html_show_sphinx = True 165 | 166 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 167 | # html_show_copyright = True 168 | 169 | # If true, an OpenSearch description file will be output, and all pages will 170 | # contain a tag referring to it. The value of this option must be the 171 | # base URL from which the finished HTML is served. 172 | # html_use_opensearch = '' 173 | 174 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 175 | # html_file_suffix = None 176 | 177 | # Output file base name for HTML help builder. 178 | htmlhelp_basename = 'screeddoc' 179 | 180 | 181 | html_context = { 182 | "google_analytics_id": 'UA-51731094-1', 183 | "disqus_shortname": 'screed-docs', 184 | # "github_base_account": 'dib-lab', 185 | "github_project": 'screed', 186 | } 187 | 188 | # -- Options for LaTeX output ------------------------------------------------- 189 | 190 | latex_elements = { 191 | # The paper size ('letterpaper' or 'a4paper'). 192 | # 'papersize': 'letterpaper', 193 | 194 | # The font size ('10pt', '11pt' or '12pt'). 195 | # 'pointsize': '10pt', 196 | 197 | # Additional stuff for the LaTeX preamble. 198 | # 'preamble': '', 199 | } 200 | 201 | # Grouping the document tree into LaTeX files. List of tuples 202 | # (source start file, target name, title, author, documentclass 203 | # [howto/manual]). 204 | latex_documents = [ 205 | ('index', 'screed.tex', u'screed Documentation', 206 | u'Alex Nolley and Titus Brown', 'manual'), 207 | ] 208 | 209 | # The name of an image file (relative to this directory) to place at the top of 210 | # the title page. 211 | # latex_logo = None 212 | 213 | # For "manual" documents, if this is true, then toplevel headings are parts, 214 | # not chapters. 215 | # latex_use_parts = False 216 | 217 | # If true, show page references after internal links. 218 | # latex_show_pagerefs = False 219 | 220 | # If true, show URL addresses after external links. 221 | # latex_show_urls = False 222 | 223 | # Documents to append as an appendix to all manuals. 224 | # latex_appendices = [] 225 | 226 | # If false, no module index is generated. 227 | # latex_domain_indices = True 228 | 229 | 230 | # -- Options for manual page output ------------------------------------------- 231 | 232 | # One entry per manual page. List of tuples 233 | # (source start file, name, description, authors, manual section). 234 | man_pages = [ 235 | ('index', 'screed', u'screed Documentation', 236 | [u'Alex Nolley and Titus Brown'], 1) 237 | ] 238 | 239 | # If true, show URL addresses after external links. 240 | # man_show_urls = False 241 | 242 | 243 | # -- Options for Texinfo output ----------------------------------------------- 244 | 245 | # Grouping the document tree into Texinfo files. List of tuples 246 | # (source start file, target name, title, author, 247 | # dir menu entry, description, category) 248 | texinfo_documents = [ 249 | ('index', 'screed', u'screed Documentation', u'Alex Nolley and Titus Brown', 250 | 'screed', 'One line description of project.', 'Miscellaneous'), 251 | ] 252 | 253 | # Documents to append as an appendix to all manuals. 254 | # texinfo_appendices = [] 255 | 256 | # If false, no module index is generated. 257 | # texinfo_domain_indices = True 258 | 259 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 260 | # texinfo_show_urls = 'footnote' 261 | -------------------------------------------------------------------------------- /doc/dev/coding-guidelines-and-review.rst: -------------------------------------------------------------------------------- 1 | .. vim: set filetype=rst 2 | 3 | Coding guidelines and code review checklist 4 | =========================================== 5 | 6 | This document is for anyone who want to contribute code to the screed 7 | project, and describes our coding standards and code review checklist. 8 | 9 | ---- 10 | 11 | Coding standards 12 | ---------------- 13 | 14 | All plain-text files should have line widths of 80 characters or less unless 15 | that is not supported for the particular file format. 16 | 17 | Vim user can set the indentation with:: 18 | 19 | set expandtab 20 | set shiftwidth=4 21 | set softtabstop=4 22 | 23 | We are a pure Python project and `PEP 8 `__ is our 24 | standard. The ```pep8``` and ```autopep8``` Makefile targets are helpful. 25 | 26 | Code and documentation must have its spelling checked. Vim users can 27 | run:: 28 | 29 | :setlocal spell spelllang=en_us 30 | 31 | Use `]s` and `[s` to navigate between misspellings and `z=` to suggest a 32 | correctly spelled word. `zg` will add a word as a good word. 33 | 34 | GNU `aspell` can also be used to check the spelling in a single file:: 35 | 36 | aspell check --mode $filename 37 | 38 | Code Review 39 | ----------- 40 | 41 | Please read `11 Best Practices for Peer Code Review 42 | `__. 43 | 44 | See also `Code reviews: the lab meeting for code 45 | `__ and 46 | `the PyCogent coding guidelines 47 | `__. 48 | 49 | Checklist 50 | --------- 51 | 52 | Copy and paste the following into a pull request comment when it is 53 | ready for review:: 54 | 55 | - [ ] Is it mergeable? 56 | - [ ] `make test` Did it pass the tests? 57 | - [ ] `make clean diff-cover` If it introduces new functionality, is it tested? 58 | - [ ] `make format diff_pylint_report doc` Is it well formatted? 59 | - [ ] Is it documented in the `ChangeLog`? 60 | http://en.wikipedia.org/wiki/Changelog#Format 61 | - [ ] Was a spellchecker run on the source code and documentation after 62 | changes were made? 63 | 64 | **Note** that after you submit the comment you can check and uncheck 65 | the individual boxes on the formatted comment; no need to put x or y 66 | in the middle. 67 | -------------------------------------------------------------------------------- /doc/dev/index.rst: -------------------------------------------------------------------------------- 1 | The screed developer documentation 2 | ================================== 3 | 4 | This section of the documentation is for people who are contributing 5 | (or would like to contribute to) the screed project codebase, either by 6 | contributing code or by helping improve the documentation. 7 | 8 | Please note that this project is released with a :doc:`../CODE_OF_CONDUCT`. 9 | By participating in the development of this project you agree to abide by its 10 | terms. 11 | 12 | Contents: 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | 17 | parsers 18 | coding-guidelines-and-review 19 | release-checklist 20 | -------------------------------------------------------------------------------- /doc/dev/parsers.rst: -------------------------------------------------------------------------------- 1 | Writing Custom Sequence Parsers 2 | =============================== 3 | 4 | screed is built to be adaptable to new kinds of file sequence formats. 5 | Included with screed are parsers for handling FASTA and FASTQ sequence 6 | file types, though if you need screed to work with a new format, all 7 | you need to do is write a new parser. 8 | 9 | Field Roles 10 | ----------- 11 | 12 | Each field in a screed database is assigned a role. These roles 13 | describe what kind of information is stored in their field. Right now 14 | there are only 4 different roles in a screed database: the text role, 15 | the sliceable role, the indexed key role and the primary key role. All 16 | roles are defined in the file: screed/DBConstants.py 17 | 18 | The text role (DBConstants._STANDARD_TEXT) is the role most fields in 19 | a database will have. This role tells screed that the associated field 20 | is storing standard textual data. Nothing special. 21 | 22 | The sliceable role (DBConstants._SLICEABLE_TEXT) is a role that can be 23 | assigned to long sequence fields. screed's default FASTA parser 24 | defines the 'sequence' field with the sliceable role. When screed 25 | retrieves a field that has the sliceable role, it builds a special 26 | data structure that supports slicing into the text. 27 | 28 | The indexed key role (DBConstants._INDEXED_TEXT_KEY) is associated 29 | with exactly one of the fields in a screed database. In screed's FASTA 30 | and FASTQ parsers, this role is fulfilled by the 'name' field. This 31 | field is required because it is the field screed tells sqlite to index 32 | when creating the database and it is the field used for name look-ups 33 | when querying a screed database. 34 | 35 | The primary key role (DBConstants._PRIMARY_KEY_ROLE) is a role 36 | automatically associated with the 'id' field in each database. This 37 | field is always created with each screed database and always holds 38 | this role. You as a user of screed won't need to worry about this one. 39 | 40 | General Parsing Function Format 41 | ------------------------------- 42 | 43 | create_db is the function central to the creation of screed 44 | databases. This function accepts a file path, a tuple of field names 45 | and roles, and an iterator function. The file path describes where the 46 | screed database should go, the tuple contains the names of fields and 47 | their associated roles and the iterator function yields records in a 48 | dictionary format. 49 | 50 | This sub-section describes general steps for preparing and using 51 | screed with a custom sequence parser. Though they don't have to be, 52 | future sequence parsers should be located in the seqparse.py file for 53 | convenience. These steps will be described in the context of working 54 | from the Python shell. 55 | 56 | First import the create_db function:: 57 | 58 | >>> from screed import create_db 59 | 60 | The create_db class handles the formatting of screed databases and 61 | provides a simple interface for storing sequence data. 62 | 63 | Next the database fields and roles must be specified. The fields tell 64 | screed the names and order of the data fields inside each record. For instance, 65 | lets say our new sequence has types 'name', 'bar', and 'baz', all text. The 66 | tuple will be:: 67 | 68 | >>> fields = (('name', DBConstants._INDEXED_TEXT_KEY), 69 | ('bar', DBConstants._STANDARD_TEXT), 70 | ('baz', DBConstants._STANDARD_TEXT)) 71 | 72 | Notice how 'name' is given the indexed key role and bar and baz are 73 | given text roles? If, for instance, you know 'baz' fields can be very long 74 | and you want to be able to retrieve slices of them, you could specify 75 | fields as:: 76 | 77 | >>> fields = (('name', DBConstants._INDEXED_TEXT_KEY), 78 | ('bar', DBConstants._STANDARD_TEXT), 79 | ('baz', DBConstants._SLICEABLE_TEXT)) 80 | 81 | All screed databases come with an 'id' field, which is a sequential 82 | numbering order starting at 0 for the first record, 1 for the second, and 83 | so on. The names and number of the other fields are arbitrary with one 84 | restriction: one and only one of the fields must fulfill the indexed key role. 85 | 86 | Next, you need to setup an iterator function that will return records in 87 | a dictionary format. Have a look at the 'fastq_iter', 'fasta_iter', or 88 | 'hava_iter' functions in the screed/fastq.py, screed/fasta.py, and 89 | screed/hava.py files, respectively for examples on how to write one of these. 90 | If you don't know what an iterator function is, the documentation on the 91 | Python website gives a good description: 92 | http://docs.python.org/library/stdtypes.html#iterator-types. 93 | 94 | Once the iterator function is written, it needs to be instantiated. In the 95 | context of the built-in parsing functions, this means opening a file and 96 | passing the file handle to the iterator function:: 97 | 98 | >>> seqfile = open('path_to_seq_file', 'rb') 99 | >>> iter_instance = myiter(seqfile) 100 | 101 | Assuming that your iterator function is called 'myiter', this sets up an 102 | instance of it ready to use with create_db. 103 | 104 | Now the screed database is created with one command:: 105 | 106 | >>> create_db('path_to_screed_db', fields, iter_instance) 107 | 108 | If you want the screed database saved at 'path_to_screed_db'. If instead you 109 | want the screed database created in the same directory and with a 110 | similar file name as the sequence file, its OK to do this:: 111 | 112 | >>> create_db('path_to_seq_file', fields, iter_instance) 113 | 114 | create_db will just append '_screed' to the end of the file name and make 115 | a screed database at that file path so the original file won't be 116 | overwritten. 117 | 118 | When you're done the sequence file should be closed:: 119 | 120 | >>> seqfile.close() 121 | 122 | Using the Built-in Sequence Iterator Functions 123 | ---------------------------------------------- 124 | 125 | This section shows how to use the 'fastq_iter' and 'fasta_iter' functions 126 | for returning records from a sequence file. 127 | 128 | These functions both take a file handle as the only argument and then return 129 | a dictionary for each record in the file containing names of fields and 130 | associated data. These functions are primarily used in conjunction with 131 | the db_create() function, but they can be useful by themselves. 132 | 133 | First, import the necessary module and open a text file containing sequences. 134 | For this example, the 'fastq_iter' function will be used:: 135 | 136 | >>> import screed.fastq 137 | >>> seqfile = open('path_to_seqfile', 'rb') 138 | 139 | Now, the 'fastq_iter' can be instantiated and iterated over:: 140 | 141 | >>> fq_instance = screed.fastq(seqfile) 142 | >>> for record in fq_instance: 143 | ... print record.name 144 | 145 | That will print the name of every sequence in the file. If instead you want 146 | to accumulate the sequences:: 147 | 148 | >>> sequences = [] 149 | >>> for record in fq_instance: 150 | ... sequences.append(record.sequence) 151 | 152 | These iterators are the core of screed's sequence modularity. If there is 153 | a new sequence format you want screed to work with, all it needs is its 154 | own iterator. 155 | 156 | Error checking in parsing methods 157 | --------------------------------- 158 | 159 | The existing FASTA/FASTQ parsing functions contain some error 160 | checking, such as making sure the file can be opened and checking 161 | correct data is being read. Though screed doesn't enforce this, it is 162 | strongly recommended to include error checking code in your parser. To 163 | remain non-specific to one file sequence type or another, the 164 | underlying screed library can't contain error checking code of this 165 | kind. If errors are not detected by the parsing function, they will be 166 | silently included into the database being built and could cause 167 | problems much later when trying to read from the database. 168 | -------------------------------------------------------------------------------- /doc/dev/release-checklist.rst: -------------------------------------------------------------------------------- 1 | .. vim: set filetype=rst 2 | 3 | ===================== 4 | Release Documentation 5 | ===================== 6 | 7 | 8 | Introduction 9 | ============ 10 | 11 | This is the release documentation for releasing a new version of screed. This 12 | document is meant for screed release managers. Michael R. Crusoe and C. Titus 13 | Brown have released screed in the past. Jake Fenton is the first to release 14 | screed using this checklist. 15 | 16 | Getting Started 17 | =============== 18 | 19 | #. Create and activate an empty Python environment:: 20 | 21 | mamba create -n screed-rc -y python=3.10 pip make setuptools_scm 22 | conda activate screed-rc 23 | python -m pip install -U pip 24 | python -m pip install -U virtualenv wheel tox-setuptools-version build 25 | 26 | #. Start with a clean checkout:: 27 | 28 | cd $(mktemp -d) 29 | git clone git@github.com:dib-lab/screed.git 30 | cd screed 31 | 32 | #. Set the new version number and release candidate:: 33 | 34 | new_version=1.1.0 35 | rc=rc1 36 | 37 | Tag the release candidate with the new version prefixed by the letter 'v':: 38 | 39 | git tag v${new_version}-${rc} 40 | git push --tags git@github.com:dib-lab/screed.git 41 | 42 | #. Test the release candidate:: 43 | 44 | cd .. 45 | virtualenv testenv1 46 | virtualenv testenv2 47 | virtualenv testenv3 48 | virtualenv testenv4 49 | 50 | # first we test the tag 51 | cd testenv1 52 | source bin/activate 53 | git clone --depth 1 --branch v${new_version}-${rc} \ 54 | https://github.com/dib-lab/screed.git 55 | cd screed 56 | make install-dependencies 57 | make install 58 | make test 59 | python -c 'import screed; print(screed.__version__)' # double-check version number 60 | 61 | 62 | # Test via pip 63 | cd ../../testenv2 64 | source bin/activate 65 | pip install -e \ 66 | git+https://github.com/dib-lab/screed.git@v${new_version}-${rc}#egg=screed 67 | cd src/screed 68 | make dist 69 | make install 70 | pip install pytest 71 | pytest screed 72 | python -c 'import screed; print(screed.__version__)' # double-check version number 73 | cp dist/screed-1.1rc1.tar.gz ../../../testenv3 74 | 75 | # test if the dist made in testenv2 is complete enough to build another 76 | # functional dist 77 | 78 | cd ../../../testenv3 79 | source bin/activate 80 | pip install pytest 81 | pip install screed*tar.gz 82 | python -c 'import screed; print(screed.__version__)' 83 | tar xzf screed*tar.gz 84 | cd screed* 85 | make dist 86 | make test 87 | 88 | #. Do any final testing (acceptance tests, etc.) A good test is to install 89 | the new version of screed and then run the sourmash tests. 90 | 91 | How to make a final release 92 | =========================== 93 | 94 | When you have a thoroughly tested release candidate, cut a release like so: 95 | 96 | #. Delete the release candidate tag and push the tag updates to GitHub:: 97 | 98 | cd ../../screed 99 | git tag -d v${new_version}-${rc} 100 | git push --delete origin v${new_version}${rc} 101 | 102 | #. Create the final tag and publish the new release on PyPI (requires an 103 | authorized account) :: 104 | 105 | git tag v${new_version} 106 | git push --tags origin 107 | make dist 108 | twine upload dist/screed-${new_version}.tar.gz 109 | 110 | #. Add the release on GitHub, using the tag you just pushed. Name it "Version 111 | X.Y.Z" and copy/paste in the release notes. 112 | 113 | #. Update the Read the Docs to point to the new version. Visit 114 | https://readthedocs.org/builds/screed/ and ‘Build Version: master’ to pick up 115 | the new tag. Once that build has finished check the “Activate” box next to 116 | the new version at https://readthedocs.org/dashboard/screed/versions/ under 117 | “Choose Active Versions”. Finally change the default version at 118 | https://readthedocs.org/dashboard/screed/advanced/ to the new version. 119 | 120 | #. Delete any RC tags created:: 121 | 122 | git tag -d ${new_version}-${rc} 123 | git push origin :refs/tags/${new_version}-${rc} 124 | 125 | #. Tweet about the new release 126 | 127 | #. Send email including the release notes to khmer@lists.idyll.org and 128 | khmer-announce@lists.idyll.org 129 | 130 | Notes on this document 131 | ====================== 132 | This is the procedure for cutting a new release of screed. It has been adapted 133 | from the release documentation for the khmer project, found at 134 | http://khmer.readthedocs.org/en/v1.1/release.html. 135 | 136 | -------------------------------------------------------------------------------- /doc/example.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | screed examples 3 | =============== 4 | 5 | .. contents: 6 | 7 | Basic Usage 8 | =========== 9 | 10 | Load screed, index the database, and return a dictionary-like object: 11 | 12 | >>> import screed 13 | >>> db = screed.read_fasta_sequences('../screed/tests/test.fa') 14 | 15 | Get the list of sequence names, sort alphabetically, and look at the 16 | first one: 17 | 18 | >>> names = db.keys() 19 | >>> names.sort() 20 | >>> names[0] 21 | u'ENSMICT00000000730' 22 | 23 | Retrieve that record: 24 | 25 | >>> r = db[names[0]] 26 | >>> print r.keys() 27 | [u'description', u'id', u'name', u'sequence'] 28 | 29 | Print out the internal ID number and the name: 30 | 31 | >>> print r.id 32 | 13 33 | >>> print r.name 34 | ENSMICT00000000730 35 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. screed documentation master file, created by 2 | sphinx-quickstart on Wed Jun 6 16:32:37 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | screed - short read sequence utils 7 | ================================== 8 | 9 | :Copyright: 2008, Michigan State University. 10 | :Copyright: 2015, The Regents of the University of California. 11 | :Authors: Alex Nolley, C. Titus Brown 12 | :Contact: ctb@msu.edu 13 | :License: BSD 14 | 15 | Contents: 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | screed 21 | example 22 | 23 | dev/index 24 | release-notes/index 25 | user/known-issues 26 | 27 | CODE_OF_CONDUCT 28 | LICENSE 29 | 30 | Indices and tables 31 | ================== 32 | 33 | * :ref:`genindex` 34 | * :ref:`modindex` 35 | * :ref:`search` 36 | -------------------------------------------------------------------------------- /doc/release-notes/RELEASE-0.5.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Release v0.5 3 | ============ 4 | 5 | We are proud to announce the release of screed v0.5. screed is a database engine 6 | capable of storing and retriving short-read sequence data. screed is designed 7 | to be fast and adaptable to different sequence file formats. This marks the 8 | first release of screed which we consider stable and complete. 9 | 10 | Features: 11 | - Read sequence data from FASTA/FASTQ files into screed databases 12 | - Save screed databases back to FASTA/FASTQ files 13 | - Lookup sequence data by index (offset) or name 14 | - Native support for sequence substring slicing 15 | - Convert between FASTA <-> FASTQ file formats 16 | 17 | screed is written entirely in Python and uses the Sqlite database for backend 18 | storage. screed can be downloaded from the public git repository: 19 | http://github.com/acr/screed.git 20 | 21 | screed is licensed under the BSD license which can be viewed in the 22 | doc/LICENSE.txt file. 23 | -------------------------------------------------------------------------------- /doc/release-notes/RELEASE-0.8.md: -------------------------------------------------------------------------------- 1 | 2 | # Release v0.8 3 | 4 | We are pleased to announce the release of Screed v0.8. Screed is a database 5 | engine capable of storing and retrieving short-read sequence data and is 6 | designed to be fast and adaptable to different sequence file formats. 7 | 8 | This version of Screed contains developer documentation for contributing to the 9 | Screed project and a code of conduct for interacting with other contributors 10 | and project maintainers. Documentation is available at 11 | http://screed.readthedocs.org/en/v0.8/ 12 | 13 | ## New items of note: 14 | 15 | This release successfully installs and passes its unit tests on 16 | Ubuntu 14.04 and the latest release of Mac OS X 10 "Yosemite". It 17 | also passes the khmer acceptance tests as per the [eelpond testing 18 | protocol.](https://github.com/dib-lab/literate-resting/blob/master/kp/README.txt) 19 | 20 | This release of screed has renamed the 'accuracy' attribute of read records to 21 | 'quality;' this API change will need to be adopted by all users wanting to 22 | upgrade to this version. Unlike the khmer project, Screed is not currently 23 | under semantic versioning. It will be with the 1.0 release. 24 | 25 | - Screed now has automatic compression detection via magic bit sniffing 26 | for gzip and bzip2 compressed files (from @mr-c in dib-lab/khmer#432) 27 | - Screed now supports streaming of uncompressed FASTA and FASTQ formatted 28 | nucleotide sequence data. bzip2 compressed FASTA and FASTQ formatted 29 | nucleotide sequence data can also be streamed but not gzip compressed 30 | FASTA and FASTQ formatted nucleotide sequence data. (from @mr-c, see 31 | dib-lab/khmer#633) 32 | - Screed now has a Changelog, developer documentation and a code of conduct 33 | (from @ctb, @mr-c, @bocajnotnef in dib-lab/khmer#625) 34 | - Versions are now autogenerated using git tags via Versioneer (from 35 | @bocajnotnef in cadceb5) 36 | - Documentation is now autogenerated using Doxygen (from @mr-c in d8ed05b) 37 | 38 | ## Notable bugs fixed/issues closed: 39 | - A khmer script was not accepting reads on the stdin dib-lab/khmer#633 40 | by @mr-c 41 | - screed returning the wrong version and breaking dev installs 42 | dib-lab/khmer#803 by @mr-c 43 | 44 | 45 | ## Known Issues 46 | 47 | These are all pre-existing 48 | 49 | - Screed records cannot be sliced requiring un-Pythonic techniques to achieve 50 | the same behavior. This will be included in a future release. This is being 51 | tracked in dib-lab/khmer#768 52 | - Screed self-tests do not use a temporary directory which causes 53 | tests run from package-based installs to fail. This is being tracked in 54 | dib-lab/khmer#748 55 | - Screed does not support gzip file streaming. This is an issue with Python 56 | 2.x and will likely *not* be fixed in future releases. This is being tracked 57 | in dib-lab/khmer#700 58 | - Screed is overly tolerant of spaces in fast{a,q} which is against spec. This 59 | is being tracked in dib-lab/khmer#108 60 | 61 | ## Contributors 62 | 63 | @bocajnotnef @mr-c @brtaylor92 @wrightmhw @kdmurray91 @luizirber @ctb 64 | 65 | -------------------------------------------------------------------------------- /doc/release-notes/RELEASE-0.8.rst: -------------------------------------------------------------------------------- 1 | Release v0.8 2 | ============ 3 | 4 | We are pleased to announce the release of Screed v0.8. Screed is a 5 | database engine capable of storing and retrieving short-read sequence 6 | data and is designed to be fast and adaptable to different sequence file 7 | formats. 8 | 9 | This version of Screed contains developer documentation for contributing 10 | to the Screed project and a code of conduct for interacting with other 11 | contributors and project maintainers. Documentation is available at 12 | http://screed.readthedocs.org/en/v0.8/ 13 | 14 | New items of note: 15 | ------------------ 16 | 17 | This release successfully installs and passes its unit tests on Ubuntu 18 | 14.04 and the latest release of Mac OS X 10 "Yosemite". It also passes 19 | the khmer acceptance tests as per the `eelpond testing 20 | protocol. `__ 21 | 22 | This release of screed has renamed the 'accuracy' attribute of read 23 | records to 'quality;' this API change will need to be adopted by all 24 | users wanting to upgrade to this version. Unlike the khmer project, 25 | Screed is not currently under semantic versioning. It will be with the 26 | 1.0 release. 27 | 28 | - Screed now has automatic compression detection via magic bit sniffing 29 | for gzip and bzip2 compressed files (from @mr-c in dib-lab/khmer#432) 30 | - Screed now supports streaming of uncompressed FASTA and FASTQ 31 | formatted nucleotide sequence data. bzip2 compressed FASTA and FASTQ 32 | formatted nucleotide sequence data can also be streamed but not gzip 33 | compressed FASTA and FASTQ formatted nucleotide sequence data. (from 34 | @mr-c, see dib-lab/khmer#633) 35 | - Screed now has a Changelog, developer documentation and a code of 36 | conduct (from @ctb, @mr-c, @bocajnotnef in dib-lab/khmer#625) 37 | - Versions are now autogenerated using git tags via Versioneer (from 38 | @bocajnotnef in cadceb5) 39 | - Documentation is now autogenerated using Doxygen (from @mr-c in 40 | d8ed05b) 41 | 42 | Notable bugs fixed/issues closed: 43 | --------------------------------- 44 | 45 | - A khmer script was not accepting reads on the stdin dib-lab/khmer#633 46 | by @mr-c 47 | - screed returning the wrong version and breaking dev installs 48 | dib-lab/khmer#803 by @mr-c 49 | 50 | Known Issues 51 | ------------ 52 | 53 | These are all pre-existing 54 | 55 | - Screed records cannot be sliced requiring un-Pythonic techniques to 56 | achieve the same behavior. This will be included in a future release. 57 | This is being tracked in dib-lab/khmer#768 58 | - Screed self-tests do not use a temporary directory which causes tests 59 | run from package-based installs to fail. This is being tracked in 60 | dib-lab/khmer#748 61 | - Screed does not support gzip file streaming. This is an issue with 62 | Python 2.x and will likely *not* be fixed in future releases. This is 63 | being tracked in dib-lab/khmer#700 64 | - Screed is overly tolerant of spaces in fast{a,q} which is against 65 | spec. This is being tracked in dib-lab/khmer#108 66 | 67 | Contributors 68 | ------------ 69 | 70 | @bocajnotnef @mr-c @brtaylor92 @wrightmhw @kdmurray91 @luizirber @ctb 71 | -------------------------------------------------------------------------------- /doc/release-notes/RELEASE-0.9.md: -------------------------------------------------------------------------------- 1 | 2 | # Release v0.9 3 | 4 | We are pleased to announce the release of Screed v0.9. Screed is a database 5 | engine capable of storing and retrieving short-read sequence data and is 6 | designed to be fast and adaptable to different sequence file formats. 7 | 8 | This version of Screed features Python 3 syntax with compatibility with Python 2. Additional changes have broken backwards compatibility in several small ways in preparation for our 1.0 release and adoption of strict semantic versioning from there on out. 9 | 10 | It is also the first release since our move to the University of Davis, California and also under our new name, the Lab for Data Intensive Biology. 11 | 12 | Documentation is available at http://screed.readthedocs.org/en/v0.9/ 13 | 14 | ## New items of note: 15 | 16 | - Now a primarily Python 3 codebase with Python 2 compatibility. https://github.com/dib-lab/screed/pull/41 @luizirber & @mr-c 17 | 18 | - Tests now correctly run using temporary directories and the test data is now shipped allowing the tests to be run after installation. https://github.com/dib-lab/screed/pull/30 @bocajnotnef https://github.com/dib-lab/screed/pull/40 @mr-c 19 | - The private method `screed/screedRecord._screed_record_dict()` has been renamed to `screed.screedRecord.Record()`. This is **not** a backwards compatible change. https://github.com/dib-lab/screed/pull/35 @sguermond 20 | - `screed.open()` now accepts `-` as a synonym for STDIN and is now an (optional) context manager. It no longer defaults to parsing out a separate description from the name. The description field will br removed altogether from the next release. This is **not** a backwards compatible change. https://github.com/dib-lab/screed/pull/36 @anotherthomas https://github.com/dib-lab/screed/pull/39 https://github.com/dib-lab/screed/pull/41 @luizirber https://github.com/dib-lab/screed/pull/43 @ctb 21 | - The FASTQ parser was improved and it no longer hangs in the presence of empty lines. https://github.com/dib-lab/screed/pull/38 @proteasome 22 | - Screed records now slice correctly https://github.com/dib-lab/screed/pull/41 @wrightmhw @luizirber 23 | 24 | 25 | ## Other bugs fixed/issues closed: 26 | 27 | - Release notes are now a part of the documentation. https://github.com/dib-lab/screed/pull/33 @bocajnotnef 28 | - A test was made more robust to prevent hangs. https://github.com/dib-lab/screed/pull/37 @anotherthomas 29 | 30 | ## Known Issues 31 | 32 | These are all pre-existing 33 | 34 | - Screed does not support gzip file streaming. This is an issue with Python 2.x and will likely *not* be fixed in future releases. This is being tracked in ged-lab/khmer#700 35 | - Screed is overly tolerant of spaces in fast{a,q} which is against spec. This is being tracked in ged-lab/khmer#108 36 | 37 | ## Contributors 38 | 39 | @luizirber @mr-c @bocajnotnef @ctb \*@proteasome \*@anotherthomas \*@sguermond 40 | 41 | \* Indicates new contributors 42 | -------------------------------------------------------------------------------- /doc/release-notes/RELEASE-0.9.rst: -------------------------------------------------------------------------------- 1 | Release v0.9 2 | ============ 3 | 4 | We are pleased to announce the release of Screed v0.9. Screed is a 5 | database engine capable of storing and retrieving short-read sequence 6 | data and is designed to be fast and adaptable to different sequence file 7 | formats. 8 | 9 | This version of Screed features Python 3 syntax with compatibility with 10 | Python 2. Additional changes have broken backwards compatibility in 11 | several small ways in preparation for our 1.0 release and adoption of 12 | strict semantic versioning from there on out. 13 | 14 | It is also the first release since our move to the University of Davis, 15 | California and also under our new name, the Lab for Data Intensive 16 | Biology. 17 | 18 | Documentation is available at http://screed.readthedocs.org/en/v0.9/ 19 | 20 | New items of note: 21 | ------------------ 22 | 23 | - Now a primarily Python 3 codebase with Python 2 compatibility. 24 | https://github.com/dib-lab/screed/pull/41 @luizirber & @mr-c 25 | 26 | - Tests now correctly run using temporary directories and the test data 27 | is now shipped allowing the tests to be run after installation. 28 | https://github.com/dib-lab/screed/pull/30 @bocajnotnef 29 | https://github.com/dib-lab/screed/pull/40 @mr-c 30 | - The private method ``screed/screedRecord._screed_record_dict()`` has 31 | been renamed to ``screed.screedRecord.Record()``. This is **not** a 32 | backwards compatible change. 33 | https://github.com/dib-lab/screed/pull/35 @sguermond 34 | - ``screed.open()`` now accepts ``-`` as a synonym for STDIN and is now 35 | an (optional) context manager. It no longer defaults to parsing out a 36 | separate description from the name. The description field will br 37 | removed altogether from the next release. This is **not** a backwards 38 | compatible change. https://github.com/dib-lab/screed/pull/36 39 | @anotherthomas https://github.com/dib-lab/screed/pull/39 40 | https://github.com/dib-lab/screed/pull/41 @luizirber 41 | https://github.com/dib-lab/screed/pull/43 @ctb 42 | - The FASTQ parser was improved and it no longer hangs in the presence 43 | of empty lines. https://github.com/dib-lab/screed/pull/38 @proteasome 44 | - Screed records now slice correctly 45 | https://github.com/dib-lab/screed/pull/41 @wrightmhw @luizirber 46 | 47 | Other bugs fixed/issues closed: 48 | ------------------------------- 49 | 50 | - Release notes are now a part of the documentation. 51 | https://github.com/dib-lab/screed/pull/33 @bocajnotnef 52 | - A test was made more robust to prevent hangs. 53 | https://github.com/dib-lab/screed/pull/37 @anotherthomas 54 | 55 | Known Issues 56 | ------------ 57 | 58 | These are all pre-existing 59 | 60 | - Screed does not support gzip file streaming. This is an issue with 61 | Python 2.x and will likely *not* be fixed in future releases. This is 62 | being tracked in ged-lab/khmer#700 63 | - Screed is overly tolerant of spaces in fast{a,q} which is against 64 | spec. This is being tracked in ged-lab/khmer#108 65 | 66 | Contributors 67 | ------------ 68 | 69 | @luizirber @mr-c @bocajnotnef @ctb \*@proteasome \*@anotherthomas 70 | \*@sguermond 71 | 72 | \* Indicates new contributors 73 | -------------------------------------------------------------------------------- /doc/release-notes/index.rst: -------------------------------------------------------------------------------- 1 | .. vim set filetype=rst 2 | 3 | Release notes for past versions of screed 4 | ========================================= 5 | 6 | Contents: 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | 11 | RELEASE-0.5 12 | RELEASE-0.8 13 | -------------------------------------------------------------------------------- /doc/run-doctests.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import doctest 3 | import sys 4 | 5 | for filename in sys.argv[1:]: 6 | print '... running doctests on', filename 7 | doctest.testfile(filename) 8 | 9 | print '*** SUCCESS ***' 10 | -------------------------------------------------------------------------------- /doc/screed.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | User Manual 3 | =========== 4 | 5 | .. note:: 6 | 7 | Some doctests are included in :doc:`example`. The examples in this 8 | document are meant for human consumption only. They will not work in 9 | doctests! 10 | 11 | screed parses FASTA and FASTQ files, generates databases, and lets you query 12 | these databases. Values such as sequence name, sequence description, sequence 13 | quality, and the sequence itself can be retrieved from these databases. 14 | 15 | Installation 16 | ============ 17 | 18 | The following software packages are required to run screed: 19 | 20 | * Python 2 (2.7) or Python 3 (3.3 or newer) 21 | * pytest (only required to running tests) 22 | 23 | Use pip to download, and install Screed and its dependencies:: 24 | 25 | pip install screed 26 | 27 | To run the optional tests type:: 28 | 29 | python -m screed.tests 30 | 31 | Command-line Quick Start 32 | ======================== 33 | 34 | Creating a database 35 | ------------------- 36 | 37 | .. code:: 38 | 39 | $ screed db 40 | 41 | Dumping a database to a file 42 | ---------------------------- 43 | 44 | .. code:: 45 | 46 | $ screed dump_fasta 47 | $ screed dump_fastq 48 | 49 | If no output file is provided, sequences are written to the terminal (stdout) by 50 | default. 51 | 52 | Python Quick Start 53 | ================== 54 | 55 | Reading FASTA/FASTQ files 56 | ------------------------- 57 | 58 | >>> import screed 59 | >>> with screed.open(filename) as seqfile: 60 | >>> for read in seqfile: 61 | ... print(read.name, read.sequence) 62 | 63 | Here, :code:`filename` can be a FASTA or FASTQ file, and can be uncompressed, 64 | gzip-compressed, or bzip2-compressed. screed natively supports FASTA and FASTQ 65 | databases creation. If your sequences are in a different format see the 66 | developer documentation on :doc:`dev/parsers`. 67 | 68 | Creating a database 69 | ------------------- 70 | 71 | >>> import screed 72 | >>> screed.make_db('screed/tests/test-data/test.fa') 73 | 74 | This loads a FASTA file :code:`screed/tests/test-data/test.fa` into a screed database 75 | named :code:`screed/tests/test-data/test.fa_screed`. A couple of things to note: 76 | 77 | * The screed database is independent of the text file from which it was derived, 78 | so moving, renaming or deleting :code:`screed/tests/test-data/test.fa` will not affect 79 | the newly created database. 80 | * The :code:`make_db` function inferred the file type as FASTA automatically. 81 | The :code:`read_fasta_sequences()` and :code:`read_fastq_sequences()` 82 | functions are available if you'd prefer to be explicit. 83 | 84 | >>> screed.read_fasta_sequences('screed/tests/test-data/test.fasta') 85 | >>> screed.read_fastq_sequences('screed/tests/test-data/test.fastq') 86 | 87 | Opening a database 88 | ------------------ 89 | 90 | The class :code:`ScreedDB` is used to read screed databases, regardless of what 91 | file format they were derived from (FASTA/FASTQ/hava/etc.). One reader to rule 92 | them all! 93 | 94 | From the Python prompt, import the ScreedDB class and load some databases:: 95 | 96 | >>> from screed import ScreedDB 97 | >>> fadb = ScreedDB('screed/tests/test-data/test.fa') 98 | >>> fqdb = ScreedDB('screed/tests/test-data/test.fastq') 99 | 100 | Notice how you didn't need to write the '_screed' at the end of the file names? 101 | screed automatically adds that to the file name if you didn't. 102 | 103 | Database dictionary interface 104 | ----------------------------- 105 | 106 | Since screed emulates a read-only dictionary interface, any methods that don't 107 | modify a dictionary are supported:: 108 | 109 | >>> fadb.keys() 110 | >>> fqdb.keys() 111 | 112 | Each record in the database contains 'fields' such as name and sequence 113 | information. If the database was derived from a FASTQ file, quality and optional 114 | annotation strings are included. Conversely, FASTA-derived databases have a 115 | description field. 116 | 117 | To retrieve the names of records in the database:: 118 | 119 | >>> names = fadb.keys() 120 | 121 | The size of the databases (number of sequence records) is easily found:: 122 | 123 | >>> len(fadb) 124 | 22 125 | >>> len(fqdb) 126 | 125 127 | 128 | Retrieving records from a database 129 | ---------------------------------- 130 | 131 | A record is the standard container unit in screed. Each has *fields* that vary 132 | slightly depending on what kind of file the database was derived from. For 133 | instance, a FASTQ-derived screed database has an id, a name, a quality score and 134 | a sequence. A FASTA-derived screed database has an id, name, description and a 135 | sequence. 136 | 137 | Retrieving entire records:: 138 | 139 | >>> records = [r for r in fadb.itervalues()] 140 | 141 | Each record is a dictionary of fields. The names of fields are keys into this 142 | dictionary with the actual information as values. For example:: 143 | 144 | >>> record = fadb[fadb.keys()[0]] 145 | >>> index = record['id'] 146 | >>> name = record['name'] 147 | >>> description = record['description'] 148 | >>> sequence = record['sequence'] 149 | 150 | What this does is retrieve the first record object in the screed database, then 151 | retrieve the index, name, description and sequence from the record object using 152 | standard dictionary key -> value pairs. 153 | 154 | Retrieving partial sequences (slicing) 155 | -------------------------------------- 156 | 157 | screed supports the concept of retrieving a *slice* or a subset of a sequence 158 | string. The motivation is speed: if you have a database entry with a very long 159 | sequence string but only want a small portion of the string, it is faster to 160 | retrieve only the portion than to retrieve the entire string and then perform 161 | standard Python string slicing. 162 | 163 | By default, screed's FASTA database creator sets up the :code:`sequence` column 164 | to support slicing. For example, if you have an entry with name :code:`someSeq` 165 | which has a 10K long sequence, and you want a slice of the sequence spanning 166 | positions 4000 to 4080:: 167 | 168 | >>> seq = db['someSeq'].sequence 169 | >>> slice = seq[4000:4080] 170 | 171 | This is much faster than say:: 172 | 173 | >>> seq = str(db['someSeq'].sequence) 174 | >>> slice = seq[4000:4080] 175 | 176 | Because deep down, less information is being read off the disk. The :code`str()` 177 | method above causes the entire sequence to be retrieved as a string. Then Python 178 | slicing is done on the string :code:`seq` and the subset stored in 179 | :code:`slice`. 180 | 181 | Retrieving records *via* index 182 | ------------------------------ 183 | 184 | Sometimes you don't care what the name of a sequence is; you're only interested 185 | in its position in the database. In these cases, retrieval via index is the 186 | method you'll want to use:: 187 | 188 | >>> record = fqdb.loadRecordByIndex(5) 189 | 190 | An index is like an offset into the database. The order records were kept in the 191 | FASTA or FASTQ file determines the index in their resulting screed database. The 192 | first record in a sequence file will have an index of 0, the second, an index of 193 | 1 and so on. 194 | 195 | File Formats As Understood By Screed 196 | ==================================== 197 | 198 | While the screed database remains non-specific to file formats, the included 199 | FASTA and FASTQ parsers expect specific formats. These parsers attempt to handle 200 | the most common attributes of sequence files, though they can not support all 201 | features. 202 | 203 | FASTQ 204 | ----- 205 | 206 | The FASTQ parsing function is :code:`read_fastq_sequences()` and is located in 207 | the screed module. 208 | 209 | The first line in a record must begin with '@' and is followed by a record 210 | identifier (a name). An optional annotations string may be included after a 211 | space on the same line. 212 | 213 | The second line begins the sequence line(s) which may be line wrapped. screed 214 | defines no limit on the length of sequence lines and no length on how many 215 | sequence lines a record may contain. 216 | 217 | After the sequence line(s) comes a '+' character on a new line. Some FASTQ 218 | formats require the first line to be repeated after the '+' character, but since 219 | this adds no new information to the record, :code:`read_fastq_sequences()` will 220 | ignore this if it is included. 221 | 222 | The quality line(s) is last. Like the sequence line(s) this may be line wrapped. 223 | :code:`read_fastq_sequences()` will raise an exception if the quality and 224 | sequence strings are of unequal length. screed performs no checking for valid 225 | quality scores. 226 | 227 | FASTA 228 | ----- 229 | 230 | The FASTA parsing function is read_fasta_sequences() and is also located in the 231 | screed module. 232 | 233 | The first line in a record must begin with '>' and is followed with the 234 | sequence's name and an optional description. If the description is included, it 235 | is separated from the name with a space. Note that though the FASTA format 236 | doesn't require named records, screed does. Without a unique name, screed can't 237 | look up sequences by name. 238 | 239 | The second line begins the line(s) of sequence. Like the FASTQ parser, 240 | :code:`read_fasta_sequences()` allows any number of lines of any length. 241 | 242 | FASTA <-> FASTQ Conversion 243 | ========================== 244 | 245 | As an extra nicety, screed can convert FASTA files to FASTQ and back again. 246 | 247 | FASTA to FASTQ 248 | -------------- 249 | 250 | The function used for this process is called 'ToFastq' and is located 251 | in the screed module. It takes the path to a screed database as the 252 | first argument and a path to the desired FASTQ file as the second 253 | argument. There is also a shell interface if the screed module is in 254 | your PYTHONPATH:: 255 | 256 | $ python -m screed dump_fastq [ ] 257 | 258 | The FASTA name attribute is directly dumped from the file. The 259 | sequence attribute is also dumped pretty much directly, but is line 260 | wrapped to 80 characters if it is longer. 261 | 262 | Any description line in the FASTA database is stored as a FASTQ annotation 263 | string with no other interpretation done. 264 | 265 | Finally, as there is no quality or quality score in a FASTA file, a 266 | default one is generated. The generation of the quality follows the 267 | Sanger FASTQ conventions. The score is 1 (ASCII: '"') meaning a 268 | probability of about 75% that the read is incorrect (1 in 4 269 | chance). This PHRED quality score is calculated from the Sanger 270 | format: Q = -10log(p) where p is the probability of an incorrect 271 | read. Obviously this is a very rough way of providing a quality score 272 | and it is only intended to fill in the requirements of a FASTQ 273 | file. Any application needing a true measurement of the quality 274 | should not rely on this automatic conversion. 275 | 276 | FASTQ to FASTA 277 | -------------- 278 | 279 | The function used for this process is called 'toFasta' and is located 280 | in the screed module. It takes the path to a screed database as the 281 | first argument and a path to the desired FASTA file as the second 282 | argument. Like the ToFastq function before, there is a shell interface 283 | to ToFasta if the screed module is in your PYTHONPATH:: 284 | 285 | $ python -m screed dump_fasta [ ] 286 | 287 | As above, the name and sequence attributes are directly dumped from 288 | the FASTQ database to the FASTA file with the sequence line wrapping 289 | to 80 characters. 290 | 291 | If it exists, the FASTQ annotation tag is stored as the FASTA description tag. 292 | As there is no equivalent in FASTA, the FASTQ quality score is ignored. 293 | -------------------------------------------------------------------------------- /doc/user/known-issues.rst: -------------------------------------------------------------------------------- 1 | .. vim: set filetype=rst 2 | 3 | ============ 4 | Known Issues 5 | ============ 6 | 7 | This document details the known issues in the current release of screed. All 8 | issues for screed are tracked at https://github.com/dib-lab/khmer/labels/screed 9 | 10 | List of known issues 11 | ==================== 12 | 13 | Screed does not support gzip file streaming. This is an issue 14 | with Python 2.x and will likely *not* be fixed in future 15 | releases. https://github.com/dib-lab/khmer/issues/700 16 | 17 | Screed is overly tolerant of spaces in fast{q,a} which is against 18 | spec. https://github.com/dib-lab/khmer/issues/108 19 | -------------------------------------------------------------------------------- /legacy/ChangeLog: -------------------------------------------------------------------------------- 1 | 2016-11-14 Daniel Standage 2 | 3 | * screed/{__init__.py,fasta.py,fastq.py,openscreed.py}: Remove implementation 4 | (and related references) of Writer classes. 5 | * screed/tests/{test_open.py,test_open_cm.py}: Remove Writer tests. 6 | * screed/{screedRecord.py,{tests/test_{fasta,fastq}.py}}: New write_fastx 7 | implementation and associated tests. 8 | 9 | 2016-11-14 Luiz Irber 10 | 11 | * Makefile,doc/dev/release-checklist.rst,doc/screed.rst,jenkins-build.sh, 12 | pytest.ini, setup.{cfg,py}, tox.ini, screed/tests/{__main__,havaGen, 13 | screed_tst_utils,test_pygr_api, test_streaming}: Replace nose and adapt 14 | for pytest. 15 | 16 | 2016-10-13 Daniel Standage 17 | 18 | * .travis.yml: Reduce the size of the CI build. 19 | 20 | 2016-10-07 Luiz Irber 21 | 22 | * screed/f{a,q}dbm.py: Fix import errors on Python 3. 23 | * screed/tests/test_shell.py: check for return code and rewrite the test to 24 | work more like the expected usage in the shell. 25 | 26 | 2016-10-06 Luiz Irber 27 | 28 | * tox.ini: Use codecov for coverage reports, add Python 3.5 to builds. 29 | * .travis.yml: Activate Python 3.5 build. 30 | * .github/{CONTRIBUTING,PULL_REQUEST_TEMPLATE}.md: Add GitHub templates. 31 | * Makefile: Throw an error If there are pep8 warnings. 32 | * doc/dev/coding-guidelinbes-and-review.rst: Update checklist 33 | * screed/{dna,openscreed,tests/test_attriberror}.py: Fix pep8 warnings. 34 | 35 | 2016-10-04 Luiz Irber 36 | 37 | * screed/screedRecord.py: Implement comparison using total_ordering 38 | decorator from functools. 39 | * screed/tests/test_attriberror.py: Fix syntax errors for Python 3 and 40 | remove tests for not implemented methods (they are implemented now). 41 | 42 | 2016-06-10 Titus Brown 43 | 44 | * screed/dna.py: Fix reverse complement calculation for Python 2.7 45 | 46 | 2015-06-22 Jacob Fenton 47 | 48 | * screed/tests/test_attriberror.py: added tests to check screed db attribute 49 | exception throwing 50 | * screed/screedRecord.py: removed __cmp__ function, explicitly disallowed 51 | all rich comparator functions that aren't == or != 52 | 53 | 2015-06-10 Michael R. Crusoe 54 | 55 | * doc/user/known-issues.rst: removed two fixed issues 56 | * doc/screed.rst: updated install & test instructions 57 | * doc/screed.html: removed un-needed file 58 | 59 | 2015-06-05 Titus Brown 60 | 61 | * screed/{fasta.py,fastq.py,seqparse.py}: Set parse_description default 62 | to False. 63 | * screed/tests/test*.py: updated tests appropriately. 64 | 65 | 2015-06-05 Luiz Irber 66 | 67 | * screed/screedRecord.py: Simplify implementation of record slicing. 68 | * screed/tests/test_fast{a,q}.py: Loop over distinct slices during test. 69 | 70 | 2015-06-05 Michael Wright 71 | 72 | * screed/screedRecord.py: Allow slicing of screed records to fix issue #768 73 | 74 | 2015-06-05 en zyme 75 | 76 | * screed/tests/fastq.py: check for empty line in two places 77 | 78 | 2015-05-29 Luiz Irber 79 | 80 | * screed/openscreed.py: Add missing "close" method to context manager. 81 | 82 | 2015-05-27 Michael R. Crusoe 83 | 84 | * MANIFEST.in: ship the recently relocated test data, fixed reference to 85 | renamed LICENSE file 86 | * doc/dev/CODE_OF_CONDUCT.rst: drop unused symlink 87 | * doc/dev/release-checklist.rst: fix line wrap 88 | 89 | 2015-05-12 Luiz Irber 90 | 91 | * screed/openscreed.py: Implement open as a context manager, keep backward 92 | compatibility. 93 | * screed/tests/test_open_cm.py: Add same tests as test_open.py, but using 94 | a context manager to make sure file is closed after being used. 95 | 96 | 2015-04-15 Thomas Fenzl 97 | 98 | * screed/tests/screed_tst_utils.py: removed unnecessary import 99 | * screed/tests/test_streaming.py: changed execution order to handle 100 | missing import files better 101 | * screed/openscreed.py: pylint-ified 102 | 103 | 2015-04-15 Thomas Fenzl 104 | 105 | * Makefile: added setup.py develop to test goal 106 | * screed/openscreed.py,screed/tests/test_open.py: added handling of '-' 107 | 108 | 2015-04-09 Sarah Guermond 109 | 110 | * screed/screedRecord.py: renamed _screed_record_dict() to Rename() 111 | * screed/__init__.py: added import for Record 112 | * screed/fasta.py: changed _screed_record_dict() to Rename() 113 | * screed/fastq.py: changed _screed_record_dict() to Rename() 114 | 115 | 2015-04-09 Jacob Fenton 116 | 117 | * doc/dev/release-checklist.txt: added "making final release" notes 118 | * Makefile: copied over @mr-c's md-to-rst release notes conversion target 119 | * doc/dev/release-notes/RELEASE-0.8.txt: added rst version of release notes 120 | for sphinx 121 | * doc/dev/release-notes/index.txt: added rst version of 0.8 release notes to 122 | toctree 123 | 124 | 2015-04-07 Jacob Fenton 125 | 126 | * screed/tests/test_{dictionary, fasta, fasta_recover, fastq, fastq_recover, 127 | hava_methods, shell}.py: changed tests to use tempdirs 128 | * screed/tests/screed_tst_utils.py: copied in khmer test utils 129 | * screed/tests/{empty.fa, test-whitespace.fa, test.fa, test.fa.bz2, 130 | test.fa.gz, test.fa.zip, test.fastq, test.fastq.bz2, test.hava}: moved test 131 | data to screed/tests/test-data/ directory 132 | 133 | 2015-04-04 Jacob Fenton 134 | 135 | * doc/dev/release{.txt,-checklist.txt}: renamed/restored release.txt to 136 | release-checklist.txt 137 | 138 | 2015-03-06 Kevin Murray 139 | 140 | * screed/screedRecord.py: Fix a typo in a try: except: block. 141 | s/AttributError/AttributeError/ 142 | 143 | 2015-02-23 Gabriel Pratt 144 | 145 | * Fixed Issue 705 len(read) != len(read.sequence) 146 | 147 | 2015-02-23 Michael R. Crusoe 148 | 149 | * Doxyfile.in: make documentation generation reproducible; removed 150 | timestamp 151 | 152 | 2015-02-23 Michael R. Crusoe 153 | 154 | * doc/dev/release.txt: Fix formatting 155 | * MANIFEST.in: include the MANIFEST.in template, the license and other 156 | files 157 | * versioneer.py,screed/{__init__,_version}.py: upgrade versioneer to 0.13 158 | 159 | 2015-02-23 Michael R. Crusoe 160 | 161 | * setup.py: work around versioneer bug: 162 | https://github.com/warner/python-versioneer/issues/52 163 | 164 | 2014-12-07 Michael R. Crusoe 165 | 166 | * Initial jenkins-build.sh 167 | 168 | 2014-12-03 Jacob Fenton 169 | 170 | * ChangeLog: updated to include major revisions since 0.7.1 171 | * CODE_OF_CONDUCT: copied in code of conduct from khmer project 172 | * docs/dev/{CODE_OF_CONDUCT, coding-guidelines-and-review, index, 173 | releases}.txt, docs/index.txt: added screed dev docs 174 | * setup.py, .gitattributes, __init__.py, _version.py, versioneer.py: 175 | installed versioneer version naming system 176 | * MANIFEST.in: now includes versioneer files and empty testing file 177 | 178 | 2014-11-02 Michael R. Crusoe 179 | 180 | * Doxyfile, Makefile: added Doxygen support, coverage & pylint make targets 181 | 182 | 2014-10-27 Ben Taylor 183 | 184 | * benchmarks/fqGen.py, benchmarks/fqToFaConvert.py, 185 | benchmarks/mysql/mysqlCreateTimeit.py, benchmarks/screedCreateTimeit.py, 186 | benchmarks/pgres/pgresCreateTimeit.py, gibtests/__init__.py, 187 | doc/screed.html, doc/screed.txt, screed/conversion.py, screed/fastq.py, 188 | screed/tests/test_fastq.py: Changed all uses of "accuracy" to "quality" 189 | * screed/tests/test_dna.py: Added basic test coverage for screed/dna.py 190 | 191 | 2014-05-16 Michael R. Crusoe 192 | 193 | * screed/openscreed.py: added sniffing of compression types, including zip 194 | -------------------------------------------------------------------------------- /legacy/jenkins-build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if type python2> /dev/null 2>&1 4 | then 5 | PYTHON_EXECUTABLE=$(which python2) 6 | else 7 | PYTHON_EXECUTABLE=$(which python) 8 | fi 9 | virtualenv -p ${PYTHON_EXECUTABLE} .env 10 | 11 | . .env/bin/activate 12 | make install-dependencies > install_dependencies.out 13 | make develop 14 | make coverage.xml 15 | make tests.xml 16 | if type doxygen >/dev/null 2>&1 17 | then 18 | make doxygen 2>&1 > doxygen.out 19 | fi 20 | make pylint_report.txt 21 | make pep8_report.txt 22 | if type sloccount >/dev/null 2>&1 23 | then 24 | make sloccount.sc 25 | fi 26 | 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools >= 48", 4 | "setuptools_scm[toml] >= 4, <6", 5 | "setuptools_scm_git_archive", 6 | "wheel >= 0.29.0", 7 | ] 8 | build-backend = 'setuptools.build_meta' 9 | 10 | [tool.setuptools_scm] 11 | write_to = "screed/version.py" 12 | git_describe_command = "git describe --dirty --tags --long --match v* --first-parent" 13 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # If you change anything in addopts, 3 | # don't forget to update screed/tests/__main__.py too! 4 | addopts = -m "not known_failing" -v 5 | testpaths = screed/tests 6 | -------------------------------------------------------------------------------- /screed/DBConstants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008, Michigan State University. 2 | 3 | """ 4 | Defines some constant strings identifications used in multiple 5 | files throughout screed 6 | """ 7 | 8 | # Name of table holding information about rest of db 9 | _SCREEDADMIN = 'SCREEDADMIN' 10 | 11 | # Names of _SCREEDADMIN columns 12 | _FIELDNAME = 'FIELDNAME' 13 | _ROLENAME = 'ROLE' 14 | _PRIMARY_KEY = 'id' 15 | 16 | # Names of roles 17 | _STANDARD_TEXT = 'STANDARDATTR' 18 | _SLICEABLE_TEXT = 'SLICEABLEATTR' 19 | _INDEXED_TEXT_KEY = 'TEXTKEYATTR' 20 | _PRIMARY_KEY_ROLE = 'INTKEYATTR' 21 | 22 | # Name of table holding sequence information 23 | _DICT_TABLE = 'DICTIONARY_TABLE' 24 | 25 | # The file extension given to all screed databases 26 | fileExtension = '_screed' 27 | -------------------------------------------------------------------------------- /screed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008, Michigan State University. 2 | # Copyright (c) 2021, University of California 3 | 4 | """ 5 | screed is a database tool useful for retrieving arbitrary kinds of sequence 6 | data through a on-disk database that emulates a read-only Python dictionary. 7 | 8 | For opening a screed database, the 'ScreedDB' class is used. This class 9 | accepts a string file path to a pre-created screed database. Read-only 10 | dictionary methods are implemented here. 11 | 12 | For creating a screed database, the 'create_db' function is used. This 13 | function accepts an iterator as an argument which will yield records 14 | from its respective sequence file. create_db will sequentially pull 15 | records from the iterator, writing them to disk in a screed database 16 | until the iterator is done. 17 | 18 | Automatic ways for parsing FASTA and FASTQ files are accessed through 19 | the read_fast*_sequences functions. These parse the given sequence 20 | file into a screed database. 21 | 22 | Conversion between sequence file types is provided in the ToFastq and 23 | ToFasta functions 24 | """ 25 | 26 | from __future__ import absolute_import 27 | 28 | from screed.openscreed import ScreedDB 29 | from screed.openscreed import Open as open 30 | from screed.conversion import ToFastq 31 | from screed.conversion import ToFasta 32 | from screed.createscreed import create_db, make_db 33 | from screed.seqparse import read_fastq_sequences 34 | from screed.seqparse import read_fasta_sequences 35 | from screed.dna import rc 36 | from screed.screedRecord import Record 37 | 38 | 39 | from importlib.metadata import version, PackageNotFoundError 40 | try: 41 | VERSION = version(__name__) 42 | except PackageNotFoundError: # pragma: no cover 43 | try: 44 | from .version import version as VERSION # noqa 45 | except ImportError: # pragma: no cover 46 | raise ImportError( 47 | "Failed to find (autogenerated) version.py. " 48 | "This might be because you are installing from GitHub's tarballs, " 49 | "use the PyPI ones." 50 | ) 51 | __version__ = VERSION 52 | -------------------------------------------------------------------------------- /screed/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2016, The Regents of the University of California. 4 | 5 | from __future__ import absolute_import, print_function 6 | 7 | import argparse 8 | import sys 9 | 10 | from . import createscreed 11 | from . import dump_fasta 12 | from . import dump_fastq 13 | 14 | 15 | class ScreedCommands(object): 16 | 17 | def __init__(self): 18 | parser = argparse.ArgumentParser( 19 | description="", 20 | usage='''screed [] 21 | 22 | Available: 23 | 24 | db Creates a screed database. 25 | dump_fasta Convert a screed database to a FASTA file 26 | dump_fastq Convert a screed database to a FASTQ file 27 | 28 | ''') 29 | 30 | commands = { 31 | 'db': createscreed.main, 32 | 'dump_fasta': dump_fasta.main, 33 | 'dump_fastq': dump_fastq.main, 34 | } 35 | 36 | parser.add_argument('command') 37 | args = parser.parse_args(sys.argv[1:2]) 38 | if args.command not in commands: 39 | print('Unrecognized command') 40 | parser.print_help() 41 | sys.exit(1) 42 | 43 | cmd = commands[args.command] 44 | cmd(sys.argv[2:]) 45 | 46 | 47 | def main(): 48 | ScreedCommands() 49 | return 0 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /screed/conversion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2010, Michigan State University. 2 | 3 | from __future__ import absolute_import 4 | from .openscreed import ScreedDB 5 | 6 | _MAXLINELEN = 80 7 | _null_quality = '\"' # ASCII 34, e.g 75% chance of incorrect read 8 | 9 | 10 | def GetComments(value): 11 | """ 12 | Returns description or annotations attributes from given 13 | dictionary object 14 | """ 15 | if 'description' in value: 16 | return value['description'] 17 | elif 'annotations' in value: 18 | return value['annotations'] 19 | else: 20 | return '' 21 | 22 | 23 | def linewrap(longString): 24 | """ 25 | Given a long string of characters, inserts newline characters 26 | every _MAXLINELEN characters 27 | """ 28 | res = [] 29 | begin = 0 30 | while begin < len(longString): 31 | res.append(longString[begin:begin + _MAXLINELEN]) 32 | begin += _MAXLINELEN 33 | 34 | return '\n'.join(res) 35 | 36 | 37 | def GenerateQuality(value): 38 | """ 39 | Returns quality from value if it exists. Otherwise, makes 40 | a null quality. Quality is line wrapped to _MAXLINELEN 41 | either way 42 | """ 43 | if 'quality' in value: 44 | return linewrap(value['quality']) 45 | 46 | return linewrap(_null_quality * len(str(value['sequence']))) 47 | 48 | 49 | def ToFastq(dbFile, outputFile): 50 | """ 51 | Opens the screed database file and attempts to dump it 52 | to a FASTQ-formatted text file 53 | """ 54 | outFile = open(outputFile, 'wb') 55 | db = ScreedDB(dbFile) 56 | 57 | for n, value in enumerate(db.itervalues()): 58 | line = '@%s %s\n%s\n+\n%s\n' % (value['name'], 59 | GetComments(value), 60 | linewrap(str(value['sequence'])), 61 | GenerateQuality(value)) 62 | outFile.write(line.encode('UTF-8')) 63 | db.close() 64 | outFile.close() 65 | 66 | return n + 1 67 | 68 | 69 | def ToFasta(dbFile, outputFile): 70 | """ 71 | Opens the screed database file and attempts to dump it 72 | to a FASTA-formatted text file 73 | """ 74 | outFile = open(outputFile, 'wb') 75 | db = ScreedDB(dbFile) 76 | 77 | for n, value in enumerate(db.itervalues()): 78 | line = '>%s %s\n%s\n' % (value['name'], GetComments(value), 79 | linewrap(str(value['sequence']))) 80 | outFile.write(line.encode('UTF-8')) 81 | 82 | db.close() 83 | outFile.close() 84 | 85 | return n + 1 86 | -------------------------------------------------------------------------------- /screed/createscreed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | from __future__ import absolute_import 4 | 5 | import argparse 6 | import itertools 7 | import os 8 | try: 9 | import sqlite3 10 | except ImportError: 11 | pass 12 | import itertools 13 | import sys 14 | 15 | from . import DBConstants, fasta, fastq, openscreed 16 | 17 | 18 | def create_db(filepath, fields, rcrditer): 19 | """ 20 | Creates a screed database in the given filepath. Fields is a tuple 21 | specifying the names and relative order of attributes in a 22 | record. rcrditer is an iterator returning records over a 23 | sequence dataset. Records yielded are in dictionary form 24 | """ 25 | try: 26 | sqlite3 27 | except NameError: 28 | raise Exception("error: sqlite3 is needed for this functionality" + 29 | " but is not installed.") 30 | 31 | if not filepath.endswith(DBConstants.fileExtension): 32 | filepath += DBConstants.fileExtension 33 | 34 | if os.path.exists(filepath): # Remove existing files 35 | os.unlink(filepath) 36 | 37 | con = sqlite3.connect(filepath) 38 | cur = con.cursor() 39 | 40 | # Sqlite PRAGMA settings for speed 41 | cur.execute("PRAGMA synchronous='OFF'") 42 | cur.execute("PRAGMA locking_mode=EXCLUSIVE") 43 | 44 | # Create the admin table 45 | cur.execute('CREATE TABLE %s (%s INTEGER PRIMARY KEY, ' 46 | '%s TEXT, %s TEXT)' % (DBConstants._SCREEDADMIN, 47 | DBConstants._PRIMARY_KEY, 48 | DBConstants._FIELDNAME, 49 | DBConstants._ROLENAME)) 50 | query = 'INSERT INTO %s (%s, %s) VALUES (?, ?)' % \ 51 | (DBConstants._SCREEDADMIN, DBConstants._FIELDNAME, 52 | DBConstants._ROLENAME) 53 | 54 | # Put the primary key in as an attribute 55 | cur.execute(query, (DBConstants._PRIMARY_KEY, 56 | DBConstants._PRIMARY_KEY_ROLE)) 57 | for attribute, role in fields: 58 | cur.execute(query, (attribute, role)) 59 | 60 | # Setup the dictionary table creation field substring 61 | fieldsub = ','.join(['%s TEXT' % field for field, role in fields]) 62 | 63 | # Create the dictionary table 64 | cur.execute('CREATE TABLE %s (%s INTEGER PRIMARY KEY, %s)' % 65 | (DBConstants._DICT_TABLE, DBConstants._PRIMARY_KEY, 66 | fieldsub)) 67 | 68 | # Setup the 'qmarks' sqlite substring 69 | qmarks = ','.join(['?' for i in range(len(fields))]) 70 | 71 | # Setup the sql substring for inserting fields into database 72 | fieldsub = ','.join([fieldname for fieldname, role in fields]) 73 | 74 | query = 'INSERT INTO %s (%s) VALUES (%s)' %\ 75 | (DBConstants._DICT_TABLE, fieldsub, qmarks) 76 | # Pull data from the iterator and store in database 77 | # Commiting in batches seems faster than a single call to executemany 78 | data = (tuple(record[fieldname] for fieldname, role in fields) 79 | for record in rcrditer) 80 | while True: 81 | batch = list(itertools.islice(data, 10000)) 82 | if not batch: 83 | break 84 | cur.executemany(query, batch) 85 | con.commit() 86 | 87 | # Attribute to index 88 | queryby = fields[0][0] # Defaults to the first field 89 | for fieldname, role in fields: 90 | if role == DBConstants._INDEXED_TEXT_KEY: 91 | queryby = fieldname 92 | break 93 | 94 | # Make the index on the 'queryby' attribute 95 | cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' % 96 | (queryby, DBConstants._DICT_TABLE, queryby)) 97 | 98 | con.commit() 99 | con.close() 100 | 101 | 102 | def make_db(filename): 103 | iterfunc = openscreed.Open(filename, parse_description=True) 104 | 105 | field_mapping = { 106 | fastq.fastq_iter.__name__: fastq.FieldTypes, 107 | fasta.fasta_iter.__name__: fasta.FieldTypes 108 | } 109 | 110 | fieldTypes = field_mapping[iterfunc.iter_fn.__name__] 111 | 112 | # Create the screed db 113 | create_db(filename, fieldTypes, iterfunc) 114 | 115 | 116 | def main(args): 117 | parser = argparse.ArgumentParser(description="A shell interface to the " 118 | "screed database writing function") 119 | parser.add_argument('filename') 120 | args = parser.parse_args(args) 121 | 122 | make_db(args.filename) 123 | 124 | print("Database saved in {}{}".format(args.filename, 125 | DBConstants.fileExtension)) 126 | exit(0) 127 | 128 | 129 | if __name__ == "__main__": 130 | main(sys.argv[1:]) 131 | -------------------------------------------------------------------------------- /screed/dna.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | import array 4 | import string 5 | 6 | legal_dna = "ACGTN" 7 | 8 | 9 | def is_DNA(seq): 10 | """ 11 | Returns 1 if it contains only legal values for a DNA sequence. 12 | 13 | c.f. http://www.ncbi.nlm.nih.gov/BLAST/fasta.html 14 | """ 15 | for ch in seq: 16 | if ch not in legal_dna: 17 | return 0 18 | 19 | return 1 20 | 21 | 22 | def reverse_complement(s): 23 | """ 24 | Build reverse complement of 's'. 25 | """ 26 | s = s.upper() 27 | assert is_DNA(s), "Your sequence must be DNA!" 28 | 29 | r = reverse(s) 30 | rc = complement(r) 31 | 32 | return rc 33 | 34 | 35 | rc = reverse_complement # alias 'rc' to 'reverse_complement' 36 | 37 | __complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} 38 | 39 | 40 | def complement(s): 41 | """ 42 | Return complement of 's'. 43 | """ 44 | c = "".join(__complementTranslation[n] for n in s) 45 | return c 46 | 47 | 48 | def reverse(s): 49 | """ 50 | Return reverse of 's'. 51 | """ 52 | r = "".join(reversed(s)) 53 | 54 | return r 55 | -------------------------------------------------------------------------------- /screed/dump_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2008, Michigan State University. 4 | # Copyright (c) 2016, The Regents of the University of California. 5 | 6 | from __future__ import print_function 7 | 8 | import argparse 9 | import os 10 | import sys 11 | 12 | from screed import ToFasta 13 | 14 | 15 | # Shell interface to the ToFasta screed conversion function 16 | def main(args): 17 | parser = argparse.ArgumentParser( 18 | description="Convert a screed database to a FASTA file") 19 | parser.add_argument('dbfile') 20 | parser.add_argument('outputfile', default='/dev/stdout', nargs='?') 21 | args = parser.parse_args(args) 22 | 23 | if not os.path.isfile(args.dbfile): 24 | print("No such file: %s" % args.dbfile) 25 | exit(1) 26 | 27 | n = ToFasta(args.dbfile, args.outputfile) 28 | 29 | sys.stderr.write('Wrote {} records in FASTA format.\n'.format(n)) 30 | 31 | 32 | if __name__ == '__main__': 33 | main(sys.argv[1]) 34 | -------------------------------------------------------------------------------- /screed/dump_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2008, Michigan State University. 4 | # Copyright (c) 2016, The Regents of the University of California. 5 | 6 | from __future__ import print_function 7 | from screed import ToFastq 8 | import argparse 9 | import sys 10 | import os 11 | 12 | 13 | # Shell interface to the ToFastq screed conversion function 14 | def main(args): 15 | parser = argparse.ArgumentParser( 16 | description="Convert a screed database to a FASTA file") 17 | parser.add_argument('dbfile') 18 | parser.add_argument('outputfile', default='/dev/stdout', nargs='?') 19 | args = parser.parse_args(args) 20 | 21 | if not os.path.isfile(args.dbfile): 22 | print("No such file: %s" % args.dbfile) 23 | exit(1) 24 | 25 | n = ToFastq(args.dbfile, args.outputfile) 26 | 27 | sys.stderr.write('Wrote {} records in FASTQ format.\n'.format(n)) 28 | 29 | 30 | if __name__ == '__main__': 31 | main(sys.argv[1]) 32 | -------------------------------------------------------------------------------- /screed/fasta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | from __future__ import absolute_import 4 | from . import DBConstants 5 | from .screedRecord import Record 6 | from .utils import to_str 7 | 8 | FieldTypes = (('name', DBConstants._INDEXED_TEXT_KEY), 9 | ('description', DBConstants._STANDARD_TEXT), 10 | ('sequence', DBConstants._SLICEABLE_TEXT)) 11 | 12 | 13 | def fasta_iter(handle, parse_description=False, line=None): 14 | """ 15 | Iterator over the given FASTA file handle, returning records. handle 16 | is a handle to a file opened for reading 17 | """ 18 | if line is None: 19 | line = handle.readline() 20 | 21 | while line: 22 | data = {} 23 | 24 | line = to_str(line.strip()) 25 | if not line.startswith('>'): 26 | msg = f"Bad FASTA format: no '>' at beginning of line: {line}" 27 | raise IOError(msg) 28 | 29 | if parse_description: # Try to grab the name and optional description 30 | try: 31 | data['name'], data['description'] = line[1:].split(' ', 1) 32 | except ValueError: # No optional description 33 | data['name'] = line[1:] 34 | data['description'] = '' 35 | else: 36 | data['name'] = line[1:] 37 | data['description'] = '' 38 | 39 | data['name'] = data['name'].strip() 40 | data['description'] = data['description'].strip() 41 | 42 | # Collect sequence lines into a list 43 | sequenceList = [] 44 | line = to_str(handle.readline()) 45 | while line and not line.startswith('>'): 46 | sequenceList.append(line.strip()) 47 | line = to_str(handle.readline()) 48 | 49 | data['sequence'] = ''.join(sequenceList) 50 | yield Record(**data) 51 | -------------------------------------------------------------------------------- /screed/fastq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | from __future__ import absolute_import 4 | from . import DBConstants 5 | from .screedRecord import Record 6 | from .utils import to_str 7 | 8 | FieldTypes = (('name', DBConstants._INDEXED_TEXT_KEY), 9 | ('annotations', DBConstants._STANDARD_TEXT), 10 | ('sequence', DBConstants._STANDARD_TEXT), 11 | ('quality', DBConstants._STANDARD_TEXT)) 12 | 13 | 14 | def fastq_iter(handle, line=None, parse_description=False): 15 | """ 16 | Iterator over the given FASTQ file handle returning records. handle 17 | is a handle to a file opened for reading 18 | """ 19 | if line is None: 20 | line = handle.readline() 21 | line = to_str(line.strip()) 22 | while line: 23 | data = {} 24 | 25 | if line and not line.startswith('@'): 26 | raise IOError("Bad FASTQ format: no '@' at beginning of line") 27 | 28 | # Try to grab the name and (optional) annotations 29 | if parse_description: 30 | try: 31 | data['name'], data['annotations'] = line[1:].split(' ', 1) 32 | except ValueError: # No optional annotations 33 | data['name'] = line[1:] 34 | data['annotations'] = '' 35 | pass 36 | else: 37 | data['name'] = line[1:] 38 | data['annotations'] = '' 39 | 40 | # Extract the sequence lines 41 | sequence = [] 42 | line = to_str(handle.readline().strip()) 43 | while line and not line.startswith('+') and not line.startswith('#'): 44 | sequence.append(line) 45 | line = to_str(handle.readline().strip()) 46 | 47 | data['sequence'] = ''.join(sequence) 48 | 49 | # Extract the quality lines 50 | quality = [] 51 | line = to_str(handle.readline().strip()) 52 | seqlen = len(data['sequence']) 53 | aclen = 0 54 | while not line == '' and aclen < seqlen: 55 | quality.append(line) 56 | aclen += len(line) 57 | line = to_str(handle.readline().strip()) 58 | 59 | data['quality'] = ''.join(quality) 60 | if len(data['sequence']) != len(data['quality']): 61 | raise IOError('sequence and quality strings must be ' 62 | 'of equal length') 63 | 64 | yield Record(**data) 65 | -------------------------------------------------------------------------------- /screed/hava.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | from __future__ import absolute_import 4 | from . import DBConstants 5 | from .utils import to_str 6 | 7 | FieldTypes = (('hava', DBConstants._INDEXED_TEXT_KEY), 8 | ('quarzk', DBConstants._STANDARD_TEXT), 9 | ('muchalo', DBConstants._STANDARD_TEXT), 10 | ('fakours', DBConstants._STANDARD_TEXT), 11 | ('selimizicka', DBConstants._STANDARD_TEXT), 12 | ('marshoon', DBConstants._STANDARD_TEXT)) 13 | 14 | 15 | def hava_iter(handle): 16 | """ 17 | Iterator over a 'hava' sequence file, returning records. handle 18 | is a handle to a file opened for reading 19 | """ 20 | data = {} 21 | line = to_str(handle.readline().strip()) 22 | while line: 23 | data['hava'] = line 24 | data['quarzk'] = to_str(handle.readline().strip()) 25 | data['muchalo'] = to_str(handle.readline().strip()) 26 | data['fakours'] = to_str(handle.readline().strip()) 27 | data['selimizicka'] = to_str(handle.readline().strip()) 28 | data['marshoon'] = to_str(handle.readline().strip()) 29 | 30 | line = to_str(handle.readline().strip()) 31 | yield data 32 | -------------------------------------------------------------------------------- /screed/openscreed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008, Michigan State University. 2 | """Reader and writer for screed.""" 3 | 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import io 8 | import sys 9 | import gzip 10 | import bz2 11 | from collections.abc import MutableMapping 12 | 13 | try: 14 | import sqlite3 15 | except ImportError: 16 | pass 17 | 18 | from . import DBConstants 19 | from . import screedRecord 20 | from .fastq import fastq_iter 21 | from .fasta import fasta_iter 22 | from .utils import to_str 23 | 24 | 25 | def _normalize_filename(filename): 26 | """Map '-' to '/dev/stdin' to handle the usual shortcut.""" 27 | if filename == '-': 28 | filename = '/dev/stdin' 29 | return filename 30 | 31 | 32 | class Open(object): 33 | def __init__(self, filename, *args, **kwargs): 34 | self.sequencefile = None 35 | self.iter_fn = self.open_reader(filename, *args, **kwargs) 36 | if self.iter_fn: 37 | self.__name__ = self.iter_fn.__name__ 38 | 39 | def open_reader(self, filename, *args, **kwargs): 40 | """ 41 | Make a best-effort guess as to how to parse the given sequence file. 42 | 43 | Handles '-' as shortcut for stdin. 44 | Deals with .gz, FASTA, and FASTQ records. 45 | """ 46 | magic_dict = { 47 | b"\x1f\x8b\x08": "gz", 48 | b"\x42\x5a\x68": "bz2", 49 | # "\x50\x4b\x03\x04": "zip" 50 | } # Inspired by http://stackoverflow.com/a/13044946/1585509 51 | filename = _normalize_filename(filename) 52 | bufferedfile = io.open(file=filename, mode='rb', buffering=8192) 53 | num_bytes_to_peek = max(len(x) for x in magic_dict) 54 | file_start = bufferedfile.peek(num_bytes_to_peek) 55 | compression = None 56 | for magic, ftype in magic_dict.items(): 57 | if file_start.startswith(magic): 58 | compression = ftype 59 | break 60 | if compression == 'bz2': 61 | sequencefile = bz2.BZ2File(filename=bufferedfile) 62 | peek = sequencefile.peek(1) 63 | elif compression == 'gz': 64 | if not bufferedfile.seekable(): 65 | bufferedfile.close() 66 | raise ValueError("gziped data not streamable, pipe " 67 | "through zcat first") 68 | peek = gzip.GzipFile(filename=filename).read(1) 69 | sequencefile = gzip.GzipFile(filename=filename) 70 | bufferedfile.close() 71 | else: 72 | peek = bufferedfile.peek(1) 73 | sequencefile = bufferedfile 74 | 75 | iter_fn = None 76 | try: 77 | first_char = peek[0] 78 | except IndexError as err: 79 | sequencefile.close() 80 | return [] # empty file 81 | 82 | try: 83 | first_char = chr(first_char) 84 | except TypeError: 85 | pass 86 | 87 | if first_char == '>': 88 | iter_fn = fasta_iter 89 | elif first_char == '@': 90 | iter_fn = fastq_iter 91 | 92 | if iter_fn is None: 93 | sequencefile.close() 94 | raise ValueError("unknown file format for '%s'" % filename) 95 | 96 | self.sequencefile = sequencefile 97 | return iter_fn(sequencefile, *args, **kwargs) 98 | 99 | def __enter__(self): 100 | return self.iter_fn 101 | 102 | def __exit__(self, *exc_info): 103 | self.close() 104 | 105 | def __iter__(self): 106 | if self.iter_fn: 107 | return self.iter_fn 108 | return iter(()) 109 | 110 | def close(self): 111 | if self.sequencefile is not None: 112 | self.sequencefile.close() 113 | 114 | 115 | class ScreedDB(MutableMapping): 116 | 117 | """ 118 | Core on-disk dictionary interface for reading screed databases. Accepts a 119 | path string to a screed database 120 | """ 121 | 122 | def __init__(self, filepath): 123 | self._db = None 124 | try: 125 | sqlite3 126 | except NameError: 127 | raise Exception("error: sqlite3 is needed for this " + 128 | "functionality, but is not installed.") 129 | 130 | self._filepath = filepath 131 | if not self._filepath.endswith(DBConstants.fileExtension): 132 | self._filepath += DBConstants.fileExtension 133 | 134 | if not os.path.exists(self._filepath): 135 | raise ValueError('No such file: %s' % self._filepath) 136 | 137 | self._db = sqlite3.connect(self._filepath) 138 | cursor = self._db.cursor() 139 | 140 | # Make sure the database is a prepared screed database 141 | query = "SELECT name FROM sqlite_master WHERE type='table' "\ 142 | "ORDER BY name" 143 | res = cursor.execute(query) 144 | try: 145 | dictionary_table, = res.fetchone() 146 | admin_table, = res.fetchone() 147 | 148 | if dictionary_table != DBConstants._DICT_TABLE: 149 | raise TypeError 150 | if admin_table != DBConstants._SCREEDADMIN: 151 | raise TypeError 152 | 153 | except TypeError: 154 | self._db.close() 155 | raise TypeError("Database %s is not a proper screed database" 156 | % self._filepath) 157 | 158 | nothing = res.fetchone() 159 | if nothing is not None: 160 | self._db.close() 161 | raise TypeError("Database %s has too many tables." % filename) 162 | 163 | # Store the fields of the admin table in a tuple 164 | query = "SELECT %s, %s FROM %s" % \ 165 | (DBConstants._FIELDNAME, 166 | DBConstants._ROLENAME, 167 | DBConstants._SCREEDADMIN) 168 | res = cursor.execute(query) 169 | self.fields = tuple([(str(field), role) for field, role in res]) 170 | 171 | # Indexed text column for querying, search fields to find 172 | self._queryBy = self.fields[1][0] 173 | for fieldname, role in self.fields: 174 | if role == DBConstants._INDEXED_TEXT_KEY: 175 | self._queryBy = fieldname 176 | 177 | # Sqlite PRAGMA settings for speed 178 | cursor.execute("PRAGMA cache_size=2000") 179 | 180 | # Retrieve the length of the database 181 | query = 'SELECT MAX(%s) FROM %s' % (DBConstants._PRIMARY_KEY, 182 | DBConstants._DICT_TABLE) 183 | self._len, = cursor.execute(query).fetchone() 184 | 185 | def __del__(self): 186 | """ 187 | Alias for close() 188 | """ 189 | self.close() 190 | 191 | def close(self): 192 | """ 193 | Closes the sqlite database handle 194 | """ 195 | if self._db is not None: 196 | self._db.close() 197 | self._db = None 198 | 199 | def __getitem__(self, key): 200 | """ 201 | Retrieves from database the record with the key 'key' 202 | """ 203 | cursor = self._db.cursor() 204 | key = str(key) # So lazy retrieval objectes are evaluated 205 | query = 'SELECT %s FROM %s WHERE %s=?' % (self._queryBy, 206 | DBConstants._DICT_TABLE, 207 | self._queryBy) 208 | res = cursor.execute(query, (key,)) 209 | if res.fetchone() is None: 210 | raise KeyError("Key %s not found" % key) 211 | return screedRecord._buildRecord(self.fields, self._db, 212 | key, 213 | self._queryBy) 214 | 215 | def values(self): 216 | """ 217 | Retrieves all records from the database and returns them as a list 218 | """ 219 | return list(self.itervalues()) 220 | 221 | def items(self): 222 | """ 223 | Retrieves all records from the database and returns them as a list of 224 | (key, record) tuple pairs 225 | """ 226 | return list(self.iteritems()) 227 | 228 | def loadRecordByIndex(self, index): 229 | """ 230 | Retrieves record from database at the given index 231 | """ 232 | cursor = self._db.cursor() 233 | index = int(index) + 1 # Hack to make indexing start at 0 234 | query = 'SELECT %s FROM %s WHERE %s=?' % (DBConstants._PRIMARY_KEY, 235 | DBConstants._DICT_TABLE, 236 | DBConstants._PRIMARY_KEY) 237 | res = cursor.execute(query, (index,)) 238 | if res.fetchone() is None: 239 | raise KeyError("Index %d not found" % index) 240 | return screedRecord._buildRecord(self.fields, self._db, 241 | index, 242 | DBConstants._PRIMARY_KEY) 243 | 244 | def __len__(self): 245 | """ 246 | Returns the number of records in the database 247 | """ 248 | return self._len 249 | 250 | def keys(self): 251 | """ 252 | Returns a list of keys in the database 253 | """ 254 | return list(self.iterkeys()) 255 | 256 | def __repr__(self): 257 | """ 258 | Returns a string with some general information about the database 259 | """ 260 | return "<%s, '%s'>" % (self.__class__.__name__, 261 | self._filepath) 262 | 263 | def itervalues(self): 264 | """ 265 | Iterator over records in the database 266 | """ 267 | for index in range(1, self.__len__() + 1): 268 | yield screedRecord._buildRecord(self.fields, self._db, 269 | index, 270 | DBConstants._PRIMARY_KEY) 271 | 272 | def iterkeys(self): 273 | """ 274 | Iterator over keys in the database 275 | """ 276 | cursor = self._db.cursor() 277 | query = 'SELECT %s FROM %s ORDER BY id' % ( 278 | self._queryBy, DBConstants._DICT_TABLE) 279 | for key, in cursor.execute(query): 280 | yield key 281 | 282 | def __iter__(self): 283 | return self.iterkeys() 284 | 285 | def iteritems(self): 286 | """ 287 | Iterator returning a (index, record) pairs 288 | """ 289 | for v in self.itervalues(): 290 | yield v[DBConstants._PRIMARY_KEY], v 291 | 292 | def has_key(self, key): 293 | """ 294 | Returns true if given key exists in database, false otherwise 295 | """ 296 | return key in self 297 | 298 | def copy(self): 299 | """ 300 | Returns shallow copy 301 | """ 302 | return self 303 | 304 | def __contains__(self, key): 305 | """ 306 | Returns true if given key exists in database, false otherwise 307 | """ 308 | cursor = self._db.cursor() 309 | query = 'SELECT %s FROM %s WHERE %s = ?' % \ 310 | (self._queryBy, DBConstants._DICT_TABLE, self._queryBy) 311 | if cursor.execute(query, (key,)).fetchone() is None: 312 | return False 313 | return True 314 | 315 | # Here follow the methods that are not implemented 316 | 317 | def __setitem__(self, something): 318 | """ 319 | Not implemented (Read-only database) 320 | """ 321 | raise NotImplementedError 322 | 323 | def __delitem__(self, something): 324 | """ 325 | Not implemented (Read-only database) 326 | """ 327 | raise NotImplementedError 328 | 329 | def clear(self): 330 | """ 331 | Not implemented (Read-only database) 332 | """ 333 | raise NotImplementedError 334 | 335 | def update(self, something): 336 | """ 337 | Not implemented (Read-only database) 338 | """ 339 | raise NotImplementedError 340 | 341 | def setdefault(self, something): 342 | """ 343 | Not implemented (Read-only database) 344 | """ 345 | raise NotImplementedError 346 | 347 | def pop(self): 348 | """ 349 | Not implemented (Read-only database) 350 | """ 351 | raise NotImplementedError 352 | 353 | def popitem(self): 354 | """ 355 | Not implemented (Read-only database) 356 | """ 357 | raise NotImplementedError 358 | -------------------------------------------------------------------------------- /screed/pygr_api.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008, Michigan State University. 2 | 3 | """ 4 | A simple wrapper implementing a pygr-compatible SequenceDB based on screed. 5 | 6 | There are two implementions: 7 | - ScreedSequenceDB 8 | - ScreedSequenceDB_ByIndex 9 | 10 | ScreedSequenceDB uses the sequence name as the sequence ID, which 11 | mimics the behavior of pygr's SequenceFileDB and is good for 12 | small-to-medium sized collections of sequences. 13 | ScreedSequenceDB_ByIndex uses the sequence's index (0...size of 14 | database) as a sequence ID, rather than the sequence name; this is 15 | much faster for databases with many, many sequences. 16 | 17 | Unlike the normal seqdb, screed will load the entire sequence record 18 | into memory on request, so it's not good for large sequences. 19 | 20 | All screed records are guaranteed to have an 'index', a 'name', and a 21 | 'sequence' attribute; anything else is specific to the database writer 22 | you use. The raw screed record (which contains any other information) 23 | is available under seqObj.record. 24 | 25 | Note: the underlying screed database must already have been built with 26 | fadbm or fqdbm. 27 | 28 | CTB 3/20/09 29 | """ 30 | 31 | import UserDict 32 | 33 | from screed import ScreedDB 34 | 35 | from pygr.sequence import SequenceBase 36 | from pygr.seqdb import SequenceDB 37 | from pygr.sequtil import DNA_SEQTYPE 38 | 39 | # 40 | 41 | 42 | class ScreedSequence(SequenceBase): 43 | 44 | """Sequence implementation based on screed; stores screed record info. 45 | 46 | Attributes: 47 | - 'id' and 'db' are the standard pygr-ish name/database attrs. 48 | - 'record' is the screed 'record' object, containing name, etc. 49 | - 'name' is the record name, which can be the same as 'id' but 50 | can also be different (see ScreedSequenceDB_ByIndex). 51 | - 'seq' is the sequence. 52 | 53 | """ 54 | 55 | def __init__(self, db, id): 56 | self.id = id 57 | SequenceBase.__init__(self) 58 | info = db.seqInfoDict[id] 59 | 60 | self.record = info.record 61 | self.name = info.record.name 62 | self.seq = info.record.sequence 63 | 64 | 65 | class ScreedSequenceDB(SequenceDB): 66 | 67 | """SequenceDB implementation based on screed; retrieve seqs by name.""" 68 | itemClass = ScreedSequence 69 | 70 | def __init__(self, filepath): 71 | self.filepath = filepath 72 | self.seqInfoDict = _ScreedSeqInfoDict_ByName(filepath) 73 | SequenceDB.__init__(self) 74 | 75 | def _set_seqtype(self): 76 | self._seqtype = DNA_SEQTYPE 77 | 78 | def __repr__(self): 79 | return "<%s '%s'>" % (self.__class__.__name__, self.filepath) 80 | 81 | # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB. 82 | def __reduce__(self): 83 | return (ScreedSequenceDB, (self.filepath,)) 84 | 85 | 86 | class ScreedSequenceDB_ByIndex(SequenceDB): 87 | 88 | """SequenceDB implementation based on screed; retrieve seqs by index.""" 89 | itemClass = ScreedSequence 90 | 91 | def __init__(self, filepath): 92 | self.filepath = filepath 93 | self.seqInfoDict = _ScreedSeqInfoDict_ByIndex(filepath) 94 | SequenceDB.__init__(self) 95 | 96 | def _set_seqtype(self): 97 | self._seqtype = DNA_SEQTYPE 98 | 99 | def __repr__(self): 100 | return "<%s '%s'>" % (self.__class__.__name__, self.filepath) 101 | 102 | # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB. 103 | def __reduce__(self): 104 | return (ScreedSequenceDB_ByIndex, (self.filepath,)) 105 | 106 | 107 | class _ScreedSequenceInfo(object): 108 | 109 | """Objects to put in seqInfoDict values, for holding screed record info.""" 110 | 111 | def __init__(self, id, record): 112 | self.id = id 113 | self.record = record 114 | self.length = len(record.sequence) 115 | 116 | 117 | class _ScreedSeqInfoDict_ByName(object, UserDict.DictMixin): 118 | 119 | """seqInfoDict implementation that uses names to retrieve records.""" 120 | 121 | def __init__(self, filepath): 122 | self.sdb = ScreedDB(filepath) 123 | 124 | def __getitem__(self, k): 125 | v = self.sdb[k] 126 | return _ScreedSequenceInfo(k, v) 127 | 128 | def keys(self): 129 | return self.sdb.keys() 130 | 131 | def itervalues(self): 132 | i = 0 133 | max_index = len(self.sdb) 134 | while i < max_index: 135 | v = self.sdb.loadRecordByIndex(i) 136 | yield _ScreedSequenceInfo(v.name, v) 137 | i += 1 138 | 139 | def iteritems(self): 140 | for v in self.itervalues(): 141 | yield v.record.name, v 142 | 143 | 144 | class _ScreedSeqInfoDict_ByIndex(object, UserDict.DictMixin): 145 | 146 | """seqInfoDict implementation that uses indices to retrieve records.""" 147 | 148 | def __init__(self, filepath): 149 | self.sdb = ScreedDB(filepath) 150 | 151 | def __getitem__(self, k): 152 | n = int(k) 153 | v = self.sdb.loadRecordByIndex(n) 154 | return _ScreedSequenceInfo(k, v) 155 | 156 | def keys(self): 157 | return range(0, len(self.sdb)) 158 | 159 | def iterkeys(self): 160 | i = 0 161 | max_index = len(self.sdb) 162 | while i < max_index: 163 | yield i 164 | i += 1 165 | 166 | 167 | if __name__ == '__main__': 168 | import sys 169 | filename = sys.argv[1] 170 | 171 | db = ScreedSequenceDB(filename) 172 | for k in db: 173 | print(k, repr(db[k]), db[k].name) 174 | 175 | db = ScreedSequenceDB_ByIndex(filename) 176 | for k in db: 177 | print(k, repr(db[k]), db[k].name) 178 | -------------------------------------------------------------------------------- /screed/screedRecord.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | from __future__ import absolute_import 4 | from functools import total_ordering 5 | import types 6 | from . import DBConstants 7 | import gzip 8 | import bz2 9 | from io import BytesIO 10 | from collections.abc import MutableMapping 11 | 12 | 13 | class Record(MutableMapping): 14 | """ 15 | Simple dict-like record interface with bag behavior. 16 | """ 17 | 18 | def __init__(self, name=None, sequence=None, **kwargs): 19 | d = dict() 20 | if name is not None: 21 | d['name'] = name 22 | if sequence is not None: 23 | d['sequence'] = sequence 24 | 25 | d.update(kwargs) 26 | 27 | if 'quality' in d and d['quality'] is None: 28 | del d['quality'] 29 | self.d = d 30 | 31 | def __setitem__(self, name, value): 32 | self.d[name] = value 33 | 34 | def __getattr__(self, name): 35 | try: 36 | return self.d[name] 37 | except KeyError: 38 | raise AttributeError(name) 39 | 40 | def __len__(self): 41 | return len(self.sequence) 42 | 43 | def keys(self): 44 | return self.d.keys() 45 | 46 | def __getitem__(self, idx): 47 | if isinstance(idx, slice): 48 | trimmed = dict(self.d) 49 | trimmed['sequence'] = trimmed['sequence'][idx] 50 | if 'quality' in trimmed: 51 | trimmed['quality'] = trimmed['quality'][idx] 52 | return Record(**trimmed) 53 | return self.d[idx] 54 | 55 | def __delitem__(self, key): 56 | del self.d[key] 57 | 58 | def __iter__(self): 59 | return iter(self.d) 60 | 61 | def __repr__(self): 62 | return repr(self.d) 63 | 64 | 65 | @total_ordering 66 | class _screed_attr(object): 67 | 68 | """ 69 | Sliceable database object that supports lazy retrieval 70 | """ 71 | 72 | def __init__(self, dbObj, attrName, rowName, queryBy): 73 | """ 74 | Initializes database object with specific record retrieval 75 | information 76 | dbOjb = database handle 77 | attrName = name of attr in db 78 | rowName = index/name of row 79 | queryBy = by name or index 80 | """ 81 | self._dbObj = dbObj 82 | self._attrName = attrName 83 | self._rowName = rowName 84 | self._queryBy = queryBy 85 | 86 | def __getitem__(self, sliceObj): 87 | """ 88 | Slicing interface. Returns the slice range given. 89 | *.start + 1 to be compatible with sqlite's 1 not 0 scheme 90 | """ 91 | if not isinstance(sliceObj, slice): 92 | raise TypeError('__getitem__ argument must be of slice type') 93 | if not sliceObj.start <= sliceObj.stop: # String reverse in future? 94 | raise ValueError('start must be less than stop in slice object') 95 | length = sliceObj.stop - sliceObj.start 96 | 97 | query = 'SELECT substr(%s, %d, %d) FROM %s WHERE %s = ?' \ 98 | % (self._attrName, sliceObj.start + 1, length, 99 | DBConstants._DICT_TABLE, 100 | self._queryBy) 101 | cur = self._dbObj.cursor() 102 | result = cur.execute(query, (str(self._rowName),)) 103 | try: 104 | subStr, = result.fetchone() 105 | except TypeError: 106 | raise KeyError("Key %s not found" % self._rowName) 107 | return str(subStr) 108 | 109 | def __len__(self): 110 | """ 111 | Returns the length of the string 112 | """ 113 | return len(self.__str__()) 114 | 115 | def __repr__(self): 116 | """ 117 | Prints out the name of the class and the name of the sliceable attr 118 | """ 119 | return "<%s '%s'>" % (self.__class__.__name__, self._attrName) 120 | 121 | def __eq__(self, given): 122 | """ 123 | Compares attribute to given object in string form 124 | """ 125 | if isinstance(given, bytes): 126 | return given == self.__str__() 127 | else: 128 | return str(given) == self.__str__() 129 | 130 | def __lt__(self, given): 131 | if isinstance(given, bytes): 132 | return self.__str__() < given 133 | else: 134 | return self.__str__() < str(given) 135 | 136 | def __str__(self): 137 | """ 138 | Returns the full attribute as a string 139 | """ 140 | query = 'SELECT %s FROM %s WHERE %s = ?' \ 141 | % (self._attrName, DBConstants._DICT_TABLE, self._queryBy) 142 | cur = self._dbObj.cursor() 143 | result = cur.execute(query, (str(self._rowName),)) 144 | try: 145 | record, = result.fetchone() 146 | except TypeError: 147 | raise KeyError("Key %s not found" % self._rowName) 148 | return str(record) 149 | 150 | 151 | def _buildRecord(fieldTuple, dbObj, rowName, queryBy): 152 | """ 153 | Constructs a dict-like object with record attribute names as keys and 154 | _screed_attr objects as values 155 | """ 156 | 157 | # Separate the lazy and full retrieval objects 158 | kvResult = [] 159 | fullRetrievals = [] 160 | for fieldname, role in fieldTuple: 161 | if role == DBConstants._SLICEABLE_TEXT: 162 | kvResult.append((fieldname, _screed_attr(dbObj, 163 | fieldname, 164 | rowName, 165 | queryBy))) 166 | else: 167 | fullRetrievals.append(fieldname) 168 | 169 | # Retrieve the full text fields from the db 170 | subs = ','.join(fullRetrievals) 171 | query = 'SELECT %s FROM %s WHERE %s=?' % \ 172 | (subs, DBConstants._DICT_TABLE, queryBy) 173 | cur = dbObj.cursor() 174 | res = cur.execute(query, (rowName,)) 175 | 176 | # Add the full text fields to the result tuple list 177 | data = tuple([str(r) for r in res.fetchone()]) 178 | kvResult.extend(zip(fullRetrievals, data)) 179 | 180 | # Hack to make indexing start at 0 181 | hackedResult = [] 182 | for key, value in kvResult: 183 | if key == DBConstants._PRIMARY_KEY: 184 | hackedResult.append((key, int(value) - 1)) 185 | else: 186 | hackedResult.append((key, value)) 187 | 188 | return Record(**dict(hackedResult)) 189 | 190 | 191 | def write_fastx(record, fileobj): 192 | """Write sequence record to 'fileobj' in FASTA/FASTQ format.""" 193 | isbytesio = isinstance(fileobj, BytesIO) 194 | iswb = hasattr(fileobj, 'mode') and fileobj.mode == 'wb' 195 | outputvalid = isbytesio or iswb 196 | if not outputvalid: 197 | message = ('cannot call "write_fastx" on object, must be of a file ' 198 | 'handle with mode "wb" or an instance of "BytesIO"') 199 | raise AttributeError(message) 200 | 201 | defline = record.name 202 | if hasattr(record, 'description'): 203 | defline += ' ' + record.description 204 | 205 | if hasattr(record, 'quality'): 206 | recstr = '@{defline}\n{sequence}\n+\n{quality}\n'.format( 207 | defline=defline, 208 | sequence=record.sequence, 209 | quality=record.quality) 210 | else: 211 | recstr = '>{defline}\n{sequence}\n'.format( 212 | defline=defline, 213 | sequence=record.sequence) 214 | 215 | fileobj.write(recstr.encode('utf-8')) 216 | 217 | 218 | def write_fastx_pair(read1, read2, fileobj): 219 | """Write a pair of sequence records to 'fileobj' in FASTA/FASTQ format.""" 220 | if hasattr(read1, 'quality'): 221 | assert hasattr(read2, 'quality') 222 | write_record(read1, fileobj) 223 | write_record(read2, fileobj) 224 | -------------------------------------------------------------------------------- /screed/seqparse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008, Michigan State University. 2 | 3 | """ 4 | seqparse contains custom sequence parsers for extending screed's 5 | functionality to arbitrary sequence formats. An example 'hava' 6 | parser is included for API reference 7 | """ 8 | 9 | from __future__ import absolute_import 10 | 11 | import os 12 | 13 | from .createscreed import create_db 14 | from .openscreed import ScreedDB 15 | from . import openscreed 16 | from . import fastq 17 | from . import fasta 18 | from . import hava 19 | 20 | # [AN] these functions look strangely similar 21 | 22 | 23 | def read_fastq_sequences(filename): 24 | """ 25 | Function to parse text from the given FASTQ file into a screed database 26 | """ 27 | # Will raise an exception if the file doesn't exist 28 | iterfunc = openscreed.Open(filename, parse_description=True) 29 | 30 | # Create the screed db 31 | create_db(filename, fastq.FieldTypes, iterfunc) 32 | 33 | return ScreedDB(filename) 34 | 35 | 36 | def read_fasta_sequences(filename): 37 | """ 38 | Function to parse text from the given FASTA file into a screed database 39 | """ 40 | # Will raise an exception if the file doesn't exist 41 | iterfunc = openscreed.Open(filename, parse_description=True) 42 | 43 | # Create the screed db 44 | create_db(filename, fasta.FieldTypes, iterfunc) 45 | 46 | return ScreedDB(filename) 47 | 48 | 49 | def read_hava_sequences(filename): 50 | """ 51 | Function to parse text from the given HAVA file into a screed database 52 | """ 53 | # Will raise an exception if the file doesn't exist 54 | theFile = open(filename, "rb") 55 | 56 | # Setup the iterator function 57 | iterfunc = hava.hava_iter(theFile) 58 | 59 | # Create the screed db 60 | create_db(filename, hava.FieldTypes, iterfunc) 61 | theFile.close() 62 | 63 | return ScreedDB(filename) 64 | -------------------------------------------------------------------------------- /screed/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /screed/tests/havaGen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | havaGen is for generating sequence files of the imaginary type 'hava'. 5 | These files consist of attributes in the following newline seperated order 6 | hava 7 | quarzk 8 | muchalo 9 | fakours 10 | selimizicka 11 | marshoon 12 | 13 | Since this 'sequence' has absolutely no utility outside of screed, it's only 14 | purpose is to make sure screed can work with arbitrary fields when running 15 | the tests. 16 | 17 | This is a work of fiction. Names are the product of the author's imagination 18 | and any resemblance to real life is entirely coincidental. 19 | """ 20 | from __future__ import print_function 21 | 22 | import sys 23 | import os 24 | import random 25 | 26 | 27 | class collectionOFiles(object): 28 | 29 | def __init__(self, baseName, divisions, totalSize): 30 | self.baseName = baseName 31 | self.divisions = divisions 32 | self.totalSize = totalSize 33 | 34 | self.fileHandles = {} 35 | for i in range(0, divisions): 36 | filename = self.baseName + "_%d" % i 37 | fh = open(filename, "wb") 38 | divisor = i * 2 39 | if divisor == 0: 40 | divisor = 1 41 | self.fileHandles[filename] = (fh, self.totalSize / divisor, 0) 42 | 43 | def writeRecord(self, hava, quarzk, muchalo, fakours, selimizicka, 44 | marshoon): 45 | toRemove = [] 46 | for filename in self.fileHandles: 47 | file, limit, count = self.fileHandles[filename] 48 | file.write("%s\n%s\n%s\n%s\n%s\n%s\n" % 49 | (hava, quarzk, muchalo, fakours, selimizicka, marshoon)) 50 | count += 1 51 | if count >= limit: 52 | file.close() 53 | toRemove.append(filename) 54 | else: 55 | self.fileHandles[filename] = (file, limit, count) 56 | 57 | for fh in toRemove: 58 | self.fileHandles.pop(fh) 59 | 60 | def finished(self): 61 | return len(self.fileHandles) == 0 62 | 63 | 64 | def genString(length, allowedChars): 65 | res = [] 66 | for i in range(0, length): 67 | char = allowedChars[random.randint(0, len(allowedChars) - 1)] 68 | res.append(char) 69 | return "".join(res) 70 | 71 | 72 | def createHavaFiles(filename, size, divisions): 73 | cof = collectionOFiles(filename, divisions, size) 74 | counter = 0 75 | lenString = 80 76 | allowedQuarzk = ['A', 'T', 'C', 'G'] 77 | allowedMuchalo = "A B C D E F G H I J K L M N O P".split(' ') 78 | allowedFakours = "1 2 3 4 5 6 7 8 9".split(' ') 79 | allowedSelimizicka = ["b"] 80 | allowedMarshoon = "A 1 B 2 C 3 D 4 E 5 G 6 F 7".split(' ') 81 | while not cof.finished(): 82 | hava = "test_00%d" % counter 83 | quarzk = genString(lenString, allowedQuarzk) 84 | muchalo = genString(lenString, allowedMuchalo) 85 | fakours = genString(lenString, allowedFakours) 86 | selimizicka = genString(lenString, allowedSelimizicka) 87 | marshoon = genString(lenString, allowedMarshoon) 88 | cof.writeRecord(hava, quarzk, muchalo, fakours, selimizicka, marshoon) 89 | counter += 1 90 | return 91 | 92 | 93 | if __name__ == '__main__': 94 | if len(sys.argv) != 4: 95 | print("Usage: ") 96 | exit(1) 97 | 98 | filename = sys.argv[1] 99 | size = int(sys.argv[2]) 100 | divisions = int(sys.argv[3]) 101 | 102 | createHavaFiles(filename, size, divisions) 103 | -------------------------------------------------------------------------------- /screed/tests/screed_tst_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # This file is part of screed, http://github.com/dib-lab/screed/, and is 3 | # Copyright (C) Michigan State University, 2009-2015. It is licensed under 4 | # the three-clause BSD license; see doc/LICENSE.txt. 5 | # Contact: khmer-project@idyll.org 6 | # 7 | # This file has been modified from the khmer project at 8 | # https://github.com/dib-lab/khmer/blob/a8356b7abbebf8540c7656378b1459442b781f87/tests/khmer_tst_utils.py 9 | # 10 | 11 | import tempfile 12 | import os 13 | import shutil 14 | from io import StringIO 15 | import sys 16 | import traceback 17 | 18 | from importlib import resources 19 | 20 | # Remove when we drop support for 3.8 21 | if sys.version_info < (3, 9): 22 | import importlib_resources as resources 23 | 24 | 25 | def get_test_data(filename): 26 | filepath = resources.files('screed') / 'screed' / 'tests' / filename 27 | if not filepath.exists() or not os.path.isfile(filepath): 28 | filepath = os.path.join(os.path.dirname(__file__), 'test-data', 29 | filename) 30 | return filepath 31 | 32 | 33 | cleanup_list = [] 34 | 35 | 36 | def get_temp_filename(filename, tempdir=None): 37 | if tempdir is None: 38 | tempdir = tempfile.mkdtemp(prefix='screedtest_') 39 | cleanup_list.append(tempdir) 40 | 41 | return os.path.join(tempdir, filename) 42 | 43 | 44 | def cleanup(): 45 | global cleanup_list 46 | 47 | for path in cleanup_list: 48 | shutil.rmtree(path, ignore_errors=True) 49 | cleanup_list = [] 50 | -------------------------------------------------------------------------------- /screed/tests/test-data/empty.fa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/empty.fa -------------------------------------------------------------------------------- /screed/tests/test-data/test-whitespace.fa: -------------------------------------------------------------------------------- 1 | >ENSMICT00000012722 cdna:pseudogene scaffold:micMur1:scaffold_185008:9:424:1 gene:ENSMICG00000012730 2 | TGCAGAAAATATCAAGAGTCAGCAGAAAAACTATACAAGGGCTGGTATTTTGATTATTCT 3 | ATAAAAATTCACTTTTTGCTCAGTGTCTTTCATCTGGGCCTGGCCTCCTCTCTTGCAAGC 4 | CCTGGATTCATAACATCTATAATAATTTTTATATGTGGTAGAGTAATATTAGCTGATTCC 5 | TTTGCCTCCTGTTCCTTCCCCTCATTCAGGCAGCTGGCCAGGTTTGTGCTCCTTATCTCG 6 | CAGAAGAGATGTGATAGCAGGCAGAGAATTAAAGTCTTCCTGGCTTTTGGTTTCAGAAGC 7 | TGCCTTGGGAAGGAAGCAAACAAACATGCCACAGATAAAATATTTGAAAGAAAAGATAAT 8 | GAAAGTAGAAAAGGGTTCCCTGTTCTTGTGGGGAGGAAGTGA 9 | 10 | >ENSMICT00000012401 cdna:novel scaffold:micMur1:scaffold_184912:461:550:-1 gene:ENSMICG00000012409 11 | GAACAGTCTCCTTTGGTTTGTGAAAAGAAACAAAAGAGTGTGGGGGTCGGGGAGCTCATC 12 | CAGCACTTCGTCGATTTCATGACCAACCAG 13 | -------------------------------------------------------------------------------- /screed/tests/test-data/test.fa.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/test.fa.bz2 -------------------------------------------------------------------------------- /screed/tests/test-data/test.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/test.fa.gz -------------------------------------------------------------------------------- /screed/tests/test-data/test.fa.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/test.fa.zip -------------------------------------------------------------------------------- /screed/tests/test-data/test.fastq.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/test.fastq.bz2 -------------------------------------------------------------------------------- /screed/tests/test-data/test.fastq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dib-lab/screed/f290cadb01112510a7aa8f3942238ac617b6d4da/screed/tests/test-data/test.fastq.gz -------------------------------------------------------------------------------- /screed/tests/test_attriberror.py: -------------------------------------------------------------------------------- 1 | import screed 2 | from screed.DBConstants import fileExtension 3 | import os 4 | from . import screed_tst_utils as utils 5 | import shutil 6 | 7 | 8 | class nostring: 9 | def __str__(self): 10 | return "" 11 | 12 | def __repr__(self): 13 | return "" 14 | 15 | 16 | class test_comparisons(): 17 | 18 | def setup_method(self): 19 | self._testfile = utils.get_temp_filename('test.fa') 20 | shutil.copy(utils.get_test_data('test.fa'), self._testfile) 21 | screed.read_fasta_sequences(self._testfile) 22 | 23 | self._db = screed.ScreedDB(self._testfile) 24 | self._ns = nostring() 25 | 26 | def test_eq(self): 27 | for k in self._db: 28 | record = self._db.get(k) 29 | res = (record.sequence == self._ns) 30 | assert res is False, res 31 | 32 | def test_neq(self): 33 | for k in self._db: 34 | record = self._db.get(k) 35 | res = (record.sequence != self._ns) 36 | assert res is True, res 37 | 38 | def test_comp_greateq(self): 39 | for k in self._db: 40 | record = self._db.get(k) 41 | res = (record.sequence >= self._ns) 42 | assert res is True, res 43 | 44 | def test_comp_lesseq(self): 45 | for k in self._db: 46 | record = self._db.get(k) 47 | res = (record.sequence <= self._ns) 48 | assert res is False, res 49 | 50 | def test_comp_less(self): 51 | for k in self._db: 52 | record = self._db.get(k) 53 | res = (record.sequence < self._ns) 54 | assert res is False, res 55 | 56 | def test_comp_great(self): 57 | for k in self._db: 58 | record = self._db.get(k) 59 | res = (record.sequence > self._ns) 60 | assert res is True, res 61 | 62 | def teardown(self): 63 | os.unlink(self._testfile + fileExtension) 64 | -------------------------------------------------------------------------------- /screed/tests/test_convert.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import test_fasta 3 | import os 4 | import screed 5 | from screed.DBConstants import fileExtension 6 | from . import screed_tst_utils as utils 7 | import shutil 8 | 9 | 10 | class Test_fasta_to_fastq(test_fasta.Test_fasta): 11 | 12 | """ 13 | Tests the ability to convert a fasta db to a fastq file, parse it into 14 | a fastq db, save to a fasta file, parse the fasta file into a fasta 15 | db and then run the fasta suite 16 | """ 17 | 18 | def setup_method(self): 19 | 20 | self._fqName = utils.get_temp_filename('fa_to_fq') 21 | self._faName = utils.get_temp_filename('fq_to_fa') 22 | self._testfa = utils.get_temp_filename('test.fa') 23 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 24 | 25 | screed.read_fasta_sequences(self._testfa) 26 | screed.ToFastq(self._testfa, self._fqName) # Fasta db -> fasta text 27 | screed.read_fastq_sequences(self._fqName) # Fastq file -> fastq db 28 | screed.ToFasta(self._fqName, self._faName) # Fastq db -> fasta text 29 | screed.read_fasta_sequences(self._faName) # Fasta file -> fasta db 30 | self.db = screed.ScreedDB(self._faName) 31 | 32 | def teardown_method(self): 33 | os.unlink(self._fqName) 34 | os.unlink(self._fqName + fileExtension) 35 | os.unlink(self._faName) 36 | os.unlink(self._faName + fileExtension) 37 | os.unlink(self._testfa + fileExtension) 38 | -------------------------------------------------------------------------------- /screed/tests/test_db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import screed 5 | from screed.DBConstants import fileExtension 6 | from . import screed_tst_utils as utils 7 | 8 | 9 | def test_make_db(): 10 | _testfa = utils.get_temp_filename('test.fa') 11 | shutil.copy(utils.get_test_data('test.fa'), _testfa) 12 | screed.make_db(_testfa) 13 | 14 | db = screed.ScreedDB(_testfa) 15 | 16 | os.unlink(_testfa + fileExtension) 17 | 18 | 19 | def test_no_sqlite_openscreed(): 20 | import screed.openscreed 21 | 22 | saveme = screed.openscreed.sqlite3 23 | del screed.openscreed.sqlite3 24 | 25 | try: 26 | try: 27 | screed.openscreed.ScreedDB('xxx') 28 | except Exception as e: 29 | assert 'sqlite3 is needed' in str(e) 30 | finally: 31 | screed.openscreed.sqlite3 = saveme 32 | 33 | 34 | def test_no_sqlite_createscreed(): 35 | import screed.createscreed 36 | 37 | saveme = screed.createscreed.sqlite3 38 | del screed.createscreed.sqlite3 39 | 40 | try: 41 | try: 42 | screed.createscreed.create_db(None, None, None) 43 | except Exception as e: 44 | assert 'sqlite3 is needed' in str(e) 45 | finally: 46 | screed.createscreed.sqlite3 = saveme 47 | 48 | 49 | def test_nodb(): 50 | """ 51 | Tests if screed throws an appropriate exception if it is 52 | asked to open a non-existant screed database 53 | """ 54 | try: 55 | db = screed.ScreedDB('foo') 56 | assert 1 == 0 # Previous line should throw an error 57 | except ValueError: 58 | pass 59 | 60 | 61 | def test_wrongdb(): 62 | """ 63 | Tests if screed throws an appropriate exception if it is 64 | asked to open a file that isn't a screed database 65 | """ 66 | try: 67 | blah = 'blah_screed' 68 | blah_file = open(blah, 'wb') 69 | blah_file.close() 70 | 71 | db = screed.ScreedDB(blah) 72 | os.unlink(blah) 73 | assert 1 == 0 74 | except TypeError: 75 | os.unlink(blah) 76 | pass 77 | -------------------------------------------------------------------------------- /screed/tests/test_dictionary.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import os 3 | import screed 4 | from screed.DBConstants import fileExtension 5 | from . import screed_tst_utils as utils 6 | import shutil 7 | 8 | 9 | class Test_dict_methods(object): 10 | 11 | """ 12 | Make sure that screed returns sensible results for standard dictionary 13 | queries. 14 | """ 15 | 16 | def setup_method(self): 17 | self._testfa = utils.get_temp_filename('test.fa') 18 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 19 | 20 | screed.read_fasta_sequences(self._testfa) 21 | self.db = screed.ScreedDB(self._testfa) 22 | 23 | def teardown_method(self): 24 | os.unlink(self._testfa + fileExtension) 25 | 26 | def test_iter_stuff(self): 27 | db = self.db 28 | keys = db.keys() 29 | ikeys = list(db.iterkeys()) 30 | assert all(key in ikeys for key in keys) 31 | 32 | values = db.values() 33 | ivalues = list(db.itervalues()) 34 | assert all(value in ivalues for value in values) 35 | 36 | items = db.items() 37 | iitems = list(db.iteritems()) 38 | assert all(item in iitems for item in items) 39 | 40 | def test_contains(self): 41 | for k in self.db: 42 | assert k in self.db 43 | 44 | assert db.get('FOO') is None 45 | 46 | assert 'FOO' not in self.db 47 | 48 | def test_contains(self): 49 | for k in self.db: 50 | assert k in self.db 51 | 52 | assert 'FOO' not in self.db 53 | 54 | def test_get(self): 55 | for k in self.db: 56 | record = self.db.get(k) 57 | assert record.name == k 58 | 59 | record = self.db[k] 60 | assert record.name == k 61 | 62 | try: 63 | self.db['FOO'] 64 | assert False, "the previous line should raise a KeyError" 65 | except KeyError: 66 | pass 67 | 68 | def test_missing(self): 69 | """ 70 | Make sure that unsupported dict attributes are actually missing. 71 | """ 72 | db = self.db 73 | 74 | try: 75 | db.clear() 76 | assert 0 77 | except NotImplementedError: 78 | pass 79 | 80 | try: 81 | db.update({}) 82 | assert 0 83 | except NotImplementedError: 84 | pass 85 | 86 | try: 87 | db.setdefault(None) 88 | assert 0 89 | except NotImplementedError: 90 | pass 91 | 92 | try: 93 | db.pop() 94 | assert 0 95 | except NotImplementedError: 96 | pass 97 | 98 | try: 99 | db.popitem() 100 | assert 0 101 | except NotImplementedError: 102 | pass 103 | -------------------------------------------------------------------------------- /screed/tests/test_dna.py: -------------------------------------------------------------------------------- 1 | import os 2 | import screed 3 | from screed.DBConstants import fileExtension 4 | 5 | 6 | class Test_dna(object): 7 | 8 | """Tests the dna module of screed""" 9 | def test_is_DNA(args): 10 | valid_DNA_str = "ATCCG" 11 | invalid_DNA_str = "ATXXG" 12 | assert screed.dna.is_DNA(valid_DNA_str) 13 | assert not screed.dna.is_DNA(invalid_DNA_str) 14 | 15 | def test_complement(args): 16 | dna = "ATCCG" 17 | comp = "TAGGC" 18 | assert screed.dna.complement(dna) == comp 19 | 20 | def test_reverse(args): 21 | dna = "ATCCG" 22 | reverse = "GCCTA" 23 | assert screed.dna.reverse(dna) == reverse 24 | 25 | def test_reverse_complement(args): 26 | dna = "ATCCG" 27 | reverse_complement = "CGGAT" 28 | assert screed.dna.reverse_complement(dna) == reverse_complement 29 | -------------------------------------------------------------------------------- /screed/tests/test_fasta.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | import screed 3 | from screed.DBConstants import fileExtension 4 | from screed.screedRecord import write_fastx 5 | import os 6 | from io import StringIO 7 | from io import BytesIO 8 | from . import screed_tst_utils as utils 9 | import shutil 10 | 11 | 12 | class FakeRecord(object): 13 | """Empty extensible object""" 14 | pass 15 | 16 | 17 | def test_new_record(): 18 | # test for a bug where the record dict was not reset after each 19 | # sequence load, leading to all records being identical if you 20 | # kept a handle on the returned dictionary. 21 | 22 | s = StringIO(">1\nACTG\n>2\nACGG\n") 23 | 24 | records = list(iter(screed.fasta.fasta_iter(s))) 25 | assert records[0]['name'] == '1' 26 | assert records[1]['name'] == '2' 27 | assert not hasattr(records[0], 'accuracy') # check for legacy attribute 28 | 29 | 30 | class Test_fasta(object): 31 | 32 | def setup_method(self): 33 | self._testfa = utils.get_temp_filename('test.fa') 34 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 35 | 36 | screed.read_fasta_sequences(self._testfa) 37 | self.db = screed.ScreedDB(self._testfa) 38 | 39 | def teardown_method(self): 40 | os.unlink(self._testfa + fileExtension) 41 | 42 | def test_length(self): 43 | assert len(self.db) == 22 44 | 45 | def test_keys(self): 46 | for key in self.db: 47 | assert key == self.db[key].name 48 | 49 | def test_id_retrieval(self): 50 | for key in self.db: 51 | record = self.db[key] 52 | intRcrd = self.db.loadRecordByIndex(record.id) 53 | assert record == intRcrd 54 | 55 | def test_length_2(self): 56 | read = self.db[self.db.keys()[0]] 57 | 58 | assert len(read) == len(read.sequence) 59 | 60 | def test_contains_front(self): 61 | first = self.db[self.db.keys()[0]] 62 | assert first.id == 0 63 | assert first.name == 'ENSMICT00000012722' 64 | assert first.description == 'cdna:pseudogene scaffold:micMur1:'\ 65 | 'scaffold_185008:9:424:1 gene:ENSMICG00000012730' 66 | assert str(first.sequence).startswith('TGCAGAAAATATCAAGAGTCAGC' 67 | 'AGAAAAACTATACAAGGGCTGGT' 68 | 'ATTTTGATTATTCT') 69 | 70 | def test_contains_middle(self): 71 | middle = self.db[self.db.keys()[10]] 72 | assert middle.id == 10 73 | assert middle.name == 'ENSMICT00000012078' 74 | assert middle.description == 'cdna:pseudogene scaffold:micMur1'\ 75 | ':scaffold_180699:3:774:-1 gene:ENSMICG00000012085' 76 | assert str(middle.sequence).startswith('GCGCACTCCCAGTGGCTACCCA' 77 | 'CGGCAGGAGGCGGCGGCAGTGA' 78 | 'CTGGGCCGGCGGCCCG') 79 | 80 | def test_contains_end(self): 81 | end = self.db[self.db.keys()[21]] 82 | assert end.id == 21 83 | assert end.name == 'ENSMICT00000003880' 84 | assert end.description == 'cdna:novel scaffold:micMur1:scaffol'\ 85 | 'd_175819:130:631:1 gene:ENSMICG00000003884' 86 | assert str(end.sequence).startswith('ATGCTGCCTAAGTTTGACCCCAACG' 87 | 'CGATCAAAGTCATGTACCTGAGGTG' 88 | 'CACGGGTGGC') 89 | 90 | def test_contains(self): 91 | for k in self.db: 92 | assert k in self.db 93 | 94 | assert self.db.get('FOO') is None 95 | 96 | assert 'FOO' not in self.db 97 | 98 | def test_iterv(self): 99 | entries = [] 100 | for entry in self.db: 101 | entries.append(self.db[entry]) 102 | 103 | ivalues = list(self.db.itervalues()) 104 | assert all(entry in ivalues for entry in entries) 105 | 106 | def test_iteri(self): 107 | for id, entry in self.db.iteritems(): 108 | assert id == self.db[entry.name].id 109 | assert entry == self.db[entry.name] 110 | 111 | 112 | class Test_fasta_whitespace(object): 113 | 114 | def setup_method(self): 115 | self._testfa = utils.get_temp_filename('test-whitespace.fa') 116 | shutil.copy(utils.get_test_data('test-whitespace.fa'), self._testfa) 117 | 118 | screed.read_fasta_sequences(self._testfa) 119 | self.db = screed.ScreedDB(self._testfa) 120 | 121 | def test_for_omitted_record(self): 122 | assert 'ENSMICT00000012401' in self.db 123 | 124 | def teardown_method(self): 125 | os.unlink(self._testfa + fileExtension) 126 | 127 | 128 | def test_output_sans_desc(): 129 | read = FakeRecord() 130 | read.name = 'foo' 131 | read.sequence = 'ATCG' 132 | 133 | fileobj = BytesIO() 134 | write_fastx(read, fileobj) 135 | assert fileobj.getvalue().decode('utf-8') == '>foo\nATCG\n' 136 | 137 | 138 | def test_output_with_desc(): 139 | read = FakeRecord() 140 | read.name = 'foo' 141 | read.description = 'bar' 142 | read.sequence = 'ATCG' 143 | 144 | fileobj = BytesIO() 145 | write_fastx(read, fileobj) 146 | assert fileobj.getvalue().decode('utf-8') == '>foo bar\nATCG\n' 147 | 148 | 149 | def test_output_two_reads(): 150 | fileobj = BytesIO() 151 | for i in range(2): 152 | read = FakeRecord() 153 | read.name = 'seq{}'.format(i) 154 | read.sequence = 'GATTACA' * (i + 1) 155 | write_fastx(read, fileobj) 156 | testoutput = '>seq0\nGATTACA\n>seq1\nGATTACAGATTACA\n' 157 | assert fileobj.getvalue().decode('utf-8') == testoutput 158 | 159 | 160 | def test_fasta_slicing(): 161 | testfa = utils.get_temp_filename('test.fa') 162 | shutil.copy(utils.get_test_data('test.fa'), testfa) 163 | 164 | with screed.open(testfa) as sequences: 165 | record = next(sequences) 166 | 167 | trimmed = record[:10] 168 | assert trimmed['sequence'] == "TGCAGAAAAT" 169 | 170 | for s in (slice(5, 10), slice(2, 26), slice(5, -1, 2), 171 | slice(-2, -10, 1), slice(-1, 5, 2), slice(5)): 172 | trimmed = record[s] 173 | 174 | assert trimmed['name'] == record['name'] 175 | assert trimmed.name == record.name 176 | 177 | assert trimmed['description'] == record['description'] 178 | assert trimmed.description == record.description 179 | 180 | assert trimmed['sequence'] == record['sequence'][s] 181 | assert trimmed.sequence == record.sequence[s] 182 | -------------------------------------------------------------------------------- /screed/tests/test_fasta_recover.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import test_fasta 3 | import os 4 | import screed 5 | from screed.DBConstants import fileExtension 6 | from . import screed_tst_utils as utils 7 | import shutil 8 | 9 | 10 | class test_fa_recover(test_fasta.Test_fasta): 11 | 12 | def setup_method(self): 13 | self._fileName = utils.get_temp_filename('fastaRecovery') 14 | 15 | self._testfa = utils.get_temp_filename('test.fa') 16 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 17 | 18 | screed.read_fasta_sequences(self._testfa) 19 | screed.ToFasta(self._testfa, self._fileName) 20 | screed.read_fasta_sequences(self._fileName) 21 | self.db = screed.ScreedDB(self._fileName) 22 | 23 | def teardown_method(self): 24 | os.unlink(self._fileName) 25 | os.unlink(self._fileName + fileExtension) 26 | os.unlink(self._testfa + fileExtension) 27 | -------------------------------------------------------------------------------- /screed/tests/test_fastq.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | import screed 3 | from screed.DBConstants import fileExtension 4 | from screed.screedRecord import write_fastx 5 | import os 6 | from io import StringIO 7 | from io import BytesIO 8 | from . import screed_tst_utils as utils 9 | import shutil 10 | import pytest 11 | 12 | 13 | class FakeRecord(object): 14 | """Empty extensible object""" 15 | pass 16 | 17 | 18 | def test_new_record(): 19 | # test for a bug where the record dict was not reset after each 20 | # sequence load, leading to all records being identical if you 21 | # kept a handle on the returned dictionary. 22 | 23 | s = StringIO("@1\nACTG\n+\nAAAA\n@2\nACGG\n+\nAAAA\n") 24 | 25 | records = list(iter(screed.fastq.fastq_iter(s))) 26 | assert records[0]['name'] == '1' 27 | assert records[1]['name'] == '2' 28 | 29 | 30 | def test_parse_description_true(): 31 | # test for a bug where the record dict was not reset after each 32 | # sequence load, leading to all records being identical if you 33 | # kept a handle on the returned dictionary. 34 | 35 | s = StringIO("@1 FOO\nACTG\n+\nAAAA\n@2\nACGG\n+\nAAAA\n") 36 | 37 | records = list(iter(screed.fastq.fastq_iter(s, parse_description=True))) 38 | assert records[0]['name'] == '1' 39 | assert records[1]['name'] == '2' 40 | 41 | 42 | def test_parse_description_false(): 43 | # test for a bug where the record dict was not reset after each 44 | # sequence load, leading to all records being identical if you 45 | # kept a handle on the returned dictionary. 46 | 47 | s = StringIO("@1 FOO\nACTG\n+\nAAAA\n@2\nACGG\n+\nAAAA\n") 48 | 49 | records = list(iter(screed.fastq.fastq_iter(s, parse_description=False))) 50 | assert records[0]['name'] == '1 FOO' 51 | assert records[1]['name'] == '2' 52 | 53 | # also is default behavior 54 | s = StringIO("@1 FOO\nACTG\n+\nAAAA\n@2\nACGG\n+\nAAAA\n") 55 | 56 | records = list(iter(screed.fastq.fastq_iter(s))) 57 | assert records[0]['name'] == '1 FOO' 58 | assert records[1]['name'] == '2' 59 | 60 | 61 | class Test_fastq(object): 62 | 63 | def setup_method(self): 64 | self._testfq = utils.get_temp_filename('test.fastq') 65 | shutil.copy(utils.get_test_data('test.fastq'), self._testfq) 66 | 67 | screed.read_fastq_sequences(self._testfq) 68 | self.db = screed.ScreedDB(self._testfq) 69 | 70 | def teardown_method(self): 71 | os.unlink(self._testfq + fileExtension) 72 | 73 | def test_length(self): 74 | assert len(self.db) == 125 75 | 76 | def test_keys(self): 77 | for key in self.db: 78 | assert key == self.db[key].name 79 | 80 | def test_id_retrieval(self): 81 | for key in self.db: 82 | record = self.db[key] 83 | intRcrd = self.db.loadRecordByIndex(record.id) 84 | assert record == intRcrd 85 | 86 | def test_contains_front(self): 87 | first = self.db[self.db.keys()[0]] 88 | assert first.id == 0 89 | assert first.name == 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2' 90 | assert first.sequence == 'ACAGCAAAATTGTGATTGAGGATGAAGAACTGCTGT' 91 | assert first.quality == 'AA7AAA3+AAAAAA.AAA.;7;AA;;;;*;<1;<<<' 92 | 93 | def test_contains_middle(self): 94 | middle = self.db[self.db.keys()[62]] 95 | assert middle.id == 62 96 | assert middle.name == 'HWI-EAS_4_PE-FC20GCB:2:1:245:483/2' 97 | assert middle.sequence == 'TGTCGAGCAAAGCAAAACAGGCGTAAAAATTGCCAT' 98 | assert middle.quality == 'AAAAAAAAAAAAAAAAAAAAA>AAAAAAAA?9>6><' 99 | 100 | def test_contains_end(self): 101 | end = self.db[self.db.keys()[124]] 102 | assert end.id == 124 103 | assert end.name == 'HWI-EAS_4_PE-FC20GCB:2:1:350:588/2' 104 | assert end.sequence == 'GGTACAAAATAGATGCTGGACTCTCCGAATCCTATA' 105 | assert end.quality == ';?5AAAAAAAAAA?A??;?AA;AAA>AAAA?4?844' 106 | 107 | def test_contains(self): 108 | for k in self.db: 109 | assert k in self.db 110 | 111 | assert self.db.get('FOO') is None 112 | 113 | assert 'FOO' not in self.db 114 | 115 | def test_iterv(self): 116 | entries = [] 117 | for entry in self.db: 118 | entries.append(self.db[entry]) 119 | 120 | ivalues = list(self.db.itervalues()) 121 | assert all(entry in ivalues for entry in entries) 122 | 123 | def test_iteri(self): 124 | for id, entry in self.db.iteritems(): 125 | assert id == self.db[entry.name].id 126 | assert entry == self.db[entry.name] 127 | 128 | 129 | def test_output_sans_desc(): 130 | read = FakeRecord() 131 | read.name = 'foo' 132 | read.sequence = 'ATCG' 133 | read.quality = '####' 134 | 135 | fileobj = BytesIO() 136 | write_fastx(read, fileobj) 137 | assert fileobj.getvalue().decode('utf-8') == '@foo\nATCG\n+\n####\n' 138 | 139 | 140 | def test_output_with_desc(): 141 | read = FakeRecord() 142 | read.name = 'foo' 143 | read.description = 'bar' 144 | read.sequence = 'ATCG' 145 | read.quality = '####' 146 | 147 | fileobj = BytesIO() 148 | write_fastx(read, fileobj) 149 | assert fileobj.getvalue().decode('utf-8') == '@foo bar\nATCG\n+\n####\n' 150 | 151 | 152 | def test_output_two_reads(): 153 | fileobj = BytesIO() 154 | for i in range(2): 155 | read = FakeRecord() 156 | read.name = 'seq{}'.format(i) 157 | read.sequence = 'GATTACA' * (i + 1) 158 | read.quality = '#######' * (i + 1) 159 | write_fastx(read, fileobj) 160 | testoutput = ('@seq0\nGATTACA\n+\n#######\n' 161 | '@seq1\nGATTACAGATTACA\n+\n##############\n') 162 | assert fileobj.getvalue().decode('utf-8') == testoutput 163 | 164 | 165 | def test_output_bad_mode(): 166 | read = FakeRecord() 167 | read.name = 'foo' 168 | read.description = 'bar' 169 | read.sequence = 'ATCG' 170 | read.quality = '####' 171 | 172 | fileobj = StringIO() 173 | with pytest.raises(AttributeError) as ae: 174 | write_fastx(read, fileobj) 175 | assert 'cannot call "write_fastx" on object' in str(ae) 176 | 177 | 178 | def test_fastq_slicing(): 179 | testfq = utils.get_temp_filename('test.fastq') 180 | shutil.copy(utils.get_test_data('test.fastq'), testfq) 181 | 182 | with screed.open(testfq) as sequences: 183 | record = next(sequences) 184 | 185 | trimmed = record[:10] 186 | assert trimmed['sequence'] == "ACAGCAAAAT" 187 | assert trimmed['quality'] == "AA7AAA3+AA" 188 | 189 | for s in (slice(5, 10), slice(2, 26), slice(5, -1, 2), 190 | slice(-2, -10, 1), slice(-1, 5, 2), slice(5)): 191 | trimmed = record[s] 192 | 193 | assert trimmed['name'] == record['name'] 194 | assert trimmed.name == record.name 195 | 196 | assert trimmed['sequence'] == record['sequence'][s] 197 | assert trimmed.sequence == record.sequence[s] 198 | 199 | assert trimmed['quality'] == record['quality'][s] 200 | assert trimmed.quality == record.quality[s] 201 | -------------------------------------------------------------------------------- /screed/tests/test_fastq_recover.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import test_fastq 3 | import os 4 | import screed 5 | from screed.DBConstants import fileExtension 6 | from . import screed_tst_utils as utils 7 | import shutil 8 | 9 | 10 | class test_fq_recover(test_fastq.Test_fastq): 11 | 12 | def setup_method(self): 13 | self._fileName = utils.get_temp_filename('fastqRecovery') 14 | 15 | self._testfq = utils.get_temp_filename('test.fastq') 16 | shutil.copy(utils.get_test_data('test.fastq'), self._testfq) 17 | 18 | screed.read_fastq_sequences(self._testfq) 19 | screed.ToFastq(self._testfq, self._fileName) 20 | screed.read_fastq_sequences(self._fileName) 21 | self.db = screed.ScreedDB(self._fileName) 22 | 23 | def teardown_method(self): 24 | os.unlink(self._fileName) 25 | os.unlink(self._fileName + fileExtension) 26 | os.unlink(self._testfq + fileExtension) 27 | -------------------------------------------------------------------------------- /screed/tests/test_hava_methods.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import screed 3 | import screed.seqparse 4 | from screed.DBConstants import fileExtension 5 | import os 6 | from . import screed_tst_utils as utils 7 | import shutil 8 | 9 | testha = utils.get_temp_filename('test.hava') 10 | shutil.copy(utils.get_test_data('test.hava'), testha) 11 | 12 | 13 | class test_hava(object): 14 | 15 | def setup_method(self): 16 | screed.seqparse.read_hava_sequences(testha) 17 | self._db = screed.ScreedDB(testha) 18 | 19 | def teardown_method(self): 20 | b = 7 21 | # os.unlink(testha + fileExtension) 22 | 23 | def test_contains(self): 24 | assert 'test_006' in self._db 25 | 26 | def test_beginning_key_retrieval(self): 27 | result = self._db['test_000'] 28 | assert result.hava == 'test_000' 29 | assert result.quarzk == 'ACGGTGACGGTCACCGTCGACGGCCCAAGCCCATCGAACG'\ 30 | 'TACCACCCCCACCTATCGTCACGCTGGTGGAGAGCCAATG' 31 | assert result.muchalo == 'AFPPCLHBCCILGMMOCHKNNDBKCCPNHAMKJOCCDJA'\ 32 | 'OEPNMHFHCBAJOKEMMMBHCPHIOAEPFFCAOJPGIMKGK' 33 | assert result.fakours == '218583165871861127719451483455294521865'\ 34 | '68176931571171542294878855181415261425688' 35 | assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\ 36 | 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb' 37 | assert result.marshoon == 'C7AF246AC7AAEABE5A557FCBC6FD5F5263BCDE'\ 38 | '4E745BEF1GG7DD1AB511GBC63A4GF1F4E1A154B35D' 39 | 40 | def test_middle_key_retrieval(self): 41 | result = self._db['test_0063'] 42 | assert result.hava == 'test_0063' 43 | assert result.quarzk == 'CAACACGATCAAGTTTGGTAAGAATTCCGCCTTAAGCTTT'\ 44 | 'CTAGAACGATAGTTGCCCCCAATCTGGTTCGAAATCTCTT' 45 | assert result.muchalo == 'GMDAPLMOOFANDHHMLBPIKGHIAFFFOABFMNNJNIJ'\ 46 | 'ILEEFEPOCAJLNDLIFBPMGKOFJIEFAHNJPIOFAJMLM' 47 | assert result.fakours == '392363971393898522756138876485334274384'\ 48 | '39122136418369146118333919885587613673488' 49 | assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\ 50 | 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb' 51 | assert result.marshoon == 'FC25E2CFC2BAFA7A2AA4757F3GFFFEE37G7752'\ 52 | 'FCDBAEADBA1AC7374FB5C15552E6E2GG6GFF62C6GE' 53 | 54 | def test_end_key_retrieval(self): 55 | result = self._db['test_00124'] 56 | assert result.hava == 'test_00124' 57 | assert result.quarzk == 'ATCGCAACCGTTTCCCCTATCTGGCAATTGAATCCGCGTC'\ 58 | 'CTAAAACGAAAGCTTATCCCTGGCGAGGCACGCTAGGCCT' 59 | assert result.muchalo == 'CIHNCECANFNLKGCHNOEHJDHADHPAEMMNKGMMMPD'\ 60 | 'OBMOCKNBCMCPHEBEOINHMBMMGCHEMOIOAPEFPDDJP' 61 | assert result.fakours == '327364511483537131695325595876269716778'\ 62 | '14946924334424648676283848861393812686731' 63 | assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\ 64 | 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb' 65 | assert result.marshoon == '4FE5FDD76CC5DE4DC2F25AA2GFBD7BEG326C6D'\ 66 | '7AB5B71GA67BAFD63AE1A562CDC1C2D157G6EF17CD' 67 | -------------------------------------------------------------------------------- /screed/tests/test_open.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2015, Michigan State University 2 | """ 3 | Test `screed.open`. 4 | """ 5 | 6 | from __future__ import absolute_import 7 | 8 | import os.path 9 | import sys 10 | import subprocess 11 | 12 | from . import screed_tst_utils as utils 13 | import screed 14 | import screed.openscreed 15 | 16 | 17 | def test_empty_open(): 18 | filename = utils.get_test_data('empty.fa') 19 | assert len(list(screed.open(filename))) == 0 20 | 21 | 22 | def test_open_maps_dash(): 23 | """Test mapping of '-'.""" 24 | # pylint: disable=protected-access 25 | filename = '-' 26 | mapped = screed.openscreed._normalize_filename(filename) 27 | 28 | assert '/dev/stdin' == mapped 29 | 30 | 31 | def test_open_stdin(): 32 | """Test feeding data through stdin. 33 | 34 | Uses a subprocess with the data file directlyused as stdin.""" 35 | filename1 = utils.get_test_data('test.fa') 36 | command = ["python", "-c", "from __future__ import print_function;" 37 | "import screed; print(list(screed.open('-')))"] 38 | with open(filename1, 'rb') as data_file: 39 | output = subprocess.Popen(command, 40 | stdin=data_file, universal_newlines=True, 41 | stdout=subprocess.PIPE).communicate()[0] 42 | assert "'name': 'ENSMICT00000012722'" \ 43 | or "'name': u'ENSMICT00000012722'" in output, output 44 | 45 | 46 | def test_simple_open(): 47 | filename = utils.get_test_data('test.fa') 48 | 49 | n = -1 50 | for n, record in enumerate(screed.open(filename, parse_description=True)): 51 | assert record.name == 'ENSMICT00000012722' 52 | break 53 | assert n == 0, n 54 | 55 | 56 | def test_simple_open_fq(): 57 | filename = utils.get_test_data('test.fastq') 58 | 59 | n = -1 60 | for n, record in enumerate(screed.open(filename)): 61 | assert record.name == 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2' 62 | break 63 | assert n == 0 64 | 65 | 66 | def test_gz_open(): 67 | filename1 = utils.get_test_data('test.fa') 68 | filename2 = utils.get_test_data('test.fa.gz') 69 | for n, (r1, r2) in enumerate(zip(screed.open(filename1), 70 | screed.open(filename2))): 71 | assert r1.name == r2.name 72 | 73 | assert n > 0 74 | 75 | 76 | def test_bz2_open(): 77 | filename1 = utils.get_test_data('test.fa') 78 | filename2 = utils.get_test_data('test.fa.bz2') 79 | for n, (r1, r2) in enumerate(zip(screed.open(filename1), 80 | screed.open(filename2))): 81 | assert r1.name == r2.name 82 | 83 | assert n > 0 84 | 85 | 86 | def test_gz_open_fastq(): 87 | filename1 = utils.get_test_data('test.fastq') 88 | filename2 = utils.get_test_data('test.fastq.gz') 89 | for n, (r1, r2) in enumerate(zip(screed.open(filename1), 90 | screed.open(filename2))): 91 | assert r1.name == r2.name 92 | 93 | assert n > 0 94 | 95 | 96 | def test_unknown_fileformat(): 97 | 98 | try: 99 | screed.open(__file__) 100 | except ValueError as err: 101 | assert "unknown file format" in str(err) 102 | -------------------------------------------------------------------------------- /screed/tests/test_open_cm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2015, Michigan State University 2 | """ 3 | Test the use of `screed.open` as a ContextManager. 4 | """ 5 | 6 | from . import screed_tst_utils as utils 7 | import screed 8 | import screed.openscreed 9 | 10 | 11 | def test_empty_open(): 12 | filename = utils.get_test_data('empty.fa') 13 | with screed.open(filename) as f: 14 | assert len(list(f)) == 0 15 | 16 | 17 | def test_simple_open(): 18 | filename = utils.get_test_data('test.fa') 19 | 20 | n = -1 21 | with screed.open(filename, parse_description=True) as f: 22 | for n, record in enumerate(f): 23 | assert record.name == 'ENSMICT00000012722' 24 | break 25 | 26 | assert n == 0, n 27 | 28 | 29 | def test_simple_close(): 30 | filename = utils.get_test_data('test.fa') 31 | 32 | n = -1 33 | f = screed.open(filename, parse_description=True) 34 | for n, record in enumerate(f): 35 | assert record.name == 'ENSMICT00000012722' 36 | break 37 | 38 | assert n == 0, n 39 | f.close() 40 | 41 | 42 | def test_simple_open_fq(): 43 | filename = utils.get_test_data('test.fastq') 44 | 45 | n = -1 46 | with screed.open(filename) as f: 47 | for n, record in enumerate(f): 48 | assert record.name == 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2' 49 | break 50 | 51 | assert n == 0 52 | 53 | 54 | def test_gz_open(): 55 | filename1 = utils.get_test_data('test.fa') 56 | filename2 = utils.get_test_data('test.fa.gz') 57 | with screed.open(filename1) as f1, screed.open(filename2) as f2: 58 | for n, (r1, r2) in enumerate(zip(f1, f2)): 59 | assert r1.name == r2.name 60 | 61 | assert n > 0 62 | 63 | 64 | def test_bz2_open(): 65 | filename1 = utils.get_test_data('test.fa') 66 | filename2 = utils.get_test_data('test.fa.bz2') 67 | with screed.open(filename1) as f1, screed.open(filename2) as f2: 68 | for n, (r1, r2) in enumerate(zip(f1, f2)): 69 | assert r1.name == r2.name 70 | 71 | assert n > 0 72 | 73 | 74 | def test_gz_open_fastq(): 75 | filename1 = utils.get_test_data('test.fastq') 76 | filename2 = utils.get_test_data('test.fastq.gz') 77 | with screed.open(filename1) as f1, screed.open(filename2) as f2: 78 | for n, (r1, r2) in enumerate(zip(f1, f2)): 79 | assert r1.name == r2.name 80 | 81 | assert n > 0 82 | 83 | 84 | def test_unknown_fileformat(): 85 | try: 86 | with screed.open(__file__): 87 | pass 88 | except ValueError as err: 89 | assert "unknown file format" in str(err) 90 | -------------------------------------------------------------------------------- /screed/tests/test_pygr_api.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | import pytest 4 | pygr = pytest.importorskip("pygr") 5 | 6 | import screed # nopep8 7 | from screed.DBConstants import fileExtension # nopep8 8 | from screed.pygr_api import ScreedSequenceDB, ScreedSequenceDB_ByIndex # nopep8 9 | from pickle import dump, load # nopep8 10 | from io import StringIO # nopep8 11 | import os # nopep8 12 | 13 | testfa = os.path.join(os.path.dirname(__file__), 'test.fa') 14 | 15 | 16 | def setup(): 17 | screed.read_fasta_sequences(testfa) 18 | 19 | 20 | def teardown(): 21 | os.unlink(testfa + fileExtension) 22 | 23 | 24 | def test_name_iterator_methods(): 25 | db = ScreedSequenceDB(testfa) 26 | 27 | # test the various iterator methods for equal results from db 28 | a = sorted([(x, db[x]) for x in db]) 29 | b = sorted([i for i in db.iteritems()]) 30 | c = sorted([(v.name, v) for v in db.itervalues()]) 31 | 32 | assert a == b 33 | assert a == c 34 | 35 | 36 | def test_index_iterator_methods(): 37 | db = ScreedSequenceDB_ByIndex(testfa) 38 | 39 | # test the various iterator methods for equal results from db 40 | m = sorted([(x, db[x]) for x in db]) 41 | n = sorted([i for i in db.iteritems()]) 42 | o = sorted([(v.record.id, v) for v in db.itervalues()]) 43 | 44 | assert m == n 45 | assert m == o, (m, o) 46 | 47 | 48 | def test_name_index_equality(): 49 | db1 = ScreedSequenceDB(testfa) 50 | db2 = ScreedSequenceDB_ByIndex(testfa) 51 | 52 | # must use something other than the obj itself for comparison... 53 | v1 = sorted([(v.name, v.seq) for v in db1.itervalues()]) 54 | v2 = sorted([(v.name, v.seq) for v in db2.itervalues()]) 55 | assert v1 == v2, (v1, v2) 56 | 57 | 58 | def test_seqinfodict_by_name(): 59 | db1 = ScreedSequenceDB(testfa) 60 | sd = db1.seqInfoDict 61 | 62 | m = sorted([y.id for (x, y) in sd.iteritems()]) 63 | n = sorted([x.id for x in sd.itervalues()]) 64 | 65 | assert m == n, (m, n) 66 | 67 | 68 | def test_seqinfodict_by_index(): 69 | db1 = ScreedSequenceDB_ByIndex(testfa) 70 | sd = db1.seqInfoDict 71 | 72 | m = sorted([x for (x, y) in sd.iteritems()]) 73 | n = sorted([x for x in sd.iterkeys()]) 74 | 75 | assert m == n, (m, n) 76 | 77 | 78 | def test_pickle_ByName(): 79 | db = ScreedSequenceDB(testfa) 80 | ofp = StringIO() 81 | 82 | dump(db, ofp) 83 | 84 | ifp = StringIO(ofp.getvalue()) 85 | db2 = load(ifp) 86 | assert db.filepath == db2.filepath 87 | 88 | 89 | def test_pickle_ByIndex(): 90 | db = ScreedSequenceDB_ByIndex(testfa) 91 | ofp = StringIO() 92 | 93 | dump(db, ofp) 94 | 95 | ifp = StringIO(ofp.getvalue()) 96 | db2 = load(ifp) 97 | assert db.filepath == db2.filepath 98 | -------------------------------------------------------------------------------- /screed/tests/test_record.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals, print_function 2 | from screed import Record 3 | import pytest 4 | 5 | 6 | def test_create_quality_none(): 7 | r = Record(name='foo', sequence='ATGACG', quality=None) 8 | assert not hasattr(r, 'quality') 9 | 10 | 11 | def test_len(): 12 | r = Record(name='foo', sequence='ATGACG') 13 | assert len(r) == 6 14 | 15 | 16 | # copied over from khmer tests/test_read_parsers.py 17 | def test_read_type_basic(): 18 | name = "895:1:1:1246:14654 1:N:0:NNNNN" 19 | sequence = "ACGT" 20 | r = Record(name, sequence) 21 | 22 | assert r.name == name 23 | assert r.sequence == sequence 24 | assert not hasattr(r, 'quality'), x 25 | assert not hasattr(r, 'annotations'), x 26 | 27 | 28 | # copied over from khmer tests/test_read_parsers.py 29 | def test_read_type_attributes(): 30 | r = Record(sequence='ACGT', quality='good', name='1234', annotations='ann') 31 | assert r.sequence == 'ACGT' 32 | assert r.quality == 'good' 33 | assert r.name == '1234' 34 | assert r.annotations == 'ann' 35 | -------------------------------------------------------------------------------- /screed/tests/test_shell.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from . import test_fasta 3 | from . import test_fastq 4 | import os 5 | import subprocess 6 | import screed 7 | from screed.DBConstants import fileExtension 8 | from . import screed_tst_utils as utils 9 | import shutil 10 | 11 | 12 | class Test_fa_shell_command(test_fasta.Test_fasta): 13 | """ 14 | Tests the functionality of the 'db' command in creating a 15 | screed database correctly from the shell 16 | """ 17 | 18 | def setup_method(self): 19 | thisdir = os.path.dirname(__file__) 20 | 21 | self._testfa = utils.get_temp_filename('test.fa') 22 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 23 | 24 | cmd = ['screed', 'db', self._testfa] 25 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 26 | assert ret == 0, ret 27 | self.db = screed.ScreedDB(self._testfa) 28 | 29 | def teardown_method(self): 30 | os.unlink(self._testfa + fileExtension) 31 | 32 | 33 | class Test_fq_shell_command(test_fastq.Test_fastq): 34 | 35 | """ 36 | Tests the functionality of the 'db' command in creating a 37 | screed database correctly from the shell 38 | """ 39 | 40 | def setup_method(self): 41 | thisdir = os.path.dirname(__file__) 42 | 43 | self._testfq = utils.get_temp_filename('test.fastq') 44 | shutil.copy(utils.get_test_data('test.fastq'), self._testfq) 45 | 46 | cmd = ['screed', 'db', self._testfq] 47 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 48 | assert ret == 0, ret 49 | self.db = screed.ScreedDB(self._testfq) 50 | 51 | def teardown_method(self): 52 | os.unlink(self._testfq + fileExtension) 53 | 54 | 55 | class Test_fa_shell_module(test_fasta.Test_fasta): 56 | 57 | """ 58 | Tests the functionality of the 'db' command in creating a 59 | screed database correctly from the shell 60 | """ 61 | 62 | def setup_method(self): 63 | thisdir = os.path.dirname(__file__) 64 | 65 | self._testfa = utils.get_temp_filename('test.fa') 66 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 67 | 68 | cmd = ['python', '-m', 'screed', 'db', self._testfa] 69 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 70 | assert ret == 0, ret 71 | self.db = screed.ScreedDB(self._testfa) 72 | 73 | def teardown_method(self): 74 | os.unlink(self._testfa + fileExtension) 75 | 76 | 77 | class Test_fq_shell_module(test_fastq.Test_fastq): 78 | 79 | """ 80 | Tests the functionality of the 'db' command in creating a 81 | screed database correctly from the shell 82 | """ 83 | 84 | def setup_method(self): 85 | thisdir = os.path.dirname(__file__) 86 | 87 | self._testfq = utils.get_temp_filename('test.fastq') 88 | shutil.copy(utils.get_test_data('test.fastq'), self._testfq) 89 | 90 | cmd = ['python', '-m', 'screed', 'db', self._testfq] 91 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 92 | assert ret == 0, ret 93 | self.db = screed.ScreedDB(self._testfq) 94 | 95 | def teardown_method(self): 96 | os.unlink(self._testfq + fileExtension) 97 | 98 | 99 | class Test_convert_shell(test_fasta.Test_fasta): 100 | 101 | """ 102 | Tests the ability to convert a fasta db to a fastq file, parse it into 103 | a fastq db, save to a fasta file, parse the fasta file into a fasta 104 | db and then run the fasta suite, all from the command line. 105 | """ 106 | 107 | def setup_method(self): 108 | 109 | self._fqName = utils.get_temp_filename('fa_to_fq') 110 | self._faName = utils.get_temp_filename('fq_to_fa') 111 | self._testfa = utils.get_temp_filename('test.fa') 112 | shutil.copy(utils.get_test_data('test.fa'), self._testfa) 113 | 114 | cmd = ['screed', 'db', self._testfa] 115 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 116 | assert ret == 0, ret 117 | 118 | cmd = ['screed', 'dump_fastq', self._testfa, self._fqName] 119 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 120 | assert ret == 0, ret 121 | 122 | cmd = ['screed', 'db', self._fqName] 123 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 124 | assert ret == 0, ret 125 | 126 | cmd = ['screed', 'dump_fasta', self._fqName, self._faName] 127 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 128 | assert ret == 0, ret 129 | 130 | cmd = ['screed', 'db', self._faName] 131 | ret = subprocess.check_call(cmd, stdout=subprocess.PIPE) 132 | assert ret == 0, ret 133 | 134 | self.db = screed.ScreedDB(self._faName) 135 | 136 | def teardown_method(self): 137 | os.unlink(self._fqName) 138 | os.unlink(self._fqName + fileExtension) 139 | os.unlink(self._faName) 140 | os.unlink(self._faName + fileExtension) 141 | os.unlink(self._testfa + fileExtension) 142 | -------------------------------------------------------------------------------- /screed/tests/test_streaming.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2008-2015, Michigan State University 2 | 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import tempfile 7 | import os 8 | import sys 9 | import io 10 | import threading 11 | import subprocess 12 | 13 | import pytest 14 | 15 | import screed 16 | from . import screed_tst_utils as utils 17 | from . import test_fasta 18 | from . import test_fastq 19 | from screed.DBConstants import fileExtension 20 | 21 | 22 | def streamer_reader(ifilename, exception): 23 | try: 24 | for read in screed.open(ifilename): 25 | pass 26 | except Exception as e: 27 | exception.append(e) 28 | 29 | 30 | def streamer(ifilename): 31 | 32 | # Get temp filenames, etc. 33 | in_dir = tempfile.mkdtemp(prefix="screedtest_") 34 | fifo = os.path.join(in_dir, 'fifo') 35 | ifile = io.open(ifilename, 'rb') 36 | 37 | # make a fifo to simulate streaming 38 | os.mkfifo(fifo) 39 | 40 | exception = [] 41 | # FIFOs MUST BE OPENED FOR READING BEFORE THEY ARE WRITTEN TO 42 | # If this isn't done, they will BLOCK and things will hang. 43 | # rvalues will hold the return from the threaded function 44 | thread = threading.Thread(target=streamer_reader, args=[fifo, exception]) 45 | thread.start() 46 | 47 | fifofile = io.open(fifo, 'wb') 48 | # read binary to handle compressed files 49 | chunk = ifile.read(8192) 50 | while len(chunk) > 0: 51 | fifofile.write(chunk) 52 | chunk = ifile.read(8192) 53 | 54 | fifofile.close() 55 | 56 | thread.join() 57 | 58 | if len(exception) > 0: 59 | raise exception[0] 60 | 61 | 62 | def test_stream_fa(): 63 | streamer(utils.get_test_data('test.fa')) 64 | 65 | 66 | def test_stream_fq(): 67 | streamer(utils.get_test_data('test.fastq')) 68 | 69 | 70 | @pytest.mark.xfail() 71 | def test_stream_fa_gz(): 72 | streamer(utils.get_test_data('test.fa.gz')) 73 | 74 | 75 | def test_stream_gz_fail(): 76 | try: 77 | streamer(utils.get_test_data('test.fastq.gz')) 78 | assert 0, "This should not work yet" 79 | except ValueError as err: 80 | print(str(err)) 81 | 82 | 83 | @pytest.mark.xfail() 84 | def test_stream_fq_gz(): 85 | streamer(utils.get_test_data('test.fastq.gz')) 86 | 87 | 88 | def test_stream_fa_bz2(): 89 | streamer(utils.get_test_data('test.fa.bz2')) 90 | 91 | 92 | def test_stream_fq_bz2(): 93 | streamer(utils.get_test_data('test.fastq.bz2')) 94 | -------------------------------------------------------------------------------- /screed/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016, The Regents of the University of California. 2 | 3 | 4 | def to_str(line): 5 | try: 6 | line = line.decode('utf-8') 7 | except AttributeError: 8 | pass 9 | 10 | return line 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = screed 3 | description = a Python library for loading FASTA and FASTQ sequences 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown; charset=UTF-8 6 | url = https://github.com/dib-lab/screed 7 | author = Alex Nolley, C. Titus Brown 8 | author_email = ctbrown@ucdavis.edu, 9 | license = BSD 3-clause 10 | license_file = doc/LICENSE.rst 11 | classifiers = 12 | Development Status :: 5 - Production/Stable 13 | Environment :: Console 14 | Environment :: MacOS X 15 | Intended Audience :: Science/Research 16 | License :: OSI Approved :: BSD License 17 | Natural Language :: English 18 | Operating System :: POSIX :: Linux 19 | Operating System :: MacOS :: MacOS X 20 | Programming Language :: Python :: 3.7 21 | Programming Language :: Python :: 3.8 22 | Programming Language :: Python :: 3.9 23 | Topic :: Scientific/Engineering :: Bio-Informatics 24 | project_urls = 25 | Documentation = https://screed.readthedocs.io 26 | Source = https://github.com/dib-lab/screed 27 | Tracker = https://github.com/dib-lab/screed/issues 28 | 29 | [options] 30 | zip_safe = False 31 | packages = find: 32 | platforms = any 33 | include_package_data = True 34 | python_requires = >=3.7 35 | setup_requires = 36 | setuptools_scm 37 | 38 | [bdist_wheel] 39 | universal = 1 40 | 41 | [aliases] 42 | test=pytest 43 | 44 | [options.entry_points] 45 | console_scripts = 46 | screed = screed.__main__:main 47 | 48 | [options.extras_require] 49 | test = 50 | pytest >= 6.2.2 51 | pycodestyle 52 | pytest-cov 53 | importlib_resources;python_version<'3.9' 54 | all = 55 | %(test)s 56 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import setuptools 3 | 4 | if __name__ == "__main__": 5 | setuptools.setup() 6 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py38, py39, py310 3 | minversion = 3.12 4 | isolated_build = true 5 | skip_missing_interpreters = true 6 | 7 | [testenv] 8 | passenv = 9 | CI 10 | GITHUB_ACTION 11 | GITHUB_REF 12 | GITHUB_HEAD_REF 13 | GITHUB_RUN_ID 14 | GITHUB_SHA 15 | GITHUB_REPOSITORY 16 | allowlist_externals = make 17 | commands = 18 | make install-dependencies 19 | pytest --cov -m 'not known_failing' --cov-report xml 20 | make pep8 21 | make doc 22 | deps = 23 | pytest 24 | pytest-cov 25 | sphinx 26 | 27 | [gh-actions] 28 | python = 29 | 3.8: py38 30 | 3.9: py39 31 | 3.10: py310 32 | --------------------------------------------------------------------------------