├── LICENSE ├── README.md ├── requirements.txt ├── setup.py └── sn.py /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2012 Sergei Lebedev 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | _____ _ _ _____ 2 | / ____| \ | | __ \ 3 | | (___ | \| | |__) | _ 4 | \___ \| . ` | ___/ | | | 5 | ____) | |\ | | | |_| | 6 | |_____/|_| \_|_| \__, | 7 | __/ | 8 | |___/ 9 | 10 | An easy to use wrapper-library for working with [openSNP](http://opensnp.org) 11 | data. The current implementation only supports local or downloaded files, 12 | but [JSON API](http://opensnp.org/faq#api) interaction is planned. 13 | 14 | All you need to remember at this point is a single function: 15 | 16 | ```python 17 | >>> import sn 18 | >>> snps = sn.parse("72.ftdna-illumina.36") 19 | >>> sns 20 | >>> snps[:1] 21 | [_SNP(name='rs3094315', variation=None, chromosome='1', position=742429, strand=None, genotype='AA')] 22 | ``` 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyVCF==0.4.4 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from distutils.core import setup 5 | 6 | # Use 2to3 build conversion if required 7 | try: 8 | from distutils.command.build_py import build_py_2to3 as build_py 9 | except ImportError: 10 | # 2.x 11 | from distutils.command.build_py import build_py 12 | 13 | setup( 14 | name="snpy", 15 | description="A wrapper-library for working with openSNP data", 16 | license="WTFPL", 17 | version="0.1", 18 | author="Sergei Lebedev", 19 | author_email="superbobry@gmail.com", 20 | url="http://github.com/superbobry/snpy/", 21 | classifiers=[ 22 | "Intended Audience :: Developers", 23 | "License :: Public Domain", 24 | "Operating System :: OS Independent", 25 | "Programming Language :: Python", 26 | "Topic :: Scientific/Engineering :: Bio-Informatics", 27 | ], 28 | py_modules=["sn"], 29 | platforms="any", 30 | cmdclass={"build_py": build_py} 31 | ) 32 | -------------------------------------------------------------------------------- /sn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | snpy 4 | ~~~~ 5 | 6 | ``snpy`` provides a fancy API for accessing `openSNP`_ data. The 7 | current implementation only supports working with *local* or downloaded 8 | files, but JSON API interaction is planned. 9 | 10 | .. _openSNP: http://opensnp.org 11 | 12 | :copyright: 2012 by Sergei Lebedev 13 | :license: WTFPL, see LICENSE for details 14 | """ 15 | 16 | import csv 17 | import os.path 18 | from collections import namedtuple 19 | 20 | try: 21 | import vcf 22 | except ImportError: 23 | vcf = None # 23andMe exome data won't be supported. 24 | 25 | 26 | class SNPyError(Exception): 27 | """Generic ``snpy`` exception class.""" 28 | 29 | 30 | class UnknownSource(SNPyError): 31 | """Raised when a parsed file has unknown source type.""" 32 | 33 | 34 | _SNP = namedtuple("_SNP", ["name", 35 | "variation", 36 | "chromosome", 37 | "position", 38 | "strand", 39 | "genotype"]) 40 | 41 | 42 | class SNP(_SNP): 43 | """A wrapper for SNP data, provided by various formats.""" 44 | __slots__ = () 45 | 46 | def __new__(cls, name, chromosome, position, genotype, 47 | variation=None, strand=None): 48 | return super(SNP, cls).__new__(cls, name, variation, chromosome, 49 | int(position), strand, genotype) 50 | 51 | 52 | def _23andme(path): 53 | handle = csv.DictReader(open(path, "r"), 54 | fieldnames=["name", "chromosome", "position", "genotype"], 55 | delimiter="\t") 56 | 57 | for row in handle: 58 | if not row["name"].startswith("#"): 59 | yield SNP(**row) 60 | 61 | 62 | def _23andme_exome(path): 63 | if vcf is None: 64 | raise RuntimeError("PyVCF not available, please 'easy_install' it.") 65 | 66 | for r in vcf.VCFReader(open(path, "r")): 67 | if not r.is_snp: 68 | continue # XXX Is it even possible? 69 | 70 | for sample in r.samples: 71 | yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS, 72 | genotype=sample.gt_bases.replace("/", "")) 73 | 74 | def _23andme_ancestry(path): 75 | handle = csv.DictReader(open(path, "r"), 76 | fieldnames=["name", "chromosome", "position", "allele1", "allele2"], 77 | delimiter="\t") 78 | for row in handle: 79 | if not row["name"].startswith(("#", "rsid")): 80 | row["genotype"] = "{}{}".format(row.pop("allele1"), row.pop("allele2")) 81 | yield SNP(**row) 82 | 83 | def _genes_for_good(path): 84 | if vcf is None: 85 | raise RuntimeError("PyVCF not available, please 'easy_install' it.") 86 | 87 | try: 88 | for r in vcf.VCFReader(open(path, "rb"), compressed=True): 89 | if not r.is_snp: 90 | continue # XXX Is it even possible? 91 | for sample in r.samples: 92 | yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS, 93 | genotype=sample.gt_bases.replace("/", "")) 94 | except OSError: 95 | # the gfg format is is likely version 1.1 96 | for snp in _23andme(path): 97 | yield snp 98 | 99 | def _iyg(path): 100 | handle = csv.DictReader(open(path, "r"), 101 | fieldnames=["name", "genotype"], delimiter="\t") 102 | for row in handle: 103 | yield SNP(name=row["name"], chromosome=None, position=0, 104 | genotype=row["genotype"]) 105 | 106 | def decodeme(path): 107 | handle = csv.DictReader(open(path, "r"), 108 | fieldnames=["name", "variation", "chromosome", "position", 109 | "strand", "genotype"]) 110 | 111 | for row in handle: 112 | # A flanky header criterion -- the last column should be 113 | # 'XX', where X is one of "ACGT-". 114 | if len(row["genotype"]) == 2: 115 | yield SNP(**row) 116 | 117 | 118 | def ftdna(path): 119 | handle = csv.DictReader(open(path, "r"), 120 | fieldnames=["name", "chromosome", "position", "genotype"]) 121 | 122 | for row in handle: 123 | if row["position"].isdigit(): 124 | yield SNP(**row) 125 | 126 | 127 | def guess_source(path): 128 | name, ext = os.path.split(path) 129 | if ext == "vcf": 130 | return ext # VCF is easy ;) 131 | 132 | # Okay, maybe it's in openSNP format: ``format.submission_id``. 133 | try: 134 | source, _ = path.rsplit(os.path.extsep, 2)[-2:] 135 | except ValueError: 136 | raise UnknownSource(path) 137 | else: 138 | return source 139 | 140 | 141 | def parse(path, source=None): 142 | """Returns a generator yielding :class:`SNP` from an openSNP file 143 | at a given location. 144 | 145 | :param str path: path to openSNP file, *all* formats are supported. 146 | :param str source: should be one of ``"23andme"``, ``"vcf"``, 147 | ``"decodeme"`` or ``"ftdna"``; `source`` will be extracted from 148 | the filename, if not provided explicitly. 149 | :returns list: of :class:`SNP` instances from a given file. 150 | :raises RuntimeError: if a given file cannot be parsed. 151 | """ 152 | if source is None: 153 | source = guess_source(path) 154 | 155 | try: 156 | handler = {"23andme": _23andme, 157 | "23andme-exome-vcf": _23andme_exome, 158 | "ancestry": _23andme_ancestry, 159 | "genes-for-good": _genes_for_good, 160 | "IYG": _iyg, 161 | "ftdna-illumina": ftdna, 162 | "decodeme": decodeme, 163 | "vcf": _23andme_exome, 164 | "ftdna": ftdna}[source] 165 | except KeyError: 166 | raise UnknownSource(path) 167 | else: 168 | return handler(path) 169 | --------------------------------------------------------------------------------