├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
└── sn.py


/LICENSE:
--------------------------------------------------------------------------------
 1 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                     Version 2, December 2004
 3 | 
 4 |  Copyright (C) 2012 Sergei Lebedev <superbobry@gmail.com>
 5 | 
 6 |  Everyone is permitted to copy and distribute verbatim or modified
 7 |  copies of this license document, and changing it is allowed as long
 8 |  as the name is changed.
 9 | 
10 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |   0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 |        _____ _   _ _____
 2 |       / ____| \ | |  __ \
 3 |      | (___ |  \| | |__) |   _
 4 |       \___ \| . ` |  ___/ | | |
 5 |       ____) | |\  | |   | |_| |
 6 |      |_____/|_| \_|_|    \__, |
 7 |                           __/ |
 8 |                          |___/
 9 | 
10 | An easy to use wrapper-library for working with [openSNP](http://opensnp.org)
11 | data. The current implementation only supports local or downloaded files,
12 | but [JSON API](http://opensnp.org/faq#api) interaction is planned.
13 | 
14 | All you need to remember at this point is a single function:
15 | 
16 | ```python
17 | >>> import sn
18 | >>> snps = sn.parse("72.ftdna-illumina.36")
19 | >>> sns
20 | >>> snps[:1]
21 | [_SNP(name='rs3094315', variation=None, chromosome='1', position=742429, strand=None, genotype='AA')]
22 | ```
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyVCF==0.4.4
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from distutils.core import setup
 5 | 
 6 | # Use 2to3 build conversion if required
 7 | try:
 8 |     from distutils.command.build_py import build_py_2to3 as build_py
 9 | except ImportError:
10 |     # 2.x
11 |     from distutils.command.build_py import build_py
12 | 
13 | setup(
14 |     name="snpy",
15 |     description="A wrapper-library for working with openSNP data",
16 |     license="WTFPL",
17 |     version="0.1",
18 |     author="Sergei Lebedev",
19 |     author_email="superbobry@gmail.com",
20 |     url="http://github.com/superbobry/snpy/",
21 |     classifiers=[
22 |         "Intended Audience :: Developers",
23 |         "License :: Public Domain",
24 |         "Operating System :: OS Independent",
25 |         "Programming Language :: Python",
26 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
27 |         ],
28 |     py_modules=["sn"],
29 |     platforms="any",
30 |     cmdclass={"build_py": build_py}
31 |     )
32 | 


--------------------------------------------------------------------------------
/sn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     snpy
  4 |     ~~~~
  5 | 
  6 |     ``snpy`` provides a fancy API for accessing `openSNP`_ data. The
  7 |     current implementation only supports working with *local* or downloaded
  8 |     files, but JSON API interaction is planned.
  9 | 
 10 |     .. _openSNP: http://opensnp.org
 11 | 
 12 |     :copyright: 2012 by Sergei Lebedev
 13 |     :license: WTFPL, see LICENSE for details
 14 | """
 15 | 
 16 | import csv
 17 | import os.path
 18 | from collections import namedtuple
 19 | 
 20 | try:
 21 |     import vcf
 22 | except ImportError:
 23 |     vcf = None  # 23andMe exome data won't be supported.
 24 | 
 25 | 
 26 | class SNPyError(Exception):
 27 |     """Generic ``snpy`` exception class."""
 28 | 
 29 | 
 30 | class UnknownSource(SNPyError):
 31 |     """Raised when a parsed file has unknown source type."""
 32 | 
 33 | 
 34 | _SNP = namedtuple("_SNP", ["name",
 35 |                            "variation",
 36 |                            "chromosome",
 37 |                            "position",
 38 |                            "strand",
 39 |                            "genotype"])
 40 | 
 41 | 
 42 | class SNP(_SNP):
 43 |     """A wrapper for SNP data, provided by various formats."""
 44 |     __slots__ = ()
 45 |     
 46 |     def __new__(cls, name, chromosome, position, genotype,
 47 |                 variation=None, strand=None):
 48 |         return super(SNP, cls).__new__(cls, name, variation, chromosome,
 49 |                                        int(position), strand, genotype)
 50 | 
 51 | 
 52 | def _23andme(path):
 53 |     handle = csv.DictReader(open(path, "r"),
 54 |         fieldnames=["name", "chromosome", "position", "genotype"],
 55 |         delimiter="\t")
 56 | 
 57 |     for row in handle:
 58 |         if not row["name"].startswith("#"):
 59 |             yield SNP(**row)
 60 | 
 61 | 
 62 | def _23andme_exome(path):
 63 |     if vcf is None:
 64 |         raise RuntimeError("PyVCF not available, please 'easy_install' it.")
 65 | 
 66 |     for r in vcf.VCFReader(open(path, "r")):
 67 |         if not r.is_snp:
 68 |             continue  # XXX Is it even possible?
 69 | 
 70 |         for sample in r.samples:
 71 |             yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS,
 72 |                       genotype=sample.gt_bases.replace("/", ""))
 73 | 
 74 | def _23andme_ancestry(path):
 75 |     handle = csv.DictReader(open(path, "r"),
 76 |         fieldnames=["name", "chromosome", "position", "allele1", "allele2"],
 77 |         delimiter="\t")
 78 |     for row in handle:
 79 |         if not row["name"].startswith(("#", "rsid")):
 80 |             row["genotype"] = "{}{}".format(row.pop("allele1"), row.pop("allele2"))
 81 |             yield SNP(**row)
 82 | 
 83 | def _genes_for_good(path):
 84 |     if vcf is None:
 85 |         raise RuntimeError("PyVCF not available, please 'easy_install' it.")
 86 | 
 87 |     try:
 88 |         for r in vcf.VCFReader(open(path, "rb"), compressed=True):
 89 |             if not r.is_snp:
 90 |                 continue  # XXX Is it even possible?
 91 |             for sample in r.samples:
 92 |                 yield SNP(name=r.ID, chromosome=r.CHROM, position=r.POS,
 93 |                         genotype=sample.gt_bases.replace("/", ""))
 94 |     except OSError:
 95 |         # the gfg format is is likely version 1.1
 96 |         for snp in _23andme(path):
 97 |             yield snp
 98 | 
 99 | def _iyg(path):
100 |     handle = csv.DictReader(open(path, "r"),
101 |             fieldnames=["name", "genotype"], delimiter="\t")
102 |     for row in handle:
103 |         yield SNP(name=row["name"], chromosome=None, position=0,
104 |                 genotype=row["genotype"])
105 | 
106 | def decodeme(path):
107 |     handle = csv.DictReader(open(path, "r"),
108 |         fieldnames=["name", "variation", "chromosome", "position",
109 |                     "strand", "genotype"])
110 | 
111 |     for row in handle:
112 |         # A flanky header criterion -- the last column should be
113 |         # 'XX', where X is one of "ACGT-".
114 |         if len(row["genotype"]) == 2:
115 |             yield SNP(**row)
116 | 
117 | 
118 | def ftdna(path):
119 |     handle = csv.DictReader(open(path, "r"),
120 |         fieldnames=["name", "chromosome", "position", "genotype"])
121 | 
122 |     for row in handle:
123 |         if row["position"].isdigit():
124 |             yield SNP(**row)
125 | 
126 | 
127 | def guess_source(path):
128 |     name, ext = os.path.split(path)
129 |     if ext == "vcf":
130 |         return ext  # VCF is easy ;)
131 | 
132 |     # Okay, maybe it's in openSNP format: ``format.submission_id``.
133 |     try:
134 |         source, _ = path.rsplit(os.path.extsep, 2)[-2:]
135 |     except ValueError:
136 |         raise UnknownSource(path)
137 |     else:
138 |         return source
139 | 
140 | 
141 | def parse(path, source=None):
142 |     """Returns a generator yielding :class:`SNP` from an openSNP file
143 |     at a given location.
144 | 
145 |     :param str path: path to openSNP file, *all* formats are supported.
146 |     :param str source: should be one of ``"23andme"``, ``"vcf"``,
147 |         ``"decodeme"`` or ``"ftdna"``; `source`` will be extracted from
148 |         the filename, if not provided explicitly.
149 |     :returns list: of :class:`SNP` instances from a given file.
150 |     :raises RuntimeError: if a given file cannot be parsed.
151 |     """
152 |     if source is None:
153 |         source = guess_source(path)
154 | 
155 |     try:
156 |         handler = {"23andme": _23andme,
157 |                    "23andme-exome-vcf": _23andme_exome,
158 |                    "ancestry": _23andme_ancestry,
159 |                    "genes-for-good": _genes_for_good,
160 |                    "IYG": _iyg,
161 |                    "ftdna-illumina": ftdna,
162 |                    "decodeme": decodeme,
163 |                    "vcf": _23andme_exome,
164 |                    "ftdna": ftdna}[source]
165 |     except KeyError:
166 |         raise UnknownSource(path)
167 |     else:
168 |         return handler(path)
169 | 


--------------------------------------------------------------------------------