├── .coveragerc
├── .gitignore
├── .landscape.yaml
├── .travis.yml
├── COPYING
├── README.rst
├── continuous_integration
    ├── install.sh
    └── requirements.txt
├── docs
    ├── Makefile
    ├── _static
    │   └── logo.png
    ├── api.rst
    ├── apiref
    │   ├── modules.rst
    │   └── pysemantic.rst
    ├── conf.py
    ├── examples.rst
    ├── examples
    │   └── introduction.ipynb
    ├── index.rst
    ├── make.bat
    ├── schema_ref.rst
    └── tutorial
    │   ├── notebooks
    │       ├── demo_project.yml
    │       ├── dummy_data.csv
    │       ├── loading_datasets.ipynb
    │       └── naive_cleaning.ipynb
    │   └── slides
    │       ├── images
    │           └── dc_logo.jpg
    │       └── presentation.tex
├── pysemantic
    ├── __init__.py
    ├── cli.py
    ├── custom_traits.py
    ├── errors.py
    ├── exporters.py
    ├── loggers.py
    ├── project.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_base.py
    │   ├── test_cli.py
    │   ├── test_custom_traits.py
    │   ├── test_project.py
    │   ├── test_utils.py
    │   ├── test_validator.py
    │   └── testdata
    │   │   ├── bad_iris.csv
    │   │   ├── iris.csv
    │   │   ├── person_activity.tsv
    │   │   ├── test.conf
    │   │   ├── test_dictionary.yaml
    │   │   ├── test_excel.yaml
    │   │   └── test_spreadsheet.xlsx
    ├── utils.py
    └── validator.py
├── requirements.txt
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     */python?.?/*
 4 |     */lib-python/?.?/*.py
 5 |     */unittest2/*
 6 |     */dist-packages/pandas/*
 7 |     */dist-packages/numpy/*
 8 |     */dist-packages/nose/*
 9 |     */dist-packages/pyyaml/*
10 |     */dist-packages/traits/*
11 |     */dist-packages/docopt/*
12 | 
13 | [report]
14 | exclude_lines =
15 |     # Have to re-enable the standard pragma
16 |     pragma: no cover
17 | 
18 |     # Don't complain about missing debug-only code:
19 |     def __repr__
20 |     if self\.debug
21 | 
22 |     # Don't complain if tests don't hit defensive assertion code:
23 |     raise AssertionError
24 |     raise NotImplementedError
25 | 
26 |     # Don't complain if non-runnable code isn't run:
27 |     if 0:
28 |     if __name__ == .__main__.:
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # Vim swap files
57 | *.swp
58 | 
59 | # ipynb checkpoints
60 | docs/examples/.ipynb_checkpoints/
61 | .idea/
62 | venv/


--------------------------------------------------------------------------------
/.landscape.yaml:
--------------------------------------------------------------------------------
 1 | doc-warnings: yes
 2 | test-warnings: yes
 3 | strictness: veryhigh
 4 | mccabe:
 5 |   run: false
 6 | pylint:
 7 |   disable:
 8 |     - too-few-public-methods
 9 |     - no-self-use
10 |     - too-many-instance-attributes
11 |     - invalid-name
12 |     - missing-docstring
13 |     - star-args
14 |     - logging-format-interpolation
15 |     - bad-builtin
16 | ignore-paths:
17 |   - docs
18 | pep8:
19 |   disable:
20 |     - E731
21 |     - E126
22 |     - E128
23 |     - E115
24 |     - N802
25 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | virtualenv:
 4 |   system_site_packages: false
 5 | env:
 6 |   - DISTRIB="conda" COVERAGE="true" PYTHON_VERSION="2.7"
 7 | addons:
 8 |   apt_packages:
 9 |     - libbz2-dev
10 |     - libhdf5-serial-dev
11 |     - liblzo2-dev
12 | # command to install dependencies
13 | install: source continuous_integration/install.sh
14 | # command to run tests
15 | script: nosetests -sv --with-coverage --cover-package=pysemantic
16 | after_success:
17 |   coveralls
18 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2014–2015 Authors
 4 | All rights reserved.
 5 | 
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |   a. Redistributions of source code must retain the above copyright notice,
11 |      this list of conditions and the following disclaimer.
12 |   b. Redistributions in binary form must reproduce the above copyright
13 |      notice, this list of conditions and the following disclaimer in the
14 |      documentation and/or other materials provided with the distribution.
15 |   c. Neither the names of the PySemantic Developers, nor the names of its
16 |      contributors may be used to endorse or promote products derived from this
17 |      software without specific prior written permission. 
18 | 
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
24 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 | DAMAGE.
31 | 
32 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. -*- mode: rst -*-
  2 | 
  3 | |Travis|_ |Coveralls|_ |Landscape|_ |RTFD|_
  4 | 
  5 | .. |Travis| image:: https://travis-ci.org/jaidevd/pysemantic.svg?branch=master
  6 | .. _Travis: https://travis-ci.org/jaidevd/pysemantic
  7 | 
  8 | .. |Coveralls| image:: https://coveralls.io/repos/jaidevd/pysemantic/badge.svg?branch=master
  9 | .. _Coveralls: https://coveralls.io/r/jaidevd/pysemantic?branch=master
 10 | 
 11 | .. |Landscape| image:: https://landscape.io/github/jaidevd/pysemantic/master/landscape.svg?style=flat
 12 | .. _Landscape: https://landscape.io/github/jaidevd/pysemantic/master
 13 | 
 14 | .. |RTFD| image:: https://readthedocs.org/projects/pysemantic/badge/?version=latest
 15 | .. _RTFD: https://readthedocs.org/projects/pysemantic/?badge=latest
 16 | 
 17 | .. image:: docs/_static/logo.png
 18 | 
 19 | pysemantic
 20 | ==========
 21 | A traits based data validation and data cleaning module for pandas data structures.
 22 | 
 23 | Dependencies
 24 | ------------
 25 | * Traits
 26 | * PyYaml
 27 | * pandas
 28 | * docopt
 29 | 
 30 | Quick Start
 31 | -----------
 32 | 
 33 | Installing with pip
 34 | +++++++++++++++++++
 35 | 
 36 | Run::
 37 | 
 38 |     $ pip install pysemantic
 39 | 
 40 | Installing from source
 41 | ++++++++++++++++++++++
 42 | 
 43 | You can install pysemantic by cloning this repository, installing the
 44 | dependencies and running::
 45 | 
 46 |     $ python setup.py install
 47 | 
 48 | in the root directory of your local clone.
 49 | 
 50 | Usage
 51 | +++++
 52 | 
 53 | Create an empty file named ``pysemantic.conf`` in your home directory. This can be as simple as running::
 54 | 
 55 | $ touch ~/pysemantic.conf
 56 | 
 57 | After installing pysemantic, you should have a command line script called
 58 | ``semantic``. Try it out by running::
 59 | 
 60 | $ semantic list
 61 | 
 62 | This should do nothing. This means that you don't have any projects regiestered
 63 | under pysemantic. A _project_ in pysemantic is just a collection of _datasets_.
 64 | pysemantic manages your datasets like an IDE manages source code files in that
 65 | it groups them under different projects, and each project has it's own tree
 66 | structure, build toolchains, requirements, etc. Similarly, different
 67 | pysemantic projects group under them a set of datasets, and manages them
 68 | depending on their respective user-defined specifications. Projects are
 69 | uniquely identified by their names.
 70 | 
 71 | For now, let's add and configure a demo project called, simply,
 72 | "pysemantic_demo". You can create a project and register it with pysemantic
 73 | using the ``add`` subcommand of the ``semantic`` script as follows::
 74 | 
 75 | $ semantic add pysemantic_demo
 76 | 
 77 | As you can see, this does not fit the supported usage of the ``add`` subcommand.
 78 | We additionally need a file containing the specifications for this project.
 79 | (Note that this file, containing the specifications, is referred to throughout
 80 | the documentation interchangeably as a *specfile* or a *data dictionary*.)
 81 | Before we create this file, let's download the well known Fisher iris datset,
 82 | which we will use as the sample dataset for this demo. You can download it
 83 | `here <https://raw.githubusercontent.com/jaidevd/pysemantic/master/pysemantic/tests/testdata/iris.csv>`_.
 84 | 
 85 | Once the dataset is downloaded, fire up your favourite text editor and create a
 86 | file named ``demo_specs.yaml``. Fill it up with the following content.
 87 | 
 88 | .. code-block:: yaml
 89 | 
 90 |     iris:
 91 |       path: /absolute/path/to/iris.csv
 92 | 
 93 | Now we can use this file as the data dictionary of the ``pysemantic_demo``
 94 | project. Let's tell pysemantic that we want to do so, by running the following
 95 | command::
 96 | 
 97 | $ semantic add pysemantic_demo /path/to/demo_specs.yaml
 98 | 
 99 | We're all set. To see how we did, start a Python interpreter and type the
100 | following statements::
101 | 
102 | >>> from pysemantic import Project
103 | >>> demo = Project("pysemantic_demo")
104 | >>> iris = demo.load_dataset("iris")
105 | 
106 | Voila! The Python object named ``iris`` is actually a pandas DataFrame containing
107 | the iris dataset! Well, nothing really remarkable so far. In fact, we cloned
108 | and installed a module, wrote two seemingly unnecessary files, and typed three
109 | lines of Python code to do something that could have been achieved by simply
110 | writing::
111 | 
112 | >>> iris = pandas.read_csv("/path/to/iris.csv")
113 | 
114 | Most datasets, however, are not as well behaved as this one. In fact they can
115 | be a nightmare to deal with. Pysemantic can be far more intricate and far
116 | smarter than this when dealing with mangled, badly encoded, ugly data with
117 | inconsistent data types. Check the IPython notebooks in the examples to see how to use Pysemantic for
118 | such data.
119 | 


--------------------------------------------------------------------------------
/continuous_integration/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script is inspired from **pgmpy** implementation of continous test
 3 | # integration. This is meant to "install" all the packages required for installing
 4 | # semantic.
 5 | 
 6 | # License: The BSD 3-clause License
 7 | 
 8 | set -e
 9 | 
10 | sudo apt-get update -qq
11 | sudo apt-get install build-essential -qq
12 | 
13 | if [[ "$DISTRIB" == "conda" ]]; then
14 | 	# Deactivate the travis-provided virtual environment and setup a
15 | 	# conda-based environment instead
16 | 	deactivate
17 | 
18 | 	# Use the miniconda installer for faster download / install of conda
19 | 	# itself
20 | 	wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \
21 | 		-O miniconda.sh
22 |     bash miniconda.sh -b -p $HOME/miniconda
23 |     export PATH=$HOME/miniconda/bin:$PATH
24 |     hash -r
25 | 	conda config --set always_yes yes --set changeps1 no
26 | 	conda update conda
27 | 	conda info -a
28 | 
29 | 	conda create -n testenv python=$PYTHON_VERSION --file continuous_integration/requirements.txt
30 |     source activate testenv
31 | fi
32 | 
33 | if [[ "$COVERAGE" == "true" ]]; then
34 | 	pip install coverage coveralls
35 | fi
36 | 
37 | # Build pgmpy
38 | python setup.py develop
39 | 


--------------------------------------------------------------------------------
/continuous_integration/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | traits
3 | pandas
4 | xlrd
5 | openpyxl
6 | pytables
7 | docopt
8 | nose
9 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pysemantic.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pysemantic.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pysemantic"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pysemantic"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaidevd/pysemantic/1b928446e431a69060bbc9d29b8a7c7a6f2b8c0c/docs/_static/logo.png


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | API Reference
 3 | =============
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     apiref/pysemantic
 9 |     apiref/modules
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/apiref/modules.rst:
--------------------------------------------------------------------------------
1 | pysemantic
2 | ==========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pysemantic
8 | 


--------------------------------------------------------------------------------
/docs/apiref/pysemantic.rst:
--------------------------------------------------------------------------------
 1 | pysemantic package
 2 | ==================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | pysemantic.cli module
 8 | ---------------------
 9 | 
10 | .. automodule:: pysemantic.cli
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | pysemantic.custom_traits module
16 | -------------------------------
17 | 
18 | .. automodule:: pysemantic.custom_traits
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | pysemantic.errors module
24 | ------------------------
25 | 
26 | .. automodule:: pysemantic.errors
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | pysemantic.exporters module
32 | ---------------------------
33 | 
34 | .. automodule:: pysemantic.exporters
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | pysemantic.loggers module
40 | -------------------------
41 | 
42 | .. automodule:: pysemantic.loggers
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | pysemantic.project module
48 | -------------------------
49 | 
50 | .. automodule:: pysemantic.project
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | pysemantic.utils module
56 | -----------------------
57 | 
58 | .. automodule:: pysemantic.utils
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 
63 | pysemantic.validator module
64 | ---------------------------
65 | 
66 | .. automodule:: pysemantic.validator
67 |     :members:
68 |     :undoc-members:
69 |     :show-inheritance:
70 | 
71 | 
72 | Module contents
73 | ---------------
74 | 
75 | .. automodule:: pysemantic
76 |     :members:
77 |     :undoc-members:
78 |     :show-inheritance:
79 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # pysemantic documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Apr  2 17:44:24 2015.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.autodoc',
 33 |     'sphinx.ext.intersphinx',
 34 |     'sphinx.ext.todo',
 35 |     'sphinx.ext.coverage',
 36 |     'sphinx.ext.mathjax',
 37 | ]
 38 | 
 39 | # Add any paths that contain templates here, relative to this directory.
 40 | templates_path = ['_templates']
 41 | 
 42 | # The suffix of source filenames.
 43 | source_suffix = '.rst'
 44 | 
 45 | # The encoding of source files.
 46 | #source_encoding = 'utf-8-sig'
 47 | 
 48 | # The master toctree document.
 49 | master_doc = 'index'
 50 | 
 51 | # General information about the project.
 52 | project = u'pysemantic'
 53 | copyright = u'2015, Jaidev Deshpande'
 54 | 
 55 | # The version info for the project you're documenting, acts as replacement for
 56 | # |version| and |release|, also used in various other places throughout the
 57 | # built documents.
 58 | #
 59 | # The short X.Y version.
 60 | version = '0.1.1'
 61 | # The full version, including alpha/beta/rc tags.
 62 | release = '0.0.1'
 63 | 
 64 | # The language for content autogenerated by Sphinx. Refer to documentation
 65 | # for a list of supported languages.
 66 | #language = None
 67 | 
 68 | # There are two options for replacing |today|: either, you set today to some
 69 | # non-false value, then it is used:
 70 | #today = ''
 71 | # Else, today_fmt is used as the format for a strftime call.
 72 | #today_fmt = '%B %d, %Y'
 73 | 
 74 | # List of patterns, relative to source directory, that match files and
 75 | # directories to ignore when looking for source files.
 76 | exclude_patterns = ['_build']
 77 | 
 78 | # The reST default role (used for this markup: `text`) to use for all
 79 | # documents.
 80 | #default_role = None
 81 | 
 82 | # If true, '()' will be appended to :func: etc. cross-reference text.
 83 | #add_function_parentheses = True
 84 | 
 85 | # If true, the current module name will be prepended to all description
 86 | # unit titles (such as .. function::).
 87 | #add_module_names = True
 88 | 
 89 | # If true, sectionauthor and moduleauthor directives will be shown in the
 90 | # output. They are ignored by default.
 91 | #show_authors = False
 92 | 
 93 | # The name of the Pygments (syntax highlighting) style to use.
 94 | pygments_style = 'sphinx'
 95 | 
 96 | # A list of ignored prefixes for module index sorting.
 97 | #modindex_common_prefix = []
 98 | 
 99 | # If true, keep warnings as "system message" paragraphs in the built documents.
100 | #keep_warnings = False
101 | 
102 | 
103 | # -- Options for HTML output ----------------------------------------------
104 | 
105 | # The theme to use for HTML and HTML Help pages.  See the documentation for
106 | # a list of builtin themes.
107 | html_theme = 'default'
108 | 
109 | # Theme options are theme-specific and customize the look and feel of a theme
110 | # further.  For a list of options available for each theme, see the
111 | # documentation.
112 | #html_theme_options = {}
113 | 
114 | # Add any paths that contain custom themes here, relative to this directory.
115 | #html_theme_path = []
116 | 
117 | # The name for this set of Sphinx documents.  If None, it defaults to
118 | # "<project> v<release> documentation".
119 | #html_title = None
120 | 
121 | # A shorter title for the navigation bar.  Default is the same as html_title.
122 | #html_short_title = None
123 | 
124 | # The name of an image file (relative to this directory) to place at the top
125 | # of the sidebar.
126 | #html_logo = None
127 | 
128 | # The name of an image file (within the static path) to use as favicon of the
129 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
130 | # pixels large.
131 | #html_favicon = None
132 | 
133 | # Add any paths that contain custom static files (such as style sheets) here,
134 | # relative to this directory. They are copied after the builtin static files,
135 | # so a file named "default.css" will overwrite the builtin "default.css".
136 | html_static_path = ['_static']
137 | 
138 | # Add any extra paths that contain custom files (such as robots.txt or
139 | # .htaccess) here, relative to this directory. These files are copied
140 | # directly to the root of the documentation.
141 | #html_extra_path = []
142 | 
143 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
144 | # using the given strftime format.
145 | #html_last_updated_fmt = '%b %d, %Y'
146 | 
147 | # If true, SmartyPants will be used to convert quotes and dashes to
148 | # typographically correct entities.
149 | #html_use_smartypants = True
150 | 
151 | # Custom sidebar templates, maps document names to template names.
152 | #html_sidebars = {}
153 | 
154 | # Additional templates that should be rendered to pages, maps page names to
155 | # template names.
156 | #html_additional_pages = {}
157 | 
158 | # If false, no module index is generated.
159 | #html_domain_indices = True
160 | 
161 | # If false, no index is generated.
162 | #html_use_index = True
163 | 
164 | # If true, the index is split into individual pages for each letter.
165 | #html_split_index = False
166 | 
167 | # If true, links to the reST sources are added to the pages.
168 | #html_show_sourcelink = True
169 | 
170 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
171 | #html_show_sphinx = True
172 | 
173 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
174 | #html_show_copyright = True
175 | 
176 | # If true, an OpenSearch description file will be output, and all pages will
177 | # contain a <link> tag referring to it.  The value of this option must be the
178 | # base URL from which the finished HTML is served.
179 | #html_use_opensearch = ''
180 | 
181 | # This is the file name suffix for HTML files (e.g. ".xhtml").
182 | #html_file_suffix = None
183 | 
184 | # Output file base name for HTML help builder.
185 | htmlhelp_basename = 'pysemanticdoc'
186 | 
187 | 
188 | # -- Options for LaTeX output ---------------------------------------------
189 | 
190 | latex_elements = {
191 | # The paper size ('letterpaper' or 'a4paper').
192 | #'papersize': 'letterpaper',
193 | 
194 | # The font size ('10pt', '11pt' or '12pt').
195 | #'pointsize': '10pt',
196 | 
197 | # Additional stuff for the LaTeX preamble.
198 | #'preamble': '',
199 | }
200 | 
201 | # Grouping the document tree into LaTeX files. List of tuples
202 | # (source start file, target name, title,
203 | #  author, documentclass [howto, manual, or own class]).
204 | latex_documents = [
205 |   ('index', 'pysemantic.tex', u'pysemantic Documentation',
206 |    u'Jaidev Deshpande', 'manual'),
207 | ]
208 | 
209 | # The name of an image file (relative to this directory) to place at the top of
210 | # the title page.
211 | #latex_logo = None
212 | 
213 | # For "manual" documents, if this is true, then toplevel headings are parts,
214 | # not chapters.
215 | #latex_use_parts = False
216 | 
217 | # If true, show page references after internal links.
218 | #latex_show_pagerefs = False
219 | 
220 | # If true, show URL addresses after external links.
221 | #latex_show_urls = False
222 | 
223 | # Documents to append as an appendix to all manuals.
224 | #latex_appendices = []
225 | 
226 | # If false, no module index is generated.
227 | #latex_domain_indices = True
228 | 
229 | 
230 | # -- Options for manual page output ---------------------------------------
231 | 
232 | # One entry per manual page. List of tuples
233 | # (source start file, name, description, authors, manual section).
234 | man_pages = [
235 |     ('index', 'pysemantic', u'pysemantic Documentation',
236 |      [u'Jaidev Deshpande'], 1)
237 | ]
238 | 
239 | # If true, show URL addresses after external links.
240 | #man_show_urls = False
241 | 
242 | 
243 | # -- Options for Texinfo output -------------------------------------------
244 | 
245 | # Grouping the document tree into Texinfo files. List of tuples
246 | # (source start file, target name, title, author,
247 | #  dir menu entry, description, category)
248 | texinfo_documents = [
249 |   ('index', 'pysemantic', u'pysemantic Documentation',
250 |    u'Jaidev Deshpande', 'pysemantic', 'One line description of project.',
251 |    'Miscellaneous'),
252 | ]
253 | 
254 | # Documents to append as an appendix to all manuals.
255 | #texinfo_appendices = []
256 | 
257 | # If false, no module index is generated.
258 | #texinfo_domain_indices = True
259 | 
260 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
261 | #texinfo_show_urls = 'footnote'
262 | 
263 | # If true, do not generate a @detailmenu in the "Top" node's menu.
264 | #texinfo_no_detailmenu = False
265 | 
266 | 
267 | # Example configuration for intersphinx: refer to the Python standard library.
268 | intersphinx_mapping = {'http://docs.python.org/': None}
269 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 |     
 8 | * Introduction_ to PySemantic.
 9 | 
10 | .. _Introduction: http://nbviewer.ipython.org/github/jaidevd/pysemantic/blob/master/docs/examples/introduction.ipynb
11 | 


--------------------------------------------------------------------------------
/docs/examples/introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Introduction\n",
  8 |     "=======\n",
  9 |     "\n",
 10 |     "Data is dirty. Any dataset that isn't properly curated and stored can suffer from many problems like having mixed data types, not being properly encoded or escaped, uneven number of fields, and so on. None of these problems are unsolvable. In fact, most of us are pretty good at cleaning data. Normally, when we know little or nothing about a given dataset, we proceed in a very predictable manner. We first try to read the data naively and see if errors are raised by the parser. If they are, we try to fix our function calls. When those are fixed, we try to run some sanity checks on the data, and end up filtering the dataset, sometimes quite heavily.\n",
 11 |     "\n",
 12 |     "The problem with this process is that it is iterative, and worse, it is _reactive_. Everybody in the team has to do it if they are to use the dataset. Sure, one can simply clean it up and dump it in a new file with just a few lines of code. But we shouldn't have to run that script every time we encouter a new dataset. We would be much more comforable if data is cleaned as it is read. It is much more efficient if data cleaning is a part of _data ingestion_.\n",
 13 |     "\n",
 14 |     "This can be achieved by having a centralized schema for every dataset. This schema can house the rules that the clean dataset must follow, so as to further aid its analysis. Of course, this schema can be expressed via a simple Python script which is shared with everyone who is doing analysis on the dataset in question. But the number of datasets that someone has to deal with over the timeline of a particular project can quickly get out of hand, and so do their cleaning scripts. Secondly, and more importantly, cleaning data via ad-hoc Python scripts is non trivial. Readable as Python scripts might be, it's not always easy for everyone in the team to change the cleaning process. Moreover, there are no Python libraries that offer an abstraction at the level of cleaning and validating data.\n",
 15 |     "\n",
 16 |     "Therefore, if one has to go through the process of data validation and cleaning in a customizable, modular way, one has to make sure that:\n",
 17 |     "\n",
 18 |     "* the specifications for all datasets are in one place, not in different scripts.\n",
 19 |     "* datasets are grouped under a suitable name, that pertains to particular projects. (In PySemantic such a group is called a `Project`, as we shall see).\n",
 20 |     "* strict validation and cleaning rules must be applied to all aspects of a dataset\n",
 21 |     "* the process of validation and cleaning has to be indentically reproducible by everyone who works on the data\n",
 22 |     "\n",
 23 |     "PySemantic makes all that happen.\n",
 24 |     "\n",
 25 |     "1. Getting Started\n",
 26 |     "==========\n",
 27 |     "\n",
 28 |     "Let's get our hands dirty. We'll explore more features as we go along. Before you proceed further, please make sure that you have gone through the quick start section [here](https://github.com/jaidevd/pysemantic#quick-start).\n",
 29 |     "\n",
 30 |     "By now you should have added a project named `pysemantic_demo`, and used the project object to load the iris. dataset. Let's take a more detailed look at what is happening here.\n",
 31 |     "\n",
 32 |     "1.1 The Project class\n",
 33 |     "------------------------\n",
 34 |     "\n",
 35 |     "A first class citizen of the pysemantic namespace is the [Project class](https://github.com/jaidevd/pysemantic/tree/master/pysemantic/project.py#L247). This class has everything you need to add, remove, read, or write datasets. In PySemantic, all datasets are classified under projects represented by instances of the Project class. Each project is identified by a unique name. This name is used to instantiate the Project class, and perform operations of all datasets registerd under it. You can think of these \"projects\" under pysemantic in the same way as an IDE organizes software projects. Each project in an IDE has a set of files containing source code, a set of build tools and a few other things that make a project self contained. Similarly, each project in PySemantic has its own datasets, which in turn have their schema and their validation rules. Currently, for this example, the iris dataset is loaded naively, without any rules."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from pysemantic import Project"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "demo = Project(\"pysemantic_demo\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "iris = demo.load_dataset(\"iris\")"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/html": [
 81 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 82 |        "<table border=\"1\" class=\"dataframe\">\n",
 83 |        "  <thead>\n",
 84 |        "    <tr style=\"text-align: right;\">\n",
 85 |        "      <th></th>\n",
 86 |        "      <th>Sepal Length</th>\n",
 87 |        "      <th>Sepal Width</th>\n",
 88 |        "      <th>Petal Length</th>\n",
 89 |        "      <th>Petal Width</th>\n",
 90 |        "      <th>Species</th>\n",
 91 |        "    </tr>\n",
 92 |        "  </thead>\n",
 93 |        "  <tbody>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>0</th>\n",
 96 |        "      <td>5.1</td>\n",
 97 |        "      <td>3.5</td>\n",
 98 |        "      <td>1.4</td>\n",
 99 |        "      <td>0.2</td>\n",
100 |        "      <td>setosa</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>1</th>\n",
104 |        "      <td>4.9</td>\n",
105 |        "      <td>3.0</td>\n",
106 |        "      <td>1.4</td>\n",
107 |        "      <td>0.2</td>\n",
108 |        "      <td>setosa</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>2</th>\n",
112 |        "      <td>4.7</td>\n",
113 |        "      <td>3.2</td>\n",
114 |        "      <td>1.3</td>\n",
115 |        "      <td>0.2</td>\n",
116 |        "      <td>setosa</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>3</th>\n",
120 |        "      <td>4.6</td>\n",
121 |        "      <td>3.1</td>\n",
122 |        "      <td>1.5</td>\n",
123 |        "      <td>0.2</td>\n",
124 |        "      <td>setosa</td>\n",
125 |        "    </tr>\n",
126 |        "    <tr>\n",
127 |        "      <th>4</th>\n",
128 |        "      <td>5.0</td>\n",
129 |        "      <td>3.6</td>\n",
130 |        "      <td>1.4</td>\n",
131 |        "      <td>0.2</td>\n",
132 |        "      <td>setosa</td>\n",
133 |        "    </tr>\n",
134 |        "  </tbody>\n",
135 |        "</table>\n",
136 |        "</div>"
137 |       ],
138 |       "text/plain": [
139 |        "   Sepal Length  Sepal Width  Petal Length  Petal Width Species\n",
140 |        "0           5.1          3.5           1.4          0.2  setosa\n",
141 |        "1           4.9          3.0           1.4          0.2  setosa\n",
142 |        "2           4.7          3.2           1.3          0.2  setosa\n",
143 |        "3           4.6          3.1           1.5          0.2  setosa\n",
144 |        "4           5.0          3.6           1.4          0.2  setosa"
145 |       ]
146 |      },
147 |      "execution_count": 4,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "iris.head(5)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "source": [
162 |     "This is the Fisher iris dataset that we know so well. Now imagine that someone was curating for us more samples of these flowers and sending us the measurements for 150 more flowers (sepal length, sepal width, petal length, petal width, and the species). That would amount to 150 more rows from in the dataset. Now suppose that our data acquisition methods were flawed, and the data that came in was dirty. A sample of such a dirty dataset can be found [here](https://github.com/jaidevd/pysemantic/tree/master/pysemantic/tests/testdata/bad_iris.csv). Try loading this file into a pandas dataframe directly, using the `pandas.read_csv` function. Notice that there's a column called `id`, which contains 10 digit strings. These IDs could correspond to some automatically generated IDs by the system storing the data. If they're really just IDs, they should be read as strings, but there was no way for pandas to know that these are as good as strings (Other examples of this are phone numbers and zipcodes). In pandas, this can be fixed by using the `dtype` argument in `pandas.read_csv`. To make this preference persist in pysemantic, we can add this dataset to our data dictionary (`demo_specs.yaml`) by adding to it the following lines:\n",
163 |     "\n",
164 |     "```yaml\n",
165 |     "bad_iris:\n",
166 |     "  path: /absolute/path/to/bad_iris.csv\n",
167 |     "  dtypes:\n",
168 |     "    - id: !!python/name.__builtin__.str\n",
169 |     "```\n",
170 |     "\n",
171 |     "The last line tells pandas that the coulmn `id` is to be read as a string, not as the default integer. Any type can thus be specified for any column, by adding a line formatted as follows:\n",
172 |     "\n",
173 |     "```yaml\n",
174 |     "  - column_name: yaml-dump-of-python-type\n",
175 |     "```\n",
176 |     "\n",
177 |     "for the given column. (Similarly, we can specify types for the other columns in the dataset too, but this isn't required since the default works fine for them.) You can try out how the Project object can infer these new specifications by doing the following:"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 6,
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "demo.reload_data_dict() # Re-reads the data dictionary specifications\n",
189 |     "bad_iris = demo.load_dataset(\"bad_iris\")"
190 |    ]
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "kernelspec": {
195 |    "display_name": "Python 2",
196 |    "language": "python",
197 |    "name": "python2"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 2
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython2",
209 |    "version": "2.7.10"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 0
214 | }
215 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pysemantic documentation master file, created by
 2 |    sphinx-quickstart on Thu Apr  2 17:44:24 2015.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pysemantic's documentation!
 7 | ======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    examples
15 |    schema_ref
16 |    api
17 | 
18 | 
19 | 
20 | Indices and tables
21 | ==================
22 | 
23 | * :ref:`genindex`
24 | * :ref:`modindex`
25 | * :ref:`search`
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pysemantic.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pysemantic.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/schema_ref.rst:
--------------------------------------------------------------------------------
  1 | ==============================
  2 | Schema Configuration Reference
  3 | ==============================
  4 | 
  5 | Every project in PySemantic can be configured via a data dictionary or a
  6 | schema, which is a yaml file. This file houses the details of how PySemantic
  7 | should treat a project's constituent datasets. A typical data dictionary
  8 | follows the following pattern:
  9 | 
 10 | .. code-block:: yaml
 11 | 
 12 |   dataset_name:
 13 |     dataset_param_1: value1
 14 |     dataset_param_2: value2
 15 |     # etc
 16 | 
 17 | PySemantic reads this as a dictionary where the parameter names are keys and
 18 | their values are the values in the dictionary. Thus, the schema for a whole
 19 | project is a dictionary of dictionaries.
 20 | 
 21 | --------------------------
 22 | Basic Schema Configuration
 23 | --------------------------
 24 | 
 25 | Here is a list of different dataset parameters that PySemantic is sensitive
 26 | to:
 27 | 
 28 | * ``path`` (Required, except when the ``source`` parameter is "mysql") The path to the file containing the data. Note that the path must either be absolute, or relative to the directory containing the schema. This can also be a list of files if the dataset spans multiple files. If that is the case, the path parameter can be specified as:
 29 | 
 30 |   .. code-block:: yaml
 31 | 
 32 |     path:
 33 |       - absolulte/path/to/file/1
 34 |       - absolulte/path/to/file/2
 35 |       # etc
 36 | 
 37 |     # or
 38 | 
 39 |     path:
 40 |       - foo/bar/baz
 41 |     # where foo is a directory in the directory that contains the schema.
 42 | 
 43 | * ``demlimiter`` (Optional, default: ``,``) The delimiter used in the file. This has to be a character delimiter, not words like "comma" or "tab".
 44 | 
 45 | * ``md5`` (Optional) The MD5 checksum of the file to read. This necessary
 46 |   because sometimes we read files and after processing it, rewrite to the same
 47 |   path. This parameter helps keep track of whether the file is correct.
 48 | 
 49 | * ``header``: (Optional) The header row of the file.
 50 | 
 51 | * ``index_col``: (Optional) Name of the column that forms the index of the
 52 |   dataframe. This can be a single string or a list of strings. If a list is
 53 |   provided, the dataframe becomes multi-indexed.
 54 | 
 55 | * ``sheetname``: (Optional) Name of the sheet containing the dataset in an
 56 |   MS Excel spreadsheet. This comes into play only when ``path`` points to an
 57 |   Excel file. For other types of files, this is ignored. When ``path`` is an
 58 |   Excel file and this parameter is not provided, it is assumed to be the same
 59 |   as the name of the dataset. For example:
 60 | 
 61 |   .. code-block:: yaml
 62 | 
 63 |     iris:
 64 |         path: /path/to/iris.xlsx
 65 | 
 66 |   The schema above assumes that the iris dataset resides in a sheet named
 67 |   "iris". If instead the name of the sheet is different, you can specify it
 68 |   as:
 69 | 
 70 |   .. code-block:: yaml
 71 | 
 72 |     iris:
 73 |         path: /path/to/iris.xlsx
 74 |         sheetname: name_of_sheet
 75 | 
 76 |   This parameter can also be a list, to enable the combination of multiple
 77 |   sheets into a dataframe, as follows:
 78 | 
 79 |   .. code-block:: yaml
 80 | 
 81 |     iris:
 82 |         path: /path/to/iris.xlsx
 83 |         sheetname:
 84 |             - sheet1
 85 |             - sheet2
 86 | 
 87 |   This will combine the data from sheet1 and sheet2 into a single dataframe.
 88 | 
 89 | * ``column_names``: (Optional) Specify the names of columns to use in the
 90 |   loaded dataframe. This option can have multiple types of values. It can be:
 91 | 
 92 |     1. A list of strings to use as column names:
 93 | 
 94 |     .. code-block:: yaml
 95 | 
 96 |       column_names:
 97 |         - column_1
 98 |         - column_2
 99 |         - column_3
100 | 
101 |     2. A dictionary that maps original column names to new ones:
102 | 
103 |     .. code-block:: yaml
104 | 
105 |       column_names:
106 |         org_colname_1: new_colname_a
107 |         org_colname_2: new_colname_b
108 |         org_colname_3: new_colname_c
109 | 
110 |     3. A Python function that translates the name of every column in the loaded
111 |        dataframe:
112 | 
113 |     .. code-block:: yaml
114 | 
115 |       column_names: !!python/name:module_name.translate_column_name
116 | 
117 | * ``nrows``: (Optional) Method to select which rows are read from the dataset.
118 |   This option, like ``column_names``, can be specified in many ways. It can be:
119 | 
120 |     1. An integer (default): Number of rows to read from the file. If this
121 |        option is not specified, all rows from the file are read.
122 | 
123 |       .. code-block:: yaml
124 | 
125 |         nrows: 100
126 | 
127 |     2. A dictionary that recognizes specific keys:
128 | 
129 |        * ``random``: A boolean that directs PySemantic to shuffle the selected rows after loading the dataset.
130 |          For example, including the following lines in the schema
131 | 
132 |          .. code-block:: yaml
133 | 
134 |            nrows:
135 |               random: true
136 | 
137 |          will shuffle the dataset before returning it.
138 | 
139 |        * ``range``: A list of two integers, which denote the first and the
140 |          last index of the range of rows to be read. For example, the
141 |          following lines
142 | 
143 |          .. code-block:: yaml
144 | 
145 |           nrows:
146 |               range:
147 |                   - 10
148 |                   - 50
149 | 
150 |          will only select the 10th to the 50th (exclusive) rows.
151 |         
152 |        * ``count``: An integer that can be used in conjunction with either
153 |          or both of the above options, to denote the number of rows to read
154 |          from a random selection or a range.
155 | 
156 |          .. code-block:: yaml
157 | 
158 |           nrows:
159 |               range:
160 |                   - 10
161 |                   - 50
162 |               count: 10
163 |               random: true
164 | 
165 |         The lines shown above will direct PySemantic to load 10 rows at
166 |         random between the 10th and the 50th rows of a dataset.
167 | 
168 |        * ``shuffle``: A boolean to be used with ``count`` to shuffle the top ``count`` rows before returning the dataframe.
169 | 
170 |          .. code-block:: yaml
171 |  
172 |           nrows:
173 |               count: 10
174 |               shuffle: True
175 | 
176 |         The above schema will read the first ten rows from the dataset and
177 |         shuffle them.
178 | 
179 |     3. A callable which returns a logical array which has the same number of elements as the number of rows in the dataset. The output of this callable is used as a logical index for slicing the dataset. For example, suppose we wanted to extract all even numbered rows from a dataset, then we could make a callable as follows:
180 | 
181 |       .. code-block:: python
182 | 
183 |         iseven = lambda x: np.remainder(x, 2) == 0
184 | 
185 |     Suppose this function resides in a module called ``foo.bar``, then we
186 |     can include it in the schema as follows:
187 | 
188 |     .. code-block:: yaml
189 | 
190 |       nrows: !!python/name:foo.bar.iseven
191 | 
192 |     This will cause PySemantic to only load all even valued row numbers.
193 | 
194 | * ``use_columns``: (Optional) The list of the columns to read from the dataset. The format for specifying this parameter is as follows:
195 | 
196 |   .. code-block:: yaml
197 |   
198 |       use_columns:
199 |         - column_1
200 |         - column_2
201 |         - column_3
202 | 
203 | If this parameter is not specified, all columns present in the dataset are read.
204 | 
205 | * ``exclude_columns``: This option can be used to specify columns that are
206 |   explicityly to be ignored. This is useful when there are large number of
207 |   columns in the dataset and we only wish to exclude a few. Note that this
208 |   option overrides the ``use_columns`` option, i.e. if a column name is present
209 |   in both lists, it will be dropped.
210 | 
211 | * ``na_values``: A string or a list of values that are considered as NAs by the pandas parsers, applicable to the whole dataframe.
212 | 
213 | * ``converters``: A dictionary of functions to be applied to columns when loading data. Any Python callable can be added to this list. This parameter makes up the ``converters`` argument of Pandas parsers. The usage is as follows:
214 | 
215 |   .. code-block:: yaml
216 |   
217 |       converters:
218 |         col_a: !!python/name:numpy.int
219 | 
220 | This results in the ``numpy.int`` function being called on the column ``col_a``
221 | 
222 | * ``dtypes`` (Optional) Data types of the columns to be read. Since types in Python are native objects, PySemantic expects them to be so in the schema. This can be formatted as follows:
223 | 
224 |   .. code-block:: yaml
225 |   
226 |     dtypes:
227 |       column_name: !!python/name:python_object
228 | 
229 | For example, if you have three columns named ``foo``, ``bar``, and ``baz``,
230 | which have the types ``string``, ``integer`` and ``float`` respectively, then your schema
231 | should look like:
232 | 
233 |   .. code-block:: yaml
234 |   
235 |     dtypes:
236 |       foo: !!python/name:__builtin__.str
237 |       bar: !!python/name:__builtin__.int
238 |       baz: !!python/name:__builtin__.float
239 | 
240 | Non-builtin types can be specified too:
241 | 
242 |   .. code-block:: yaml
243 | 
244 |      dtypes:
245 |        datetime_column: !!python/name:datetime.date
246 | 
247 | *Note*: You can figure out the yaml representation of a Python type by doing
248 | the following:
249 | 
250 |   .. code-block:: python
251 | 
252 |     import yaml
253 |     x = type(foo) # where foo is the object who's type is to be yamlized
254 |     print yaml.dump(x)
255 | 
256 | * ``parse_dates`` (Optional) Columns containing Date/Time values can be parsed into native NumPy datetime objects. This argument can be a list, or a ditionary. If it is a dictionary of the following form: 
257 | 
258 |   .. code-block:: yaml
259 | 
260 |     parse_dates:
261 |       output_col_name:
262 |         - col_a
263 |         - col_b
264 | 
265 | it will parse columns ``col_a`` and ``col_b`` as datetime columns, and put the result in a column named ``output_col_name``. Specifying the output name is optional. You may declare the schema as a list, as follows:
266 | 
267 |   .. code-block:: yaml
268 | 
269 |     parse_dates:
270 |       - col_a
271 |       - col_b
272 | 
273 | In this case the parser will independently parse columns ``col_a`` and ``col_b`` into datetime.
274 | 
275 | *NOTE*: Specifying this column will make PySemantic ignore any columns that have been declared as having the datetime type in the ``dtypes`` parameter.
276 | 
277 | * ``pickle`` (Optional) Absolute path to file which contains pickled arguments for the
278 |   parser. This option can be used if readability or declaratives are not a concern. The file should contain a picked dictionary that is directly passed
279 |   to the parser, i.e. if the loaded pickled data is in a dict named ``data``,
280 |   then parser invocation becomes ``parser(**data)``.
281 | 
282 | *NOTE*: If any of the above options are present, they will override the corresponding arguments contained in the pickle file. In PySemantic, declarative statements have the right of way.
283 | 
284 | ----------------------------
285 | Column Schema Configuration
286 | ----------------------------
287 | 
288 | PySemantic also allows specifying rules and validators independently for each
289 | column. This can be done using the ``column_rules`` parameter of the dataset
290 | schema. Here is a typical format:
291 | 
292 | .. code-block:: yaml
293 | 
294 |   dataset_name:
295 |     column_rules:
296 |       column_1_name:
297 |         # rules to be applied to the column
298 |       column_2_name:
299 |         # rules to be applied to the column
300 | 
301 | The following parameters can be supplied to any column under ``column_rules``:
302 | 
303 | * ``is_drop_na`` ([true|false], default false) Setting this to ``true`` causes PySemantic to drop all NA values in the column.
304 | * ``is_drop_duplicates`` ([true|false], default false) Setting this to ``true`` causes PySemantic to drop all duplicated values in the column.
305 | * ``unique_values``: These are the unique values that are expected in a column. The value of this parameter has to be a yaml list. Any value not found in this list will be dropped when cleaning the dataset.
306 | * ``exclude``: These are the values that are to be explicitly excluded from the column. This comes in handy when a column has too many unique values, and a handful of them have to be dropped. Note that this value has to be a list.
307 | * ``min``: Minimum value allowed in a column if the column holds numerical data. By default, the minimum is -np.inf. Any value less than this one is dropped.
308 | * ``max``: Maximum value allowed in a column if the column holds numerical data. By default, the maximum is np.inf. Any value greater than this one is dropped.
309 | * ``regex``: A regular expression that each element of the column must match, if the column holds text data. Any element of the column not matching this regex is dropped.
310 | * ``na_values``: A list of values that are considered as NAs by the pandas parsers, applicable to this column.
311 | * ``postprocessors``: A list of callables that called one by one on the columns. Any python function that accepts a series, and returns a series can be a postprocessor.
312 | 
313 | 
314 | Here is a more extensive example of the usage of this schema.
315 | 
316 | .. code-block:: yaml
317 | 
318 |   iris:
319 |     path: /home/username/src/pysemantic/testdata/iris.csv
320 |     converters:
321 |       Sepal Width: !!python/name:numpy.floor
322 |     column_rules:
323 |       Sepal Length:
324 |         min: 2.0
325 |       Petal Length:
326 |         max: 4.0
327 |       Petal Width:
328 |         exclude:
329 |           - 3.14
330 |       Species:
331 |         unique_values:
332 |           - setosa
333 |           - versicolor
334 |         postprocessors:
335 |           - !!python/name:module_name.foo
336 | 
337 | This would cause PySemantic to produce a dataframe corresponding to the Fisher
338 | iris dataset which has the following characteristics:
339 | 
340 | 1. It contains no observations where the sepal length is less than 2 cm.
341 | 2. It contains no observations where the petal length is more than 4 cm.
342 | 3. The sepal width only contains integers.
343 | 4. The petal width column will not contain the specific value 3.14
344 | 5. The species column will only contain the values "setosa" and "versicolor", i.e. it will not contain the value "virginica".
345 | 6. The species column in the dataframe will be processed by the ``module_name.foo`` function.
346 | 
347 | 
348 | ------------------------------
349 | DataFrame Schema Configuration
350 | ------------------------------
351 | 
352 | A few rules can also be enforced at the dataframe level, instead of at the
353 | level of individual columns in the dataset. Two of them are:
354 | 
355 | * ``drop_duplicates`` ([true|false, default true]). This behaves in the same
356 |   way as ``is_drop_duplicates`` for series schema, with the exception that here
357 |   the default is True.
358 | * ``drop_na`` ([true|false, default true]). This behaves in the same
359 |   way as ``is_drop_na`` for series schema, with the exception that here
360 |   the default is True.
361 | 
362 | 
363 | ----------------
364 | Reading from SQL
365 | ----------------
366 | 
367 | *Note*: This has not yet been tested.
368 | 
369 | PySemantic can automatically create the function calls required to download a
370 | SQL table as a dataframe - by using a wrapper around the
371 | ``pandas.read_sql_table`` function. The configuration parameters are as
372 | follows:
373 | 
374 | * ``source``: This is simply a string saying "mysql", which lets pysemantic
375 |   know that the dataset is to be downloaded from a MySQL database.
376 | * ``table_name``: Name of the table to be read. If this argument is not
377 |   present, pysemantic expects to find the ``query`` parameter.
378 | * ``query``: SQL query to run and extract the resulting rows into a pandas
379 |     dataframe
380 | * ``config``: This is a dictionary that contains the configuration required to
381 |   connect to the MySQL server. The configuration must have the following
382 |   elements:
383 | 
384 |     1. ``hostname``: The IP address or the hostname of the machine hosting the MySQL server.
385 |     2. ``db_name``: Name of the database from which to read the table.
386 |     3. ``username``: The SQL username
387 |     4. ``password``: The SQL password
388 | * ``chunksize``: (Integer, optional) If this is specified, Pandas returns an
389 |   iterator in which every iteration contains ``chunksize`` rows.
390 | 


--------------------------------------------------------------------------------
/docs/tutorial/notebooks/demo_project.yml:
--------------------------------------------------------------------------------
 1 | dummy_data:
 2 |   path: /home/jaidev/src/pysemantic/docs/tutorial/notebooks/dummy_data.csv
 3 |   dtypes:
 4 |     zip: !!python/name:__builtin__.str
 5 |     date: !!python/name:datetime.date
 6 |   column_rules:
 7 |     X:
 8 |       maximum: 9.0
 9 |     email:
10 |       regex: (^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)
11 | 


--------------------------------------------------------------------------------
/docs/tutorial/notebooks/loading_datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "What we did last time\n",
  8 |     "------------------------\n",
  9 |     "\n",
 10 |     "1. Specified the data type of a column\n",
 11 |     "2. Parsed timestamps into Pythonic timestamps\n",
 12 |     "3. Dropped outliers from a numerical array\n",
 13 |     "4. Checked text for valid email addresses"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from pysemantic import Project\n",
 25 |     "import numpy as np"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "demo = Project(\"demo\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "data = demo.load_dataset(\"dummy_data\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/html": [
 60 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 61 |        "<table border=\"1\" class=\"dataframe\">\n",
 62 |        "  <thead>\n",
 63 |        "    <tr style=\"text-align: right;\">\n",
 64 |        "      <th></th>\n",
 65 |        "      <th>date</th>\n",
 66 |        "      <th>zip</th>\n",
 67 |        "      <th>X</th>\n",
 68 |        "      <th>email</th>\n",
 69 |        "    </tr>\n",
 70 |        "  </thead>\n",
 71 |        "  <tbody>\n",
 72 |        "    <tr>\n",
 73 |        "      <th>0</th>\n",
 74 |        "      <td>2015-02-21 01:05:03</td>\n",
 75 |        "      <td>13611</td>\n",
 76 |        "      <td>5.014501</td>\n",
 77 |        "      <td>jeff.dasovich@enron.com</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>2</th>\n",
 81 |        "      <td>2015-02-27 10:16:34</td>\n",
 82 |        "      <td>02888</td>\n",
 83 |        "      <td>8.918459</td>\n",
 84 |        "      <td>karen.denne@enron.com</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>3</th>\n",
 88 |        "      <td>2015-02-20 19:11:00</td>\n",
 89 |        "      <td>07827</td>\n",
 90 |        "      <td>5.664665</td>\n",
 91 |        "      <td>enron-owner@lists.qgadc.com</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>4</th>\n",
 95 |        "      <td>2015-02-21 13:20:11</td>\n",
 96 |        "      <td>23887</td>\n",
 97 |        "      <td>6.159554</td>\n",
 98 |        "      <td>jeff.dasovich@enron.com</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>5</th>\n",
102 |        "      <td>2015-02-22 04:17:01</td>\n",
103 |        "      <td>35461</td>\n",
104 |        "      <td>5.618556</td>\n",
105 |        "      <td>jeff.dasovich@enron.com</td>\n",
106 |        "    </tr>\n",
107 |        "  </tbody>\n",
108 |        "</table>\n",
109 |        "</div>"
110 |       ],
111 |       "text/plain": [
112 |        "                 date    zip         X                        email\n",
113 |        "0 2015-02-21 01:05:03  13611  5.014501      jeff.dasovich@enron.com\n",
114 |        "2 2015-02-27 10:16:34  02888  8.918459        karen.denne@enron.com\n",
115 |        "3 2015-02-20 19:11:00  07827  5.664665  enron-owner@lists.qgadc.com\n",
116 |        "4 2015-02-21 13:20:11  23887  6.159554      jeff.dasovich@enron.com\n",
117 |        "5 2015-02-22 04:17:01  35461  5.618556      jeff.dasovich@enron.com"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "data.head()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "date datetime64[ns]\n",
141 |       "zip object\n",
142 |       "X float64\n",
143 |       "email object\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "for column in data:\n",
149 |     "    print column, data[column].dtype"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "False\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "zip_lengths = data['zip'].apply(len)\n",
169 |     "print np.any(zip_lengths != 5)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 7,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "8.99728774235\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "print data['X'].max()"
189 |    ]
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "kernelspec": {
194 |    "display_name": "Python 2",
195 |    "language": "python",
196 |    "name": "python2"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 2
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython2",
208 |    "version": "2.7.9"
209 |   }
210 |  },
211 |  "nbformat": 4,
212 |  "nbformat_minor": 0
213 | }
214 | 


--------------------------------------------------------------------------------
/docs/tutorial/slides/images/dc_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaidevd/pysemantic/1b928446e431a69060bbc9d29b8a7c7a6f2b8c0c/docs/tutorial/slides/images/dc_logo.jpg


--------------------------------------------------------------------------------
/docs/tutorial/slides/presentation.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | % Beamer Presentation
  3 | % LaTeX Template
  4 | % Version 1.0 (10/11/12)
  5 | %
  6 | % This template has been downloaded from:
  7 | % http://www.LaTeXTemplates.com
  8 | %
  9 | % License:
 10 | % CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
 11 | %
 12 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 13 | 
 14 | %----------------------------------------------------------------------------------------
 15 | %	PACKAGES AND THEMES
 16 | %----------------------------------------------------------------------------------------
 17 | 
 18 | 
 19 | 
 20 | \documentclass{beamer}
 21 | 
 22 | 
 23 | \definecolor{dcorange}{HTML}{F05A28}
 24 | \setbeamercolor{structure}{bg=black, fg=dcorange}
 25 | 
 26 | 
 27 | \usetheme{Warsaw}
 28 | 
 29 | \usepackage{graphicx} % Allows including images
 30 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables
 31 | \usepackage{hyperref} % Allows the use of \toprule, \midrule and \bottomrule in tables
 32 | \usepackage{textpos}
 33 | 
 34 | %----------------------------------------------------------------------------------------
 35 | %	TITLE PAGE
 36 | %----------------------------------------------------------------------------------------
 37 | 
 38 | \titlegraphic{\includegraphics[width=.6\textwidth,height=.3\textheight]{images/dc_logo.jpg}}
 39 | \title[Introduction to PySemantic]{Introduction to PySemantic}
 40 | % The short title appears at the bottom of every slide, the full title is only on the title page
 41 | 
 42 | %\titlegraphic{\includegraphics[width=.5\textwidth,height=.5\textheight]{dc_logo.jpg}}
 43 | \author{Jaidev Deshpande} % Your name
 44 | \institute[Cube26 Pvt Limited] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space
 45 | 
 46 | 
 47 | %{
 48 | %Cube26 Pvt Ltd \\ % Your institution for the title page
 49 | %%\includegraphics[width=2cm]{dc_logo.jpg}
 50 | %\medskip
 51 | %\textit{deshpande.jaidev@gmail.com} % Your email address
 52 | %}
 53 | \date{\today} % Date, can be changed to a custom date
 54 | 
 55 | %\titlegraphic{\includegraphics[width=2cm]{dc_logo.jpg}}
 56 | 
 57 | %\addtobeamertemplate{frametitle}{}{%
 58 | %    \begin{textblock*}{100mm}(.85\textwidth,-1cm)
 59 | %        \includegraphics[height=1cm,width=2cm]{dc_logo.jpg}
 60 | %\end{textblock*}}
 61 | 
 62 | 
 63 | 
 64 | \begin{document}
 65 | 
 66 | \begin{frame}
 67 | \titlepage % Print the title page as the first slide
 68 | \end{frame}
 69 | 
 70 | \begin{frame}
 71 | \frametitle{Motivation}
 72 | \begin{itemize}
 73 | \item Typical data analysis pipeline:\\
 74 | Data Ingest $\rightarrow$ Exploratory Analysis $\rightarrow$ Feature Engineering $\rightarrow$ Machine Learning $\rightarrow$ Insights!
 75 | \item Data scientists often work in large teams.
 76 | \item Communication about data ingest is important.
 77 | \item Messy data $\Rightarrow$ more communication.
 78 | \end{itemize}
 79 | \end{frame}
 80 | 
 81 | \begin{frame}
 82 |     \frametitle{Why PySemantic?}
 83 |     \begin{itemize}
 84 |         \item Problem: How do I effectively communicate about data?
 85 |         \item Existing solutions:\\
 86 |             \begin{enumerate}
 87 |                 \item Text documentation
 88 |                 \item Ad-hoc scripts to clean or validate the data
 89 |                 \item Version control
 90 |             \end{enumerate}
 91 |         \item Don't scale with the diversity of the data.
 92 |         \item The process is \textit{reactive}
 93 |         \item The process is unnecessarily redundant.
 94 |     \end{itemize}
 95 | \end{frame}
 96 | 
 97 | \begin{frame}
 98 |     \frametitle{Why PySemantic?}
 99 |     \begin{itemize}
100 |         \item Group all datasets under \textit{projects}.
101 |         \item A centralized data dictionary that holds properties of all
102 |             datasets under a project.
103 |         \item A single entry point into the software framework required for
104 |             reading, cleaning and validating a dataset.
105 |         \item Reproducibility across teams and individuals.
106 |     \end{itemize}
107 | \end{frame}
108 | 
109 | \begin{frame}
110 |     \frametitle{Getting Started}
111 |     \url{https://github.com/jaidevd/pysemantic}
112 | \end{frame}
113 | 
114 | 
115 | %----------------------------------------------------------------------------------------
116 | 
117 | \end{document} 
118 | 


--------------------------------------------------------------------------------
/pysemantic/__init__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@schrodinger.local>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | 
10 | from pysemantic.project import Project
11 | 
12 | __version__ = "0.1.1"
13 | 
14 | __all__ = ['Project', 'test']
15 | 


--------------------------------------------------------------------------------
/pysemantic/cli.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3 clause license.
 8 | 
 9 | """semantic
10 | 
11 | Usage:
12 |   semantic list [--project=<PROJECT_NAME>]
13 |   semantic add PROJECT_NAME PROJECT_SPECFILE
14 |   semantic remove PROJECT_NAME [--dataset=<dname>]
15 |   semantic set-schema PROJECT_NAME SCHEMA_FPATH
16 |   semantic set-specs PROJECT_NAME --dataset=<dname> [--path=<pth>] [--dlm=<sep>]
17 |   semantic add-dataset DATASET_NAME --project=<pname> --path=<pth> --dlm=<sep>
18 |   semantic export PROJECT_NAME [--dataset=<dname>] OUTPATH
19 | 
20 | Options:
21 |   -h --help	        Show this screen
22 |   -d --dataset=<dname>   Name of the dataset to modify
23 |   --path=<pth>        Path to a dataset
24 |   --dlm=<sep>         Declare the delimiter for a dataset
25 |   -p --project=<pname>   Name of the project to modify
26 |   -v --version        Print the version of PySemantic
27 | 
28 | """
29 | 
30 | import os.path as op
31 | 
32 | from docopt import docopt
33 | 
34 | from pysemantic import project as pr
35 | from pysemantic.errors import MissingProject
36 | 
37 | 
38 | def cli(arguments):
39 |     """cli - The main CLI argument parser.
40 | 
41 |     :param arguments: command line arguments, as parsed by docopt
42 |     :type arguments: dict
43 |     :return: None
44 |     """
45 |     if arguments.get("list", False):
46 |         if arguments['--project'] is None:
47 |             pr.view_projects()
48 |         else:
49 |             proj_name = arguments.get('--project')
50 |             dataset_names = pr.get_datasets(proj_name)
51 |             for name in dataset_names:
52 |                 print name
53 |     elif arguments.get("add", False):
54 |         proj_name = arguments.get("PROJECT_NAME")
55 |         proj_spec = arguments.get("PROJECT_SPECFILE")
56 |         proj_spec = op.abspath(proj_spec)
57 |         pr.add_project(proj_name, proj_spec)
58 |     elif arguments.get("remove", False):
59 |         proj_name = arguments.get("PROJECT_NAME")
60 |         if arguments['--dataset'] is None:
61 |             if not pr.remove_project(proj_name):
62 |                 print "The project {0} doesn't exist.".format(proj_name)
63 |         else:
64 |             pr.remove_dataset(proj_name, arguments['--dataset'])
65 |     elif arguments.get("set-schema", False):
66 |         try:
67 |             proj_name = arguments.get("PROJECT_NAME")
68 |             proj_spec = arguments.get("SCHEMA_FPATH")
69 |             proj_spec = op.abspath(proj_spec)
70 |             pr.set_schema_fpath(proj_name, proj_spec)
71 |         except MissingProject:
72 |             msg = """Project {} not found in the configuration. Please use
73 |             $ semantic add
74 |             to register the project.""".format(arguments.get("PROJECT_NAME"))
75 |             print msg
76 |     elif arguments.get("set-specs", False):
77 |         proj_name = arguments.get("PROJECT_NAME")
78 |         dataset_name = arguments.get("--dataset")
79 |         newspecs = {}
80 |         if arguments.get("--path", False):
81 |             newspecs['path'] = arguments.get("--path")
82 |         if arguments.get("--dlm", False):
83 |             newspecs['delimiter'] = arguments.get("--dlm")
84 |         pr.set_schema_specs(proj_name, dataset_name, **newspecs)
85 |     elif arguments.get("add-dataset", False):
86 |         proj_name = arguments.get('--project')
87 |         dataset_name = arguments.get("DATASET_NAME")
88 |         specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
89 |         pr.add_dataset(proj_name, dataset_name, specs)
90 |     elif arguments.get("export", False):
91 |         project = pr.Project(arguments.get("PROJECT_NAME"))
92 |         project.export_dataset(arguments.get("--dataset"),
93 |                                outpath=arguments.get("OUTPATH"))
94 | 
95 | 
96 | def main():
97 |     arguments = docopt(__doc__, version="semantic v0.1.1")
98 |     cli(arguments)
99 | 


--------------------------------------------------------------------------------
/pysemantic/custom_traits.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """Customized traits for advanced validation."""
10 | 
11 | import os.path as op
12 | 
13 | from traits.api import File, List, TraitError
14 | 
15 | 
16 | class ValidTraitList(List):
17 | 
18 |     """A List trait whose every element should be valid trait."""
19 | 
20 |     def validate(self, obj, name, value):
21 |         validated_value = super(ValidTraitList, self).validate(obj, name,
22 |                                                                value)
23 |         for trait_name in validated_value:
24 |             trait = obj.trait(trait_name)
25 |             trait.validate(obj, trait_name, getattr(obj, trait_name))
26 |         return validated_value
27 | 
28 | 
29 | class AbsFile(File):
30 | 
31 |     """A File trait whose value must be an absolute path, to an existing
32 |     file.
33 |     """
34 | 
35 |     exists = True
36 | 
37 |     def validate(self, obj, name, value):
38 |         validated_value = super(AbsFile, self).validate(obj, name, value)
39 |         if validated_value and op.isabs(validated_value) and op.isfile(value):
40 |             return validated_value
41 |         elif not op.isfile(value):
42 |             raise TraitError("The filepath does not exist.")
43 | 
44 |         self.error(obj, name, value)
45 | 


--------------------------------------------------------------------------------
/pysemantic/errors.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@schrodinger.local>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """Errors."""
10 | 
11 | 
12 | class MissingProject(Exception):
13 | 
14 |     """Error raised when project is not found."""
15 | 
16 | 
17 | class MissingConfigError(Exception):
18 | 
19 |     """Error raised when the pysemantic configuration file is not found."""
20 | 
21 | 
22 | class ParserArgumentError(Exception):
23 | 
24 |     """Error raised when no valid parser arguments are inferred from the
25 |     schema."""
26 | 


--------------------------------------------------------------------------------
/pysemantic/exporters.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """
10 | Exporters from PySemantic to databases or other data sinks.
11 | """
12 | 
13 | 
14 | class AbstractExporter(object):
15 |     """Abstract exporter for dataframes that have been cleaned."""
16 | 
17 |     def get(self, **kwargs):
18 |         raise NotImplementedError
19 | 
20 |     def set(self, **kwargs):
21 |         raise NotImplementedError
22 | 
23 | 
24 | class AerospikeExporter(AbstractExporter):
25 |     """Example class for exporting to an aerospike database."""
26 | 
27 |     def __init__(self, config, dataframe):
28 |         self.dataframe = dataframe
29 |         self.namespace = config['namespace']
30 |         self.set_name = config['set']
31 |         self.port = config['port']
32 |         self.hostname = config['hostname']
33 | 
34 |     def set(self, key_tuple, bins):
35 |         self.client.put(key_tuple, bins)
36 | 
37 |     def run(self):
38 |         import aerospike
39 |         self.client = aerospike.client({'hosts': [(self.hostname,
40 |                                                    self.port)],
41 |                                         'policies':{'timeout': 60000}}).connect()
42 |         for ix in self.dataframe.index:
43 |             self.set((self.namespace, self.set_name, ix),
44 |                      self.dataframe.ix[ix].to_dict())
45 |         self.client.close()
46 | 


--------------------------------------------------------------------------------
/pysemantic/loggers.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """Loggers"""
10 | 
11 | import os
12 | import os.path as op
13 | import logging
14 | import time
15 | 
16 | 
17 | LOGDIR = op.join(op.expanduser("~"), ".pysemantic")
18 | if not op.exists(LOGDIR):
19 |     os.mkdir(LOGDIR)
20 | 
21 | 
22 | def setup_logging(project_name):
23 |     logfile = "{0}_{1}.log".format(project_name, time.time())
24 |     logging.basicConfig(filename=op.join(LOGDIR, logfile),
25 |                         level=logging.INFO)
26 |     logging.info("Project {0} started.".format(project_name))
27 | 


--------------------------------------------------------------------------------
/pysemantic/project.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2015 jaidev <jaidev@newton>
  6 | #
  7 | # Distributed under terms of the BSD 3-clause license.
  8 | 
  9 | """The Project class."""
 10 | 
 11 | import os
 12 | import textwrap
 13 | import pprint
 14 | import logging
 15 | import json
 16 | from ConfigParser import RawConfigParser
 17 | import os.path as op
 18 | import yaml
 19 | import pandas as pd
 20 | import numpy as np
 21 | from pysemantic.validator import SchemaValidator, DataFrameValidator, \
 22 |     ParseErrorHandler
 23 | from pysemantic.errors import MissingProject, MissingConfigError, \
 24 |     ParserArgumentError
 25 | from pysemantic.loggers import setup_logging
 26 | from pysemantic.utils import TypeEncoder
 27 | from pysemantic.exporters import AerospikeExporter
 28 | 
 29 | try:
 30 |     from yaml import CDumper as Dumper
 31 |     from yaml import CLoader as Loader
 32 | except ImportError:
 33 |     from yaml import Dumper
 34 |     from yaml import Loader
 35 | 
 36 | CONF_FILE_NAME = os.environ.get("PYSEMANTIC_CONFIG", "pysemantic.conf")
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | 
 40 | def locate_config_file():
 41 |     """Locates the configuration file used by semantic.
 42 | 
 43 |     :return: Path of the pysemantic config file.
 44 |     :rtype: str
 45 |     :Example:
 46 | 
 47 |     >>> locate_config_file()
 48 |     '/home/username/pysemantic.conf'
 49 |     """
 50 |     paths = [op.join(os.getcwd(), CONF_FILE_NAME),
 51 |              op.join(op.expanduser('~'), CONF_FILE_NAME)]
 52 |     for path in paths:
 53 |         if op.exists(path):
 54 |             logger.info("Config file found at {0}".format(path))
 55 |             return path
 56 |     raise MissingConfigError("No pysemantic configuration file was fount at"
 57 |                              " {0} or {1}".format(*paths))
 58 | 
 59 | 
 60 | def get_default_specfile(project_name):
 61 |     """Returns the specifications file used by the given project. The \
 62 |             configuration file is searched for first in the current directory \
 63 |             and then in the home directory.
 64 | 
 65 |     :param project_name: Name of the project for which to get the spcfile.
 66 |     :type project_name: str
 67 |     :return: Path to the data dictionary of the project.
 68 |     :rtype: str
 69 |     :Example:
 70 | 
 71 |     >>> get_default_specfile('skynet')
 72 |     '/home/username/projects/skynet/schema.yaml'
 73 |     """
 74 |     path = locate_config_file()
 75 |     parser = RawConfigParser()
 76 |     parser.read(path)
 77 |     return parser.get(project_name, 'specfile')
 78 | 
 79 | 
 80 | def add_project(project_name, specfile):
 81 |     """Add a project to the global configuration file.
 82 | 
 83 |     :param project_name: Name of the project
 84 |     :param specfile: path to the data dictionary used by the project.
 85 |     :type project_name: str
 86 |     :type specfile: str
 87 |     :return: None
 88 |     """
 89 |     if not op.isabs(specfile):
 90 |         raise ValueError("Path to the schema should be absolute.")
 91 |     path = locate_config_file()
 92 |     parser = RawConfigParser()
 93 |     parser.read(path)
 94 |     parser.add_section(project_name)
 95 |     parser.set(project_name, "specfile", specfile)
 96 |     with open(path, "w") as f:
 97 |         parser.write(f)
 98 | 
 99 | 
100 | def add_dataset(project_name, dataset_name, dataset_specs):
101 |     """Add a dataset to a project.
102 | 
103 |     :param project_name: Name of the project to which the dataset is to be \
104 |             added.
105 |     :param dataset_name: Name of the dataset to be added.
106 |     :param dataset_specs: Specifications of the dataset.
107 |     :type project_name: str
108 |     :type dataset_name: str
109 |     :type dataset_specs: dict
110 |     :return: None
111 |     """
112 |     data_dict = get_default_specfile(project_name)
113 |     with open(data_dict, "r") as f:
114 |         spec = yaml.load(f, Loader=Loader)
115 |     spec[dataset_name] = dataset_specs
116 |     with open(data_dict, "w") as f:
117 |         yaml.dump(spec, f, Dumper=Dumper, default_flow_style=False)
118 | 
119 | 
120 | def remove_dataset(project_name, dataset_name):
121 |     """Removes a dataset from a project.
122 | 
123 |     :param project_name: Name of the project
124 |     :param dataset_name: Name of the dataset to remove
125 |     :type project_name: str
126 |     :type dataset_name: str
127 |     :return: None
128 |     """
129 |     data_dict = get_default_specfile(project_name)
130 |     with open(data_dict, "r") as f:
131 |         spec = yaml.load(f, Loader=Loader)
132 |     del spec[dataset_name]
133 |     with open(data_dict, "w") as f:
134 |         yaml.dump(spec, f, Dumper=Dumper, default_flow_style=False)
135 | 
136 | 
137 | def get_datasets(project_name=None):
138 |     """Get names of all datasets registered under the project `project_name`.
139 | 
140 |     :param project_name: name of the projects to list the datasets from. If \
141 |             `None` (default), datasets under all projects are returned.
142 |     :type project_name: str
143 |     :return: List of datasets listed under `project_name`, or if \
144 |             `project_name` is `None`, returns dictionary such that \
145 |             {project_name: [list of projects]}
146 |     :rtype: dict or list
147 |     :Example:
148 | 
149 |     >>> get_datasets('skynet')
150 |     ['sarah_connor', 'john_connor', 'kyle_reese']
151 |     >>> get_datasets()
152 |     {'skynet': ['sarah_connor', 'john_connor', 'kyle_reese'],
153 |      'south park': ['stan', 'kyle', 'cartman', 'kenny']}
154 |     """
155 |     if project_name is not None:
156 |         specs = get_schema_specs(project_name)
157 |         return specs.keys()
158 |     else:
159 |         dataset_names = {}
160 |         projects = get_projects()
161 |         for project_name, _ in projects:
162 |             dataset_names[project_name] = get_datasets(project_name)
163 |         return dataset_names
164 | 
165 | 
166 | def set_schema_fpath(project_name, schema_fpath):
167 |     """Set the schema path for a given project.
168 | 
169 |     :param project_name: Name of the project
170 |     :param schema_fpath: path to the yaml file to be used as the schema for \
171 |             the project.
172 |     :type project_name: str
173 |     :type schema_fpath: str
174 |     :return: True, if setting the schema path was successful.
175 |     :Example:
176 | 
177 |     >>> set_schema_fpath('skynet', '/path/to/new/schema.yaml')
178 |     True
179 |     """
180 |     path = locate_config_file()
181 |     parser = RawConfigParser()
182 |     parser.read(path)
183 |     if project_name in parser.sections():
184 |         if not parser.remove_option(project_name, "specfile"):
185 |             raise MissingProject
186 |         else:
187 |             parser.set(project_name, "specfile", schema_fpath)
188 |             with open(path, "w") as f:
189 |                 parser.write(f)
190 |             return True
191 |     raise MissingProject
192 | 
193 | 
194 | def get_projects():
195 |     """Get the list of projects currently registered with pysemantic as a
196 |     list.
197 | 
198 |     :return: List of tuples, such that each tuple is (project_name, \
199 |             location_of_specfile)
200 |     :rtype: list
201 |     :Example:
202 | 
203 |     >>> get_projects()
204 |     ['skynet', 'south park']
205 |     """
206 |     path = locate_config_file()
207 |     parser = RawConfigParser()
208 |     parser.read(path)
209 |     projects = []
210 |     for section in parser.sections():
211 |         project_name = section
212 |         specfile = parser.get(section, "specfile")
213 |         projects.append((project_name, specfile))
214 |     return projects
215 | 
216 | 
217 | def get_schema_specs(project_name, dataset_name=None):
218 |     """Get the specifications of a dataset as specified in the schema.
219 | 
220 |     :param project_name: Name of project
221 |     :param dataset_name: name of the dataset for which to get the schema. If \
222 |             None (default), schema for all datasets is returned.
223 |     :type project_name: str
224 |     :type dataset_name: str
225 |     :return: schema for dataset
226 |     :rtype: dict
227 |     :Example:
228 | 
229 |     >>> get_schema_specs('skynet')
230 |     {'sarah connor': {'path': '/path/to/sarah_connor.csv',
231 |                       'delimiter': ','},
232 |      'kyle reese': {'path': '/path/to/kyle_reese.tsv',
233 |                     'delimiter':, '\t'}
234 |      'john connor': {'path': '/path/to/john_connor.txt',
235 |                     'delimiter':, ' '}
236 |      }
237 |     """
238 |     schema_file = get_default_specfile(project_name)
239 |     with open(schema_file, "r") as f:
240 |         specs = yaml.load(f, Loader=Loader)
241 |     if dataset_name is not None:
242 |         return specs[dataset_name]
243 |     return specs
244 | 
245 | 
246 | def set_schema_specs(project_name, dataset_name, **kwargs):
247 |     """Set the schema specifications for a dataset.
248 | 
249 |     :param project_name: Name of the project containing the dataset.
250 |     :param dataset_name: Name of the dataset of which the schema is being set.
251 |     :param kwargs: Schema fields that are dumped into the schema files.
252 |     :type project_name: str
253 |     :type dataset_name: str
254 |     :return: None
255 |     :Example:
256 | 
257 |     >>> set_schema_specs('skynet', 'kyle reese',
258 |                          path='/path/to/new/file.csv', delimiter=new_delimiter)
259 |     """
260 |     schema_file = get_default_specfile(project_name)
261 |     with open(schema_file, "r") as f:
262 |         specs = yaml.load(f, Loader=Loader)
263 |     for key, value in kwargs.iteritems():
264 |         specs[dataset_name][key] = value
265 |     with open(schema_file, "w") as f:
266 |         yaml.dump(specs, f, Dumper=Dumper, default_flow_style=False)
267 | 
268 | 
269 | def view_projects():
270 |     """View a list of all projects currently registered with pysemantic.
271 | 
272 |     :Example:
273 | 
274 |     >>> view_projects()
275 |     Project skynet with specfile at /path/to/skynet.yaml
276 |     Project south park with specfile at /path/to/south_park.yaml
277 |     """
278 |     projects = get_projects()
279 |     if len(projects) > 0:
280 |         for project_name, specfile in projects:
281 |             print "Project {0} with specfile at {1}".format(project_name,
282 |                                                             specfile)
283 |     else:
284 |         msg = textwrap.dedent("""\
285 |             No projects found. You can add projects using the
286 |             $ semantic list
287 |             command.
288 |             """)
289 |         print msg
290 | 
291 | 
292 | def remove_project(project_name):
293 |     """Remove a project from the global configuration file.
294 | 
295 |     :param project_name: Name of the project to remove.
296 |     :type project_name: str
297 |     :return: True if the project existed
298 |     :rtype: bool
299 |     :Example:
300 | 
301 |     >>> view_projects()
302 |     Project skynet with specfile at /path/to/skynet.yaml
303 |     Project south park with specfile at /path/to/south_park.yaml
304 |     >>> remove_project('skynet')
305 |     >>> view_projects()
306 |     Project south park with specfile at /path/to/south_park.yaml
307 |     """
308 |     path = locate_config_file()
309 |     parser = RawConfigParser()
310 |     parser.read(path)
311 |     result = parser.remove_section(project_name)
312 |     if result:
313 |         with open(path, "w") as f:
314 |             parser.write(f)
315 |     return result
316 | 
317 | 
318 | class Project(object):
319 |     """The Project class, the entry point for most things in this module."""
320 | 
321 |     def __init__(self, project_name=None, parser=None, schema=None):
322 |         """The Project class.
323 | 
324 |         :param project_name: Name of the project as specified in the \
325 |                 pysemantic configuration file. If this is ``None``, then the
326 |                 ``schema`` parameter is expected to contain the schema
327 |                 dictionary. (see below)
328 |         :param parser: The parser to be used for reading dataset files. The \
329 |                 default is `pandas.read_table`.
330 |         :param schema: Dictionary containing the schema for the project. When
331 |         this argument is supplied (not ``None``), the ``project_name`` is
332 |         ignored, no specfile is read, and all the specifications for the data
333 |         are inferred from this dictionary.
334 |         """
335 |         if project_name is not None:
336 |             setup_logging(project_name)
337 |             self.project_name = project_name
338 |             self.specfile = get_default_specfile(self.project_name)
339 |             logger.info(
340 |                 "Schema for project {0} found at {1}".format(project_name,
341 |                                                              self.specfile))
342 |         else:
343 |             setup_logging("no_name")
344 |             logger.info("Schema defined by user at runtime. Not reading any "
345 |                         "specfile.")
346 |             self.specfile = None
347 |         self.validators = {}
348 |         if parser is not None:
349 |             self.user_specified_parser = True
350 |         else:
351 |             self.user_specified_parser = False
352 |         self.parser = parser
353 |         if self.specfile is not None:
354 |             with open(self.specfile, 'r') as f:
355 |                 specifications = yaml.load(f, Loader=Loader)
356 |         else:
357 |             specifications = schema
358 |         self.column_rules = {}
359 |         self.df_rules = {}
360 |         for name, specs in specifications.iteritems():
361 |             self.column_rules[name] = specs.get('column_rules', {})
362 |             self.df_rules[name] = specs.get('dataframe_rules', {})
363 |         self.specifications = specifications
364 | 
365 |     def export_dataset(self, dataset_name, dataframe=None, outpath=None):
366 |         """Export a dataset to an exporter defined in the schema. If nothing is
367 |         specified in the schema, simply export to a CSV file such named
368 |         <dataset_name>.csv
369 | 
370 |         :param dataset_name: Name of the dataset to exporter.
371 |         :param dataframe: Pandas dataframe to export. If None (default), this \
372 |                 dataframe is loaded using the `load_dataset` method.
373 |         :type dataset_name: Str
374 |         """
375 |         if dataframe is None:
376 |             dataframe = self.load_dataset(dataset_name)
377 |         config = self.specifications[dataset_name].get('exporter')
378 |         if outpath is None:
379 |             outpath = dataset_name + ".csv"
380 |         if config is not None:
381 |             if config['kind'] == "aerospike":
382 |                 config['namespace'] = self.project_name
383 |                 config['set'] = dataset_name
384 |                 exporter = AerospikeExporter(config, dataframe)
385 |                 exporter.run()
386 |         else:
387 |             suffix = outpath.split('.')[-1]
388 |             if suffix in ("h5", "hdf"):
389 |                 group = r'/{0}/{1}'.format(self.project_name, dataset_name)
390 |                 dataframe.to_hdf(outpath, group)
391 |             elif suffix == "csv":
392 |                 dataframe.to_csv(outpath, index=False)
393 | 
394 |     def reload_data_dict(self):
395 |         """Reload the data dictionary and re-populate the schema."""
396 | 
397 |         with open(self.specfile, "r") as f:
398 |             specifications = yaml.load(f, Loader=Loader)
399 |         self.validators = {}
400 |         self.column_rules = {}
401 |         self.df_rules = {}
402 |         logger.info("Reloading project information.")
403 |         self.specifications = specifications
404 |         for name, specs in specifications.iteritems():
405 |             logger.info("Schema for dataset {0}:".format(name))
406 |             logger.info(json.dumps(specs, cls=TypeEncoder))
407 |             self._init_validate(name)
408 |             self.column_rules[name] = specs.get('column_rules', {})
409 |             self.df_rules[name] = specs.get('dataframe_rules', {})
410 | 
411 |     @property
412 |     def datasets(self):
413 |         """"List the datasets registered under the parent project.
414 | 
415 |         :Example:
416 | 
417 |         >>> project = Project('skynet')
418 |         >>> project.datasets
419 |         ['sarah connor', 'john connor', 'kyle reese']
420 |         """
421 |         return self.specifications.keys()
422 | 
423 |     def _init_validate(self, dataset_name):
424 |         """Given a dataset name, create a SchemaValidator object and add to the
425 |         cache.
426 | 
427 |         :param dataset_name: Name of the dataset
428 |         """
429 |         specs = self.specifications.get(dataset_name)
430 |         is_pickled = specs.get("pickle", False)
431 |         if self.specfile is not None:
432 |             validator = SchemaValidator.from_specfile(specfile=self.specfile,
433 |                                                       name=dataset_name,
434 |                                                       is_pickled=is_pickled)
435 |         else:
436 |             validator = SchemaValidator(specification=specs,
437 |                                         name=dataset_name,
438 |                                         is_pickled=is_pickled)
439 |         self.validators[dataset_name] = validator
440 | 
441 |     def get_dataset_specs(self, dataset_name):
442 |         """Returns the specifications for the specified dataset in the project.
443 | 
444 |         :param dataset_name: Name of the dataset
445 |         :type dataset_name: str
446 |         :return: Parser arguments required to import the dataset in pandas.
447 |         :rtype: dict
448 |         """
449 |         if dataset_name not in self.validators:
450 |             self._init_validate(dataset_name)
451 |         return self.validators[dataset_name].get_parser_args()
452 | 
453 |     def get_project_specs(self):
454 |         """Returns a dictionary containing the schema for all datasets listed
455 |         under this project.
456 | 
457 |         :return: Parser arguments for all datasets listed under the project.
458 |         :rtype: dict
459 |         """
460 |         specs = {}
461 |         for name, basespecs in self.specifications.iteritems():
462 |             if name not in self.validators:
463 |                 self._init_validate(name)
464 |             validator = self.validators[name]
465 |             specs[name] = validator.get_parser_args()
466 |         return specs
467 | 
468 |     def view_dataset_specs(self, dataset_name):
469 |         """Pretty print the specifications for a dataset.
470 | 
471 |         :param dataset_name: Name of the dataset
472 |         :type dataset_name: str
473 |         """
474 |         specs = self.get_dataset_specs(dataset_name)
475 |         pprint.pprint(specs)
476 | 
477 |     def update_dataset(self, dataset_name, dataframe, path=None, **kwargs):
478 |         """This is tricky."""
479 |         org_specs = self.get_dataset_specs(dataset_name)
480 |         if path is None:
481 |             path = org_specs['filepath_or_buffer']
482 |         sep = kwargs.get('sep', org_specs['sep'])
483 |         index = kwargs.get('index', False)
484 |         dataframe.to_csv(path, sep=sep, index=index)
485 |         dtypes = {}
486 |         for col in dataframe:
487 |             dtype = dataframe[col].dtype
488 |             if dtype == np.dtype('O'):
489 |                 dtypes[col] = str
490 |             elif dtype == np.dtype('float'):
491 |                 dtypes[col] = float
492 |             elif dtype == np.dtype('int'):
493 |                 dtypes[col] = int
494 |             else:
495 |                 dtypes[col] = dtype
496 |         new_specs = {'path': path, 'delimiter': sep, 'dtypes': dtypes}
497 |         with open(self.specfile, "r") as fid:
498 |             specs = yaml.load(fid, Loader=Loader)
499 |         dataset_specs = specs[dataset_name]
500 |         dataset_specs.update(new_specs)
501 |         if "column_rules" in dataset_specs:
502 |             col_rules = dataset_specs['column_rules']
503 |             cols_to_remove = []
504 |             for colname in col_rules.iterkeys():
505 |                 if colname not in dataframe.columns:
506 |                     cols_to_remove.append(colname)
507 |             for colname in cols_to_remove:
508 |                 del col_rules[colname]
509 |         logger.info("Attempting to update schema for dataset {0} to:".format(
510 |             dataset_name))
511 |         logger.info(json.dumps(dataset_specs, cls=TypeEncoder))
512 |         with open(self.specfile, "w") as fid:
513 |             yaml.dump(specs, fid, Dumper=Dumper,
514 |                       default_flow_style=False)
515 | 
516 |     def _sql_read(self, parser_args):
517 |         if parser_args.get('table_name'):
518 |             if parser_args.get('query'):
519 |                 return pd.read_sql_query(sql=parser_args.get('query'),
520 |                                          con=parser_args['con'])
521 |             return pd.read_sql_table(
522 |                 table_name=parser_args.get('table_name'),
523 |                 con=parser_args.get('con'),
524 |                 columns=parser_args.get('use_columns'),
525 |                 index_col=parser_args.get('index_col')
526 |             )
527 |         elif parser_args.get('query'):
528 |             return pd.read_sql_query(sql=parser_args.get('query'),
529 |                                      con=parser_args['con'])
530 | 
531 |     def _sql_iterator(self, parser_args):
532 |         dfs = []
533 |         if parser_args.get('table_name'):
534 |             if parser_args.get('query'):
535 |                 iterator = pd.read_sql_query(sql=parser_args.get('query'),
536 |                                              con=parser_args['con'],
537 |                                              chunksize=parser_args['chunksize'])
538 |             else:
539 |                 iterator = pd.read_sql_table(
540 |                     table_name=parser_args.get('table_name'),
541 |                     con=parser_args.get('con'),
542 |                     chunksize=parser_args.get('chunksize'),
543 |                     columns=parser_args.get('use_columns'),
544 |                     index_col=parser_args.get('index_col')
545 |                 )
546 |         else:
547 |             iterator = pd.read_sql_query(sql=parser_args.get('query'),
548 |                                          con=parser_args['con'],
549 |                                          chunksize=parser_args['chunksize'])
550 |         while True:
551 |             try:
552 |                 dfs.append(iterator.next())
553 |             except StopIteration:
554 |                 break
555 |             except Exception as err:
556 |                 logger.debug("SQL iterator failed: {}".format(err))
557 |                 break
558 |         dfs.append(None)
559 |         return pd.concat(dfs)
560 | 
561 |     def load_dataset(self, dataset_name):
562 |         """Load and return a dataset.
563 | 
564 |         :param dataset_name: Name of the dataset
565 |         :type dataset_name: str
566 |         :return: A pandas DataFrame containing the dataset.
567 |         :rtype: pandas.DataFrame
568 |         :Example:
569 | 
570 |         >>> demo_project = Project('pysemantic_demo')
571 |         >>> iris = demo_project.load_dataset('iris')
572 |         >>> type(iris)
573 |         pandas.core.DataFrame
574 |         """
575 |         if dataset_name not in self.validators:
576 |             self._init_validate(dataset_name)
577 |         validator = self.validators[dataset_name]
578 |         column_rules = self.column_rules.get(dataset_name, {})
579 |         df_rules = self.df_rules.get(dataset_name, {})
580 |         parser_args = validator.get_parser_args()
581 |         df_rules.update(validator.df_rules)
582 |         logger.info("Attempting to load dataset {} with args:".format(
583 |             dataset_name))
584 |         if validator.is_spreadsheet:
585 |             parser_args.pop('usecols', None)
586 |         logger.info(json.dumps(parser_args, cls=TypeEncoder))
587 |         if isinstance(parser_args, dict):
588 |             if validator.is_mysql or validator.is_postgresql:
589 |                 if not (
590 |                         parser_args.get('table_name') or parser_args.get('query')):
591 |                     raise ParserArgumentError(
592 |                         "No table_name or query was provided for the "
593 |                         "postgres configuration.")
594 |                 elif validator.sql_validator.chunksize is not None:
595 |                     df = self._sql_iterator(parser_args)
596 |                 else:
597 |                     df = self._sql_read(parser_args)
598 |             else:
599 |                 with ParseErrorHandler(parser_args, self) as handler:
600 |                     df = handler.load()
601 |             if df is None:
602 |                 raise ParserArgumentError("No valid parser arguments were " +
603 |                                           "inferred from the schema.")
604 |             if validator.is_spreadsheet and isinstance(validator.sheetname,
605 |                                                        list):
606 |                 df = pd.concat(df.itervalues(), axis=0)
607 |             logger.info("Success!")
608 |             df_validator = DataFrameValidator(data=df, rules=df_rules,
609 |                                               column_rules=column_rules)
610 |             logger.info("Commence cleaning dataset:")
611 |             logger.info("DataFrame rules:")
612 |             logger.info(json.dumps(df_rules, cls=TypeEncoder))
613 |             logger.info("Column rules:")
614 |             logger.info(json.dumps(column_rules, cls=TypeEncoder))
615 | 
616 |             return df_validator.clean()
617 |         else:
618 |             dfs = []
619 |             for argset in parser_args:
620 |                 with ParseErrorHandler(argset, self) as handler:
621 |                     _df = handler.load()
622 |                 df_validator = DataFrameValidator(data=_df,
623 |                                                   column_rules=column_rules)
624 |                 dfs.append(df_validator.clean())
625 |             df = pd.concat(dfs, axis=0)
626 |             return df.set_index(np.arange(df.shape[0]))
627 | 
628 |     def load_datasets(self):
629 |         """Load and return all datasets.
630 | 
631 |         :return: dictionary like {dataset_name: dataframe}
632 |         :rtype: dict
633 |         """
634 |         datasets = {}
635 |         for name in self.specifications.iterkeys():
636 |             if name not in self.validators:
637 |                 self._init_validate(name)
638 |             datasets[name] = self.load_dataset(name)
639 |         return datasets
640 | 


--------------------------------------------------------------------------------
/pysemantic/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """The tests module."""
2 | 


--------------------------------------------------------------------------------
/pysemantic/tests/test_base.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2015 jaidev <jaidev@newton>
  6 | #
  7 | # Distributed under terms of the BSD 3-clause license.
  8 | 
  9 | """Base classes and functions for tests."""
 10 | 
 11 | import os
 12 | import unittest
 13 | import tempfile
 14 | import shutil
 15 | import os.path as op
 16 | from copy import deepcopy
 17 | from ConfigParser import RawConfigParser
 18 | 
 19 | import yaml
 20 | import numpy as np
 21 | import pandas as pd
 22 | 
 23 | from pysemantic import project as pr
 24 | from pysemantic.utils import colnames
 25 | 
 26 | try:
 27 |     from yaml import CLoader as Loader
 28 |     from yaml import CDumper as Dumper
 29 | except ImportError:
 30 |     from yaml import Loader
 31 |     from yaml import Dumper
 32 | 
 33 | TEST_CONFIG_FILE_PATH = op.join(op.abspath(op.dirname(__file__)), "testdata",
 34 |                                 "test.conf")
 35 | TEST_DATA_DICT = op.join(op.abspath(op.dirname(__file__)), "testdata",
 36 |                          "test_dictionary.yaml")
 37 | TEST_XL_DICT = op.join(op.abspath(op.dirname(__file__)), "testdata",
 38 |                        "test_excel.yaml")
 39 | 
 40 | 
 41 | def _path_fixer(filepath, root=None):
 42 |     """Change all the relative paths in `filepath` to absolute ones.
 43 | 
 44 |     :param filepath: File to be changed
 45 |     :param root: Root path with which the relative paths are prefixed. If None
 46 |     (default), the directory with this file is the root.
 47 |     """
 48 |     if root is None:
 49 |         root = op.join(op.abspath(op.dirname(__file__)))
 50 |     if filepath.endswith((".yaml", ".yml")):
 51 |         with open(filepath, "r") as fileobj:
 52 |             data = yaml.load(fileobj, Loader=Loader)
 53 |         for specs in data.itervalues():
 54 |             specs['path'] = op.join(root, specs['path'])
 55 |         with open(filepath, "w") as fileobj:
 56 |             yaml.dump(data, fileobj, Dumper=Dumper,
 57 |                       default_flow_style=False)
 58 |     elif filepath.endswith(".conf"):
 59 |         parser = RawConfigParser()
 60 |         parser.read(filepath)
 61 |         for section in parser.sections():
 62 |             path = parser.get(section, "specfile")
 63 |             parser.remove_option(section, "specfile")
 64 |             parser.set(section, "specfile", op.join(root, path))
 65 |         with open(filepath, "w") as fileobj:
 66 |             parser.write(fileobj)
 67 | 
 68 | 
 69 | def _remove_project(project_name, project_files=None):
 70 |     pr.remove_project(project_name)
 71 |     if project_files is not None:
 72 |         if hasattr(project_files, "__iter__"):
 73 |             for path in project_files:
 74 |                 if op.isfile(path):
 75 |                     os.unlink(path)
 76 |                 elif op.isdir(path):
 77 |                     shutil.rmtree(path)
 78 |         else:
 79 |             if op.isfile(project_files):
 80 |                 os.unlink(project_files)
 81 |             elif op.isdir(project_files):
 82 |                 shutil.rmtree(project_files)
 83 | 
 84 | 
 85 | class DummyProjectFactory(object):
 86 | 
 87 |     def __init__(self, schema, df, exporter="to_csv", **kwargs):
 88 |         self.tempdir = tempfile.mkdtemp()
 89 |         data_fpath = op.join(self.tempdir, "data.dat")
 90 |         if ("index" not in kwargs) and ("index_label" not in kwargs):
 91 |             kwargs['index'] = False
 92 |         getattr(df, exporter)(data_fpath, **kwargs)
 93 |         schema['data']['path'] = data_fpath
 94 |         schema_fpath = op.join(self.tempdir, "schema.yml")
 95 |         with open(schema_fpath, "w") as f_schema:
 96 |             yaml.dump(schema, f_schema, Dumper=yaml.CDumper)
 97 |         self.schema_fpath = schema_fpath
 98 | 
 99 |     def __enter__(self):
100 |         pr.add_project("dummy_project", self.schema_fpath)
101 |         return pr.Project("dummy_project")
102 | 
103 |     def __exit__(self, type, value, traceback):
104 |         _remove_project("dummy_project", self.tempdir)
105 | 
106 | 
107 | class BaseTestCase(unittest.TestCase):
108 | 
109 |     """Base test class, introduces commonly required methods."""
110 | 
111 |     def assertKwargsEqual(self, dict1, dict2):
112 |         """Assert that dictionaries are equal, to a deeper extent."""
113 |         self.assertEqual(len(dict1.keys()), len(dict2.keys()))
114 |         for key, value in dict1.iteritems():
115 |             self.assertIn(key, dict2)
116 |             left = value
117 |             right = dict2[key]
118 |             if isinstance(left, (tuple, list)):
119 |                 self.assertItemsEqual(left, right)
120 |             elif isinstance(left, dict):
121 |                 self.assertDictEqual(left, right)
122 |             else:
123 |                 self.assertEqual(left, right)
124 | 
125 |     def assertKwargsEmpty(self, data):
126 |         """Assert that a dictionary is empty."""
127 |         for value in data.itervalues():
128 |             self.assertIn(value, ("", 0, 1, [], (), {}, None, False))
129 | 
130 |     def assertDataFrameEqual(self, dframe1, dframe2):
131 |         """Assert that two dataframes are equal by their columns, indices and
132 |         values."""
133 |         self.assertTrue(np.all(dframe1.index.values == dframe2.index.values))
134 |         self.assertTrue(np.all(dframe1.columns == dframe2.columns))
135 |         for col in dframe1:
136 |             if dframe1[col].dtype in (np.dtype(float), np.dtype(int)):
137 |                 np.testing.assert_allclose(dframe1[col], dframe2[col])
138 |             else:
139 |                 self.assertTrue(np.all(dframe1[col] == dframe2[col]))
140 |             self.assertEqual(dframe1[col].dtype, dframe2[col].dtype)
141 | 
142 |     def assertSeriesEqual(self, s1, s2):
143 |         """Assert that two series are equal by their indices and values."""
144 |         self.assertEqual(s1.shape, s2.shape)
145 |         self.assertTrue(np.all(s1.values == s2.values))
146 |         self.assertTrue(np.all(s1.index == s2.index))
147 | 
148 | 
149 | class BaseProjectTestCase(BaseTestCase):
150 | 
151 |     """Base class for tests of the Project module."""
152 | 
153 |     @classmethod
154 |     def setUpClass(cls):
155 |         cls.maxDiff = None
156 |         # modify the testdata dict to have absolute paths
157 |         with open(TEST_DATA_DICT, "r") as fileobj:
158 |             test_data = yaml.load(fileobj, Loader=Loader)
159 |         for _, specs in test_data.iteritems():
160 |             path = op.join(op.abspath(op.dirname(__file__)), specs['path'])
161 |             specs['path'] = path
162 |         # Put in the multifile specs
163 |         cls.copied_iris_path = test_data['iris']['path'].replace("iris",
164 |                                                                 "iris2")
165 |         dframe = pd.read_csv(test_data['iris']['path'])
166 |         dframe.to_csv(cls.copied_iris_path, index=False)
167 | 
168 |         copied_iris_specs = deepcopy(test_data['iris'])
169 |         copied_iris_specs['path'] = [copied_iris_specs['path'],
170 |                                      cls.copied_iris_path]
171 |         copied_iris_specs['nrows'] = [150, 150]
172 |         test_data['multi_iris'] = copied_iris_specs
173 | 
174 |         with open(TEST_DATA_DICT, "w") as fileobj:
175 |             yaml.dump(test_data, fileobj, Dumper=Dumper,
176 |                       default_flow_style=False)
177 |         cls.data_specs = test_data
178 |         _path_fixer(TEST_XL_DICT)
179 | 
180 |         # Fix config file to have absolute paths
181 | 
182 |         config_fname = op.basename(TEST_CONFIG_FILE_PATH)
183 |         cls.test_conf_file = op.join(os.getcwd(), config_fname)
184 |         parser = RawConfigParser()
185 |         parser.read(TEST_CONFIG_FILE_PATH)
186 |         for project in ("pysemantic", "test_excel"):
187 |             specfile = parser.get(project, 'specfile')
188 |             specfile = op.join(op.abspath(op.dirname(__file__)), specfile)
189 |             parser.remove_option(project, "specfile")
190 |             parser.set(project, "specfile", specfile)
191 |             with open(cls.test_conf_file, 'w') as fileobj:
192 |                 parser.write(fileobj)
193 |         pr.CONF_FILE_NAME = config_fname
194 | 
195 |     @classmethod
196 |     def tearDownClass(cls):
197 |         try:
198 |             # modify the testdata back
199 |             with open(TEST_DATA_DICT, "r") as fileobj:
200 |                 test_data = yaml.load(fileobj, Loader=Loader)
201 |             test_data['iris']['path'] = op.join("testdata", "iris.csv")
202 |             test_data['random_row_iris']['path'] = op.join("testdata", "iris.csv")
203 |             test_data['bad_iris']['path'] = op.join("testdata", "bad_iris.csv")
204 |             test_data['person_activity']['path'] = op.join("testdata",
205 |                                                          "person_activity.tsv")
206 |             del test_data['multi_iris']
207 |             with open(TEST_DATA_DICT, "w") as fileobj:
208 |                 test_data = yaml.dump(test_data, fileobj, Dumper=Dumper,
209 |                                      default_flow_style=False)
210 | 
211 |             with open(TEST_XL_DICT, "r") as fileobj:
212 |                 test_data = yaml.load(fileobj, Loader=Loader)
213 |             xl_path = op.join("testdata", "test_spreadsheet.xlsx")
214 |             test_data['iris']['path'] = xl_path
215 |             test_data['person_activity']['path'] = xl_path
216 |             test_data['iris_renamed']['path'] = xl_path
217 |             with open(TEST_XL_DICT, "w") as fileobj:
218 |                 test_data = yaml.dump(test_data, fileobj, Dumper=Dumper,
219 |                                      default_flow_style=False)
220 | 
221 |             # Change the config files back
222 |             parser = RawConfigParser()
223 |             parser.read(cls.test_conf_file)
224 |             parser.remove_option("pysemantic", "specfile")
225 |             parser.set("pysemantic", "specfile",
226 |                        op.join("testdata", "test_dictionary.yaml"))
227 |             parser.remove_option("test_excel", "specfile")
228 |             parser.set("test_excel", "specfile",
229 |                        op.join("testdata", "test_excel.yaml"))
230 |             with open(TEST_CONFIG_FILE_PATH, 'w') as fileobj:
231 |                 parser.write(fileobj)
232 | 
233 |         finally:
234 |             os.unlink(cls.test_conf_file)
235 |             os.unlink(cls.copied_iris_path)
236 | 
237 |     def setUp(self):
238 |         iris_specs = _get_iris_args()
239 |         copied_iris_specs = deepcopy(iris_specs)
240 |         copied_iris_specs.update(
241 |                {'filepath_or_buffer': iris_specs['filepath_or_buffer'].replace(
242 |                                                         "iris", "iris2")})
243 |         multi_iris_specs = [iris_specs, copied_iris_specs]
244 |         person_activity_specs = _get_person_activity_args()
245 |         random_row_iris_specs = {'nrows': {'random': True, 'count': 50},
246 |                                  'error_bad_lines': False,
247 |                                  'filepath_or_buffer': op.join(
248 |                                               op.abspath(op.dirname(__file__)),
249 |                                               "testdata", "iris.csv")}
250 |         expected = {'iris': iris_specs,
251 |                     'person_activity': person_activity_specs,
252 |                     'multi_iris': multi_iris_specs,
253 |                     'random_row_iris': random_row_iris_specs}
254 |         self.expected_specs = expected
255 |         self.project = pr.Project(project_name="pysemantic")
256 | 
257 | 
258 | class TestConfig(BaseTestCase):
259 | 
260 |     """Test the configuration management utilities."""
261 | 
262 |     @classmethod
263 |     def setUpClass(cls):
264 |         # Fix the relative paths in the conig file.
265 |         parser = RawConfigParser()
266 |         parser.read(TEST_CONFIG_FILE_PATH)
267 |         cls.old_fpath = parser.get("pysemantic", "specfile")
268 |         parser.set("pysemantic", "specfile", op.abspath(cls.old_fpath))
269 |         with open(TEST_CONFIG_FILE_PATH, "w") as fileobj:
270 |             parser.write(fileobj)
271 |         cls._parser = parser
272 |         pr.CONF_FILE_NAME = "test.conf"
273 | 
274 |     @classmethod
275 |     def tearDownClass(cls):
276 |         cls._parser.set("pysemantic", "specfile", cls.old_fpath)
277 |         with open(TEST_CONFIG_FILE_PATH, "w") as fileobj:
278 |             cls._parser.write(fileobj)
279 | 
280 |     def setUp(self):
281 |         self.testParser = RawConfigParser()
282 |         for section in self._parser.sections():
283 |             self.testParser.add_section(section)
284 |             for item in self._parser.items(section):
285 |                 self.testParser.set(section, item[0], item[1])
286 | 
287 |     def test_loader_default_location(self):
288 |         """Test if the config looks for the files in the correct places."""
289 |         # Put the test config file in the current and home directories, with
290 |         # some modifications.
291 |         cwd_file = op.join(os.getcwd(), "test.conf")
292 |         home_file = op.join(op.expanduser('~'), "test.conf")
293 | 
294 |         try:
295 |             self.testParser.set("pysemantic", "specfile", os.getcwd())
296 |             with open(cwd_file, "w") as fileobj:
297 |                 self.testParser.write(fileobj)
298 |             specfile = pr.get_default_specfile("pysemantic")
299 |             self.assertEqual(specfile, os.getcwd())
300 | 
301 |             os.unlink(cwd_file)
302 | 
303 |             self.testParser.set("pysemantic", "specfile", op.expanduser('~'))
304 |             with open(home_file, "w") as fileobj:
305 |                 self.testParser.write(fileobj)
306 |             specfile = pr.get_default_specfile("pysemantic")
307 |             self.assertEqual(specfile, op.expanduser('~'))
308 | 
309 |         finally:
310 |             os.unlink(home_file)
311 | 
312 | 
313 | def _dummy_postproc(series):
314 |     return pd.Series([x if "v" in x else "" for x in series],
315 |                      index=series.index)
316 | 
317 | 
318 | def _get_iris_args():
319 |     """Get the ideal parser arguments for the iris dataset."""
320 |     filepath = op.join(op.dirname(__file__), "testdata", "iris.csv")
321 |     names = colnames(filepath)
322 |     return dict(filepath_or_buffer=op.abspath(filepath),
323 |                 sep=",", nrows=150, error_bad_lines=False,
324 |                 dtype={'Petal Length': float,
325 |                        'Petal Width': float,
326 |                        'Sepal Length': float,
327 |                        'Sepal Width': float,
328 |                        'Species': str},
329 |                 usecols=names, na_values=None, parse_dates=False,
330 |                 converters=None, header='infer', index_col=None)
331 | 
332 | 
333 | def _get_person_activity_args():
334 |     """Get the ideal parser arguments for the activity dataset."""
335 |     filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv")
336 |     names = colnames(filepath, sep='\t')
337 |     return dict(filepath_or_buffer=op.abspath(filepath),
338 |                 error_bad_lines=False, usecols=names, na_values=None,
339 |                 converters=None, header='infer', index_col=None,
340 |                 sep="\t", nrows=100, dtype={'sequence_name': str,
341 |                                             'tag': str,
342 |                                             'x': float,
343 |                                             'y': float,
344 |                                             'z': float,
345 |                                             'activity': str},
346 |                 parse_dates=['date'])
347 | 
348 | if __name__ == '__main__':
349 |     unittest.main()
350 | 


--------------------------------------------------------------------------------
/pysemantic/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2015 jaidev <jaidev@newton>
  6 | #
  7 | # Distributed under terms of the BSD 3-clause license.
  8 | 
  9 | """Tests for the cli."""
 10 | 
 11 | import os
 12 | import shutil
 13 | import subprocess
 14 | import tempfile
 15 | import unittest
 16 | import os.path as op
 17 | from copy import deepcopy
 18 | from ConfigParser import RawConfigParser
 19 | 
 20 | import yaml
 21 | import pandas as pd
 22 | import numpy as np
 23 | 
 24 | from pysemantic.tests.test_base import (BaseTestCase, TEST_CONFIG_FILE_PATH,
 25 |                                         TEST_DATA_DICT)
 26 | from pysemantic import project as pr
 27 | 
 28 | try:
 29 |     from yaml import CLoader as Loader
 30 |     from yaml import CDumper as Dumper
 31 | except ImportError:
 32 |     from yaml import Loader as Loader
 33 |     from yaml import Dumper as Dumper
 34 | 
 35 | try:
 36 |     import tables
 37 |     PYTABLES_NOT_INSTALLED = False
 38 | except ImportError:
 39 |     PYTABLES_NOT_INSTALLED = True
 40 | 
 41 | 
 42 | class TestCLI(BaseTestCase):
 43 | 
 44 |     """Test the pysemantic CLI."""
 45 | 
 46 |     @classmethod
 47 |     def setUpClass(cls):
 48 |         os.environ['PYSEMANTIC_CONFIG'] = "test.conf"
 49 |         pr.CONF_FILE_NAME = "test.conf"
 50 |         cls.testenv = os.environ
 51 |         cls.test_config_path = op.join(os.getcwd(), "test.conf")
 52 |         shutil.copy(TEST_CONFIG_FILE_PATH, cls.test_config_path)
 53 |         # Change the relative paths in the config file to absolute paths
 54 |         parser = RawConfigParser()
 55 |         parser.read(cls.test_config_path)
 56 |         for section in parser.sections():
 57 |             schema_path = parser.get(section, "specfile")
 58 |             parser.remove_option(section, "specfile")
 59 |             parser.set(section, "specfile",
 60 |                        op.join(op.abspath(op.dirname(__file__)), schema_path))
 61 |         with open(cls.test_config_path, "w") as fileobj:
 62 |             parser.write(fileobj)
 63 |         # change the relative paths in the test dictionary to absolute paths
 64 |         with open(TEST_DATA_DICT, "r") as fileobj:
 65 |             cls.org_specs = yaml.load(fileobj, Loader=Loader)
 66 |         new_specs = deepcopy(cls.org_specs)
 67 |         for _, specs in new_specs.iteritems():
 68 |             path = specs['path']
 69 |             specs['path'] = op.join(op.abspath(op.dirname(__file__)), path)
 70 |         # Rewrite this to the file
 71 |         with open(TEST_DATA_DICT, "w") as fileobj:
 72 |             yaml.dump(new_specs, fileobj, Dumper=Dumper,
 73 |                       default_flow_style=False)
 74 | 
 75 |     @classmethod
 76 |     def tearDownClass(cls):
 77 |         os.unlink(cls.test_config_path)
 78 |         # Rewrite the original specs back to the config dir
 79 |         with open(TEST_DATA_DICT, "w") as fileobj:
 80 |             yaml.dump(cls.org_specs, fileobj, Dumper=Dumper,
 81 |                       default_flow_style=False)
 82 | 
 83 |     def setUp(self):
 84 |         pr.add_project("dummy_project", "/foo/bar.yaml")
 85 | 
 86 |     def tearDown(self):
 87 |         pr.remove_project("dummy_project")
 88 | 
 89 |     def test_set_specification(self):
 90 |         """Test if the set-specs subcommand of the CLI worls properly."""
 91 |         org_specs = pr.get_schema_specs("pysemantic")
 92 |         cmd = ['semantic', 'set-specs', 'pysemantic', '--dataset', 'iris',
 93 |                '--dlm', '|']
 94 |         try:
 95 |             subprocess.check_call(cmd, env=self.testenv)
 96 |             new_specs = pr.get_schema_specs("pysemantic", "iris")
 97 |             self.assertEqual(new_specs['delimiter'], '|')
 98 |         finally:
 99 |             for dataset_name, specs in org_specs.iteritems():
100 |                 pr.set_schema_specs("pysemantic", dataset_name, **specs)
101 | 
102 |     def test_list_projects(self):
103 |         """Test if the `list` subcommand of the CLI works properly."""
104 |         cmd = ['semantic', 'list']
105 |         output = subprocess.check_output(cmd, env=self.testenv).splitlines()
106 |         path = op.join(op.abspath(op.dirname(__file__)),
107 |                        "testdata/test_dictionary.yaml")
108 |         excel_path = op.join(op.abspath(op.dirname(__file__)),
109 |                        "testdata/test_excel.yaml")
110 |         dummy_data = [("pysemantic", path), ("test_excel", excel_path),
111 |                       ("dummy_project", "/foo/bar.yaml")]
112 |         for i, config in enumerate(dummy_data):
113 |             ideal = "Project {0} with specfile at {1}".format(*config)
114 |             self.assertEqual(ideal, output[i])
115 | 
116 |     def test_list_datasets(self):
117 |         """Test if the `list` subocmmand works for listing datasets."""
118 |         command = "semantic list --project pysemantic"
119 |         cmd = command.split(' ')
120 |         datasets = pr.get_datasets("pysemantic")
121 |         output = subprocess.check_output(cmd, env=self.testenv).splitlines()
122 |         self.assertItemsEqual(datasets, output)
123 | 
124 |     def test_add(self):
125 |         """Test if the `add` subcommand can add projects to the config file."""
126 |         try:
127 |             cmd = ['semantic', 'add', 'dummy_added_project', '/tmp/dummy.yaml']
128 |             subprocess.check_call(cmd, env=self.testenv)
129 |             projects = pr.get_projects()
130 |             self.assertIn(("dummy_added_project", "/tmp/dummy.yaml"), projects)
131 |         finally:
132 |             pr.remove_project("dummy_added_project")
133 | 
134 |     def test_add_dataset(self):
135 |         """Test if the add-dataset subcommand adds datasets to projects."""
136 |         tempdir = tempfile.mkdtemp()
137 |         outfile = op.join(tempdir, "testdata.csv")
138 |         dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b'])
139 |         dframe.to_csv(outfile, index=False)
140 |         cmd = ("semantic add-dataset testdata --project pysemantic --path {}"
141 |                " --dlm ,")
142 |         cmd = cmd.format(outfile).split(" ")
143 |         try:
144 |             subprocess.check_call(cmd, env=self.testenv)
145 |             _pr = pr.Project("pysemantic")
146 |             self.assertIn("testdata", _pr.datasets)
147 |             specs = dict(path=outfile, delimiter=',')
148 |             actual = pr.get_schema_specs("pysemantic", "testdata")
149 |             self.assertKwargsEqual(specs, actual)
150 |         finally:
151 |             pr.remove_dataset("pysemantic", "testdata")
152 |             shutil.rmtree(tempdir)
153 | 
154 |     def test_remove_dataset(self):
155 |         """Test if removing datasets works from the command line."""
156 |         # Add a temporary dataset and try to remove it.
157 |         tempdir = tempfile.mkdtemp()
158 |         outfile = op.join(tempdir, "testdata.csv")
159 |         dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b'])
160 |         dframe.to_csv(outfile, index=False)
161 |         specs = dict(path=outfile, delimiter=',')
162 |         pr.add_dataset("pysemantic", "testdata", specs)
163 |         try:
164 |             command = "semantic remove pysemantic --dataset testdata"
165 |             cmd = command.split(' ')
166 |             subprocess.check_call(cmd, env=self.testenv)
167 |             datasets = pr.get_datasets("pysemantic")
168 |             self.assertNotIn("testdata", datasets)
169 |         finally:
170 |             datasets = pr.get_datasets("pysemantic")
171 |             if "testdata" in datasets:
172 |                 pr.remove_dataset("pysemantic", "testdata")
173 |             shutil.rmtree(tempdir)
174 | 
175 |     def test_remove(self):
176 |         """Test if the remove subcommand can remove projects."""
177 |         pr.add_project("dummy_project_2", "/foo/baz.yaml")
178 |         try:
179 |             cmd = ['semantic', 'remove', 'dummy_project_2']
180 |             subprocess.check_call(cmd, env=self.testenv)
181 |             projects = pr.get_projects()
182 |             proj_names = [p[0] for p in projects]
183 |             self.assertNotIn("dummy_project_2", proj_names)
184 |         finally:
185 |             pr.remove_project("dummy_project_2")
186 | 
187 |     def test_remove_nonexistent_project(self):
188 |         """Check if attempting to remove a nonexistent project fails."""
189 |         cmd = ['semantic', 'remove', 'foobar']
190 |         output = subprocess.check_output(cmd, env=self.testenv)
191 |         self.assertEqual(output.strip(), "The project foobar doesn't exist.")
192 | 
193 |     def test_set_schema(self):
194 |         """Test if the set-schema subcommand works."""
195 |         cmd = ['semantic', 'set-schema', 'dummy_project', '/tmp/baz.yaml']
196 |         subprocess.check_call(cmd, env=self.testenv)
197 |         self.assertEqual(pr.get_default_specfile('dummy_project'),
198 |                          '/tmp/baz.yaml')
199 | 
200 |     @unittest.skipIf(PYTABLES_NOT_INSTALLED, "HDF export needs PyTables.")
201 |     def test_export_hdf(self):
202 |         """Test if exporting a dataset to hdf works."""
203 |         tempdir = tempfile.mkdtemp()
204 |         cmd = "semantic export pysemantic --dataset iris {0}"
205 |         cmd = cmd.format(op.join(tempdir, "iris.h5"))
206 |         cmd = cmd.split()
207 |         try:
208 |             subprocess.check_call(cmd, env=self.testenv)
209 |             self.assertTrue(op.exists(op.join(tempdir, "iris.h5")))
210 |         finally:
211 |             shutil.rmtree(tempdir)
212 | 
213 |     def test_set_schema_nonexistent_project(self):
214 |         """Test if the set-schema prints proper warnings when trying to set
215 |         schema file for nonexistent project.
216 |         """
217 |         cmd = ['semantic', 'set-schema', 'dummy_project_3', '/foo']
218 |         output = subprocess.check_output(cmd, env=self.testenv)
219 |         msg = """Project {} not found in the configuration. Please use
220 |             $ semantic add
221 |             to register the project.""".format("dummy_project_3")
222 |         self.assertEqual(output.strip(), msg)
223 | 
224 |     def test_relative_path(self):
225 |         """Check if the set-schema and add subcommands convert relative paths
226 |         from the cmdline to absolute paths in the config file.
227 |         """
228 |         try:
229 |             cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml']
230 |             subprocess.check_call(cmd, env=self.testenv)
231 |             self.assertTrue(op.isabs(pr.get_default_specfile(
232 |                                                              'dummy_project')))
233 |             pr.remove_project("dummy_project")
234 |             cmd = ['semantic', 'add', 'dummy_project', './foo.yaml']
235 |             subprocess.check_call(cmd, env=self.testenv)
236 |             self.assertTrue(op.isabs(pr.get_default_specfile(
237 |                                                              'dummy_project')))
238 |         finally:
239 |             pr.remove_project("dummy_project_1")
240 | 
241 | if __name__ == '__main__':
242 |     unittest.main()
243 | 


--------------------------------------------------------------------------------
/pysemantic/tests/test_custom_traits.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """Tests for the custom_traits module."""
10 | 
11 | import unittest
12 | import os.path as op
13 | 
14 | from traits.api import HasTraits, Either, List, Str, TraitError
15 | 
16 | from pysemantic.custom_traits import AbsFile, ValidTraitList
17 | from pysemantic.tests.test_base import TEST_DATA_DICT
18 | 
19 | 
20 | class TestCustomTraits(unittest.TestCase):
21 | 
22 |     """ Testcase for the custom_traits module. This consists purely of testing
23 |     whether validation is happening correctly on the custom_traits.
24 |     """
25 | 
26 |     @classmethod
27 |     def setUpClass(cls):
28 |         class CustomTraits(HasTraits):
29 |             def __init__(self, **kwargs):
30 |                 super(CustomTraits, self).__init__(**kwargs)
31 |                 self.required = ['filepath']
32 |             filepath = AbsFile
33 |             filelist = Either(List(AbsFile), AbsFile)
34 |             required = ValidTraitList(Str)
35 | 
36 |         cls.custom_traits = CustomTraits
37 | 
38 |     def setUp(self):
39 |         self.traits = self.custom_traits(filepath=op.abspath(__file__))
40 |         self.setter = lambda x, y: setattr(self.traits, x, y)
41 | 
42 |     def test_validtraitlist_trait(self):
43 |         """Test if `pysemantic.self.traits.ValidTraitsList` works properly."""
44 |         self.assertItemsEqual(self.traits.required, ['filepath'])
45 | 
46 |     def test_absfile_either_list_traits(self):
47 |         """Test if the AbsFile trait works within Either and List self.traits.
48 |         """
49 |         self.traits.filelist = op.abspath(__file__)
50 |         self.traits.filelist = [op.abspath(__file__), TEST_DATA_DICT]
51 |         self.assertRaises(TraitError, self.setter, "filelist",
52 |                           [op.basename(__file__)])
53 |         self.assertRaises(TraitError, self.setter, "filelist", ["/foo/bar"])
54 |         self.assertRaises(TraitError, self.setter, "filelist",
55 |                           op.basename(__file__))
56 |         self.assertRaises(TraitError, self.setter, "filelist", "/foo/bar")
57 | 
58 |     def test_absolute_filepath_nonexistent(self):
59 |         """Test if the Absfile trait raises the correct error when the filepath
60 |         is absolute but doesn't exist."""
61 |         self.assertRaisesRegexp(TraitError, 'The filepath does not exist.',
62 |                                 self.setter, "filepath", '/foo/bar')
63 | 
64 |     def test_absolute_path_file_trait(self):
65 |         """Test if the `traits.AbsFile` trait works correctly."""
66 |         self.traits.filepath = op.abspath(__file__)
67 |         self.assertRaises(TraitError, self.setter, "filepath",
68 |                           op.basename(__file__))
69 |         self.assertRaises(TraitError, self.setter, "filepath", "foo/bar")
70 |         self.assertRaises(TraitError, self.setter, "filepath", "/foo/bar")
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/pysemantic/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2015 jaidev <jaidev@newton>
 6 | #
 7 | # Distributed under terms of the BSD 3-clause license.
 8 | 
 9 | """
10 | Tests for a the pysemantic.utils module.
11 | """
12 | 
13 | import unittest
14 | import os.path as op
15 | from pysemantic.utils import colnames, get_md5_checksum
16 | 
17 | 
18 | class TestUtils(unittest.TestCase):
19 | 
20 |     def setUp(self):
21 |         self.filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
22 |                                 "iris.csv")
23 | 
24 |     def test_colnames(self):
25 |         """Test if the column names are read correctly from a file."""
26 |         ideal = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width',
27 |                  'Species']
28 |         actual = colnames(self.filepath)
29 |         self.assertItemsEqual(actual, ideal)
30 | 
31 |     def test_colnames_infer_parser_from_extension(self):
32 |         """Test if the colnames function can infer the correct parser from the
33 |         file extension."""
34 |         filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
35 |                            "person_activity.tsv")
36 |         ideal = "sequence_name tag date x y z activity".split()
37 |         actual = colnames(filepath)
38 |         self.assertItemsEqual(actual, ideal)
39 | 
40 |     def test_colnames_parser_arg(self):
41 |         """Test if the colnames are read if the parser is specified."""
42 |         filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
43 |                            "person_activity.tsv")
44 |         ideal = "sequence_name tag date x y z activity".split()
45 |         from pandas import read_table
46 |         actual = colnames(filepath, parser=read_table)
47 |         self.assertItemsEqual(actual, ideal)
48 | 
49 |     def test_colnames_infer_parser_from_sep(self):
50 |         """Test if the colnames are read if the separator is specified."""
51 |         filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
52 |                            "person_activity.tsv")
53 |         ideal = "sequence_name tag date x y z activity".split()
54 |         actual = colnames(filepath, sep='\\t')
55 |         self.assertItemsEqual(actual, ideal)
56 | 
57 |     def test_md5(self):
58 |         """Test the md5 checksum calculator."""
59 |         ideal = "9b3ecf3031979169c0ecc5e03cfe20a6"
60 |         actual = get_md5_checksum(self.filepath)
61 |         self.assertEqual(ideal, actual)
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/pysemantic/tests/test_validator.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2015 jaidev <jaidev@newton>
  6 | #
  7 | # Distributed under terms of the BSD 3-clause license.
  8 | 
  9 | """Tests for the validator module."""
 10 | 
 11 | import os
 12 | import os.path as op
 13 | import cPickle
 14 | import unittest
 15 | import tempfile
 16 | import warnings
 17 | import shutil
 18 | from copy import deepcopy
 19 | 
 20 | import numpy as np
 21 | import pandas as pd
 22 | import yaml
 23 | from traits.api import TraitError
 24 | 
 25 | from pysemantic.tests.test_base import (BaseTestCase, TEST_DATA_DICT,
 26 |                                         _get_iris_args, _dummy_postproc,
 27 |                                         _get_person_activity_args)
 28 | from pysemantic.validator import (SeriesValidator, SchemaValidator,
 29 |                                   DataFrameValidator)
 30 | from pysemantic.utils import get_md5_checksum
 31 | 
 32 | try:
 33 |     from yaml import CLoader as Loader
 34 |     from yaml import CDumper as Dumper
 35 | except ImportError:
 36 |     from yaml import Loader
 37 |     from yaml import Dumper
 38 | 
 39 | 
 40 | class TestSchemaValidator(BaseTestCase):
 41 | 
 42 |     """Test the `pysemantic.validator.SchemaValidatorClass`."""
 43 | 
 44 |     @classmethod
 45 |     def setUpClass(cls):
 46 |         cls.maxDiff = None
 47 |         cls.specfile = op.join(op.dirname(__file__), "testdata",
 48 |                                "test_dictionary.yaml")
 49 |         with open(cls.specfile, "r") as fileobj:
 50 |             cls._basespecs = yaml.load(fileobj, Loader=Loader)
 51 |         cls.specs = deepcopy(cls._basespecs)
 52 | 
 53 |         # fix the paths in basespecs if they aren't absolute
 54 |         for _, dataspec in cls.specs.iteritems():
 55 |             if not op.isabs(dataspec['path']):
 56 |                 dataspec['path'] = op.join(op.abspath(op.dirname(__file__)),
 57 |                                            dataspec['path'])
 58 |         # The updated values also need to be dumped into the yaml file, because
 59 |         # some functionality of the validator depends on parsing it.
 60 |         with open(cls.specfile, "w") as fileobj:
 61 |             yaml.dump(cls.specs, fileobj, Dumper=Dumper,
 62 |                       default_flow_style=False)
 63 | 
 64 |         cls.ideal_activity_parser_args = _get_person_activity_args()
 65 |         cls.ideal_iris_parser_args = _get_iris_args()
 66 | 
 67 |     @classmethod
 68 |     def tearDownClass(cls):
 69 |         with open(cls.specfile, "w") as fileobj:
 70 |             yaml.dump(cls._basespecs, fileobj, Dumper=Dumper,
 71 |                       default_flow_style=False)
 72 | 
 73 |     def setUp(self):
 74 |         # FIXME: This should not be necessary, but without it, a couple of
 75 |         # tests strangely fail. I think one or both of the following two tests
 76 |         # are messing up the base specifications.
 77 |         self.basespecs = deepcopy(self.specs)
 78 | 
 79 |     def test_parse_dates_list(self):
 80 |         """Test if arguments to `parse_dates` are put into a list."""
 81 |         specs = deepcopy(self.basespecs['person_activity'])
 82 |         specs['parse_dates'] = specs['parse_dates'][0]
 83 |         validator = SchemaValidator(specification=specs)
 84 |         parser_args = validator.get_parser_args()
 85 |         self.assertTrue(isinstance(parser_args['parse_dates'], list))
 86 |         df = pd.read_csv(**parser_args)
 87 |         self.assertEqual(df['date'].dtype, np.dtype('<M8[ns]'))
 88 | 
 89 |     def test_usecols(self):
 90 |         """Test if inferring the usecols argument works."""
 91 |         specs = deepcopy(self.basespecs['iris'])
 92 |         specs['use_columns'] = ['Petal Length', 'Sepal Width', 'Species']
 93 |         validator = SchemaValidator(specification=specs)
 94 |         df = pd.read_csv(**validator.get_parser_args())
 95 |         for colname in specs['use_columns']:
 96 |             self.assertIn(colname, df)
 97 |         self.assertNotIn("Petal Width", df)
 98 |         self.assertNotIn("Sepal Length", df)
 99 |         self.assertEqual(df.shape[1], 3)
100 | 
101 |     def test_index(self):
102 |         """Test if specifying the index_col works."""
103 |         specs = deepcopy(self.basespecs['iris'])
104 |         index_col = "Species"
105 |         specs['index_col'] = index_col
106 |         del specs['column_rules']['Species']
107 |         validator = SchemaValidator(specification=specs)
108 |         parser_args = validator.get_parser_args()
109 |         self.assertItemsEqual(parser_args['index_col'], index_col)
110 | 
111 |     def test_multiindex(self):
112 |         """Test if validator accepts list of index columns for
113 |         multiindexing."""
114 |         specs = deepcopy(self.basespecs['person_activity'])
115 |         index_cols = ['tag', 'sequence_name']
116 |         specs['index_col'] = index_cols
117 |         validator = SchemaValidator(specification=specs)
118 |         parser_args = validator.get_parser_args()
119 |         self.assertItemsEqual(parser_args['index_col'], index_cols)
120 | 
121 |     def test_random_rows_selection(self):
122 |         """Test if the validator correctly produces the function argument
123 |         required for selecting a range of rows from a dataset."""
124 |         self.basespecs['iris']['nrows'] = {'range': [25, 75]}
125 |         validator = SchemaValidator(specification=self.basespecs['iris'])
126 |         parser_args = validator.get_parser_args()
127 |         self.assertEqual(parser_args['skiprows'], 25)
128 |         self.assertEqual(parser_args['nrows'], 50)
129 | 
130 |     def test_pickled_arguments(self):
131 |         """Test if the SchemaValidator correctly loads pickled arguments."""
132 |         tempdir = tempfile.mkdtemp()
133 |         outpath = op.join(tempdir, "iris_args.pkl")
134 |         with open(outpath, 'w') as fid:
135 |             cPickle.dump(self.ideal_iris_parser_args, fid)
136 |         new_schema_path = op.join(tempdir, "pickle_schema.yml")
137 |         with open(new_schema_path, 'w') as fid:
138 |             yaml.dump(dict(iris=dict(pickle=outpath)), fid, Dumper=Dumper,
139 |                       default_flow_style=False)
140 |         org_data = pd.read_csv(self.ideal_iris_parser_args['filepath_or_buffer'])
141 |         try:
142 |             validator = SchemaValidator.from_specfile(new_schema_path, "iris",
143 |                                                       is_pickled=True)
144 |             loaded = pd.read_csv(**validator.get_parser_args())
145 |             self.assertDataFrameEqual(loaded, org_data)
146 |         finally:
147 |             shutil.rmtree(tempdir)
148 | 
149 |     def test_exclude_columns(self):
150 |         schema = deepcopy(self.basespecs['iris'])
151 |         schema['exclude_columns'] = ['Sepal Length', 'Petal Width']
152 |         validator = SchemaValidator(specification=schema)
153 |         loaded = pd.read_csv(**validator.get_parser_args())
154 |         self.assertItemsEqual(loaded.columns,
155 |                               ['Petal Length', 'Sepal Width', 'Species'])
156 | 
157 |     def test_header(self):
158 |         """Test if the header option works."""
159 |         schema = deepcopy(self.basespecs['iris'])
160 |         schema['header'] = 1
161 |         validator = SchemaValidator(specification=schema)
162 |         loaded = pd.read_csv(**validator.get_parser_args())
163 |         self.assertItemsEqual(loaded.columns,
164 |                               ['5.1', '3.5', '1.4', '0.2', 'setosa'])
165 | 
166 |     def test_colnames_as_dict(self):
167 |         """Test if the column names work when specified as a dictionary."""
168 |         schema = deepcopy(self.basespecs['iris'])
169 |         namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth',
170 |                    'Petal Width': 'pwidth', 'Petal Length': 'plength',
171 |                    'Species': 'spcs'}
172 |         schema['column_names'] = namemap
173 |         ideal = {'column_names': namemap}
174 |         validator = SchemaValidator(specification=schema)
175 |         validator.get_parser_args()
176 |         self.assertKwargsEqual(validator.df_rules, ideal)
177 | 
178 |     def test_colnames_as_callable(self):
179 |         """Test if column names work when specified as a callable."""
180 |         schema = deepcopy(self.basespecs['iris'])
181 |         translator = lambda x: "_".join([s.lower() for s in x.split()])
182 |         schema['column_names'] = translator
183 |         ideal = {'column_names': translator}
184 |         validator = SchemaValidator(specification=schema)
185 |         validator.get_parser_args()
186 |         self.assertKwargsEqual(validator.df_rules, ideal)
187 | 
188 |     def test_converter(self):
189 |         """Test if the SeriesValidator properly applies converters."""
190 |         schema = deepcopy(self.basespecs['iris'])
191 |         schema['converters'] = {'Sepal Width': lambda x: int(float(x))}
192 |         validator = SchemaValidator(specification=schema)
193 |         filtered = pd.read_csv(**validator.get_parser_args())['Sepal Width']
194 |         self.assertTrue(filtered.dtype == np.int)
195 | 
196 |     def test_timestamp_cols_combine(self):
197 |         """Test if the schema for combining datetime columns works."""
198 |         tempdir = tempfile.mkdtemp()
199 |         outpath = op.join(tempdir, "data.csv")
200 |         rng = pd.date_range('1/1/2011', periods=72, freq='H')
201 |         rng = [str(x).split() for x in rng]
202 |         date = [x[0] for x in rng]
203 |         time = [x[1] for x in rng]
204 |         data = pd.DataFrame({'Date': date, 'Time': time,
205 |                              'X': np.random.rand(len(date),)})
206 |         data.to_csv(outpath, index=False)
207 |         specs = dict(path=outpath, parse_dates={'Date_Time': ['Date', 'Time']})
208 |         validator = SchemaValidator(specification=specs)
209 |         try:
210 |             loaded = pd.read_csv(**validator.get_parser_args())
211 |             x = " ".join((date[0], time[0]))
212 |             self.assertEqual(loaded['Date_Time'].dtype,
213 |                              np.datetime64(x, 'ns').dtype)
214 |         finally:
215 |             shutil.rmtree(tempdir)
216 | 
217 |     def test_global_na_values(self):
218 |         """Test if specifying a global NA value for a dataset works."""
219 |         tempdir = tempfile.mkdtemp()
220 |         df = pd.DataFrame(np.random.rand(10, 10))
221 |         ix = np.random.randint(0, df.shape[0], size=(5,))
222 |         ix = np.unique(ix)
223 |         for i in xrange(ix.shape[0]):
224 |             df.iloc[ix[i], ix[i]] = "foobar"
225 |         fpath = op.join(tempdir, "test_na.csv")
226 |         df.to_csv(fpath, index=False)
227 |         schema = {'path': fpath, 'na_values': "foobar"}
228 |         try:
229 |             validator = SchemaValidator(specification=schema)
230 |             parser_args = validator.get_parser_args()
231 |             self.assertEqual(parser_args['na_values'], "foobar")
232 |             df = pd.read_csv(**parser_args)
233 |             self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0])
234 |         finally:
235 |             shutil.rmtree(tempdir)
236 | 
237 |     def test_na_values(self):
238 |         """Test if adding NA values in the schema works properly."""
239 |         bad_iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata",
240 |                                 "bad_iris.csv")
241 |         schema = deepcopy(self.basespecs['iris'])
242 |         schema['path'] = bad_iris_path
243 |         schema['column_rules']['Species']['unique_values'].append('unknown')
244 |         schema['column_rules']['Species']['na_values'] = ['unknown']
245 |         validator = SchemaValidator(specification=schema)
246 |         parser_args = validator.get_parser_args()
247 |         self.assertDictEqual(parser_args.get("na_values"),
248 |                              {'Species': ['unknown']})
249 | 
250 |     def test_md5(self):
251 |         """Check if the md5 checksum validation works properly."""
252 |         schema = deepcopy(self.basespecs["iris"])
253 |         schema['md5'] = get_md5_checksum(schema['path'])
254 |         SchemaValidator(specification=schema)
255 |         tempdir = tempfile.mkdtemp()
256 |         outpath = op.join(tempdir, "bad_iris.csv")
257 |         iris = pd.read_csv(schema['path'])
258 |         del iris['Species']
259 |         iris.to_csv(outpath, index=False)
260 |         schema['path'] = outpath
261 |         try:
262 |             with warnings.catch_warnings(record=True) as catcher:
263 |                 SchemaValidator(specification=schema).get_parser_args()
264 |                 assert len(catcher) == 1
265 |                 assert issubclass(catcher[-1].category, UserWarning)
266 |         finally:
267 |             shutil.rmtree(tempdir)
268 | 
269 |     def test_pandas_defaults_empty_specs(self):
270 |         """Test if the validator falls back to pandas defaults for empty specs.
271 |         """
272 |         schema = dict(path=op.join(op.abspath(op.dirname(__file__)),
273 |                                    "testdata", "iris.csv"))
274 |         validator = SchemaValidator(specification=schema)
275 |         ideal = pd.read_csv(schema['path'])
276 |         actual = pd.read_csv(**validator.get_parser_args())
277 |         self.assertDataFrameEqual(ideal, actual)
278 | 
279 |     def test_from_dict(self):
280 |         """Test if the SchemaValidator.from_dict constructor works."""
281 |         validator = SchemaValidator.from_dict(self.basespecs['iris'])
282 |         self.assertKwargsEqual(validator.get_parser_args(),
283 |                                self.ideal_iris_parser_args)
284 |         validator = SchemaValidator.from_dict(self.basespecs[
285 |                                                             'person_activity'])
286 |         self.assertKwargsEqual(validator.get_parser_args(),
287 |                                self.ideal_activity_parser_args)
288 | 
289 |     def test_from_specfile(self):
290 |         """Test if the SchemaValidator.from_specfile constructor works."""
291 |         validator = SchemaValidator.from_specfile(self.specfile, "iris")
292 |         self.assertKwargsEqual(validator.get_parser_args(),
293 |                                self.ideal_iris_parser_args)
294 |         validator = SchemaValidator.from_specfile(self.specfile,
295 |                                                   "person_activity")
296 |         self.assertKwargsEqual(validator.get_parser_args(),
297 |                                self.ideal_activity_parser_args)
298 | 
299 |     def test_to_dict(self):
300 |         """Test if the SchemaValidator.to_dict method works."""
301 |         validator = SchemaValidator(specification=self.basespecs['iris'])
302 |         self.assertKwargsEqual(validator.to_dict(),
303 |                                self.ideal_iris_parser_args)
304 |         validator = SchemaValidator(specification=self.basespecs[
305 |                                                             'person_activity'])
306 |         self.assertKwargsEqual(validator.to_dict(),
307 |                                self.ideal_activity_parser_args)
308 | 
309 |     def test_required_args(self):
310 |         """Test if the required arguments for the validator are working
311 |         properly.
312 |         """
313 |         # Remove the path and delimiter from the scehma
314 |         filepath = self.basespecs['iris'].pop('path')
315 |         delimiter = self.basespecs['iris'].pop('delimiter')
316 |         try:
317 |             self.assertRaises(TraitError, SchemaValidator,
318 |                               specification=self.basespecs['iris'])
319 |         finally:
320 |             self.basespecs['iris']['delimiter'] = delimiter
321 |             self.basespecs['iris']['path'] = filepath
322 | 
323 |     def test_multifile_dataset_schema(self):
324 |         """Test if a dataset schema with multiple files works properly."""
325 |         duplicate_iris_path = self.basespecs['iris']['path'].replace("iris",
326 |                                                                      "iris2")
327 |         # Copy the file
328 |         dframe = pd.read_csv(self.basespecs['iris']['path'])
329 |         dframe.to_csv(duplicate_iris_path, index=False)
330 | 
331 |         # Create the news chema
332 |         schema = {'nrows': [150, 150], 'path': [duplicate_iris_path,
333 |                   self.basespecs['iris']['path']]}
334 |         for key, value in self.basespecs['iris'].iteritems():
335 |             if key not in schema:
336 |                 schema[key] = value
337 | 
338 |         try:
339 |             validator = SchemaValidator(specification=schema)
340 |             self.assertTrue(validator.is_multifile)
341 |             self.assertItemsEqual(validator.filepath, schema['path'])
342 |             self.assertItemsEqual(validator.nrows, schema['nrows'])
343 |             validated_args = validator.get_parser_args()
344 |             self.assertTrue(isinstance(validated_args, list))
345 |             self.assertEqual(len(validated_args), 2)
346 |         finally:
347 |             os.unlink(duplicate_iris_path)
348 | 
349 |     def test_validator_with_specdict_iris(self):
350 |         """Check if the validator works when only the specification is supplied
351 |         as a dictionary for the iris dataset.
352 |         """
353 |         validator = SchemaValidator(specification=self.basespecs['iris'])
354 |         self.assertFalse(validator.is_multifile)
355 |         validated_parser_args = validator.get_parser_args()
356 |         self.assertKwargsEqual(validated_parser_args,
357 |                                self.ideal_iris_parser_args)
358 | 
359 |     def test_validator_with_specfile_spec(self):
360 |         """Check if the validator works when the specfile and specification are
361 |         both provided.
362 |         """
363 |         # This is necessary because the validator might have to write
364 |         # specifications to the dictionary.
365 |         validator = SchemaValidator(specification=self.basespecs['iris'],
366 |                                     specfile=self.specfile)
367 |         self.assertFalse(validator.is_multifile)
368 |         validated_parser_args = validator.get_parser_args()
369 |         self.assertKwargsEqual(validated_parser_args,
370 |                                self.ideal_iris_parser_args)
371 | 
372 |     def test_validator_with_specdist_activity(self):
373 |         """Check if the validator works when only the specification is supplied
374 |         as a dictionary for the person activity dataset.
375 |         """
376 |         validator = SchemaValidator(
377 |                                specification=self.basespecs['person_activity'])
378 |         self.assertFalse(validator.is_multifile)
379 |         validated = validator.get_parser_args()
380 |         self.assertKwargsEqual(validated, self.ideal_activity_parser_args)
381 | 
382 |     def test_error_for_relative_filepath(self):
383 |         """Test if validator raises errors when relative paths are found in the
384 |         dictionary.
385 |         """
386 |         specs = self.basespecs['iris']
387 |         old_path = specs['path']
388 |         try:
389 |             specs['path'] = op.join("testdata", "iris.csv")
390 |             self.assertRaises(TraitError, SchemaValidator,
391 |                               specification=specs)
392 |         finally:
393 |             specs['path'] = old_path
394 | 
395 |     def test_error_for_bad_dtypes(self):
396 |         """Check if the validator raises an error if a bad dtype dictionary is
397 |         passed.
398 |         """
399 |         specs = self.basespecs['iris']
400 |         old = specs['dtypes'].pop('Species')
401 |         try:
402 |             specs['dtypes']['Species'] = "random_string"
403 |             validator = SchemaValidator(specification=specs)
404 |             self.assertRaises(TraitError, validator.get_parser_args)
405 |         finally:
406 |             specs['dtypes']['Species'] = old
407 | 
408 |     def test_error_only_specfile(self):
409 |         """Test if the validator fails when only the path to the specfile is
410 |         provided.
411 |         """
412 |         self.assertRaises((TraitError, ValueError), SchemaValidator,
413 |                           specfile=self.specfile)
414 | 
415 |     def test_error_only_name(self):
416 |         """Test if the validator fails when only the path to the specfile is
417 |         provided.
418 |         """
419 |         self.assertRaises(TraitError, SchemaValidator, name="iris")
420 | 
421 |     def test_validator_specfile_name_iris(self):
422 |         """Test if the validator works when providing specifle and name for the
423 |         iris dataset.
424 |         """
425 |         validator = SchemaValidator(specfile=self.specfile, name="iris")
426 |         validated_parser_args = validator.get_parser_args()
427 |         self.assertKwargsEqual(validated_parser_args,
428 |                                self.ideal_iris_parser_args)
429 | 
430 |     def test_validator_specfile_name_activity(self):
431 |         """Test if the validator works when providing specifle and name for the
432 |         activity dataset.
433 |         """
434 |         validator = SchemaValidator(specfile=self.specfile,
435 |                                     name="person_activity")
436 |         validated_parser_args = validator.get_parser_args()
437 |         self.assertKwargsEqual(validated_parser_args,
438 |                                self.ideal_activity_parser_args)
439 | 
440 | 
441 | class TestSeriesValidator(BaseTestCase):
442 | 
443 |     """Tests for the SeriesValidator class."""
444 | 
445 |     @classmethod
446 |     def setUpClass(cls):
447 |         cls.dataframe = pd.read_csv(op.join(op.abspath(op.dirname(__file__)),
448 |                                             "testdata", "iris.csv"))
449 |         species_rules = {'unique_values': ['setosa', 'virginica',
450 |                                            'versicolor'],
451 |                          'drop_duplicates': False, 'drop_na': False}
452 |         cls.species_rules = species_rules
453 |         sepal_length_rules = {'drop_duplicates': False}
454 |         cls.sepal_length_rules = sepal_length_rules
455 | 
456 |     def setUp(self):
457 |         self.species = self.dataframe['Species'].copy()
458 |         self.sepal_length = self.dataframe['Sepal Length'].copy()
459 | 
460 |     def test_postprocessor(self):
461 |         """Test if postporocessors work for series data."""
462 |         self.species_rules['postprocessors'] = [_dummy_postproc]
463 |         validator = SeriesValidator(data=self.species, rules=self.species_rules)
464 |         try:
465 |             cleaned = validator.clean()
466 |             self.assertNotIn("setosa", cleaned.unique())
467 |         finally:
468 |             del self.species_rules['postprocessors']
469 | 
470 |     def test_drop_duplicates(self):
471 |         """Check if the SeriesValidator drops duplicates in the series."""
472 |         self.species_rules['drop_duplicates'] = True
473 |         try:
474 |             series = self.species.unique().tolist()
475 |             validator = SeriesValidator(data=self.species,
476 |                                         rules=self.species_rules)
477 |             cleaned = validator.clean()
478 |             self.assertEqual(cleaned.shape[0], 3)
479 |             self.assertItemsEqual(cleaned.tolist(), series)
480 |         finally:
481 |             self.species_rules['drop_duplicates'] = False
482 | 
483 |     def test_drop_na(self):
484 |         """Check if the SeriesValidator drops NAs in the series."""
485 |         self.species_rules['drop_na'] = True
486 |         try:
487 |             unqs = np.random.choice(self.species.unique().tolist() + [None],
488 |                                     size=(100,))
489 |             unqs = pd.Series(unqs)
490 |             validator = SeriesValidator(data=unqs,
491 |                                         rules=self.species_rules)
492 |             cleaned = validator.clean()
493 |             self.assertEqual(cleaned.nunique(), self.species.nunique())
494 |             self.assertItemsEqual(cleaned.unique().tolist(),
495 |                                   self.species.unique().tolist())
496 |         finally:
497 |             self.species_rules['drop_na'] = False
498 | 
499 |     def test_numerical_series(self):
500 |         """Test if the SeriesValidator works on a numerical series."""
501 |         validator = SeriesValidator(data=self.sepal_length,
502 |                                     rules=self.sepal_length_rules)
503 |         cleaned = validator.clean()
504 |         self.assertSeriesEqual(cleaned, self.dataframe['Sepal Length'])
505 | 
506 |     def test_min_max_rules(self):
507 |         """Test if the validator enforces min and max values from schema."""
508 |         self.sepal_length_rules['min'] = 5.0
509 |         self.sepal_length_rules['max'] = 7.0
510 |         try:
511 |             validator = SeriesValidator(data=self.sepal_length,
512 |                                         rules=self.sepal_length_rules)
513 |             cleaned = validator.clean()
514 |             self.assertLessEqual(cleaned.max(), 7.0)
515 |             self.assertGreaterEqual(cleaned.min(), 5.0)
516 |         finally:
517 |             del self.sepal_length_rules['max']
518 |             del self.sepal_length_rules['min']
519 | 
520 |     def test_regex_filter(self):
521 |         """Test if the SeriesValidator does filtering based on the regular
522 |         expression provided.
523 |         """
524 |         self.species_rules['regex'] = r'\b[a-z]+\b'
525 |         try:
526 |             validator = SeriesValidator(data=self.species,
527 |                                         rules=self.species_rules)
528 |             cleaned = validator.clean()
529 |             self.assertSeriesEqual(cleaned, self.dataframe['Species'])
530 | 
531 |             self.species = self.dataframe['Species'].copy()
532 |             self.species = self.species.apply(lambda x: x.replace("e", "1"))
533 |             validator = SeriesValidator(data=self.species,
534 |                                         rules=self.species_rules)
535 |             cleaned = validator.clean()
536 |             self.assertItemsEqual(cleaned.shape, (50,))
537 |             self.assertItemsEqual(cleaned.unique().tolist(), ['virginica'])
538 |         finally:
539 |             del self.species_rules['regex']
540 | 
541 | 
542 | class TestDataFrameValidator(BaseTestCase):
543 | 
544 |     """Tests for the DataFrameValidator class."""
545 | 
546 |     @classmethod
547 |     def setUpClass(cls):
548 |         cls.maxDiff = None
549 |         with open(TEST_DATA_DICT, 'r') as fileobj:
550 |             basespecs = yaml.load(fileobj, Loader=Loader)
551 |         # Fix the paths in basespecs
552 |         for _, specs in basespecs.iteritems():
553 |             rlpth = specs['path']
554 |             specs['path'] = op.join(op.abspath(op.dirname(__file__)),
555 |                                     rlpth)
556 |         cls._basespecs = basespecs
557 | 
558 |         iris_validator = SchemaValidator(specification=cls._basespecs['iris'])
559 |         pa_validator = SchemaValidator(
560 |                                specification=cls._basespecs['person_activity'])
561 |         iris_dframe = pd.read_csv(**iris_validator.get_parser_args())
562 |         pa_dframe = pd.read_csv(**pa_validator.get_parser_args())
563 |         cls.iris_dframe = iris_dframe
564 |         cls.pa_dframe = pa_dframe
565 |         cls.species_rules = {'unique_values': ['setosa', 'virginica',
566 |                                                'versicolor'],
567 |                              'drop_duplicates': False, 'drop_na': False}
568 | 
569 |     def setUp(self):
570 |         self.basespecs = deepcopy(self._basespecs)
571 | 
572 |     def test_unique_values(self):
573 |         """Test if the validator checks for the unique values."""
574 |         validator = DataFrameValidator(data=self.iris_dframe,
575 |                 column_rules={'Species': self.species_rules})
576 |         cleaned = validator.clean()
577 |         self.assertItemsEqual(cleaned.Species.unique(),
578 |                               ['setosa', 'versicolor', 'virginica'])
579 | 
580 |     def test_bad_unique_values(self):
581 |         """Test if the validator drops values not specified in the schema."""
582 |         # Add some bogus values
583 |         noise = np.random.choice(['lily', 'petunia'], size=(50,))
584 |         species = np.hstack((self.iris_dframe.Species.values, noise))
585 |         np.random.shuffle(species)
586 |         species = pd.Series(species)
587 | 
588 |         validator = DataFrameValidator(data=pd.DataFrame({'Species': species}),
589 |                                   column_rules={'Species': self.species_rules})
590 |         cleaned = validator.clean()
591 |         self.assertItemsEqual(cleaned.Species.unique(),
592 |                               ['setosa', 'versicolor', 'virginica'])
593 | 
594 |     def test_colnames_as_list(self):
595 |         """Test if the column names option works when provided as a list."""
596 |         schema = deepcopy(self.basespecs['iris'])
597 |         schema['header'] = 0
598 |         ideal = ['a', 'b', 'c', 'd', 'e']
599 |         schema['column_names'] = ideal
600 |         validator = SchemaValidator(specification=schema)
601 |         df = pd.read_csv(**validator.get_parser_args())
602 |         rules = {}
603 |         rules.update(validator.df_rules)
604 |         df_val = DataFrameValidator(data=df, rules=rules)
605 |         data = df_val.clean()
606 |         self.assertItemsEqual(data.columns, ideal)
607 | 
608 |     def test_colnames_as_dict(self):
609 |         """Test if column names gotten from SchemaValidator are implemented."""
610 |         namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth',
611 |                    'Petal Width': 'pwidth', 'Petal Length': 'plength',
612 |                    'Species': 'spcs'}
613 |         self.basespecs['iris']['column_names'] = namemap
614 |         schema_val = SchemaValidator(specification=self.basespecs['iris'])
615 |         parser_args = schema_val.get_parser_args()
616 |         df = pd.read_csv(**parser_args)
617 |         rules = {}
618 |         rules.update(schema_val.df_rules)
619 |         df_val = DataFrameValidator(data=df, rules=rules)
620 |         data = df_val.clean()
621 |         self.assertItemsEqual(data.columns, namemap.values())
622 | 
623 |     def test_colnames_as_callable(self):
624 |         translator = lambda x: "_".join([s.lower() for s in x.split()])
625 |         self.basespecs['iris']['column_names'] = translator
626 |         schema_val = SchemaValidator(specification=self.basespecs['iris'])
627 |         parser_args = schema_val.get_parser_args()
628 |         df = pd.read_csv(**parser_args)
629 |         rules = {}
630 |         rules.update(schema_val.df_rules)
631 |         df_val = DataFrameValidator(data=df, rules=rules)
632 |         data = df_val.clean()
633 |         ideal = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
634 |                  'species']
635 |         self.assertItemsEqual(data.columns, ideal)
636 | 
637 |     def test_column_rules(self):
638 |         """Test if the DataFrame validator reads and enforces the column rules
639 |         properly.
640 |         """
641 |         dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
642 |                            column_rules=self.basespecs['iris']['column_rules'])
643 |         cleaned = dframe_val.clean()
644 |         self.assertDataFrameEqual(cleaned, self.iris_dframe.drop_duplicates())
645 |         dframe_val = DataFrameValidator(data=self.pa_dframe.copy(),
646 |                 column_rules=self.basespecs['person_activity']['column_rules'])
647 |         cleaned = dframe_val.clean()
648 |         self.assertDataFrameEqual(cleaned, self.pa_dframe.drop_duplicates())
649 | 
650 |     def test_drop_duplicates(self):
651 |         """Test if the DataFrameValidator is dropping duplicates properly."""
652 |         col_rules = self.basespecs['iris'].get('column_rules')
653 |         data = self.iris_dframe.copy()
654 |         _data = pd.concat((data, data))
655 |         validator = DataFrameValidator(data=_data, column_rules=col_rules)
656 |         cleaned = validator.clean()
657 |         self.assertDataFrameEqual(cleaned, data.drop_duplicates())
658 | 
659 |     def test_column_exclude_rules(self):
660 |         """Test if the validator drops values excluded from columns."""
661 |         col_rules = deepcopy(self.basespecs['iris']['column_rules'])
662 |         col_rules['Species']['exclude'] = ['virginica', 'versicolor']
663 |         dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
664 |                                         column_rules=col_rules,
665 |                                         rules={'drop_duplicates': False})
666 |         cleaned_species = dframe_val.clean()['Species']
667 |         self.assertItemsEqual(cleaned_species.unique().tolist(), ['setosa'])
668 |         self.assertEqual(cleaned_species.shape[0], 50)
669 | 
670 | 
671 | if __name__ == '__main__':
672 |     unittest.main()
673 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/bad_iris.csv:
--------------------------------------------------------------------------------
  1 | Petal Length,Petal Width,Sepal Length,Sepal Width,Species,id
  2 | 1.3,0.2,5.5,3.5,setosa,5254531121
  3 | 1.5,0.2,5.4,3.7,setosa,0310063872
  4 | 1.4,0.2,5.1,3.5,setosa,6370482163
  5 | 5.0,1.7,6.7,3.0,versicolor,7143038184
  6 | 1.9,0.2,4.8,3.4,setosa,1236618380
  7 | 3.6,1.3,5.6,2.9,versicolor,7734802887
  8 | 5.5,2.1,6.8,3.0,virginica,4180285121
  9 | 4.0,1.3,5.5,2.5,versicolor,5551456027
 10 | 1.4,0.2,5.5,4.2,setosa,0762228151
 11 | 4.4,1.3,6.3,2.3,versicolor,7762140413
 12 | 4.6,1.5,6.5,2.8,versicolor,8471815536
 13 | 5.1,2.0,6.5,3.2,virginica,4563586668
 14 | 4.0,1.3,6.1,2.8,versicolor,8274742730
 15 | 5.2,2.3,6.7,3.0,virginica,4740425073
 16 | 1.6,0.2,5.1,3.8,setosa,2448468613
 17 | 5.0,1.5,6.0,2.2,virginica,5728737061
 18 | 1.3,0.2,4.4,3.2,setosa,2037080432
 19 | 1.5,0.2,5.2,3.5,setosa,5710365244
 20 | 4.4,1.4,6.6,3.0,versicolor,1865672631
 21 | 1.4,0.3,5.1,3.5,setosa,3475187281
 22 | 1.4,0.2,4.9,3.0,setosa,0047325178
 23 | 1.3,0.3,5.0,3.5,setosa,3156035337
 24 | 4.0,1.2,5.8,2.6,versicolor,8522423578
 25 | 5.0,2.0,5.7,2.5,virginica,8865501652
 26 | 4.7,1.6,6.3,3.3,versicolor,0740000121
 27 | 1.0,0.2,4.6,3.6,setosa,5344225514
 28 | 4.0,1.3,5.5,2.3,versicolor,8634704464
 29 | 5.9,2.3,6.8,3.2,virginica,2631423266
 30 | 5.5,1.8,6.5,3.0,virginica,2723350380
 31 | 5.7,2.5,6.7,3.3,virginica,8134828502
 32 | 4.8,1.4,6.8,2.8,versicolor,0340133834
 33 | 1.5,0.1,4.9,3.1,setosa,0155082367
 34 | 5.1,2.4,5.8,2.8,virginica,1462548133
 35 | 6.1,2.5,7.2,3.6,virginica,5774012842
 36 | 4.8,1.8,5.9,3.2,versicolor,0266030776
 37 | 6.0,2.5,6.3,3.3,virginica,3458500505
 38 | 4.8,1.8,6.2,2.8,virginica,5842175327
 39 | 5.1,1.8,5.9,3.0,virginica,5653017805
 40 | 1.4,0.1,4.8,3.0,setosa,4370585883
 41 | 4.1,1.3,5.7,2.8,versicolor,0414081326
 42 | 5.1,2.3,6.9,3.1,virginica,7831077110
 43 | 4.1,1.0,5.8,2.7,versicolor,1781168828
 44 | 3.9,1.4,5.2,2.7,versicolor,7532786603
 45 | 4.2,1.2,5.7,3.0,versicolor,6582245166
 46 | 4.5,1.6,6.0,3.4,versicolor,3476370705
 47 | 5.1,1.9,5.8,2.7,virginica,5280156765
 48 | 5.1,1.9,5.8,2.7,virginica,2085052700
 49 | 5.2,2.0,6.5,3.0,virginica,6417477471
 50 | 3.9,1.1,5.6,2.5,versicolor,0374543615
 51 | 1.7,0.4,5.4,3.9,setosa,8154170723
 52 | 1.3,0.2,4.7,3.2,setosa,3074385750
 53 | 1.4,0.2,5.2,3.4,setosa,4573217143
 54 | 1.5,0.4,5.7,4.4,setosa,3803037278
 55 | 5.6,1.4,6.1,2.6,virginica,5578305032
 56 | 6.7,2.0,7.7,2.8,virginica,0022008100
 57 | 1.6,0.6,5.0,3.5,setosa,7003160326
 58 | 1.6,0.2,4.7,3.2,setosa,3767310610
 59 | 1.3,0.4,5.4,3.9,setosa,0235610803
 60 | 1.9,0.4,5.1,3.8,setosa,8650506543
 61 | 1.5,0.3,5.1,3.8,setosa,7234340152
 62 | 1.4,0.2,5.0,3.3,setosa,8461833838
 63 | 4.3,1.3,6.4,2.9,versicolor,7861030833
 64 | 1.4,0.2,4.4,2.9,setosa,8413815620
 65 | 6.9,2.3,7.7,2.6,virginica,1156172832
 66 | 3.9,1.2,5.8,2.7,versicolor,2404200173
 67 | 5.6,2.1,6.4,2.8,virginica,7023574381
 68 | 5.9,2.1,7.1,3.0,virginica,7264077412
 69 | 3.0,1.1,5.1,2.5,versicolor,3166531268
 70 | 6.7,2.2,7.7,3.8,virginica,7832131245
 71 | 5.7,2.1,6.7,3.3,virginica,0685785037
 72 | 6.3,1.8,7.3,2.9,virginica,6046332261
 73 | 5.6,2.4,6.7,3.1,virginica,0326743168
 74 | 1.5,0.2,5.3,3.7,setosa,3552008360
 75 | 1.7,0.3,5.7,3.8,setosa,0463368223
 76 | 1.2,0.2,5.8,4.0,setosa,2220486483
 77 | 1.1,0.1,4.3,3.0,setosa,1151420314
 78 | 4.5,1.5,6.0,2.9,versicolor,7623716736
 79 | 1.7,0.5,5.1,3.3,setosa,3785253612
 80 | 3.7,1.0,5.5,2.4,versicolor,4367464018
 81 | 4.7,1.5,6.7,3.1,versicolor,6033571681
 82 | 5.6,1.8,6.3,2.9,virginica,8828207413
 83 | 4.5,1.7,4.9,2.5,virginica,2017826026
 84 | 4.7,1.4,6.1,2.9,versicolor,8761651565
 85 | 4.5,1.5,6.2,2.2,versicolor,6713108702
 86 | 1.3,0.3,4.5,2.3,setosa,5030206023
 87 | 1.3,0.2,4.4,3.0,setosa,8806133585
 88 | 4.8,1.8,6.0,3.0,virginica,2770426070
 89 | 1.5,0.2,5.0,3.4,setosa,8416474044
 90 | 1.4,0.3,4.8,3.0,setosa,6885514242
 91 | 5.1,1.5,6.3,2.8,virginica,2387283070
 92 | 6.0,1.8,7.2,3.2,virginica,5801452667
 93 | 1.5,0.4,5.1,3.7,setosa,8576302375
 94 | 4.1,1.3,5.6,3.0,versicolor,8684311486
 95 | 3.3,1.0,4.9,2.4,versicolor,7636342716
 96 | 1.5,0.4,5.4,3.4,setosa,3623014671
 97 | 4.9,1.8,6.3,2.7,virginica,1538848232
 98 | 5.6,2.2,6.4,2.8,virginica,0710568576
 99 | 4.2,1.3,5.7,2.9,versicolor,5851521750
100 | 6.1,2.3,7.7,3.0,virginica,5882058426
101 | 1.6,0.2,5.0,3.0,setosa,7400827057
102 | 5.1,1.6,6.0,2.7,versicolor,5736582340
103 | 1.7,0.2,5.4,3.4,setosa,7501632833
104 | 4.6,1.3,6.6,2.9,versicolor,3065614786
105 | 5.3,2.3,6.4,3.2,virginica,7884767054
106 | 4.6,1.4,6.1,3.0,versicolor,1746721808
107 | 3.5,1.0,5.0,2.0,versicolor,5707252356
108 | 1.6,0.2,4.8,3.1,setosa,3751051670
109 | 4.3,1.3,6.2,2.9,versicolor,0405863742
110 | 4.2,1.5,5.9,3.0,versicolor,2435707773
111 | 4.5,1.5,6.4,3.2,versicolor,3117804258
112 | 3.8,1.1,5.5,2.4,versicolor,8857524045
113 | 1.4,0.2,5.0,3.6,setosa,8185325751
114 | 4.0,1.0,6.0,2.2,versicolor,8212880870
115 | 4.9,1.5,6.3,2.5,versicolor,6164814033
116 | 5.3,1.9,6.4,2.7,virginica,1135440442
117 | 1.5,0.1,4.9,3.1,setosa,8773117406
118 | 1.4,0.2,4.6,3.2,setosa,2732175110
119 | 5.7,2.3,6.9,3.2,virginica,5740160264
120 | 4.9,1.5,6.9,3.1,versicolor,7255830663
121 | 4.5,1.3,5.7,2.8,versicolor,7665146858
122 | 1.5,0.1,4.9,3.1,setosa,3067320101
123 | 4.5,1.5,5.4,3.0,versicolor,2518723350
124 | 6.1,1.9,7.4,2.8,virginica,8688844867
125 | 5.8,1.8,6.7,2.5,virginica,1550015603
126 | 6.6,2.1,7.6,3.0,virginica,8758154826
127 | 1.5,0.1,5.2,4.1,setosa,7352658814
128 | 5.4,2.1,6.9,3.1,virginica,4030187175
129 | 5.8,1.6,7.2,3.0,virginica,7701413673
130 | 3.5,1.0,5.7,2.6,versicolor,7002048780
131 | 1.2,0.2,5.0,3.2,setosa,8847352870
132 | 4.2,1.3,5.6,2.7,versicolor,1661312160
133 | 5.8,2.2,6.5,3.0,virginica,6477515605
134 | 4.7,1.4,7.0,3.2,versicolor,2727404201
135 | 5.5,1.8,6.4,3.1,virginica,5034558328
136 | 4.9,2.0,5.6,2.8,virginica,2101326604
137 | 1.6,0.4,5.0,3.4,setosa,4335432145
138 | 1.5,0.2,4.6,3.1,setosa,4787160060
139 | 4.4,1.4,6.7,3.1,versicolor,5460147256
140 | 4.9,1.8,6.1,3.0,virginica,5342222566
141 | 4.7,1.2,6.1,2.8,versicolor,8048384635
142 | 6.4,2.0,7.9,3.8,virginica,7113747856
143 | 4.5,1.5,5.6,3.0,versicolor,8365134271
144 | 1.6,foobar,4.8,3.4,setosa,0373078236
145 | 5.0,1.9,6.3,2.5,virginica,3014252651
146 | 4.4,1.2,5.5,2.6,versicolor,7457813172
147 | 5.4,2.3,6.2,3.4,virginica,6223623434
148 | 5.6,2.4,6.3,3.4,virginica,6121178201
149 | 1.4,0.3,4.6,3.4,setosa,5475120310
150 | 3.3,1.0,5.0,2.3,versicolor,1717243078
151 | 1.5,0.2,5.1,3.4,setosa,1200176364
152 | 0.0,0.0,0.0,0.0,unknown,5472465474
153 | 0.0,0.0,0.0,0.0,unknown,1261158681
154 | 0.0,0.0,0.0,0.0,unknown,7274054324
155 | 0.0,0.0,0.0,0.0,unknown,8356316546
156 | 0.0,0.0,0.0,0.0,unknown,5524535047
157 | 0.0,0.0,0.0,0.0,unknown,7782611488
158 | 0.0,0.0,0.0,0.0,unknown,7503324707
159 | 0.0,0.0,0.0,0.0,unknown,0140858527
160 | 0.0,0.0,0.0,0.0,unknown,7838578287
161 | 0.0,0.0,0.0,0.0,unknown,6388146677
162 | 0.0,0.0,0.0,0.0,unknown,2146315361
163 | 0.0,0.0,0.0,0.0,unknown,0847144553
164 | 0.0,0.0,0.0,0.0,unknown,6138456108
165 | 0.0,0.0,0.0,0.0,unknown,5132623852
166 | 0.0,0.0,0.0,0.0,unknown,1327546424
167 | 0.0,0.0,0.0,0.0,unknown,8527216472
168 | 0.0,0.0,0.0,0.0,unknown,5248868586
169 | 0.0,0.0,0.0,0.0,unknown,1506600816
170 | 0.0,0.0,0.0,0.0,unknown,1606813420
171 | 0.0,0.0,0.0,0.0,unknown,6458681543
172 | 0.0,0.0,0.0,0.0,unknown,4481277745
173 | 0.0,0.0,0.0,0.0,unknown,2860575527
174 | 0.0,0.0,0.0,0.0,unknown,3080503163
175 | 0.0,0.0,0.0,0.0,unknown,2784800416
176 | 0.0,0.0,0.0,0.0,unknown,8457088037
177 | 0.0,0.0,0.0,0.0,unknown,1723845570
178 | 0.0,0.0,0.0,0.0,unknown,7682112756
179 | 0.0,0.0,0.0,0.0,unknown,6714126620
180 | 0.0,0.0,0.0,0.0,unknown,6223655526
181 | 0.0,0.0,0.0,0.0,unknown,4224667340
182 | 0.0,0.0,0.0,0.0,unknown,1350643141
183 | 0.0,0.0,0.0,0.0,unknown,7741781348
184 | 0.0,0.0,0.0,0.0,unknown,4788824374
185 | 0.0,0.0,0.0,0.0,unknown,4802501878
186 | 0.0,0.0,0.0,0.0,unknown,0620404613
187 | 0.0,0.0,0.0,0.0,unknown,6446457340
188 | 0.0,0.0,0.0,0.0,unknown,4861105162
189 | 0.0,0.0,0.0,0.0,unknown,8741320300
190 | 0.0,0.0,0.0,0.0,unknown,7347388342
191 | 0.0,0.0,0.0,0.0,unknown,5026762301
192 | 0.0,0.0,0.0,0.0,unknown,3660636434
193 | 0.0,0.0,0.0,0.0,unknown,8538848885
194 | 0.0,0.0,0.0,0.0,unknown,2441022645
195 | 0.0,0.0,0.0,0.0,unknown,4376642226
196 | 0.0,0.0,0.0,0.0,unknown,6845502748
197 | 0.0,0.0,0.0,0.0,unknown,8601262072
198 | 0.0,0.0,0.0,0.0,unknown,4143467360
199 | 0.0,0.0,0.0,0.0,unknown,3645855842
200 | 0.0,0.0,0.0,0.0,unknown,1608320020
201 | 0.0,0.0,0.0,0.0,unknown,2530310776
202 | 0.0,0.0,0.0,0.0,unknown,1278337817
203 | 0.0,0.0,0.0,0.0,unknown,3401276441
204 | 0.0,0.0,0.0,0.0,unknown,2802763181
205 | 0.0,0.0,0.0,0.0,unknown,8841174018
206 | 0.0,0.0,0.0,0.0,unknown,3416281704
207 | 0.0,0.0,0.0,0.0,unknown,1830021335
208 | 0.0,0.0,0.0,0.0,unknown,4710450047
209 | 0.0,0.0,0.0,0.0,unknown,1003531740
210 | 0.0,0.0,0.0,0.0,unknown,1766308753
211 | 0.0,0.0,0.0,0.0,unknown,4726017458
212 | 0.0,0.0,0.0,0.0,unknown,1766320271
213 | 0.0,0.0,0.0,0.0,unknown,2053182451
214 | 0.0,0.0,0.0,0.0,unknown,6821733760
215 | 0.0,0.0,0.0,0.0,unknown,5870471324
216 | 0.0,0.0,0.0,0.0,unknown,8730175040
217 | 0.0,0.0,0.0,0.0,unknown,5862035731
218 | 0.0,0.0,0.0,0.0,unknown,2555074601
219 | 0.0,0.0,0.0,0.0,unknown,6114111156
220 | 0.0,0.0,0.0,0.0,unknown,3350260427
221 | 0.0,0.0,0.0,0.0,unknown,0000745117
222 | 0.0,0.0,0.0,0.0,unknown,4523482601
223 | 0.0,0.0,0.0,0.0,unknown,5835586230
224 | 0.0,0.0,0.0,0.0,unknown,8461676772
225 | 0.0,0.0,0.0,0.0,unknown,5612325353
226 | 0.0,0.0,0.0,0.0,unknown,1853088304
227 | 0.0,0.0,0.0,0.0,unknown,4735874332
228 | 0.0,0.0,0.0,0.0,unknown,3315120417
229 | 0.0,0.0,0.0,0.0,unknown,7501037384
230 | 0.0,0.0,0.0,0.0,unknown,6840721541
231 | 0.0,0.0,0.0,0.0,unknown,6556836302
232 | 0.0,0.0,0.0,0.0,unknown,1065881460
233 | 0.0,0.0,0.0,0.0,unknown,1233632221
234 | 0.0,0.0,0.0,0.0,unknown,8208086123
235 | 0.0,0.0,0.0,0.0,unknown,3271075568
236 | 0.0,0.0,0.0,0.0,unknown,6250433216
237 | 0.0,0.0,0.0,0.0,unknown,7386371563
238 | 0.0,0.0,0.0,0.0,unknown,1582584735
239 | 0.0,0.0,0.0,0.0,unknown,2811137651
240 | 0.0,0.0,0.0,0.0,unknown,5841756205
241 | 0.0,0.0,0.0,0.0,unknown,2370872348
242 | 0.0,0.0,0.0,0.0,unknown,2360400058
243 | 0.0,0.0,0.0,0.0,unknown,1526221436
244 | 0.0,0.0,0.0,0.0,unknown,2418148414
245 | 0.0,0.0,0.0,0.0,unknown,6877022142
246 | 0.0,0.0,0.0,0.0,unknown,6713226486
247 | 0.0,0.0,0.0,0.0,unknown,7535727412
248 | 0.0,0.0,0.0,0.0,unknown,5880055037
249 | 0.0,0.0,0.0,0.0,unknown,1600206352
250 | 0.0,0.0,0.0,0.0,unknown,2551227172
251 | 0.0,0.0,0.0,0.0,unknown,4162357761
252 | 0.0,0.0,0.0,0.0,unknown,1572542561
253 | 0.0,0.0,0.0,0.0,unknown,6777220467
254 | 0.0,0.0,0.0,0.0,unknown,0008212436
255 | 0.0,0.0,0.0,0.0,unknown,6161041486
256 | 0.0,0.0,0.0,0.0,unknown,5006000617
257 | 0.0,0.0,0.0,0.0,unknown,3220878464
258 | 0.0,0.0,0.0,0.0,unknown,2043550351
259 | 0.0,0.0,0.0,0.0,unknown,4315316628
260 | 0.0,0.0,0.0,0.0,unknown,2175060658
261 | 0.0,0.0,0.0,0.0,unknown,3584445350
262 | 0.0,0.0,0.0,0.0,unknown,5782266131
263 | 0.0,0.0,0.0,0.0,unknown,7422563481
264 | 0.0,0.0,0.0,0.0,unknown,2636611550
265 | 0.0,0.0,0.0,0.0,unknown,3586862645
266 | 0.0,0.0,0.0,0.0,unknown,8725601350
267 | 0.0,0.0,0.0,0.0,unknown,1867555811
268 | 0.0,0.0,0.0,0.0,unknown,7383366876
269 | 0.0,0.0,0.0,0.0,unknown,0024680346
270 | 0.0,0.0,0.0,0.0,unknown,0778637336
271 | 0.0,0.0,0.0,0.0,unknown,1562305031
272 | 0.0,0.0,0.0,0.0,unknown,0800581614
273 | 0.0,0.0,0.0,0.0,unknown,3515135761
274 | 0.0,0.0,0.0,0.0,unknown,8115708833
275 | 0.0,0.0,0.0,0.0,unknown,1678366405
276 | 0.0,0.0,0.0,0.0,unknown,4467248363
277 | 0.0,0.0,0.0,0.0,unknown,2883025748
278 | 0.0,0.0,0.0,0.0,unknown,0452863538
279 | 0.0,0.0,0.0,0.0,unknown,0745208738
280 | 0.0,0.0,0.0,0.0,unknown,2671042177
281 | 0.0,0.0,0.0,0.0,unknown,8650741373
282 | 0.0,0.0,0.0,0.0,unknown,7181536775
283 | 0.0,0.0,0.0,0.0,unknown,7411138651
284 | 0.0,0.0,0.0,0.0,unknown,2541415604
285 | 0.0,0.0,0.0,0.0,unknown,6074771706
286 | 0.0,0.0,0.0,0.0,unknown,6727737384
287 | 0.0,0.0,0.0,0.0,unknown,6455764403
288 | 0.0,0.0,0.0,0.0,unknown,0804570350
289 | 0.0,0.0,0.0,0.0,unknown,3127285770
290 | 0.0,0.0,0.0,0.0,unknown,4425025047
291 | 0.0,0.0,0.0,0.0,unknown,5641023413
292 | 0.0,0.0,0.0,0.0,unknown,5353464852
293 | 0.0,0.0,0.0,0.0,unknown,5353484338
294 | 0.0,0.0,0.0,0.0,unknown,8038431388
295 | 0.0,0.0,0.0,0.0,unknown,6832085831
296 | 0.0,0.0,0.0,0.0,unknown,4027573178
297 | 0.0,0.0,0.0,0.0,unknown,6385771578
298 | 0.0,0.0,0.0,0.0,unknown,5168161134
299 | 0.0,0.0,0.0,0.0,unknown,2701165343
300 | 0.0,0.0,0.0,0.0,unknown,0405323628
301 | 0.0,0.0,0.0,0.0,unknown,3780871527
302 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/iris.csv:
--------------------------------------------------------------------------------
  1 | Sepal Length,Sepal Width,Petal Length,Petal Width,Species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3.0,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5.0,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5.0,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3.0,1.4,0.1,setosa
 15 | 4.3,3.0,1.1,0.1,setosa
 16 | 5.8,4.0,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1.0,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5.0,3.0,1.6,0.2,setosa
 28 | 5.0,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5.0,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3.0,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5.0,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5.0,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3.0,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5.0,3.3,1.4,0.2,setosa
 52 | 7.0,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4.0,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1.0,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5.0,2.0,3.5,1.0,versicolor
 63 | 5.9,3.0,4.2,1.5,versicolor
 64 | 6.0,2.2,4.0,1.0,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3.0,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1.0,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4.0,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3.0,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3.0,5.0,1.7,versicolor
 80 | 6.0,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1.0,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1.0,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6.0,2.7,5.1,1.6,versicolor
 86 | 5.4,3.0,4.5,1.5,versicolor
 87 | 6.0,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3.0,4.1,1.3,versicolor
 91 | 5.5,2.5,4.0,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3.0,4.6,1.4,versicolor
 94 | 5.8,2.6,4.0,1.2,versicolor
 95 | 5.0,2.3,3.3,1.0,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3.0,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3.0,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6.0,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3.0,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3.0,5.8,2.2,virginica
107 | 7.6,3.0,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2.0,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3.0,5.5,2.1,virginica
115 | 5.7,2.5,5.0,2.0,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3.0,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6.0,2.2,5.0,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2.0,virginica
124 | 7.7,2.8,6.7,2.0,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6.0,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3.0,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3.0,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2.0,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3.0,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6.0,3.0,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3.0,5.2,2.3,virginica
148 | 6.3,2.5,5.0,1.9,virginica
149 | 6.5,3.0,5.2,2.0,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3.0,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/person_activity.tsv:
--------------------------------------------------------------------------------
  1 | sequence_name	tag	date	x	y	z	activity
  2 | A02	020-000-032-221	2009-05-27 14:10:34.403000	2.364227533340454	1.0726794004440308	0.6014408469200134	sitting
  3 | E04	020-000-032-221	2009-05-27 11:32:08.007000	2.9927980899810787	2.2969036102294917	0.6610792279243469	sitting
  4 | E03	020-000-033-111	2009-05-27 11:24:21.703000	1.0881950855255127	2.472888946533203	0.3776327967643738	lying
  5 | A05	020-000-032-221	2009-05-27 14:30:42.777000	1.5825680494308472	2.5671520233154297	0.4250121414661408	lying
  6 | D04	010-000-030-096	2009-05-27 12:21:40.940000	3.085246086120605	1.8761938810348509	-0.17712418735027313	walking
  7 | D04	020-000-033-111	2009-05-27 12:23:19.087000	2.8921008110046387	1.3567155599594116	0.9023138284683228	sitting down
  8 | C01	010-000-024-033	2009-05-27 15:07:00.170000	2.575935840606689	2.2539055347442627	0.4438603818416596	lying
  9 | B01	010-000-024-033	2009-05-27 13:20:21.113000	3.241147518157959	0.991890788078308	0.5760034322738647	lying
 10 | A05	020-000-032-221	2009-05-27 14:31:00.830000	4.175508499145509	1.2167696952819824	0.051605984568595886	falling
 11 | B03	010-000-030-096	2009-05-27 13:31:44.630000	3.3847193717956543	1.99529767036438	0.3521308600902557	walking
 12 | B03	010-000-030-096	2009-05-27 13:30:59.347000	3.1864302158355717	1.912295937538147	0.16545848548412326	walking
 13 | D01	010-000-030-096	2009-05-27 12:02:57.080000	3.2420635223388667	2.048999309539795	0.4218811392784119	walking
 14 | A05	020-000-033-111	2009-05-27 14:31:52.683000	3.267212390899658	1.0581835508346558	0.6482892632484436	sitting on the ground
 15 | D03	010-000-024-033	2009-05-27 12:11:38.173000	2.425730466842652	1.6957041025161743	-0.1944011002779007	sitting
 16 | E04	020-000-033-111	2009-05-27 11:32:47.347000	3.9950709342956543	1.4088057279586792	0.5891545414924622	standing up from lying
 17 | D02	020-000-032-221	2009-05-27 12:09:26.150000	1.6337320804595947	2.3748033046722408	0.13590830564498899	lying
 18 | B01	020-000-032-221	2009-05-27 13:22:27.120000	2.873187065124512	1.8603403568267824	0.7493166327476501	walking
 19 | A03	010-000-030-096	2009-05-27 14:17:51.233000	3.4173827171325684	1.4706878662109375	0.563476026058197	lying
 20 | D04	010-000-024-033	2009-05-27 12:23:35.457000	3.205171823501587	1.9038610458374023	0.11705143004655838	sitting
 21 | A03	020-000-033-111	2009-05-27 14:17:28.670000	1.932982444763184	1.7435823678970337	0.7567794322967529	sitting
 22 | C05	010-000-030-096	2009-05-27 15:50:01.170000	1.2183953523635864	0.9638705849647522	-0.7276989817619324	walking
 23 | B05	010-000-030-096	2009-05-27 13:45:27.370000	2.5190000534057617	1.8821338415145876	-0.022361312061548237	sitting
 24 | E03	010-000-024-033	2009-05-27 11:25:51.037000	2.987109661102295	1.200242519378662	0.5542596578598022	sitting
 25 | A02	020-000-033-111	2009-05-27 14:13:14.893000	1.977061867713928	1.6049242019653318	0.4875929951667786	on all fours
 26 | E02	020-000-032-221	2009-05-27 11:18:00.317000	2.8983795642852783	1.5748742818832395	0.8054356575012207	sitting
 27 | D05	010-000-024-033	2009-05-27 12:28:38.417000	2.424173593521118	1.6271040439605713	0.004698952194303274	lying
 28 | C03	010-000-024-033	2009-05-27 15:35:41.103000	3.306899309158325	1.5572663545608518	0.4415912628173828	lying
 29 | C03	020-000-033-111	2009-05-27 15:38:09.100000	1.4963722229003906	1.764997124671936	0.12460332363843918	on all fours
 30 | A02	010-000-024-033	2009-05-27 14:10:53.460000	3.3491194248199463	1.4425557851791382	0.2877790629863739	lying
 31 | C03	010-000-024-033	2009-05-27 15:36:47.090000	3.1415631771087646	2.092335700988769	-0.12813234329223633	standing up from lying
 32 | C04	020-000-033-111	2009-05-27 15:44:35.993000	1.547951102256775	1.5348349809646606	1.0966185331344604	sitting down
 33 | E04	010-000-030-096	2009-05-27 11:33:55.580000	3.382487773895264	1.3829824924468994	0.5090287327766418	lying
 34 | D01	020-000-032-221	2009-05-27 12:00:55.297000	1.6717469692230225	2.3130135536193848	0.20647180080413816	lying down
 35 | C01	020-000-033-111	2009-05-27 15:08:41.440000	1.2766221761703491	2.387655973434448	0.2712592780590057	lying
 36 | E05	020-000-032-221	2009-05-27 11:48:19.560000	4.189929962158203	1.7505072355270386	0.5655279755592346	lying
 37 | D01	020-000-033-111	2009-05-27 12:01:27.013000	3.0045716762542725	1.367339253425598	1.1981894969940186	walking
 38 | E03	020-000-032-221	2009-05-27 11:22:51.373000	2.5259954929351807	1.481900334358215	0.8428568840026855	walking
 39 | D02	010-000-024-033	2009-05-27 12:07:07.317000	2.9613344669342037	1.4691461324691772	0.3535345196723938	lying
 40 | B05	010-000-030-096	2009-05-27 13:45:49.850000	3.165351152420044	1.3761900663375854	0.25447002053260803	lying
 41 | D04	010-000-024-033	2009-05-27 12:22:16.007000	3.403285026550293	2.1172318458557133	0.4001142680644989	standing up from lying
 42 | E01	020-000-033-111	2009-05-27 11:05:41.837000	2.511707067489624	1.5223387479782104	1.3326233625411987	walking
 43 | C04	010-000-024-033	2009-05-27 15:47:32.090000	2.336556911468506	1.729482650756836	0.15372277796268466	standing up from lying
 44 | E05	020-000-033-111	2009-05-27 11:49:56.187000	2.163171768188477	2.201981782913208	0.8030093908309937	standing up from lying
 45 | A03	010-000-024-033	2009-05-27 14:18:17.793000	2.658756256103516	2.620194435119629	0.550217866897583	lying
 46 | D02	010-000-030-096	2009-05-27 12:06:34.863000	1.7009001970291138	1.859947919845581	0.04206356033682823	walking
 47 | D05	010-000-030-096	2009-05-27 12:30:24.263000	0.8862645030021667	1.3443036079406738	0.6247302889823914	standing up from lying
 48 | A03	010-000-030-096	2009-05-27 14:19:03.213000	3.443813800811768	1.5246080160140991	0.4430009722709656	sitting
 49 | C04	020-000-032-221	2009-05-27 15:46:57.240000	3.2377943992614746	1.4848848581314087	0.4814980924129486	sitting on the ground
 50 | E04	010-000-024-033	2009-05-27 11:36:13.420000	4.166337490081787	1.7488794326782229	0.3960814476013184	lying
 51 | A03	010-000-024-033	2009-05-27 14:19:13.377000	3.73836088180542	1.9150758981704712	0.40552473068237305	sitting on the ground
 52 | A01	020-000-033-111	2009-05-27 14:04:29.923000	0.9687991142272948	2.4544811248779297	0.4013099074363709	lying
 53 | D04	010-000-024-033	2009-05-27 12:22:51.793000	2.6993303298950195	1.308944821357727	0.08806340396404266	lying
 54 | A01	020-000-032-221	2009-05-27 14:03:30.507000	3.1015098094940186	2.2763476371765137	0.6889973878860474	sitting down
 55 | C04	020-000-032-221	2009-05-27 15:45:33.577000	1.5871737003326416	2.2460286617279053	0.3390403687953949	lying
 56 | B05	010-000-030-096	2009-05-27 13:45:27.587000	2.6004910469055176	1.7772588729858398	-0.17593622207641602	sitting
 57 | E03	010-000-030-096	2009-05-27 11:25:42.200000	3.1063001155853267	1.5200097560882568	0.2225923836231232	sitting
 58 | C04	010-000-024-033	2009-05-27 15:46:35.673000	3.209347486495972	1.3106393814086914	0.3402466177940369	sitting
 59 | C04	020-000-033-111	2009-05-27 15:45:17.710000	3.725693464279175	1.2948448657989502	0.7121231555938721	standing up from lying
 60 | C03	010-000-024-033	2009-05-27 15:37:46.100000	2.8383827209472656	2.5331251621246342	0.13558849692344666	lying
 61 | C02	020-000-033-111	2009-05-27 15:32:39.363000	4.742339134216309	1.6062165498733518	0.4227121472358704	lying
 62 | B02	010-000-030-096	2009-05-27 13:26:02.337000	3.1818909645080566	0.8593894243240356	-0.18768659234046936	standing up from lying
 63 | D01	020-000-033-111	2009-05-27 12:00:20.653000	1.3210180997848509	1.457034468650818	1.314078688621521	walking
 64 | C05	020-000-033-111	2009-05-27 15:52:40.730000	2.5794620513916016	2.0291283130645748	0.9889678359031676	walking
 65 | C02	020-000-032-221	2009-05-27 15:32:11.773000	1.69430673122406	2.540302276611328	0.2075749933719635	lying
 66 | A04	020-000-032-221	2009-05-27 14:23:37.787000	1.714241623878479	2.4859325885772705	0.3637436628341675	lying
 67 | B01	020-000-033-111	2009-05-27 13:21:40.717000	3.21249794960022	1.8495882749557493	1.1178481578826904	walking
 68 | B04	020-000-033-111	2009-05-27 13:38:35.227000	1.1691774129867554	2.4054462909698486	0.36464691162109375	lying
 69 | E05	010-000-024-033	2009-05-27 11:46:11.230000	4.175631046295166	1.8160799741744995	0.488844245672226	sitting
 70 | C05	020-000-033-111	2009-05-27 15:50:03.930000	2.302480697631836	1.5800533294677734	1.3494281768798828	walking
 71 | E02	010-000-024-033	2009-05-27 11:18:00.560000	3.2470200061798096	1.2233848571777344	0.3556487262248993	sitting
 72 | B04	010-000-024-033	2009-05-27 13:39:03.843000	3.5982739925384517	1.370031476020813	0.2926847338676453	lying
 73 | A01	020-000-033-111	2009-05-27 14:05:26.127000	3.2199802398681636	1.305781602859497	0.7259878516197205	sitting on the ground
 74 | C04	010-000-024-033	2009-05-27 15:47:47.443000	1.1412479877471924	1.2081260681152344	0.05556079372763634	lying
 75 | A04	010-000-024-033	2009-05-27 14:25:03.987000	2.733184576034546	2.1023993492126465	0.6411945819854736	lying
 76 | B02	020-000-033-111	2009-05-27 13:26:07.337000	1.9430094957351685	1.692920207977295	1.2974882125854492	lying down
 77 | B04	010-000-030-096	2009-05-27 13:39:34.487000	3.358769416809082	0.9730023741722108	0.8309000134468079	sitting
 78 | E04	020-000-032-221	2009-05-27 11:35:24.587000	3.6543121337890634	2.2231676578521733	1.223678469657898	lying
 79 | E05	010-000-024-033	2009-05-27 11:47:51.747000	2.7803151607513428	1.1996113061904907	0.1838060021400452	walking
 80 | E05	010-000-030-096	2009-05-27 11:48:29.877000	3.334173679351806	1.9126269817352293	0.3006707727909088	walking
 81 | D01	010-000-024-033	2009-05-27 12:00:25.547000	2.943986415863037	1.4689847230911257	0.13244961202144626	lying
 82 | E03	020-000-032-221	2009-05-27 11:23:29.963000	2.5558111667633057	1.4326682090759275	1.2855970859527588	walking
 83 | D05	010-000-030-096	2009-05-27 12:29:43.953000	2.677099704742432	2.0248022079467773	0.6353068947792053	lying down
 84 | D05	020-000-032-221	2009-05-27 12:29:34.890000	3.09594988822937	1.3275997638702393	0.5477344393730164	sitting on the ground
 85 | D04	010-000-030-096	2009-05-27 12:24:38.383000	3.6772496700286874	1.5792498588562012	0.07858741283416748	walking
 86 | A02	020-000-032-221	2009-05-27 14:12:59.023000	1.7106853723526	2.338544607162476	0.4377666413784027	lying
 87 | C04	020-000-033-111	2009-05-27 15:46:26.023000	2.9756088256835938	1.444602608680725	1.2077064514160156	walking
 88 | C04	020-000-033-111	2009-05-27 15:45:16.087000	3.701939344406128	1.1584758758544922	0.4779553413391113	standing up from lying
 89 | B01	020-000-032-221	2009-05-27 13:19:40.337000	1.8374569416046145	1.6944326162338257	0.8672473430633545	walking
 90 | E01	020-000-033-111	2009-05-27 11:05:16.580000	2.9653923511505127	2.387855768203736	0.996475338935852	sitting
 91 | B03	020-000-032-221	2009-05-27 13:33:38.210000	1.5590615272521973	2.5278961658477783	0.6008175015449524	lying
 92 | E01	010-000-024-033	2009-05-27 11:08:54.630000	4.09044361114502	1.4909925460815432	0.29941579699516296	sitting on the ground
 93 | C05	010-000-024-033	2009-05-27 15:50:30.860000	2.2480688095092773	1.4606925249099731	0.33752167224884033	walking
 94 | D01	010-000-030-096	2009-05-27 12:02:21.303000	3.0831291675567627	1.4361507892608645	0.351560652256012	sitting
 95 | C01	010-000-024-033	2009-05-27 15:08:15.933000	3.894229650497437	1.301555037498474	0.2481682598590851	sitting on the ground
 96 | E05	010-000-024-033	2009-05-27 11:46:47.117000	3.025940418243408	1.5858956575393677	0.14383256435394287	lying
 97 | B02	010-000-024-033	2009-05-27 13:26:46.917000	3.903318166732788	1.400129795074463	0.26374736428260803	lying
 98 | A05	020-000-033-111	2009-05-27 14:31:51.493000	3.238938331604004	1.0651748180389404	0.7165815830230713	sitting on the ground
 99 | D01	020-000-033-111	2009-05-27 12:00:46.593000	3.682337045669556	1.9551628828048704	1.2390739917755127	standing up from lying
100 | E04	020-000-033-111	2009-05-27 11:33:52.417000	4.436995029449463	1.37040913105011	0.26064157485961914	lying
101 | A05	020-000-032-221	2009-05-27 14:31:58.007000	3.1532111167907715	1.527239203453064	0.5854612588882446	standing up from sitting on the ground
102 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/test.conf:
--------------------------------------------------------------------------------
1 | [pysemantic]
2 | specfile = testdata/test_dictionary.yaml
3 | 
4 | [test_excel]
5 | specfile = testdata/test_excel.yaml
6 | 
7 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/test_dictionary.yaml:
--------------------------------------------------------------------------------
  1 | bad_iris:
  2 |   column_rules:
  3 |     Species:
  4 |       na_values:
  5 |       - unknown
  6 |   dataframe_rules:
  7 |     drop_duplicates: false
  8 |     drop_na: false
  9 |   delimiter: ','
 10 |   dtypes:
 11 |     Petal Length: &id001 !!python/name:__builtin__.float 
 12 |     Petal Width: *id001
 13 |     Sepal Length: *id001
 14 |     Sepal Width: *id001
 15 |     Species: &id002 !!python/name:__builtin__.str 
 16 |   path: testdata/bad_iris.csv
 17 |   use_columns:
 18 |   - Petal Length
 19 |   - Petal Width
 20 |   - Sepal Length
 21 |   - Sepal Width
 22 |   - Species
 23 | iris:
 24 |   column_rules:
 25 |     Petal Length:
 26 |       minimum: 0.0
 27 |     Petal Width:
 28 |       minimum: 0.0
 29 |     Sepal Length:
 30 |       minimum: 0.0
 31 |     Sepal Width:
 32 |       minimum: 0.0
 33 |     Species:
 34 |       drop_na: false
 35 |       regex: \b[a-z]+\b
 36 |       unique_values:
 37 |       - setosa
 38 |       - versicolor
 39 |       - virginica
 40 |   dataframe_rules:
 41 |     drop_duplicates: false
 42 |   delimiter: ','
 43 |   dtypes:
 44 |     Petal Length: *id001
 45 |     Petal Width: *id001
 46 |     Sepal Length: *id001
 47 |     Sepal Width: *id001
 48 |     Species: *id002
 49 |   nrows: 150
 50 |   path: testdata/iris.csv
 51 | person_activity:
 52 |   column_rules:
 53 |     activity:
 54 |       drop_duplicates: false
 55 |       regex: \W|\w
 56 |     date:
 57 |       drop_duplicates: false
 58 |     sequence_name:
 59 |       drop_duplicates: false
 60 |       regex: '[A-Z]\d{2}'
 61 |     tag:
 62 |       drop_duplicates: false
 63 |       regex: \d{3}-\d{3}-\d{3}-\d{3}
 64 |     x:
 65 |       drop_duplicates: false
 66 |     y:
 67 |       drop_duplicates: false
 68 |     z:
 69 |       drop_duplicates: false
 70 |   dataframe_rules:
 71 |     drop_duplicates: false
 72 |   delimiter: "\t"
 73 |   dtypes:
 74 |     activity: *id002
 75 |     sequence_name: *id002
 76 |     tag: *id002
 77 |     x: *id001
 78 |     y: *id001
 79 |     z: *id001
 80 |   nrows: 100
 81 |   parse_dates:
 82 |   - date
 83 |   path: testdata/person_activity.tsv
 84 | random_row_iris:
 85 |   column_rules:
 86 |     Petal Length:
 87 |       minimum: 0.0
 88 |     Petal Width:
 89 |       minimum: 0.0
 90 |     Sepal Length:
 91 |       minimum: 0.0
 92 |     Sepal Width:
 93 |       minimum: 0.0
 94 |     Species:
 95 |       drop_na: false
 96 |       regex: \b[a-z]+\b
 97 |       unique_values:
 98 |       - setosa
 99 |       - versicolor
100 |       - virginica
101 |   dataframe_rules:
102 |     drop_duplicates: false
103 |   delimiter: ','
104 |   dtypes:
105 |     Petal Length: *id001
106 |     Petal Width: *id001
107 |     Sepal Length: *id001
108 |     Sepal Width: *id001
109 |     Species: *id002
110 |   nrows:
111 |     count: 50
112 |     random: true
113 |   path: testdata/iris.csv
114 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/test_excel.yaml:
--------------------------------------------------------------------------------
 1 | iris:
 2 |   dataframe_rules:
 3 |     drop_duplicates: false
 4 |   path: testdata/test_spreadsheet.xlsx
 5 | iris_renamed:
 6 |   dataframe_rules:
 7 |     drop_duplicates: false
 8 |   path: testdata/test_spreadsheet.xlsx
 9 |   sheetname: iris
10 | person_activity:
11 |   dataframe_rules:
12 |     drop_duplicates: false
13 |   path: testdata/test_spreadsheet.xlsx
14 | 


--------------------------------------------------------------------------------
/pysemantic/tests/testdata/test_spreadsheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaidevd/pysemantic/1b928446e431a69060bbc9d29b8a7c7a6f2b8c0c/pysemantic/tests/testdata/test_spreadsheet.xlsx


--------------------------------------------------------------------------------
/pysemantic/utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2015 jaidev <jaidev@newton>
  6 | #
  7 | # Distributed under terms of the BSD 3-clause license.
  8 | 
  9 | """
 10 | Misecellaneous bells and whistles.
 11 | """
 12 | 
 13 | import sys
 14 | import json
 15 | import pandas as pd
 16 | import numpy as np
 17 | import datetime
 18 | 
 19 | DATA_TYPES = {'String': str, 'Date/Time': datetime.date, 'Float': float,
 20 |               'Integer': int}
 21 | 
 22 | 
 23 | class TypeEncoder(json.JSONEncoder):
 24 | 
 25 |     def default(self, obj):
 26 |         if isinstance(obj, type):
 27 |             return str(obj)
 28 |         elif isinstance(obj, set):
 29 |             return list(obj)
 30 |         elif callable(obj):
 31 |             return ".".join((obj.__module__, obj.__name__))
 32 |         elif isinstance(obj, np.ndarray):
 33 |             return np.array_str(obj)
 34 |         else:
 35 |             if "Engine" in str(obj):
 36 |                 return str(obj)
 37 |             return json.JSONEncoder.default(self, obj)
 38 | 
 39 | 
 40 | def generate_questionnaire(filepath):
 41 |     """Generate a questionnaire for data at `filepath`.
 42 | 
 43 |     This questionnaire will be presented to the client, which helps us
 44 |     automatically generate the schema.
 45 | 
 46 |     :param filepath: Path to the file that needs to be ingested.
 47 |     :type filepath: str
 48 |     :return: A dictionary of questions and their possible answers. The format
 49 |     of the dictionary is such that every key is a question to be put to the
 50 |     client, and its value is a list of possible answers. The first item in the
 51 |     list is the default value.
 52 |     :rtype: dict
 53 |     """
 54 |     qdict = {}
 55 |     if filepath.endswith(".tsv"):
 56 |         dataframe = pd.read_table(filepath)
 57 |     else:
 58 |         dataframe = pd.read_csv(filepath)
 59 |     for col in dataframe.columns:
 60 |         qstring = "What is the data type of {}?".format(col)
 61 |         if "float" in str(dataframe[col].dtype).lower():
 62 |             defaultType = "Float"
 63 |         elif "object" in str(dataframe[col].dtype).lower():
 64 |             defaultType = "String"
 65 |         elif "int" in str(dataframe[col].dtype).lower():
 66 |             defaultType = "Integer"
 67 |         typeslist = DATA_TYPES.keys()
 68 |         typeslist.remove(defaultType)
 69 |         typeslist = [defaultType] + typeslist
 70 |         qdict[qstring] = typeslist
 71 |     return qdict
 72 | 
 73 | 
 74 | def colnames(filename, parser=None, **kwargs):
 75 |     """
 76 |     Read the column names of a delimited file, without actually reading the
 77 |     whole file. This is simply a wrapper around `pandas.read_csv`, which reads
 78 |     only one row and returns the column names.
 79 | 
 80 | 
 81 |     :param filename: Path to the file to be read
 82 |     :param kwargs: Arguments to be passed to the `pandas.read_csv`
 83 |     :type filename: str
 84 |     :rtype: list
 85 | 
 86 |     :Example:
 87 | 
 88 |     Suppose we want to see the column names of the Fisher iris dataset.
 89 | 
 90 |     >>> colnames("/path/to/iris.csv")
 91 |     ['Sepal Length', 'Petal Length', 'Sepal Width', 'Petal Width', 'Species']
 92 | 
 93 |     """
 94 |     if 'nrows' in kwargs:
 95 |         UserWarning("The nrows parameter is pointless here. This function only"
 96 |                     "reads one row.")
 97 |         kwargs.pop('nrows')
 98 |     if parser is None:
 99 |         if "sep" in kwargs:
100 |             sep = kwargs.get('sep')
101 |             if sep == r"\t":
102 |                 parser = pd.read_table
103 |                 kwargs.pop('sep')
104 |             else:
105 |                 parser = pd.read_csv
106 |         elif filename.endswith('.tsv'):
107 |             parser = pd.read_table
108 |         else:
109 |             parser = pd.read_csv
110 |     return parser(filename, nrows=1, **kwargs).columns.tolist()
111 | 
112 | 
113 | def get_md5_checksum(filepath):
114 |     """Get the md5 checksum of a file.
115 | 
116 |     :param filepath: Path to the file of which to calculate the md5 checksum.
117 |     :type filepath: Str
118 |     :return: MD5 checksum of the file.
119 |     :rtype: Str
120 |     :Example:
121 | 
122 |     >>> get_md5_checksum('pysemantic/tests/testdata/iris.csv')
123 |     '9b3ecf3031979169c0ecc5e03cfe20a6'
124 | 
125 |     """
126 |     import subprocess
127 |     if sys.platform == "darwin":
128 |         cmd = "md5 -q {}".format(filepath).split()
129 |     else:
130 |         cmd = "md5sum {}".format(filepath).split()
131 |     return subprocess.check_output(cmd).rstrip().split()[0]
132 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml
2 | traits
3 | pandas
4 | docopt
5 | nose
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os.path as op
 3 | 
 4 | CONF_PATH = op.join(op.expanduser("~"), "pysemantic.conf")
 5 | if not op.exists(CONF_PATH):
 6 |     with open(CONF_PATH, "w") as fid:
 7 |         fid.write("# Config file added by the pysemantic setup script.")
 8 |         fid.write("\n")
 9 |     print "Config file added at {}".format(CONF_PATH)
10 | 
11 | NAME = "pysemantic"
12 | 
13 | setup(
14 |     name=NAME,
15 |     version='0.1.1',
16 |     author='Jaidev Deshpande',
17 |     author_email='deshpande.jaidev@gmail.com',
18 |     description="A traits based data validation module for pandas data structures.",
19 |     url="https://github.com/jaidevd/pysemantic",
20 |     long_description=open("README.rst").read(),
21 |     entry_points={
22 |         'console_scripts': ['semantic = pysemantic.cli:main'],
23 |                },
24 |     packages=find_packages(),
25 |     install_requires=['pyyaml', 'traits', 'pandas', 'docopt']
26 | )
27 | 


--------------------------------------------------------------------------------