├── .coveragerc ├── .gitignore ├── .landscape.yaml ├── .travis.yml ├── COPYING ├── README.rst ├── continuous_integration ├── install.sh └── requirements.txt ├── docs ├── Makefile ├── _static │ └── logo.png ├── api.rst ├── apiref │ ├── modules.rst │ └── pysemantic.rst ├── conf.py ├── examples.rst ├── examples │ └── introduction.ipynb ├── index.rst ├── make.bat ├── schema_ref.rst └── tutorial │ ├── notebooks │ ├── demo_project.yml │ ├── dummy_data.csv │ ├── loading_datasets.ipynb │ └── naive_cleaning.ipynb │ └── slides │ ├── images │ └── dc_logo.jpg │ └── presentation.tex ├── pysemantic ├── __init__.py ├── cli.py ├── custom_traits.py ├── errors.py ├── exporters.py ├── loggers.py ├── project.py ├── tests │ ├── __init__.py │ ├── test_base.py │ ├── test_cli.py │ ├── test_custom_traits.py │ ├── test_project.py │ ├── test_utils.py │ ├── test_validator.py │ └── testdata │ │ ├── bad_iris.csv │ │ ├── iris.csv │ │ ├── person_activity.tsv │ │ ├── test.conf │ │ ├── test_dictionary.yaml │ │ ├── test_excel.yaml │ │ └── test_spreadsheet.xlsx ├── utils.py └── validator.py ├── requirements.txt └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */python?.?/* 4 | */lib-python/?.?/*.py 5 | */unittest2/* 6 | */dist-packages/pandas/* 7 | */dist-packages/numpy/* 8 | */dist-packages/nose/* 9 | */dist-packages/pyyaml/* 10 | */dist-packages/traits/* 11 | */dist-packages/docopt/* 12 | 13 | [report] 14 | exclude_lines = 15 | # Have to re-enable the standard pragma 16 | pragma: no cover 17 | 18 | # Don't complain about missing debug-only code: 19 | def __repr__ 20 | if self\.debug 21 | 22 | # Don't complain if tests don't hit defensive assertion code: 23 | raise AssertionError 24 | raise NotImplementedError 25 | 26 | # Don't complain if non-runnable code isn't run: 27 | if 0: 28 | if __name__ == .__main__.: 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # Vim swap files 57 | *.swp 58 | 59 | # ipynb checkpoints 60 | docs/examples/.ipynb_checkpoints/ 61 | .idea/ 62 | venv/ -------------------------------------------------------------------------------- /.landscape.yaml: -------------------------------------------------------------------------------- 1 | doc-warnings: yes 2 | test-warnings: yes 3 | strictness: veryhigh 4 | mccabe: 5 | run: false 6 | pylint: 7 | disable: 8 | - too-few-public-methods 9 | - no-self-use 10 | - too-many-instance-attributes 11 | - invalid-name 12 | - missing-docstring 13 | - star-args 14 | - logging-format-interpolation 15 | - bad-builtin 16 | ignore-paths: 17 | - docs 18 | pep8: 19 | disable: 20 | - E731 21 | - E126 22 | - E128 23 | - E115 24 | - N802 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | virtualenv: 4 | system_site_packages: false 5 | env: 6 | - DISTRIB="conda" COVERAGE="true" PYTHON_VERSION="2.7" 7 | addons: 8 | apt_packages: 9 | - libbz2-dev 10 | - libhdf5-serial-dev 11 | - liblzo2-dev 12 | # command to install dependencies 13 | install: source continuous_integration/install.sh 14 | # command to run tests 15 | script: nosetests -sv --with-coverage --cover-package=pysemantic 16 | after_success: 17 | coveralls 18 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2014–2015 Authors 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the names of the PySemantic Developers, nor the names of its 16 | contributors may be used to endorse or promote products derived from this 17 | software without specific prior written permission. 18 | 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 24 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 30 | DAMAGE. 31 | 32 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | |Travis|_ |Coveralls|_ |Landscape|_ |RTFD|_ 4 | 5 | .. |Travis| image:: https://travis-ci.org/jaidevd/pysemantic.svg?branch=master 6 | .. _Travis: https://travis-ci.org/jaidevd/pysemantic 7 | 8 | .. |Coveralls| image:: https://coveralls.io/repos/jaidevd/pysemantic/badge.svg?branch=master 9 | .. _Coveralls: https://coveralls.io/r/jaidevd/pysemantic?branch=master 10 | 11 | .. |Landscape| image:: https://landscape.io/github/jaidevd/pysemantic/master/landscape.svg?style=flat 12 | .. _Landscape: https://landscape.io/github/jaidevd/pysemantic/master 13 | 14 | .. |RTFD| image:: https://readthedocs.org/projects/pysemantic/badge/?version=latest 15 | .. _RTFD: https://readthedocs.org/projects/pysemantic/?badge=latest 16 | 17 | .. image:: docs/_static/logo.png 18 | 19 | pysemantic 20 | ========== 21 | A traits based data validation and data cleaning module for pandas data structures. 22 | 23 | Dependencies 24 | ------------ 25 | * Traits 26 | * PyYaml 27 | * pandas 28 | * docopt 29 | 30 | Quick Start 31 | ----------- 32 | 33 | Installing with pip 34 | +++++++++++++++++++ 35 | 36 | Run:: 37 | 38 | $ pip install pysemantic 39 | 40 | Installing from source 41 | ++++++++++++++++++++++ 42 | 43 | You can install pysemantic by cloning this repository, installing the 44 | dependencies and running:: 45 | 46 | $ python setup.py install 47 | 48 | in the root directory of your local clone. 49 | 50 | Usage 51 | +++++ 52 | 53 | Create an empty file named ``pysemantic.conf`` in your home directory. This can be as simple as running:: 54 | 55 | $ touch ~/pysemantic.conf 56 | 57 | After installing pysemantic, you should have a command line script called 58 | ``semantic``. Try it out by running:: 59 | 60 | $ semantic list 61 | 62 | This should do nothing. This means that you don't have any projects regiestered 63 | under pysemantic. A _project_ in pysemantic is just a collection of _datasets_. 64 | pysemantic manages your datasets like an IDE manages source code files in that 65 | it groups them under different projects, and each project has it's own tree 66 | structure, build toolchains, requirements, etc. Similarly, different 67 | pysemantic projects group under them a set of datasets, and manages them 68 | depending on their respective user-defined specifications. Projects are 69 | uniquely identified by their names. 70 | 71 | For now, let's add and configure a demo project called, simply, 72 | "pysemantic_demo". You can create a project and register it with pysemantic 73 | using the ``add`` subcommand of the ``semantic`` script as follows:: 74 | 75 | $ semantic add pysemantic_demo 76 | 77 | As you can see, this does not fit the supported usage of the ``add`` subcommand. 78 | We additionally need a file containing the specifications for this project. 79 | (Note that this file, containing the specifications, is referred to throughout 80 | the documentation interchangeably as a *specfile* or a *data dictionary*.) 81 | Before we create this file, let's download the well known Fisher iris datset, 82 | which we will use as the sample dataset for this demo. You can download it 83 | `here `_. 84 | 85 | Once the dataset is downloaded, fire up your favourite text editor and create a 86 | file named ``demo_specs.yaml``. Fill it up with the following content. 87 | 88 | .. code-block:: yaml 89 | 90 | iris: 91 | path: /absolute/path/to/iris.csv 92 | 93 | Now we can use this file as the data dictionary of the ``pysemantic_demo`` 94 | project. Let's tell pysemantic that we want to do so, by running the following 95 | command:: 96 | 97 | $ semantic add pysemantic_demo /path/to/demo_specs.yaml 98 | 99 | We're all set. To see how we did, start a Python interpreter and type the 100 | following statements:: 101 | 102 | >>> from pysemantic import Project 103 | >>> demo = Project("pysemantic_demo") 104 | >>> iris = demo.load_dataset("iris") 105 | 106 | Voila! The Python object named ``iris`` is actually a pandas DataFrame containing 107 | the iris dataset! Well, nothing really remarkable so far. In fact, we cloned 108 | and installed a module, wrote two seemingly unnecessary files, and typed three 109 | lines of Python code to do something that could have been achieved by simply 110 | writing:: 111 | 112 | >>> iris = pandas.read_csv("/path/to/iris.csv") 113 | 114 | Most datasets, however, are not as well behaved as this one. In fact they can 115 | be a nightmare to deal with. Pysemantic can be far more intricate and far 116 | smarter than this when dealing with mangled, badly encoded, ugly data with 117 | inconsistent data types. Check the IPython notebooks in the examples to see how to use Pysemantic for 118 | such data. 119 | -------------------------------------------------------------------------------- /continuous_integration/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # This script is inspired from **pgmpy** implementation of continous test 3 | # integration. This is meant to "install" all the packages required for installing 4 | # semantic. 5 | 6 | # License: The BSD 3-clause License 7 | 8 | set -e 9 | 10 | sudo apt-get update -qq 11 | sudo apt-get install build-essential -qq 12 | 13 | if [[ "$DISTRIB" == "conda" ]]; then 14 | # Deactivate the travis-provided virtual environment and setup a 15 | # conda-based environment instead 16 | deactivate 17 | 18 | # Use the miniconda installer for faster download / install of conda 19 | # itself 20 | wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \ 21 | -O miniconda.sh 22 | bash miniconda.sh -b -p $HOME/miniconda 23 | export PATH=$HOME/miniconda/bin:$PATH 24 | hash -r 25 | conda config --set always_yes yes --set changeps1 no 26 | conda update conda 27 | conda info -a 28 | 29 | conda create -n testenv python=$PYTHON_VERSION --file continuous_integration/requirements.txt 30 | source activate testenv 31 | fi 32 | 33 | if [[ "$COVERAGE" == "true" ]]; then 34 | pip install coverage coveralls 35 | fi 36 | 37 | # Build pgmpy 38 | python setup.py develop 39 | -------------------------------------------------------------------------------- /continuous_integration/requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | traits 3 | pandas 4 | xlrd 5 | openpyxl 6 | pytables 7 | docopt 8 | nose 9 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pysemantic.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pysemantic.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pysemantic" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pysemantic" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaidevd/pysemantic/1b928446e431a69060bbc9d29b8a7c7a6f2b8c0c/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | API Reference 3 | ============= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | apiref/pysemantic 9 | apiref/modules 10 | 11 | -------------------------------------------------------------------------------- /docs/apiref/modules.rst: -------------------------------------------------------------------------------- 1 | pysemantic 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pysemantic 8 | -------------------------------------------------------------------------------- /docs/apiref/pysemantic.rst: -------------------------------------------------------------------------------- 1 | pysemantic package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pysemantic.cli module 8 | --------------------- 9 | 10 | .. automodule:: pysemantic.cli 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pysemantic.custom_traits module 16 | ------------------------------- 17 | 18 | .. automodule:: pysemantic.custom_traits 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pysemantic.errors module 24 | ------------------------ 25 | 26 | .. automodule:: pysemantic.errors 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pysemantic.exporters module 32 | --------------------------- 33 | 34 | .. automodule:: pysemantic.exporters 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pysemantic.loggers module 40 | ------------------------- 41 | 42 | .. automodule:: pysemantic.loggers 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pysemantic.project module 48 | ------------------------- 49 | 50 | .. automodule:: pysemantic.project 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pysemantic.utils module 56 | ----------------------- 57 | 58 | .. automodule:: pysemantic.utils 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pysemantic.validator module 64 | --------------------------- 65 | 66 | .. automodule:: pysemantic.validator 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | 72 | Module contents 73 | --------------- 74 | 75 | .. automodule:: pysemantic 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pysemantic documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Apr 2 17:44:24 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.intersphinx', 34 | 'sphinx.ext.todo', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.mathjax', 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix of source filenames. 43 | source_suffix = '.rst' 44 | 45 | # The encoding of source files. 46 | #source_encoding = 'utf-8-sig' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # General information about the project. 52 | project = u'pysemantic' 53 | copyright = u'2015, Jaidev Deshpande' 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | version = '0.1.1' 61 | # The full version, including alpha/beta/rc tags. 62 | release = '0.0.1' 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | #language = None 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | #today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | #today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | exclude_patterns = ['_build'] 77 | 78 | # The reST default role (used for this markup: `text`) to use for all 79 | # documents. 80 | #default_role = None 81 | 82 | # If true, '()' will be appended to :func: etc. cross-reference text. 83 | #add_function_parentheses = True 84 | 85 | # If true, the current module name will be prepended to all description 86 | # unit titles (such as .. function::). 87 | #add_module_names = True 88 | 89 | # If true, sectionauthor and moduleauthor directives will be shown in the 90 | # output. They are ignored by default. 91 | #show_authors = False 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = 'sphinx' 95 | 96 | # A list of ignored prefixes for module index sorting. 97 | #modindex_common_prefix = [] 98 | 99 | # If true, keep warnings as "system message" paragraphs in the built documents. 100 | #keep_warnings = False 101 | 102 | 103 | # -- Options for HTML output ---------------------------------------------- 104 | 105 | # The theme to use for HTML and HTML Help pages. See the documentation for 106 | # a list of builtin themes. 107 | html_theme = 'default' 108 | 109 | # Theme options are theme-specific and customize the look and feel of a theme 110 | # further. For a list of options available for each theme, see the 111 | # documentation. 112 | #html_theme_options = {} 113 | 114 | # Add any paths that contain custom themes here, relative to this directory. 115 | #html_theme_path = [] 116 | 117 | # The name for this set of Sphinx documents. If None, it defaults to 118 | # " v documentation". 119 | #html_title = None 120 | 121 | # A shorter title for the navigation bar. Default is the same as html_title. 122 | #html_short_title = None 123 | 124 | # The name of an image file (relative to this directory) to place at the top 125 | # of the sidebar. 126 | #html_logo = None 127 | 128 | # The name of an image file (within the static path) to use as favicon of the 129 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 130 | # pixels large. 131 | #html_favicon = None 132 | 133 | # Add any paths that contain custom static files (such as style sheets) here, 134 | # relative to this directory. They are copied after the builtin static files, 135 | # so a file named "default.css" will overwrite the builtin "default.css". 136 | html_static_path = ['_static'] 137 | 138 | # Add any extra paths that contain custom files (such as robots.txt or 139 | # .htaccess) here, relative to this directory. These files are copied 140 | # directly to the root of the documentation. 141 | #html_extra_path = [] 142 | 143 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 144 | # using the given strftime format. 145 | #html_last_updated_fmt = '%b %d, %Y' 146 | 147 | # If true, SmartyPants will be used to convert quotes and dashes to 148 | # typographically correct entities. 149 | #html_use_smartypants = True 150 | 151 | # Custom sidebar templates, maps document names to template names. 152 | #html_sidebars = {} 153 | 154 | # Additional templates that should be rendered to pages, maps page names to 155 | # template names. 156 | #html_additional_pages = {} 157 | 158 | # If false, no module index is generated. 159 | #html_domain_indices = True 160 | 161 | # If false, no index is generated. 162 | #html_use_index = True 163 | 164 | # If true, the index is split into individual pages for each letter. 165 | #html_split_index = False 166 | 167 | # If true, links to the reST sources are added to the pages. 168 | #html_show_sourcelink = True 169 | 170 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 171 | #html_show_sphinx = True 172 | 173 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 174 | #html_show_copyright = True 175 | 176 | # If true, an OpenSearch description file will be output, and all pages will 177 | # contain a tag referring to it. The value of this option must be the 178 | # base URL from which the finished HTML is served. 179 | #html_use_opensearch = '' 180 | 181 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 182 | #html_file_suffix = None 183 | 184 | # Output file base name for HTML help builder. 185 | htmlhelp_basename = 'pysemanticdoc' 186 | 187 | 188 | # -- Options for LaTeX output --------------------------------------------- 189 | 190 | latex_elements = { 191 | # The paper size ('letterpaper' or 'a4paper'). 192 | #'papersize': 'letterpaper', 193 | 194 | # The font size ('10pt', '11pt' or '12pt'). 195 | #'pointsize': '10pt', 196 | 197 | # Additional stuff for the LaTeX preamble. 198 | #'preamble': '', 199 | } 200 | 201 | # Grouping the document tree into LaTeX files. List of tuples 202 | # (source start file, target name, title, 203 | # author, documentclass [howto, manual, or own class]). 204 | latex_documents = [ 205 | ('index', 'pysemantic.tex', u'pysemantic Documentation', 206 | u'Jaidev Deshpande', 'manual'), 207 | ] 208 | 209 | # The name of an image file (relative to this directory) to place at the top of 210 | # the title page. 211 | #latex_logo = None 212 | 213 | # For "manual" documents, if this is true, then toplevel headings are parts, 214 | # not chapters. 215 | #latex_use_parts = False 216 | 217 | # If true, show page references after internal links. 218 | #latex_show_pagerefs = False 219 | 220 | # If true, show URL addresses after external links. 221 | #latex_show_urls = False 222 | 223 | # Documents to append as an appendix to all manuals. 224 | #latex_appendices = [] 225 | 226 | # If false, no module index is generated. 227 | #latex_domain_indices = True 228 | 229 | 230 | # -- Options for manual page output --------------------------------------- 231 | 232 | # One entry per manual page. List of tuples 233 | # (source start file, name, description, authors, manual section). 234 | man_pages = [ 235 | ('index', 'pysemantic', u'pysemantic Documentation', 236 | [u'Jaidev Deshpande'], 1) 237 | ] 238 | 239 | # If true, show URL addresses after external links. 240 | #man_show_urls = False 241 | 242 | 243 | # -- Options for Texinfo output ------------------------------------------- 244 | 245 | # Grouping the document tree into Texinfo files. List of tuples 246 | # (source start file, target name, title, author, 247 | # dir menu entry, description, category) 248 | texinfo_documents = [ 249 | ('index', 'pysemantic', u'pysemantic Documentation', 250 | u'Jaidev Deshpande', 'pysemantic', 'One line description of project.', 251 | 'Miscellaneous'), 252 | ] 253 | 254 | # Documents to append as an appendix to all manuals. 255 | #texinfo_appendices = [] 256 | 257 | # If false, no module index is generated. 258 | #texinfo_domain_indices = True 259 | 260 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 261 | #texinfo_show_urls = 'footnote' 262 | 263 | # If true, do not generate a @detailmenu in the "Top" node's menu. 264 | #texinfo_no_detailmenu = False 265 | 266 | 267 | # Example configuration for intersphinx: refer to the Python standard library. 268 | intersphinx_mapping = {'http://docs.python.org/': None} 269 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | * Introduction_ to PySemantic. 9 | 10 | .. _Introduction: http://nbviewer.ipython.org/github/jaidevd/pysemantic/blob/master/docs/examples/introduction.ipynb 11 | -------------------------------------------------------------------------------- /docs/examples/introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Introduction\n", 8 | "=======\n", 9 | "\n", 10 | "Data is dirty. Any dataset that isn't properly curated and stored can suffer from many problems like having mixed data types, not being properly encoded or escaped, uneven number of fields, and so on. None of these problems are unsolvable. In fact, most of us are pretty good at cleaning data. Normally, when we know little or nothing about a given dataset, we proceed in a very predictable manner. We first try to read the data naively and see if errors are raised by the parser. If they are, we try to fix our function calls. When those are fixed, we try to run some sanity checks on the data, and end up filtering the dataset, sometimes quite heavily.\n", 11 | "\n", 12 | "The problem with this process is that it is iterative, and worse, it is _reactive_. Everybody in the team has to do it if they are to use the dataset. Sure, one can simply clean it up and dump it in a new file with just a few lines of code. But we shouldn't have to run that script every time we encouter a new dataset. We would be much more comforable if data is cleaned as it is read. It is much more efficient if data cleaning is a part of _data ingestion_.\n", 13 | "\n", 14 | "This can be achieved by having a centralized schema for every dataset. This schema can house the rules that the clean dataset must follow, so as to further aid its analysis. Of course, this schema can be expressed via a simple Python script which is shared with everyone who is doing analysis on the dataset in question. But the number of datasets that someone has to deal with over the timeline of a particular project can quickly get out of hand, and so do their cleaning scripts. Secondly, and more importantly, cleaning data via ad-hoc Python scripts is non trivial. Readable as Python scripts might be, it's not always easy for everyone in the team to change the cleaning process. Moreover, there are no Python libraries that offer an abstraction at the level of cleaning and validating data.\n", 15 | "\n", 16 | "Therefore, if one has to go through the process of data validation and cleaning in a customizable, modular way, one has to make sure that:\n", 17 | "\n", 18 | "* the specifications for all datasets are in one place, not in different scripts.\n", 19 | "* datasets are grouped under a suitable name, that pertains to particular projects. (In PySemantic such a group is called a `Project`, as we shall see).\n", 20 | "* strict validation and cleaning rules must be applied to all aspects of a dataset\n", 21 | "* the process of validation and cleaning has to be indentically reproducible by everyone who works on the data\n", 22 | "\n", 23 | "PySemantic makes all that happen.\n", 24 | "\n", 25 | "1. Getting Started\n", 26 | "==========\n", 27 | "\n", 28 | "Let's get our hands dirty. We'll explore more features as we go along. Before you proceed further, please make sure that you have gone through the quick start section [here](https://github.com/jaidevd/pysemantic#quick-start).\n", 29 | "\n", 30 | "By now you should have added a project named `pysemantic_demo`, and used the project object to load the iris. dataset. Let's take a more detailed look at what is happening here.\n", 31 | "\n", 32 | "1.1 The Project class\n", 33 | "------------------------\n", 34 | "\n", 35 | "A first class citizen of the pysemantic namespace is the [Project class](https://github.com/jaidevd/pysemantic/tree/master/pysemantic/project.py#L247). This class has everything you need to add, remove, read, or write datasets. In PySemantic, all datasets are classified under projects represented by instances of the Project class. Each project is identified by a unique name. This name is used to instantiate the Project class, and perform operations of all datasets registerd under it. You can think of these \"projects\" under pysemantic in the same way as an IDE organizes software projects. Each project in an IDE has a set of files containing source code, a set of build tools and a few other things that make a project self contained. Similarly, each project in PySemantic has its own datasets, which in turn have their schema and their validation rules. Currently, for this example, the iris dataset is loaded naively, without any rules." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from pysemantic import Project" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "demo = Project(\"pysemantic_demo\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "iris = demo.load_dataset(\"iris\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | "
Sepal LengthSepal WidthPetal LengthPetal WidthSpecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", 136 | "
" 137 | ], 138 | "text/plain": [ 139 | " Sepal Length Sepal Width Petal Length Petal Width Species\n", 140 | "0 5.1 3.5 1.4 0.2 setosa\n", 141 | "1 4.9 3.0 1.4 0.2 setosa\n", 142 | "2 4.7 3.2 1.3 0.2 setosa\n", 143 | "3 4.6 3.1 1.5 0.2 setosa\n", 144 | "4 5.0 3.6 1.4 0.2 setosa" 145 | ] 146 | }, 147 | "execution_count": 4, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "iris.head(5)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "source": [ 162 | "This is the Fisher iris dataset that we know so well. Now imagine that someone was curating for us more samples of these flowers and sending us the measurements for 150 more flowers (sepal length, sepal width, petal length, petal width, and the species). That would amount to 150 more rows from in the dataset. Now suppose that our data acquisition methods were flawed, and the data that came in was dirty. A sample of such a dirty dataset can be found [here](https://github.com/jaidevd/pysemantic/tree/master/pysemantic/tests/testdata/bad_iris.csv). Try loading this file into a pandas dataframe directly, using the `pandas.read_csv` function. Notice that there's a column called `id`, which contains 10 digit strings. These IDs could correspond to some automatically generated IDs by the system storing the data. If they're really just IDs, they should be read as strings, but there was no way for pandas to know that these are as good as strings (Other examples of this are phone numbers and zipcodes). In pandas, this can be fixed by using the `dtype` argument in `pandas.read_csv`. To make this preference persist in pysemantic, we can add this dataset to our data dictionary (`demo_specs.yaml`) by adding to it the following lines:\n", 163 | "\n", 164 | "```yaml\n", 165 | "bad_iris:\n", 166 | " path: /absolute/path/to/bad_iris.csv\n", 167 | " dtypes:\n", 168 | " - id: !!python/name.__builtin__.str\n", 169 | "```\n", 170 | "\n", 171 | "The last line tells pandas that the coulmn `id` is to be read as a string, not as the default integer. Any type can thus be specified for any column, by adding a line formatted as follows:\n", 172 | "\n", 173 | "```yaml\n", 174 | " - column_name: yaml-dump-of-python-type\n", 175 | "```\n", 176 | "\n", 177 | "for the given column. (Similarly, we can specify types for the other columns in the dataset too, but this isn't required since the default works fine for them.) You can try out how the Project object can infer these new specifications by doing the following:" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 6, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "demo.reload_data_dict() # Re-reads the data dictionary specifications\n", 189 | "bad_iris = demo.load_dataset(\"bad_iris\")" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 2", 196 | "language": "python", 197 | "name": "python2" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 2 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython2", 209 | "version": "2.7.10" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 0 214 | } 215 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pysemantic documentation master file, created by 2 | sphinx-quickstart on Thu Apr 2 17:44:24 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pysemantic's documentation! 7 | ====================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | examples 15 | schema_ref 16 | api 17 | 18 | 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | * :ref:`search` 26 | 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pysemantic.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pysemantic.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/schema_ref.rst: -------------------------------------------------------------------------------- 1 | ============================== 2 | Schema Configuration Reference 3 | ============================== 4 | 5 | Every project in PySemantic can be configured via a data dictionary or a 6 | schema, which is a yaml file. This file houses the details of how PySemantic 7 | should treat a project's constituent datasets. A typical data dictionary 8 | follows the following pattern: 9 | 10 | .. code-block:: yaml 11 | 12 | dataset_name: 13 | dataset_param_1: value1 14 | dataset_param_2: value2 15 | # etc 16 | 17 | PySemantic reads this as a dictionary where the parameter names are keys and 18 | their values are the values in the dictionary. Thus, the schema for a whole 19 | project is a dictionary of dictionaries. 20 | 21 | -------------------------- 22 | Basic Schema Configuration 23 | -------------------------- 24 | 25 | Here is a list of different dataset parameters that PySemantic is sensitive 26 | to: 27 | 28 | * ``path`` (Required, except when the ``source`` parameter is "mysql") The path to the file containing the data. Note that the path must either be absolute, or relative to the directory containing the schema. This can also be a list of files if the dataset spans multiple files. If that is the case, the path parameter can be specified as: 29 | 30 | .. code-block:: yaml 31 | 32 | path: 33 | - absolulte/path/to/file/1 34 | - absolulte/path/to/file/2 35 | # etc 36 | 37 | # or 38 | 39 | path: 40 | - foo/bar/baz 41 | # where foo is a directory in the directory that contains the schema. 42 | 43 | * ``demlimiter`` (Optional, default: ``,``) The delimiter used in the file. This has to be a character delimiter, not words like "comma" or "tab". 44 | 45 | * ``md5`` (Optional) The MD5 checksum of the file to read. This necessary 46 | because sometimes we read files and after processing it, rewrite to the same 47 | path. This parameter helps keep track of whether the file is correct. 48 | 49 | * ``header``: (Optional) The header row of the file. 50 | 51 | * ``index_col``: (Optional) Name of the column that forms the index of the 52 | dataframe. This can be a single string or a list of strings. If a list is 53 | provided, the dataframe becomes multi-indexed. 54 | 55 | * ``sheetname``: (Optional) Name of the sheet containing the dataset in an 56 | MS Excel spreadsheet. This comes into play only when ``path`` points to an 57 | Excel file. For other types of files, this is ignored. When ``path`` is an 58 | Excel file and this parameter is not provided, it is assumed to be the same 59 | as the name of the dataset. For example: 60 | 61 | .. code-block:: yaml 62 | 63 | iris: 64 | path: /path/to/iris.xlsx 65 | 66 | The schema above assumes that the iris dataset resides in a sheet named 67 | "iris". If instead the name of the sheet is different, you can specify it 68 | as: 69 | 70 | .. code-block:: yaml 71 | 72 | iris: 73 | path: /path/to/iris.xlsx 74 | sheetname: name_of_sheet 75 | 76 | This parameter can also be a list, to enable the combination of multiple 77 | sheets into a dataframe, as follows: 78 | 79 | .. code-block:: yaml 80 | 81 | iris: 82 | path: /path/to/iris.xlsx 83 | sheetname: 84 | - sheet1 85 | - sheet2 86 | 87 | This will combine the data from sheet1 and sheet2 into a single dataframe. 88 | 89 | * ``column_names``: (Optional) Specify the names of columns to use in the 90 | loaded dataframe. This option can have multiple types of values. It can be: 91 | 92 | 1. A list of strings to use as column names: 93 | 94 | .. code-block:: yaml 95 | 96 | column_names: 97 | - column_1 98 | - column_2 99 | - column_3 100 | 101 | 2. A dictionary that maps original column names to new ones: 102 | 103 | .. code-block:: yaml 104 | 105 | column_names: 106 | org_colname_1: new_colname_a 107 | org_colname_2: new_colname_b 108 | org_colname_3: new_colname_c 109 | 110 | 3. A Python function that translates the name of every column in the loaded 111 | dataframe: 112 | 113 | .. code-block:: yaml 114 | 115 | column_names: !!python/name:module_name.translate_column_name 116 | 117 | * ``nrows``: (Optional) Method to select which rows are read from the dataset. 118 | This option, like ``column_names``, can be specified in many ways. It can be: 119 | 120 | 1. An integer (default): Number of rows to read from the file. If this 121 | option is not specified, all rows from the file are read. 122 | 123 | .. code-block:: yaml 124 | 125 | nrows: 100 126 | 127 | 2. A dictionary that recognizes specific keys: 128 | 129 | * ``random``: A boolean that directs PySemantic to shuffle the selected rows after loading the dataset. 130 | For example, including the following lines in the schema 131 | 132 | .. code-block:: yaml 133 | 134 | nrows: 135 | random: true 136 | 137 | will shuffle the dataset before returning it. 138 | 139 | * ``range``: A list of two integers, which denote the first and the 140 | last index of the range of rows to be read. For example, the 141 | following lines 142 | 143 | .. code-block:: yaml 144 | 145 | nrows: 146 | range: 147 | - 10 148 | - 50 149 | 150 | will only select the 10th to the 50th (exclusive) rows. 151 | 152 | * ``count``: An integer that can be used in conjunction with either 153 | or both of the above options, to denote the number of rows to read 154 | from a random selection or a range. 155 | 156 | .. code-block:: yaml 157 | 158 | nrows: 159 | range: 160 | - 10 161 | - 50 162 | count: 10 163 | random: true 164 | 165 | The lines shown above will direct PySemantic to load 10 rows at 166 | random between the 10th and the 50th rows of a dataset. 167 | 168 | * ``shuffle``: A boolean to be used with ``count`` to shuffle the top ``count`` rows before returning the dataframe. 169 | 170 | .. code-block:: yaml 171 | 172 | nrows: 173 | count: 10 174 | shuffle: True 175 | 176 | The above schema will read the first ten rows from the dataset and 177 | shuffle them. 178 | 179 | 3. A callable which returns a logical array which has the same number of elements as the number of rows in the dataset. The output of this callable is used as a logical index for slicing the dataset. For example, suppose we wanted to extract all even numbered rows from a dataset, then we could make a callable as follows: 180 | 181 | .. code-block:: python 182 | 183 | iseven = lambda x: np.remainder(x, 2) == 0 184 | 185 | Suppose this function resides in a module called ``foo.bar``, then we 186 | can include it in the schema as follows: 187 | 188 | .. code-block:: yaml 189 | 190 | nrows: !!python/name:foo.bar.iseven 191 | 192 | This will cause PySemantic to only load all even valued row numbers. 193 | 194 | * ``use_columns``: (Optional) The list of the columns to read from the dataset. The format for specifying this parameter is as follows: 195 | 196 | .. code-block:: yaml 197 | 198 | use_columns: 199 | - column_1 200 | - column_2 201 | - column_3 202 | 203 | If this parameter is not specified, all columns present in the dataset are read. 204 | 205 | * ``exclude_columns``: This option can be used to specify columns that are 206 | explicityly to be ignored. This is useful when there are large number of 207 | columns in the dataset and we only wish to exclude a few. Note that this 208 | option overrides the ``use_columns`` option, i.e. if a column name is present 209 | in both lists, it will be dropped. 210 | 211 | * ``na_values``: A string or a list of values that are considered as NAs by the pandas parsers, applicable to the whole dataframe. 212 | 213 | * ``converters``: A dictionary of functions to be applied to columns when loading data. Any Python callable can be added to this list. This parameter makes up the ``converters`` argument of Pandas parsers. The usage is as follows: 214 | 215 | .. code-block:: yaml 216 | 217 | converters: 218 | col_a: !!python/name:numpy.int 219 | 220 | This results in the ``numpy.int`` function being called on the column ``col_a`` 221 | 222 | * ``dtypes`` (Optional) Data types of the columns to be read. Since types in Python are native objects, PySemantic expects them to be so in the schema. This can be formatted as follows: 223 | 224 | .. code-block:: yaml 225 | 226 | dtypes: 227 | column_name: !!python/name:python_object 228 | 229 | For example, if you have three columns named ``foo``, ``bar``, and ``baz``, 230 | which have the types ``string``, ``integer`` and ``float`` respectively, then your schema 231 | should look like: 232 | 233 | .. code-block:: yaml 234 | 235 | dtypes: 236 | foo: !!python/name:__builtin__.str 237 | bar: !!python/name:__builtin__.int 238 | baz: !!python/name:__builtin__.float 239 | 240 | Non-builtin types can be specified too: 241 | 242 | .. code-block:: yaml 243 | 244 | dtypes: 245 | datetime_column: !!python/name:datetime.date 246 | 247 | *Note*: You can figure out the yaml representation of a Python type by doing 248 | the following: 249 | 250 | .. code-block:: python 251 | 252 | import yaml 253 | x = type(foo) # where foo is the object who's type is to be yamlized 254 | print yaml.dump(x) 255 | 256 | * ``parse_dates`` (Optional) Columns containing Date/Time values can be parsed into native NumPy datetime objects. This argument can be a list, or a ditionary. If it is a dictionary of the following form: 257 | 258 | .. code-block:: yaml 259 | 260 | parse_dates: 261 | output_col_name: 262 | - col_a 263 | - col_b 264 | 265 | it will parse columns ``col_a`` and ``col_b`` as datetime columns, and put the result in a column named ``output_col_name``. Specifying the output name is optional. You may declare the schema as a list, as follows: 266 | 267 | .. code-block:: yaml 268 | 269 | parse_dates: 270 | - col_a 271 | - col_b 272 | 273 | In this case the parser will independently parse columns ``col_a`` and ``col_b`` into datetime. 274 | 275 | *NOTE*: Specifying this column will make PySemantic ignore any columns that have been declared as having the datetime type in the ``dtypes`` parameter. 276 | 277 | * ``pickle`` (Optional) Absolute path to file which contains pickled arguments for the 278 | parser. This option can be used if readability or declaratives are not a concern. The file should contain a picked dictionary that is directly passed 279 | to the parser, i.e. if the loaded pickled data is in a dict named ``data``, 280 | then parser invocation becomes ``parser(**data)``. 281 | 282 | *NOTE*: If any of the above options are present, they will override the corresponding arguments contained in the pickle file. In PySemantic, declarative statements have the right of way. 283 | 284 | ---------------------------- 285 | Column Schema Configuration 286 | ---------------------------- 287 | 288 | PySemantic also allows specifying rules and validators independently for each 289 | column. This can be done using the ``column_rules`` parameter of the dataset 290 | schema. Here is a typical format: 291 | 292 | .. code-block:: yaml 293 | 294 | dataset_name: 295 | column_rules: 296 | column_1_name: 297 | # rules to be applied to the column 298 | column_2_name: 299 | # rules to be applied to the column 300 | 301 | The following parameters can be supplied to any column under ``column_rules``: 302 | 303 | * ``is_drop_na`` ([true|false], default false) Setting this to ``true`` causes PySemantic to drop all NA values in the column. 304 | * ``is_drop_duplicates`` ([true|false], default false) Setting this to ``true`` causes PySemantic to drop all duplicated values in the column. 305 | * ``unique_values``: These are the unique values that are expected in a column. The value of this parameter has to be a yaml list. Any value not found in this list will be dropped when cleaning the dataset. 306 | * ``exclude``: These are the values that are to be explicitly excluded from the column. This comes in handy when a column has too many unique values, and a handful of them have to be dropped. Note that this value has to be a list. 307 | * ``min``: Minimum value allowed in a column if the column holds numerical data. By default, the minimum is -np.inf. Any value less than this one is dropped. 308 | * ``max``: Maximum value allowed in a column if the column holds numerical data. By default, the maximum is np.inf. Any value greater than this one is dropped. 309 | * ``regex``: A regular expression that each element of the column must match, if the column holds text data. Any element of the column not matching this regex is dropped. 310 | * ``na_values``: A list of values that are considered as NAs by the pandas parsers, applicable to this column. 311 | * ``postprocessors``: A list of callables that called one by one on the columns. Any python function that accepts a series, and returns a series can be a postprocessor. 312 | 313 | 314 | Here is a more extensive example of the usage of this schema. 315 | 316 | .. code-block:: yaml 317 | 318 | iris: 319 | path: /home/username/src/pysemantic/testdata/iris.csv 320 | converters: 321 | Sepal Width: !!python/name:numpy.floor 322 | column_rules: 323 | Sepal Length: 324 | min: 2.0 325 | Petal Length: 326 | max: 4.0 327 | Petal Width: 328 | exclude: 329 | - 3.14 330 | Species: 331 | unique_values: 332 | - setosa 333 | - versicolor 334 | postprocessors: 335 | - !!python/name:module_name.foo 336 | 337 | This would cause PySemantic to produce a dataframe corresponding to the Fisher 338 | iris dataset which has the following characteristics: 339 | 340 | 1. It contains no observations where the sepal length is less than 2 cm. 341 | 2. It contains no observations where the petal length is more than 4 cm. 342 | 3. The sepal width only contains integers. 343 | 4. The petal width column will not contain the specific value 3.14 344 | 5. The species column will only contain the values "setosa" and "versicolor", i.e. it will not contain the value "virginica". 345 | 6. The species column in the dataframe will be processed by the ``module_name.foo`` function. 346 | 347 | 348 | ------------------------------ 349 | DataFrame Schema Configuration 350 | ------------------------------ 351 | 352 | A few rules can also be enforced at the dataframe level, instead of at the 353 | level of individual columns in the dataset. Two of them are: 354 | 355 | * ``drop_duplicates`` ([true|false, default true]). This behaves in the same 356 | way as ``is_drop_duplicates`` for series schema, with the exception that here 357 | the default is True. 358 | * ``drop_na`` ([true|false, default true]). This behaves in the same 359 | way as ``is_drop_na`` for series schema, with the exception that here 360 | the default is True. 361 | 362 | 363 | ---------------- 364 | Reading from SQL 365 | ---------------- 366 | 367 | *Note*: This has not yet been tested. 368 | 369 | PySemantic can automatically create the function calls required to download a 370 | SQL table as a dataframe - by using a wrapper around the 371 | ``pandas.read_sql_table`` function. The configuration parameters are as 372 | follows: 373 | 374 | * ``source``: This is simply a string saying "mysql", which lets pysemantic 375 | know that the dataset is to be downloaded from a MySQL database. 376 | * ``table_name``: Name of the table to be read. If this argument is not 377 | present, pysemantic expects to find the ``query`` parameter. 378 | * ``query``: SQL query to run and extract the resulting rows into a pandas 379 | dataframe 380 | * ``config``: This is a dictionary that contains the configuration required to 381 | connect to the MySQL server. The configuration must have the following 382 | elements: 383 | 384 | 1. ``hostname``: The IP address or the hostname of the machine hosting the MySQL server. 385 | 2. ``db_name``: Name of the database from which to read the table. 386 | 3. ``username``: The SQL username 387 | 4. ``password``: The SQL password 388 | * ``chunksize``: (Integer, optional) If this is specified, Pandas returns an 389 | iterator in which every iteration contains ``chunksize`` rows. 390 | -------------------------------------------------------------------------------- /docs/tutorial/notebooks/demo_project.yml: -------------------------------------------------------------------------------- 1 | dummy_data: 2 | path: /home/jaidev/src/pysemantic/docs/tutorial/notebooks/dummy_data.csv 3 | dtypes: 4 | zip: !!python/name:__builtin__.str 5 | date: !!python/name:datetime.date 6 | column_rules: 7 | X: 8 | maximum: 9.0 9 | email: 10 | regex: (^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$) 11 | -------------------------------------------------------------------------------- /docs/tutorial/notebooks/loading_datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "What we did last time\n", 8 | "------------------------\n", 9 | "\n", 10 | "1. Specified the data type of a column\n", 11 | "2. Parsed timestamps into Pythonic timestamps\n", 12 | "3. Dropped outliers from a numerical array\n", 13 | "4. Checked text for valid email addresses" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "from pysemantic import Project\n", 25 | "import numpy as np" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "demo = Project(\"demo\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "data = demo.load_dataset(\"dummy_data\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | "
datezipXemail
02015-02-21 01:05:03136115.014501jeff.dasovich@enron.com
22015-02-27 10:16:34028888.918459karen.denne@enron.com
32015-02-20 19:11:00078275.664665enron-owner@lists.qgadc.com
42015-02-21 13:20:11238876.159554jeff.dasovich@enron.com
52015-02-22 04:17:01354615.618556jeff.dasovich@enron.com
\n", 109 | "
" 110 | ], 111 | "text/plain": [ 112 | " date zip X email\n", 113 | "0 2015-02-21 01:05:03 13611 5.014501 jeff.dasovich@enron.com\n", 114 | "2 2015-02-27 10:16:34 02888 8.918459 karen.denne@enron.com\n", 115 | "3 2015-02-20 19:11:00 07827 5.664665 enron-owner@lists.qgadc.com\n", 116 | "4 2015-02-21 13:20:11 23887 6.159554 jeff.dasovich@enron.com\n", 117 | "5 2015-02-22 04:17:01 35461 5.618556 jeff.dasovich@enron.com" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "data.head()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "date datetime64[ns]\n", 141 | "zip object\n", 142 | "X float64\n", 143 | "email object\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "for column in data:\n", 149 | " print column, data[column].dtype" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "False\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "zip_lengths = data['zip'].apply(len)\n", 169 | "print np.any(zip_lengths != 5)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 7, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "8.99728774235\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "print data['X'].max()" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 2", 195 | "language": "python", 196 | "name": "python2" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 2 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython2", 208 | "version": "2.7.9" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 0 213 | } 214 | -------------------------------------------------------------------------------- /docs/tutorial/slides/images/dc_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaidevd/pysemantic/1b928446e431a69060bbc9d29b8a7c7a6f2b8c0c/docs/tutorial/slides/images/dc_logo.jpg -------------------------------------------------------------------------------- /docs/tutorial/slides/presentation.tex: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % Beamer Presentation 3 | % LaTeX Template 4 | % Version 1.0 (10/11/12) 5 | % 6 | % This template has been downloaded from: 7 | % http://www.LaTeXTemplates.com 8 | % 9 | % License: 10 | % CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/) 11 | % 12 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 13 | 14 | %---------------------------------------------------------------------------------------- 15 | % PACKAGES AND THEMES 16 | %---------------------------------------------------------------------------------------- 17 | 18 | 19 | 20 | \documentclass{beamer} 21 | 22 | 23 | \definecolor{dcorange}{HTML}{F05A28} 24 | \setbeamercolor{structure}{bg=black, fg=dcorange} 25 | 26 | 27 | \usetheme{Warsaw} 28 | 29 | \usepackage{graphicx} % Allows including images 30 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 31 | \usepackage{hyperref} % Allows the use of \toprule, \midrule and \bottomrule in tables 32 | \usepackage{textpos} 33 | 34 | %---------------------------------------------------------------------------------------- 35 | % TITLE PAGE 36 | %---------------------------------------------------------------------------------------- 37 | 38 | \titlegraphic{\includegraphics[width=.6\textwidth,height=.3\textheight]{images/dc_logo.jpg}} 39 | \title[Introduction to PySemantic]{Introduction to PySemantic} 40 | % The short title appears at the bottom of every slide, the full title is only on the title page 41 | 42 | %\titlegraphic{\includegraphics[width=.5\textwidth,height=.5\textheight]{dc_logo.jpg}} 43 | \author{Jaidev Deshpande} % Your name 44 | \institute[Cube26 Pvt Limited] % Your institution as it will appear on the bottom of every slide, may be shorthand to save space 45 | 46 | 47 | %{ 48 | %Cube26 Pvt Ltd \\ % Your institution for the title page 49 | %%\includegraphics[width=2cm]{dc_logo.jpg} 50 | %\medskip 51 | %\textit{deshpande.jaidev@gmail.com} % Your email address 52 | %} 53 | \date{\today} % Date, can be changed to a custom date 54 | 55 | %\titlegraphic{\includegraphics[width=2cm]{dc_logo.jpg}} 56 | 57 | %\addtobeamertemplate{frametitle}{}{% 58 | % \begin{textblock*}{100mm}(.85\textwidth,-1cm) 59 | % \includegraphics[height=1cm,width=2cm]{dc_logo.jpg} 60 | %\end{textblock*}} 61 | 62 | 63 | 64 | \begin{document} 65 | 66 | \begin{frame} 67 | \titlepage % Print the title page as the first slide 68 | \end{frame} 69 | 70 | \begin{frame} 71 | \frametitle{Motivation} 72 | \begin{itemize} 73 | \item Typical data analysis pipeline:\\ 74 | Data Ingest $\rightarrow$ Exploratory Analysis $\rightarrow$ Feature Engineering $\rightarrow$ Machine Learning $\rightarrow$ Insights! 75 | \item Data scientists often work in large teams. 76 | \item Communication about data ingest is important. 77 | \item Messy data $\Rightarrow$ more communication. 78 | \end{itemize} 79 | \end{frame} 80 | 81 | \begin{frame} 82 | \frametitle{Why PySemantic?} 83 | \begin{itemize} 84 | \item Problem: How do I effectively communicate about data? 85 | \item Existing solutions:\\ 86 | \begin{enumerate} 87 | \item Text documentation 88 | \item Ad-hoc scripts to clean or validate the data 89 | \item Version control 90 | \end{enumerate} 91 | \item Don't scale with the diversity of the data. 92 | \item The process is \textit{reactive} 93 | \item The process is unnecessarily redundant. 94 | \end{itemize} 95 | \end{frame} 96 | 97 | \begin{frame} 98 | \frametitle{Why PySemantic?} 99 | \begin{itemize} 100 | \item Group all datasets under \textit{projects}. 101 | \item A centralized data dictionary that holds properties of all 102 | datasets under a project. 103 | \item A single entry point into the software framework required for 104 | reading, cleaning and validating a dataset. 105 | \item Reproducibility across teams and individuals. 106 | \end{itemize} 107 | \end{frame} 108 | 109 | \begin{frame} 110 | \frametitle{Getting Started} 111 | \url{https://github.com/jaidevd/pysemantic} 112 | \end{frame} 113 | 114 | 115 | %---------------------------------------------------------------------------------------- 116 | 117 | \end{document} 118 | -------------------------------------------------------------------------------- /pysemantic/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | 10 | from pysemantic.project import Project 11 | 12 | __version__ = "0.1.1" 13 | 14 | __all__ = ['Project', 'test'] 15 | -------------------------------------------------------------------------------- /pysemantic/cli.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3 clause license. 8 | 9 | """semantic 10 | 11 | Usage: 12 | semantic list [--project=] 13 | semantic add PROJECT_NAME PROJECT_SPECFILE 14 | semantic remove PROJECT_NAME [--dataset=] 15 | semantic set-schema PROJECT_NAME SCHEMA_FPATH 16 | semantic set-specs PROJECT_NAME --dataset= [--path=] [--dlm=] 17 | semantic add-dataset DATASET_NAME --project= --path= --dlm= 18 | semantic export PROJECT_NAME [--dataset=] OUTPATH 19 | 20 | Options: 21 | -h --help Show this screen 22 | -d --dataset= Name of the dataset to modify 23 | --path= Path to a dataset 24 | --dlm= Declare the delimiter for a dataset 25 | -p --project= Name of the project to modify 26 | -v --version Print the version of PySemantic 27 | 28 | """ 29 | 30 | import os.path as op 31 | 32 | from docopt import docopt 33 | 34 | from pysemantic import project as pr 35 | from pysemantic.errors import MissingProject 36 | 37 | 38 | def cli(arguments): 39 | """cli - The main CLI argument parser. 40 | 41 | :param arguments: command line arguments, as parsed by docopt 42 | :type arguments: dict 43 | :return: None 44 | """ 45 | if arguments.get("list", False): 46 | if arguments['--project'] is None: 47 | pr.view_projects() 48 | else: 49 | proj_name = arguments.get('--project') 50 | dataset_names = pr.get_datasets(proj_name) 51 | for name in dataset_names: 52 | print name 53 | elif arguments.get("add", False): 54 | proj_name = arguments.get("PROJECT_NAME") 55 | proj_spec = arguments.get("PROJECT_SPECFILE") 56 | proj_spec = op.abspath(proj_spec) 57 | pr.add_project(proj_name, proj_spec) 58 | elif arguments.get("remove", False): 59 | proj_name = arguments.get("PROJECT_NAME") 60 | if arguments['--dataset'] is None: 61 | if not pr.remove_project(proj_name): 62 | print "The project {0} doesn't exist.".format(proj_name) 63 | else: 64 | pr.remove_dataset(proj_name, arguments['--dataset']) 65 | elif arguments.get("set-schema", False): 66 | try: 67 | proj_name = arguments.get("PROJECT_NAME") 68 | proj_spec = arguments.get("SCHEMA_FPATH") 69 | proj_spec = op.abspath(proj_spec) 70 | pr.set_schema_fpath(proj_name, proj_spec) 71 | except MissingProject: 72 | msg = """Project {} not found in the configuration. Please use 73 | $ semantic add 74 | to register the project.""".format(arguments.get("PROJECT_NAME")) 75 | print msg 76 | elif arguments.get("set-specs", False): 77 | proj_name = arguments.get("PROJECT_NAME") 78 | dataset_name = arguments.get("--dataset") 79 | newspecs = {} 80 | if arguments.get("--path", False): 81 | newspecs['path'] = arguments.get("--path") 82 | if arguments.get("--dlm", False): 83 | newspecs['delimiter'] = arguments.get("--dlm") 84 | pr.set_schema_specs(proj_name, dataset_name, **newspecs) 85 | elif arguments.get("add-dataset", False): 86 | proj_name = arguments.get('--project') 87 | dataset_name = arguments.get("DATASET_NAME") 88 | specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) 89 | pr.add_dataset(proj_name, dataset_name, specs) 90 | elif arguments.get("export", False): 91 | project = pr.Project(arguments.get("PROJECT_NAME")) 92 | project.export_dataset(arguments.get("--dataset"), 93 | outpath=arguments.get("OUTPATH")) 94 | 95 | 96 | def main(): 97 | arguments = docopt(__doc__, version="semantic v0.1.1") 98 | cli(arguments) 99 | -------------------------------------------------------------------------------- /pysemantic/custom_traits.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Customized traits for advanced validation.""" 10 | 11 | import os.path as op 12 | 13 | from traits.api import File, List, TraitError 14 | 15 | 16 | class ValidTraitList(List): 17 | 18 | """A List trait whose every element should be valid trait.""" 19 | 20 | def validate(self, obj, name, value): 21 | validated_value = super(ValidTraitList, self).validate(obj, name, 22 | value) 23 | for trait_name in validated_value: 24 | trait = obj.trait(trait_name) 25 | trait.validate(obj, trait_name, getattr(obj, trait_name)) 26 | return validated_value 27 | 28 | 29 | class AbsFile(File): 30 | 31 | """A File trait whose value must be an absolute path, to an existing 32 | file. 33 | """ 34 | 35 | exists = True 36 | 37 | def validate(self, obj, name, value): 38 | validated_value = super(AbsFile, self).validate(obj, name, value) 39 | if validated_value and op.isabs(validated_value) and op.isfile(value): 40 | return validated_value 41 | elif not op.isfile(value): 42 | raise TraitError("The filepath does not exist.") 43 | 44 | self.error(obj, name, value) 45 | -------------------------------------------------------------------------------- /pysemantic/errors.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Errors.""" 10 | 11 | 12 | class MissingProject(Exception): 13 | 14 | """Error raised when project is not found.""" 15 | 16 | 17 | class MissingConfigError(Exception): 18 | 19 | """Error raised when the pysemantic configuration file is not found.""" 20 | 21 | 22 | class ParserArgumentError(Exception): 23 | 24 | """Error raised when no valid parser arguments are inferred from the 25 | schema.""" 26 | -------------------------------------------------------------------------------- /pysemantic/exporters.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """ 10 | Exporters from PySemantic to databases or other data sinks. 11 | """ 12 | 13 | 14 | class AbstractExporter(object): 15 | """Abstract exporter for dataframes that have been cleaned.""" 16 | 17 | def get(self, **kwargs): 18 | raise NotImplementedError 19 | 20 | def set(self, **kwargs): 21 | raise NotImplementedError 22 | 23 | 24 | class AerospikeExporter(AbstractExporter): 25 | """Example class for exporting to an aerospike database.""" 26 | 27 | def __init__(self, config, dataframe): 28 | self.dataframe = dataframe 29 | self.namespace = config['namespace'] 30 | self.set_name = config['set'] 31 | self.port = config['port'] 32 | self.hostname = config['hostname'] 33 | 34 | def set(self, key_tuple, bins): 35 | self.client.put(key_tuple, bins) 36 | 37 | def run(self): 38 | import aerospike 39 | self.client = aerospike.client({'hosts': [(self.hostname, 40 | self.port)], 41 | 'policies':{'timeout': 60000}}).connect() 42 | for ix in self.dataframe.index: 43 | self.set((self.namespace, self.set_name, ix), 44 | self.dataframe.ix[ix].to_dict()) 45 | self.client.close() 46 | -------------------------------------------------------------------------------- /pysemantic/loggers.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Loggers""" 10 | 11 | import os 12 | import os.path as op 13 | import logging 14 | import time 15 | 16 | 17 | LOGDIR = op.join(op.expanduser("~"), ".pysemantic") 18 | if not op.exists(LOGDIR): 19 | os.mkdir(LOGDIR) 20 | 21 | 22 | def setup_logging(project_name): 23 | logfile = "{0}_{1}.log".format(project_name, time.time()) 24 | logging.basicConfig(filename=op.join(LOGDIR, logfile), 25 | level=logging.INFO) 26 | logging.info("Project {0} started.".format(project_name)) 27 | -------------------------------------------------------------------------------- /pysemantic/project.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """The Project class.""" 10 | 11 | import os 12 | import textwrap 13 | import pprint 14 | import logging 15 | import json 16 | from ConfigParser import RawConfigParser 17 | import os.path as op 18 | import yaml 19 | import pandas as pd 20 | import numpy as np 21 | from pysemantic.validator import SchemaValidator, DataFrameValidator, \ 22 | ParseErrorHandler 23 | from pysemantic.errors import MissingProject, MissingConfigError, \ 24 | ParserArgumentError 25 | from pysemantic.loggers import setup_logging 26 | from pysemantic.utils import TypeEncoder 27 | from pysemantic.exporters import AerospikeExporter 28 | 29 | try: 30 | from yaml import CDumper as Dumper 31 | from yaml import CLoader as Loader 32 | except ImportError: 33 | from yaml import Dumper 34 | from yaml import Loader 35 | 36 | CONF_FILE_NAME = os.environ.get("PYSEMANTIC_CONFIG", "pysemantic.conf") 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | def locate_config_file(): 41 | """Locates the configuration file used by semantic. 42 | 43 | :return: Path of the pysemantic config file. 44 | :rtype: str 45 | :Example: 46 | 47 | >>> locate_config_file() 48 | '/home/username/pysemantic.conf' 49 | """ 50 | paths = [op.join(os.getcwd(), CONF_FILE_NAME), 51 | op.join(op.expanduser('~'), CONF_FILE_NAME)] 52 | for path in paths: 53 | if op.exists(path): 54 | logger.info("Config file found at {0}".format(path)) 55 | return path 56 | raise MissingConfigError("No pysemantic configuration file was fount at" 57 | " {0} or {1}".format(*paths)) 58 | 59 | 60 | def get_default_specfile(project_name): 61 | """Returns the specifications file used by the given project. The \ 62 | configuration file is searched for first in the current directory \ 63 | and then in the home directory. 64 | 65 | :param project_name: Name of the project for which to get the spcfile. 66 | :type project_name: str 67 | :return: Path to the data dictionary of the project. 68 | :rtype: str 69 | :Example: 70 | 71 | >>> get_default_specfile('skynet') 72 | '/home/username/projects/skynet/schema.yaml' 73 | """ 74 | path = locate_config_file() 75 | parser = RawConfigParser() 76 | parser.read(path) 77 | return parser.get(project_name, 'specfile') 78 | 79 | 80 | def add_project(project_name, specfile): 81 | """Add a project to the global configuration file. 82 | 83 | :param project_name: Name of the project 84 | :param specfile: path to the data dictionary used by the project. 85 | :type project_name: str 86 | :type specfile: str 87 | :return: None 88 | """ 89 | if not op.isabs(specfile): 90 | raise ValueError("Path to the schema should be absolute.") 91 | path = locate_config_file() 92 | parser = RawConfigParser() 93 | parser.read(path) 94 | parser.add_section(project_name) 95 | parser.set(project_name, "specfile", specfile) 96 | with open(path, "w") as f: 97 | parser.write(f) 98 | 99 | 100 | def add_dataset(project_name, dataset_name, dataset_specs): 101 | """Add a dataset to a project. 102 | 103 | :param project_name: Name of the project to which the dataset is to be \ 104 | added. 105 | :param dataset_name: Name of the dataset to be added. 106 | :param dataset_specs: Specifications of the dataset. 107 | :type project_name: str 108 | :type dataset_name: str 109 | :type dataset_specs: dict 110 | :return: None 111 | """ 112 | data_dict = get_default_specfile(project_name) 113 | with open(data_dict, "r") as f: 114 | spec = yaml.load(f, Loader=Loader) 115 | spec[dataset_name] = dataset_specs 116 | with open(data_dict, "w") as f: 117 | yaml.dump(spec, f, Dumper=Dumper, default_flow_style=False) 118 | 119 | 120 | def remove_dataset(project_name, dataset_name): 121 | """Removes a dataset from a project. 122 | 123 | :param project_name: Name of the project 124 | :param dataset_name: Name of the dataset to remove 125 | :type project_name: str 126 | :type dataset_name: str 127 | :return: None 128 | """ 129 | data_dict = get_default_specfile(project_name) 130 | with open(data_dict, "r") as f: 131 | spec = yaml.load(f, Loader=Loader) 132 | del spec[dataset_name] 133 | with open(data_dict, "w") as f: 134 | yaml.dump(spec, f, Dumper=Dumper, default_flow_style=False) 135 | 136 | 137 | def get_datasets(project_name=None): 138 | """Get names of all datasets registered under the project `project_name`. 139 | 140 | :param project_name: name of the projects to list the datasets from. If \ 141 | `None` (default), datasets under all projects are returned. 142 | :type project_name: str 143 | :return: List of datasets listed under `project_name`, or if \ 144 | `project_name` is `None`, returns dictionary such that \ 145 | {project_name: [list of projects]} 146 | :rtype: dict or list 147 | :Example: 148 | 149 | >>> get_datasets('skynet') 150 | ['sarah_connor', 'john_connor', 'kyle_reese'] 151 | >>> get_datasets() 152 | {'skynet': ['sarah_connor', 'john_connor', 'kyle_reese'], 153 | 'south park': ['stan', 'kyle', 'cartman', 'kenny']} 154 | """ 155 | if project_name is not None: 156 | specs = get_schema_specs(project_name) 157 | return specs.keys() 158 | else: 159 | dataset_names = {} 160 | projects = get_projects() 161 | for project_name, _ in projects: 162 | dataset_names[project_name] = get_datasets(project_name) 163 | return dataset_names 164 | 165 | 166 | def set_schema_fpath(project_name, schema_fpath): 167 | """Set the schema path for a given project. 168 | 169 | :param project_name: Name of the project 170 | :param schema_fpath: path to the yaml file to be used as the schema for \ 171 | the project. 172 | :type project_name: str 173 | :type schema_fpath: str 174 | :return: True, if setting the schema path was successful. 175 | :Example: 176 | 177 | >>> set_schema_fpath('skynet', '/path/to/new/schema.yaml') 178 | True 179 | """ 180 | path = locate_config_file() 181 | parser = RawConfigParser() 182 | parser.read(path) 183 | if project_name in parser.sections(): 184 | if not parser.remove_option(project_name, "specfile"): 185 | raise MissingProject 186 | else: 187 | parser.set(project_name, "specfile", schema_fpath) 188 | with open(path, "w") as f: 189 | parser.write(f) 190 | return True 191 | raise MissingProject 192 | 193 | 194 | def get_projects(): 195 | """Get the list of projects currently registered with pysemantic as a 196 | list. 197 | 198 | :return: List of tuples, such that each tuple is (project_name, \ 199 | location_of_specfile) 200 | :rtype: list 201 | :Example: 202 | 203 | >>> get_projects() 204 | ['skynet', 'south park'] 205 | """ 206 | path = locate_config_file() 207 | parser = RawConfigParser() 208 | parser.read(path) 209 | projects = [] 210 | for section in parser.sections(): 211 | project_name = section 212 | specfile = parser.get(section, "specfile") 213 | projects.append((project_name, specfile)) 214 | return projects 215 | 216 | 217 | def get_schema_specs(project_name, dataset_name=None): 218 | """Get the specifications of a dataset as specified in the schema. 219 | 220 | :param project_name: Name of project 221 | :param dataset_name: name of the dataset for which to get the schema. If \ 222 | None (default), schema for all datasets is returned. 223 | :type project_name: str 224 | :type dataset_name: str 225 | :return: schema for dataset 226 | :rtype: dict 227 | :Example: 228 | 229 | >>> get_schema_specs('skynet') 230 | {'sarah connor': {'path': '/path/to/sarah_connor.csv', 231 | 'delimiter': ','}, 232 | 'kyle reese': {'path': '/path/to/kyle_reese.tsv', 233 | 'delimiter':, '\t'} 234 | 'john connor': {'path': '/path/to/john_connor.txt', 235 | 'delimiter':, ' '} 236 | } 237 | """ 238 | schema_file = get_default_specfile(project_name) 239 | with open(schema_file, "r") as f: 240 | specs = yaml.load(f, Loader=Loader) 241 | if dataset_name is not None: 242 | return specs[dataset_name] 243 | return specs 244 | 245 | 246 | def set_schema_specs(project_name, dataset_name, **kwargs): 247 | """Set the schema specifications for a dataset. 248 | 249 | :param project_name: Name of the project containing the dataset. 250 | :param dataset_name: Name of the dataset of which the schema is being set. 251 | :param kwargs: Schema fields that are dumped into the schema files. 252 | :type project_name: str 253 | :type dataset_name: str 254 | :return: None 255 | :Example: 256 | 257 | >>> set_schema_specs('skynet', 'kyle reese', 258 | path='/path/to/new/file.csv', delimiter=new_delimiter) 259 | """ 260 | schema_file = get_default_specfile(project_name) 261 | with open(schema_file, "r") as f: 262 | specs = yaml.load(f, Loader=Loader) 263 | for key, value in kwargs.iteritems(): 264 | specs[dataset_name][key] = value 265 | with open(schema_file, "w") as f: 266 | yaml.dump(specs, f, Dumper=Dumper, default_flow_style=False) 267 | 268 | 269 | def view_projects(): 270 | """View a list of all projects currently registered with pysemantic. 271 | 272 | :Example: 273 | 274 | >>> view_projects() 275 | Project skynet with specfile at /path/to/skynet.yaml 276 | Project south park with specfile at /path/to/south_park.yaml 277 | """ 278 | projects = get_projects() 279 | if len(projects) > 0: 280 | for project_name, specfile in projects: 281 | print "Project {0} with specfile at {1}".format(project_name, 282 | specfile) 283 | else: 284 | msg = textwrap.dedent("""\ 285 | No projects found. You can add projects using the 286 | $ semantic list 287 | command. 288 | """) 289 | print msg 290 | 291 | 292 | def remove_project(project_name): 293 | """Remove a project from the global configuration file. 294 | 295 | :param project_name: Name of the project to remove. 296 | :type project_name: str 297 | :return: True if the project existed 298 | :rtype: bool 299 | :Example: 300 | 301 | >>> view_projects() 302 | Project skynet with specfile at /path/to/skynet.yaml 303 | Project south park with specfile at /path/to/south_park.yaml 304 | >>> remove_project('skynet') 305 | >>> view_projects() 306 | Project south park with specfile at /path/to/south_park.yaml 307 | """ 308 | path = locate_config_file() 309 | parser = RawConfigParser() 310 | parser.read(path) 311 | result = parser.remove_section(project_name) 312 | if result: 313 | with open(path, "w") as f: 314 | parser.write(f) 315 | return result 316 | 317 | 318 | class Project(object): 319 | """The Project class, the entry point for most things in this module.""" 320 | 321 | def __init__(self, project_name=None, parser=None, schema=None): 322 | """The Project class. 323 | 324 | :param project_name: Name of the project as specified in the \ 325 | pysemantic configuration file. If this is ``None``, then the 326 | ``schema`` parameter is expected to contain the schema 327 | dictionary. (see below) 328 | :param parser: The parser to be used for reading dataset files. The \ 329 | default is `pandas.read_table`. 330 | :param schema: Dictionary containing the schema for the project. When 331 | this argument is supplied (not ``None``), the ``project_name`` is 332 | ignored, no specfile is read, and all the specifications for the data 333 | are inferred from this dictionary. 334 | """ 335 | if project_name is not None: 336 | setup_logging(project_name) 337 | self.project_name = project_name 338 | self.specfile = get_default_specfile(self.project_name) 339 | logger.info( 340 | "Schema for project {0} found at {1}".format(project_name, 341 | self.specfile)) 342 | else: 343 | setup_logging("no_name") 344 | logger.info("Schema defined by user at runtime. Not reading any " 345 | "specfile.") 346 | self.specfile = None 347 | self.validators = {} 348 | if parser is not None: 349 | self.user_specified_parser = True 350 | else: 351 | self.user_specified_parser = False 352 | self.parser = parser 353 | if self.specfile is not None: 354 | with open(self.specfile, 'r') as f: 355 | specifications = yaml.load(f, Loader=Loader) 356 | else: 357 | specifications = schema 358 | self.column_rules = {} 359 | self.df_rules = {} 360 | for name, specs in specifications.iteritems(): 361 | self.column_rules[name] = specs.get('column_rules', {}) 362 | self.df_rules[name] = specs.get('dataframe_rules', {}) 363 | self.specifications = specifications 364 | 365 | def export_dataset(self, dataset_name, dataframe=None, outpath=None): 366 | """Export a dataset to an exporter defined in the schema. If nothing is 367 | specified in the schema, simply export to a CSV file such named 368 | .csv 369 | 370 | :param dataset_name: Name of the dataset to exporter. 371 | :param dataframe: Pandas dataframe to export. If None (default), this \ 372 | dataframe is loaded using the `load_dataset` method. 373 | :type dataset_name: Str 374 | """ 375 | if dataframe is None: 376 | dataframe = self.load_dataset(dataset_name) 377 | config = self.specifications[dataset_name].get('exporter') 378 | if outpath is None: 379 | outpath = dataset_name + ".csv" 380 | if config is not None: 381 | if config['kind'] == "aerospike": 382 | config['namespace'] = self.project_name 383 | config['set'] = dataset_name 384 | exporter = AerospikeExporter(config, dataframe) 385 | exporter.run() 386 | else: 387 | suffix = outpath.split('.')[-1] 388 | if suffix in ("h5", "hdf"): 389 | group = r'/{0}/{1}'.format(self.project_name, dataset_name) 390 | dataframe.to_hdf(outpath, group) 391 | elif suffix == "csv": 392 | dataframe.to_csv(outpath, index=False) 393 | 394 | def reload_data_dict(self): 395 | """Reload the data dictionary and re-populate the schema.""" 396 | 397 | with open(self.specfile, "r") as f: 398 | specifications = yaml.load(f, Loader=Loader) 399 | self.validators = {} 400 | self.column_rules = {} 401 | self.df_rules = {} 402 | logger.info("Reloading project information.") 403 | self.specifications = specifications 404 | for name, specs in specifications.iteritems(): 405 | logger.info("Schema for dataset {0}:".format(name)) 406 | logger.info(json.dumps(specs, cls=TypeEncoder)) 407 | self._init_validate(name) 408 | self.column_rules[name] = specs.get('column_rules', {}) 409 | self.df_rules[name] = specs.get('dataframe_rules', {}) 410 | 411 | @property 412 | def datasets(self): 413 | """"List the datasets registered under the parent project. 414 | 415 | :Example: 416 | 417 | >>> project = Project('skynet') 418 | >>> project.datasets 419 | ['sarah connor', 'john connor', 'kyle reese'] 420 | """ 421 | return self.specifications.keys() 422 | 423 | def _init_validate(self, dataset_name): 424 | """Given a dataset name, create a SchemaValidator object and add to the 425 | cache. 426 | 427 | :param dataset_name: Name of the dataset 428 | """ 429 | specs = self.specifications.get(dataset_name) 430 | is_pickled = specs.get("pickle", False) 431 | if self.specfile is not None: 432 | validator = SchemaValidator.from_specfile(specfile=self.specfile, 433 | name=dataset_name, 434 | is_pickled=is_pickled) 435 | else: 436 | validator = SchemaValidator(specification=specs, 437 | name=dataset_name, 438 | is_pickled=is_pickled) 439 | self.validators[dataset_name] = validator 440 | 441 | def get_dataset_specs(self, dataset_name): 442 | """Returns the specifications for the specified dataset in the project. 443 | 444 | :param dataset_name: Name of the dataset 445 | :type dataset_name: str 446 | :return: Parser arguments required to import the dataset in pandas. 447 | :rtype: dict 448 | """ 449 | if dataset_name not in self.validators: 450 | self._init_validate(dataset_name) 451 | return self.validators[dataset_name].get_parser_args() 452 | 453 | def get_project_specs(self): 454 | """Returns a dictionary containing the schema for all datasets listed 455 | under this project. 456 | 457 | :return: Parser arguments for all datasets listed under the project. 458 | :rtype: dict 459 | """ 460 | specs = {} 461 | for name, basespecs in self.specifications.iteritems(): 462 | if name not in self.validators: 463 | self._init_validate(name) 464 | validator = self.validators[name] 465 | specs[name] = validator.get_parser_args() 466 | return specs 467 | 468 | def view_dataset_specs(self, dataset_name): 469 | """Pretty print the specifications for a dataset. 470 | 471 | :param dataset_name: Name of the dataset 472 | :type dataset_name: str 473 | """ 474 | specs = self.get_dataset_specs(dataset_name) 475 | pprint.pprint(specs) 476 | 477 | def update_dataset(self, dataset_name, dataframe, path=None, **kwargs): 478 | """This is tricky.""" 479 | org_specs = self.get_dataset_specs(dataset_name) 480 | if path is None: 481 | path = org_specs['filepath_or_buffer'] 482 | sep = kwargs.get('sep', org_specs['sep']) 483 | index = kwargs.get('index', False) 484 | dataframe.to_csv(path, sep=sep, index=index) 485 | dtypes = {} 486 | for col in dataframe: 487 | dtype = dataframe[col].dtype 488 | if dtype == np.dtype('O'): 489 | dtypes[col] = str 490 | elif dtype == np.dtype('float'): 491 | dtypes[col] = float 492 | elif dtype == np.dtype('int'): 493 | dtypes[col] = int 494 | else: 495 | dtypes[col] = dtype 496 | new_specs = {'path': path, 'delimiter': sep, 'dtypes': dtypes} 497 | with open(self.specfile, "r") as fid: 498 | specs = yaml.load(fid, Loader=Loader) 499 | dataset_specs = specs[dataset_name] 500 | dataset_specs.update(new_specs) 501 | if "column_rules" in dataset_specs: 502 | col_rules = dataset_specs['column_rules'] 503 | cols_to_remove = [] 504 | for colname in col_rules.iterkeys(): 505 | if colname not in dataframe.columns: 506 | cols_to_remove.append(colname) 507 | for colname in cols_to_remove: 508 | del col_rules[colname] 509 | logger.info("Attempting to update schema for dataset {0} to:".format( 510 | dataset_name)) 511 | logger.info(json.dumps(dataset_specs, cls=TypeEncoder)) 512 | with open(self.specfile, "w") as fid: 513 | yaml.dump(specs, fid, Dumper=Dumper, 514 | default_flow_style=False) 515 | 516 | def _sql_read(self, parser_args): 517 | if parser_args.get('table_name'): 518 | if parser_args.get('query'): 519 | return pd.read_sql_query(sql=parser_args.get('query'), 520 | con=parser_args['con']) 521 | return pd.read_sql_table( 522 | table_name=parser_args.get('table_name'), 523 | con=parser_args.get('con'), 524 | columns=parser_args.get('use_columns'), 525 | index_col=parser_args.get('index_col') 526 | ) 527 | elif parser_args.get('query'): 528 | return pd.read_sql_query(sql=parser_args.get('query'), 529 | con=parser_args['con']) 530 | 531 | def _sql_iterator(self, parser_args): 532 | dfs = [] 533 | if parser_args.get('table_name'): 534 | if parser_args.get('query'): 535 | iterator = pd.read_sql_query(sql=parser_args.get('query'), 536 | con=parser_args['con'], 537 | chunksize=parser_args['chunksize']) 538 | else: 539 | iterator = pd.read_sql_table( 540 | table_name=parser_args.get('table_name'), 541 | con=parser_args.get('con'), 542 | chunksize=parser_args.get('chunksize'), 543 | columns=parser_args.get('use_columns'), 544 | index_col=parser_args.get('index_col') 545 | ) 546 | else: 547 | iterator = pd.read_sql_query(sql=parser_args.get('query'), 548 | con=parser_args['con'], 549 | chunksize=parser_args['chunksize']) 550 | while True: 551 | try: 552 | dfs.append(iterator.next()) 553 | except StopIteration: 554 | break 555 | except Exception as err: 556 | logger.debug("SQL iterator failed: {}".format(err)) 557 | break 558 | dfs.append(None) 559 | return pd.concat(dfs) 560 | 561 | def load_dataset(self, dataset_name): 562 | """Load and return a dataset. 563 | 564 | :param dataset_name: Name of the dataset 565 | :type dataset_name: str 566 | :return: A pandas DataFrame containing the dataset. 567 | :rtype: pandas.DataFrame 568 | :Example: 569 | 570 | >>> demo_project = Project('pysemantic_demo') 571 | >>> iris = demo_project.load_dataset('iris') 572 | >>> type(iris) 573 | pandas.core.DataFrame 574 | """ 575 | if dataset_name not in self.validators: 576 | self._init_validate(dataset_name) 577 | validator = self.validators[dataset_name] 578 | column_rules = self.column_rules.get(dataset_name, {}) 579 | df_rules = self.df_rules.get(dataset_name, {}) 580 | parser_args = validator.get_parser_args() 581 | df_rules.update(validator.df_rules) 582 | logger.info("Attempting to load dataset {} with args:".format( 583 | dataset_name)) 584 | if validator.is_spreadsheet: 585 | parser_args.pop('usecols', None) 586 | logger.info(json.dumps(parser_args, cls=TypeEncoder)) 587 | if isinstance(parser_args, dict): 588 | if validator.is_mysql or validator.is_postgresql: 589 | if not ( 590 | parser_args.get('table_name') or parser_args.get('query')): 591 | raise ParserArgumentError( 592 | "No table_name or query was provided for the " 593 | "postgres configuration.") 594 | elif validator.sql_validator.chunksize is not None: 595 | df = self._sql_iterator(parser_args) 596 | else: 597 | df = self._sql_read(parser_args) 598 | else: 599 | with ParseErrorHandler(parser_args, self) as handler: 600 | df = handler.load() 601 | if df is None: 602 | raise ParserArgumentError("No valid parser arguments were " + 603 | "inferred from the schema.") 604 | if validator.is_spreadsheet and isinstance(validator.sheetname, 605 | list): 606 | df = pd.concat(df.itervalues(), axis=0) 607 | logger.info("Success!") 608 | df_validator = DataFrameValidator(data=df, rules=df_rules, 609 | column_rules=column_rules) 610 | logger.info("Commence cleaning dataset:") 611 | logger.info("DataFrame rules:") 612 | logger.info(json.dumps(df_rules, cls=TypeEncoder)) 613 | logger.info("Column rules:") 614 | logger.info(json.dumps(column_rules, cls=TypeEncoder)) 615 | 616 | return df_validator.clean() 617 | else: 618 | dfs = [] 619 | for argset in parser_args: 620 | with ParseErrorHandler(argset, self) as handler: 621 | _df = handler.load() 622 | df_validator = DataFrameValidator(data=_df, 623 | column_rules=column_rules) 624 | dfs.append(df_validator.clean()) 625 | df = pd.concat(dfs, axis=0) 626 | return df.set_index(np.arange(df.shape[0])) 627 | 628 | def load_datasets(self): 629 | """Load and return all datasets. 630 | 631 | :return: dictionary like {dataset_name: dataframe} 632 | :rtype: dict 633 | """ 634 | datasets = {} 635 | for name in self.specifications.iterkeys(): 636 | if name not in self.validators: 637 | self._init_validate(name) 638 | datasets[name] = self.load_dataset(name) 639 | return datasets 640 | -------------------------------------------------------------------------------- /pysemantic/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """The tests module.""" 2 | -------------------------------------------------------------------------------- /pysemantic/tests/test_base.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Base classes and functions for tests.""" 10 | 11 | import os 12 | import unittest 13 | import tempfile 14 | import shutil 15 | import os.path as op 16 | from copy import deepcopy 17 | from ConfigParser import RawConfigParser 18 | 19 | import yaml 20 | import numpy as np 21 | import pandas as pd 22 | 23 | from pysemantic import project as pr 24 | from pysemantic.utils import colnames 25 | 26 | try: 27 | from yaml import CLoader as Loader 28 | from yaml import CDumper as Dumper 29 | except ImportError: 30 | from yaml import Loader 31 | from yaml import Dumper 32 | 33 | TEST_CONFIG_FILE_PATH = op.join(op.abspath(op.dirname(__file__)), "testdata", 34 | "test.conf") 35 | TEST_DATA_DICT = op.join(op.abspath(op.dirname(__file__)), "testdata", 36 | "test_dictionary.yaml") 37 | TEST_XL_DICT = op.join(op.abspath(op.dirname(__file__)), "testdata", 38 | "test_excel.yaml") 39 | 40 | 41 | def _path_fixer(filepath, root=None): 42 | """Change all the relative paths in `filepath` to absolute ones. 43 | 44 | :param filepath: File to be changed 45 | :param root: Root path with which the relative paths are prefixed. If None 46 | (default), the directory with this file is the root. 47 | """ 48 | if root is None: 49 | root = op.join(op.abspath(op.dirname(__file__))) 50 | if filepath.endswith((".yaml", ".yml")): 51 | with open(filepath, "r") as fileobj: 52 | data = yaml.load(fileobj, Loader=Loader) 53 | for specs in data.itervalues(): 54 | specs['path'] = op.join(root, specs['path']) 55 | with open(filepath, "w") as fileobj: 56 | yaml.dump(data, fileobj, Dumper=Dumper, 57 | default_flow_style=False) 58 | elif filepath.endswith(".conf"): 59 | parser = RawConfigParser() 60 | parser.read(filepath) 61 | for section in parser.sections(): 62 | path = parser.get(section, "specfile") 63 | parser.remove_option(section, "specfile") 64 | parser.set(section, "specfile", op.join(root, path)) 65 | with open(filepath, "w") as fileobj: 66 | parser.write(fileobj) 67 | 68 | 69 | def _remove_project(project_name, project_files=None): 70 | pr.remove_project(project_name) 71 | if project_files is not None: 72 | if hasattr(project_files, "__iter__"): 73 | for path in project_files: 74 | if op.isfile(path): 75 | os.unlink(path) 76 | elif op.isdir(path): 77 | shutil.rmtree(path) 78 | else: 79 | if op.isfile(project_files): 80 | os.unlink(project_files) 81 | elif op.isdir(project_files): 82 | shutil.rmtree(project_files) 83 | 84 | 85 | class DummyProjectFactory(object): 86 | 87 | def __init__(self, schema, df, exporter="to_csv", **kwargs): 88 | self.tempdir = tempfile.mkdtemp() 89 | data_fpath = op.join(self.tempdir, "data.dat") 90 | if ("index" not in kwargs) and ("index_label" not in kwargs): 91 | kwargs['index'] = False 92 | getattr(df, exporter)(data_fpath, **kwargs) 93 | schema['data']['path'] = data_fpath 94 | schema_fpath = op.join(self.tempdir, "schema.yml") 95 | with open(schema_fpath, "w") as f_schema: 96 | yaml.dump(schema, f_schema, Dumper=yaml.CDumper) 97 | self.schema_fpath = schema_fpath 98 | 99 | def __enter__(self): 100 | pr.add_project("dummy_project", self.schema_fpath) 101 | return pr.Project("dummy_project") 102 | 103 | def __exit__(self, type, value, traceback): 104 | _remove_project("dummy_project", self.tempdir) 105 | 106 | 107 | class BaseTestCase(unittest.TestCase): 108 | 109 | """Base test class, introduces commonly required methods.""" 110 | 111 | def assertKwargsEqual(self, dict1, dict2): 112 | """Assert that dictionaries are equal, to a deeper extent.""" 113 | self.assertEqual(len(dict1.keys()), len(dict2.keys())) 114 | for key, value in dict1.iteritems(): 115 | self.assertIn(key, dict2) 116 | left = value 117 | right = dict2[key] 118 | if isinstance(left, (tuple, list)): 119 | self.assertItemsEqual(left, right) 120 | elif isinstance(left, dict): 121 | self.assertDictEqual(left, right) 122 | else: 123 | self.assertEqual(left, right) 124 | 125 | def assertKwargsEmpty(self, data): 126 | """Assert that a dictionary is empty.""" 127 | for value in data.itervalues(): 128 | self.assertIn(value, ("", 0, 1, [], (), {}, None, False)) 129 | 130 | def assertDataFrameEqual(self, dframe1, dframe2): 131 | """Assert that two dataframes are equal by their columns, indices and 132 | values.""" 133 | self.assertTrue(np.all(dframe1.index.values == dframe2.index.values)) 134 | self.assertTrue(np.all(dframe1.columns == dframe2.columns)) 135 | for col in dframe1: 136 | if dframe1[col].dtype in (np.dtype(float), np.dtype(int)): 137 | np.testing.assert_allclose(dframe1[col], dframe2[col]) 138 | else: 139 | self.assertTrue(np.all(dframe1[col] == dframe2[col])) 140 | self.assertEqual(dframe1[col].dtype, dframe2[col].dtype) 141 | 142 | def assertSeriesEqual(self, s1, s2): 143 | """Assert that two series are equal by their indices and values.""" 144 | self.assertEqual(s1.shape, s2.shape) 145 | self.assertTrue(np.all(s1.values == s2.values)) 146 | self.assertTrue(np.all(s1.index == s2.index)) 147 | 148 | 149 | class BaseProjectTestCase(BaseTestCase): 150 | 151 | """Base class for tests of the Project module.""" 152 | 153 | @classmethod 154 | def setUpClass(cls): 155 | cls.maxDiff = None 156 | # modify the testdata dict to have absolute paths 157 | with open(TEST_DATA_DICT, "r") as fileobj: 158 | test_data = yaml.load(fileobj, Loader=Loader) 159 | for _, specs in test_data.iteritems(): 160 | path = op.join(op.abspath(op.dirname(__file__)), specs['path']) 161 | specs['path'] = path 162 | # Put in the multifile specs 163 | cls.copied_iris_path = test_data['iris']['path'].replace("iris", 164 | "iris2") 165 | dframe = pd.read_csv(test_data['iris']['path']) 166 | dframe.to_csv(cls.copied_iris_path, index=False) 167 | 168 | copied_iris_specs = deepcopy(test_data['iris']) 169 | copied_iris_specs['path'] = [copied_iris_specs['path'], 170 | cls.copied_iris_path] 171 | copied_iris_specs['nrows'] = [150, 150] 172 | test_data['multi_iris'] = copied_iris_specs 173 | 174 | with open(TEST_DATA_DICT, "w") as fileobj: 175 | yaml.dump(test_data, fileobj, Dumper=Dumper, 176 | default_flow_style=False) 177 | cls.data_specs = test_data 178 | _path_fixer(TEST_XL_DICT) 179 | 180 | # Fix config file to have absolute paths 181 | 182 | config_fname = op.basename(TEST_CONFIG_FILE_PATH) 183 | cls.test_conf_file = op.join(os.getcwd(), config_fname) 184 | parser = RawConfigParser() 185 | parser.read(TEST_CONFIG_FILE_PATH) 186 | for project in ("pysemantic", "test_excel"): 187 | specfile = parser.get(project, 'specfile') 188 | specfile = op.join(op.abspath(op.dirname(__file__)), specfile) 189 | parser.remove_option(project, "specfile") 190 | parser.set(project, "specfile", specfile) 191 | with open(cls.test_conf_file, 'w') as fileobj: 192 | parser.write(fileobj) 193 | pr.CONF_FILE_NAME = config_fname 194 | 195 | @classmethod 196 | def tearDownClass(cls): 197 | try: 198 | # modify the testdata back 199 | with open(TEST_DATA_DICT, "r") as fileobj: 200 | test_data = yaml.load(fileobj, Loader=Loader) 201 | test_data['iris']['path'] = op.join("testdata", "iris.csv") 202 | test_data['random_row_iris']['path'] = op.join("testdata", "iris.csv") 203 | test_data['bad_iris']['path'] = op.join("testdata", "bad_iris.csv") 204 | test_data['person_activity']['path'] = op.join("testdata", 205 | "person_activity.tsv") 206 | del test_data['multi_iris'] 207 | with open(TEST_DATA_DICT, "w") as fileobj: 208 | test_data = yaml.dump(test_data, fileobj, Dumper=Dumper, 209 | default_flow_style=False) 210 | 211 | with open(TEST_XL_DICT, "r") as fileobj: 212 | test_data = yaml.load(fileobj, Loader=Loader) 213 | xl_path = op.join("testdata", "test_spreadsheet.xlsx") 214 | test_data['iris']['path'] = xl_path 215 | test_data['person_activity']['path'] = xl_path 216 | test_data['iris_renamed']['path'] = xl_path 217 | with open(TEST_XL_DICT, "w") as fileobj: 218 | test_data = yaml.dump(test_data, fileobj, Dumper=Dumper, 219 | default_flow_style=False) 220 | 221 | # Change the config files back 222 | parser = RawConfigParser() 223 | parser.read(cls.test_conf_file) 224 | parser.remove_option("pysemantic", "specfile") 225 | parser.set("pysemantic", "specfile", 226 | op.join("testdata", "test_dictionary.yaml")) 227 | parser.remove_option("test_excel", "specfile") 228 | parser.set("test_excel", "specfile", 229 | op.join("testdata", "test_excel.yaml")) 230 | with open(TEST_CONFIG_FILE_PATH, 'w') as fileobj: 231 | parser.write(fileobj) 232 | 233 | finally: 234 | os.unlink(cls.test_conf_file) 235 | os.unlink(cls.copied_iris_path) 236 | 237 | def setUp(self): 238 | iris_specs = _get_iris_args() 239 | copied_iris_specs = deepcopy(iris_specs) 240 | copied_iris_specs.update( 241 | {'filepath_or_buffer': iris_specs['filepath_or_buffer'].replace( 242 | "iris", "iris2")}) 243 | multi_iris_specs = [iris_specs, copied_iris_specs] 244 | person_activity_specs = _get_person_activity_args() 245 | random_row_iris_specs = {'nrows': {'random': True, 'count': 50}, 246 | 'error_bad_lines': False, 247 | 'filepath_or_buffer': op.join( 248 | op.abspath(op.dirname(__file__)), 249 | "testdata", "iris.csv")} 250 | expected = {'iris': iris_specs, 251 | 'person_activity': person_activity_specs, 252 | 'multi_iris': multi_iris_specs, 253 | 'random_row_iris': random_row_iris_specs} 254 | self.expected_specs = expected 255 | self.project = pr.Project(project_name="pysemantic") 256 | 257 | 258 | class TestConfig(BaseTestCase): 259 | 260 | """Test the configuration management utilities.""" 261 | 262 | @classmethod 263 | def setUpClass(cls): 264 | # Fix the relative paths in the conig file. 265 | parser = RawConfigParser() 266 | parser.read(TEST_CONFIG_FILE_PATH) 267 | cls.old_fpath = parser.get("pysemantic", "specfile") 268 | parser.set("pysemantic", "specfile", op.abspath(cls.old_fpath)) 269 | with open(TEST_CONFIG_FILE_PATH, "w") as fileobj: 270 | parser.write(fileobj) 271 | cls._parser = parser 272 | pr.CONF_FILE_NAME = "test.conf" 273 | 274 | @classmethod 275 | def tearDownClass(cls): 276 | cls._parser.set("pysemantic", "specfile", cls.old_fpath) 277 | with open(TEST_CONFIG_FILE_PATH, "w") as fileobj: 278 | cls._parser.write(fileobj) 279 | 280 | def setUp(self): 281 | self.testParser = RawConfigParser() 282 | for section in self._parser.sections(): 283 | self.testParser.add_section(section) 284 | for item in self._parser.items(section): 285 | self.testParser.set(section, item[0], item[1]) 286 | 287 | def test_loader_default_location(self): 288 | """Test if the config looks for the files in the correct places.""" 289 | # Put the test config file in the current and home directories, with 290 | # some modifications. 291 | cwd_file = op.join(os.getcwd(), "test.conf") 292 | home_file = op.join(op.expanduser('~'), "test.conf") 293 | 294 | try: 295 | self.testParser.set("pysemantic", "specfile", os.getcwd()) 296 | with open(cwd_file, "w") as fileobj: 297 | self.testParser.write(fileobj) 298 | specfile = pr.get_default_specfile("pysemantic") 299 | self.assertEqual(specfile, os.getcwd()) 300 | 301 | os.unlink(cwd_file) 302 | 303 | self.testParser.set("pysemantic", "specfile", op.expanduser('~')) 304 | with open(home_file, "w") as fileobj: 305 | self.testParser.write(fileobj) 306 | specfile = pr.get_default_specfile("pysemantic") 307 | self.assertEqual(specfile, op.expanduser('~')) 308 | 309 | finally: 310 | os.unlink(home_file) 311 | 312 | 313 | def _dummy_postproc(series): 314 | return pd.Series([x if "v" in x else "" for x in series], 315 | index=series.index) 316 | 317 | 318 | def _get_iris_args(): 319 | """Get the ideal parser arguments for the iris dataset.""" 320 | filepath = op.join(op.dirname(__file__), "testdata", "iris.csv") 321 | names = colnames(filepath) 322 | return dict(filepath_or_buffer=op.abspath(filepath), 323 | sep=",", nrows=150, error_bad_lines=False, 324 | dtype={'Petal Length': float, 325 | 'Petal Width': float, 326 | 'Sepal Length': float, 327 | 'Sepal Width': float, 328 | 'Species': str}, 329 | usecols=names, na_values=None, parse_dates=False, 330 | converters=None, header='infer', index_col=None) 331 | 332 | 333 | def _get_person_activity_args(): 334 | """Get the ideal parser arguments for the activity dataset.""" 335 | filepath = op.join(op.dirname(__file__), "testdata", "person_activity.tsv") 336 | names = colnames(filepath, sep='\t') 337 | return dict(filepath_or_buffer=op.abspath(filepath), 338 | error_bad_lines=False, usecols=names, na_values=None, 339 | converters=None, header='infer', index_col=None, 340 | sep="\t", nrows=100, dtype={'sequence_name': str, 341 | 'tag': str, 342 | 'x': float, 343 | 'y': float, 344 | 'z': float, 345 | 'activity': str}, 346 | parse_dates=['date']) 347 | 348 | if __name__ == '__main__': 349 | unittest.main() 350 | -------------------------------------------------------------------------------- /pysemantic/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Tests for the cli.""" 10 | 11 | import os 12 | import shutil 13 | import subprocess 14 | import tempfile 15 | import unittest 16 | import os.path as op 17 | from copy import deepcopy 18 | from ConfigParser import RawConfigParser 19 | 20 | import yaml 21 | import pandas as pd 22 | import numpy as np 23 | 24 | from pysemantic.tests.test_base import (BaseTestCase, TEST_CONFIG_FILE_PATH, 25 | TEST_DATA_DICT) 26 | from pysemantic import project as pr 27 | 28 | try: 29 | from yaml import CLoader as Loader 30 | from yaml import CDumper as Dumper 31 | except ImportError: 32 | from yaml import Loader as Loader 33 | from yaml import Dumper as Dumper 34 | 35 | try: 36 | import tables 37 | PYTABLES_NOT_INSTALLED = False 38 | except ImportError: 39 | PYTABLES_NOT_INSTALLED = True 40 | 41 | 42 | class TestCLI(BaseTestCase): 43 | 44 | """Test the pysemantic CLI.""" 45 | 46 | @classmethod 47 | def setUpClass(cls): 48 | os.environ['PYSEMANTIC_CONFIG'] = "test.conf" 49 | pr.CONF_FILE_NAME = "test.conf" 50 | cls.testenv = os.environ 51 | cls.test_config_path = op.join(os.getcwd(), "test.conf") 52 | shutil.copy(TEST_CONFIG_FILE_PATH, cls.test_config_path) 53 | # Change the relative paths in the config file to absolute paths 54 | parser = RawConfigParser() 55 | parser.read(cls.test_config_path) 56 | for section in parser.sections(): 57 | schema_path = parser.get(section, "specfile") 58 | parser.remove_option(section, "specfile") 59 | parser.set(section, "specfile", 60 | op.join(op.abspath(op.dirname(__file__)), schema_path)) 61 | with open(cls.test_config_path, "w") as fileobj: 62 | parser.write(fileobj) 63 | # change the relative paths in the test dictionary to absolute paths 64 | with open(TEST_DATA_DICT, "r") as fileobj: 65 | cls.org_specs = yaml.load(fileobj, Loader=Loader) 66 | new_specs = deepcopy(cls.org_specs) 67 | for _, specs in new_specs.iteritems(): 68 | path = specs['path'] 69 | specs['path'] = op.join(op.abspath(op.dirname(__file__)), path) 70 | # Rewrite this to the file 71 | with open(TEST_DATA_DICT, "w") as fileobj: 72 | yaml.dump(new_specs, fileobj, Dumper=Dumper, 73 | default_flow_style=False) 74 | 75 | @classmethod 76 | def tearDownClass(cls): 77 | os.unlink(cls.test_config_path) 78 | # Rewrite the original specs back to the config dir 79 | with open(TEST_DATA_DICT, "w") as fileobj: 80 | yaml.dump(cls.org_specs, fileobj, Dumper=Dumper, 81 | default_flow_style=False) 82 | 83 | def setUp(self): 84 | pr.add_project("dummy_project", "/foo/bar.yaml") 85 | 86 | def tearDown(self): 87 | pr.remove_project("dummy_project") 88 | 89 | def test_set_specification(self): 90 | """Test if the set-specs subcommand of the CLI worls properly.""" 91 | org_specs = pr.get_schema_specs("pysemantic") 92 | cmd = ['semantic', 'set-specs', 'pysemantic', '--dataset', 'iris', 93 | '--dlm', '|'] 94 | try: 95 | subprocess.check_call(cmd, env=self.testenv) 96 | new_specs = pr.get_schema_specs("pysemantic", "iris") 97 | self.assertEqual(new_specs['delimiter'], '|') 98 | finally: 99 | for dataset_name, specs in org_specs.iteritems(): 100 | pr.set_schema_specs("pysemantic", dataset_name, **specs) 101 | 102 | def test_list_projects(self): 103 | """Test if the `list` subcommand of the CLI works properly.""" 104 | cmd = ['semantic', 'list'] 105 | output = subprocess.check_output(cmd, env=self.testenv).splitlines() 106 | path = op.join(op.abspath(op.dirname(__file__)), 107 | "testdata/test_dictionary.yaml") 108 | excel_path = op.join(op.abspath(op.dirname(__file__)), 109 | "testdata/test_excel.yaml") 110 | dummy_data = [("pysemantic", path), ("test_excel", excel_path), 111 | ("dummy_project", "/foo/bar.yaml")] 112 | for i, config in enumerate(dummy_data): 113 | ideal = "Project {0} with specfile at {1}".format(*config) 114 | self.assertEqual(ideal, output[i]) 115 | 116 | def test_list_datasets(self): 117 | """Test if the `list` subocmmand works for listing datasets.""" 118 | command = "semantic list --project pysemantic" 119 | cmd = command.split(' ') 120 | datasets = pr.get_datasets("pysemantic") 121 | output = subprocess.check_output(cmd, env=self.testenv).splitlines() 122 | self.assertItemsEqual(datasets, output) 123 | 124 | def test_add(self): 125 | """Test if the `add` subcommand can add projects to the config file.""" 126 | try: 127 | cmd = ['semantic', 'add', 'dummy_added_project', '/tmp/dummy.yaml'] 128 | subprocess.check_call(cmd, env=self.testenv) 129 | projects = pr.get_projects() 130 | self.assertIn(("dummy_added_project", "/tmp/dummy.yaml"), projects) 131 | finally: 132 | pr.remove_project("dummy_added_project") 133 | 134 | def test_add_dataset(self): 135 | """Test if the add-dataset subcommand adds datasets to projects.""" 136 | tempdir = tempfile.mkdtemp() 137 | outfile = op.join(tempdir, "testdata.csv") 138 | dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b']) 139 | dframe.to_csv(outfile, index=False) 140 | cmd = ("semantic add-dataset testdata --project pysemantic --path {}" 141 | " --dlm ,") 142 | cmd = cmd.format(outfile).split(" ") 143 | try: 144 | subprocess.check_call(cmd, env=self.testenv) 145 | _pr = pr.Project("pysemantic") 146 | self.assertIn("testdata", _pr.datasets) 147 | specs = dict(path=outfile, delimiter=',') 148 | actual = pr.get_schema_specs("pysemantic", "testdata") 149 | self.assertKwargsEqual(specs, actual) 150 | finally: 151 | pr.remove_dataset("pysemantic", "testdata") 152 | shutil.rmtree(tempdir) 153 | 154 | def test_remove_dataset(self): 155 | """Test if removing datasets works from the command line.""" 156 | # Add a temporary dataset and try to remove it. 157 | tempdir = tempfile.mkdtemp() 158 | outfile = op.join(tempdir, "testdata.csv") 159 | dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b']) 160 | dframe.to_csv(outfile, index=False) 161 | specs = dict(path=outfile, delimiter=',') 162 | pr.add_dataset("pysemantic", "testdata", specs) 163 | try: 164 | command = "semantic remove pysemantic --dataset testdata" 165 | cmd = command.split(' ') 166 | subprocess.check_call(cmd, env=self.testenv) 167 | datasets = pr.get_datasets("pysemantic") 168 | self.assertNotIn("testdata", datasets) 169 | finally: 170 | datasets = pr.get_datasets("pysemantic") 171 | if "testdata" in datasets: 172 | pr.remove_dataset("pysemantic", "testdata") 173 | shutil.rmtree(tempdir) 174 | 175 | def test_remove(self): 176 | """Test if the remove subcommand can remove projects.""" 177 | pr.add_project("dummy_project_2", "/foo/baz.yaml") 178 | try: 179 | cmd = ['semantic', 'remove', 'dummy_project_2'] 180 | subprocess.check_call(cmd, env=self.testenv) 181 | projects = pr.get_projects() 182 | proj_names = [p[0] for p in projects] 183 | self.assertNotIn("dummy_project_2", proj_names) 184 | finally: 185 | pr.remove_project("dummy_project_2") 186 | 187 | def test_remove_nonexistent_project(self): 188 | """Check if attempting to remove a nonexistent project fails.""" 189 | cmd = ['semantic', 'remove', 'foobar'] 190 | output = subprocess.check_output(cmd, env=self.testenv) 191 | self.assertEqual(output.strip(), "The project foobar doesn't exist.") 192 | 193 | def test_set_schema(self): 194 | """Test if the set-schema subcommand works.""" 195 | cmd = ['semantic', 'set-schema', 'dummy_project', '/tmp/baz.yaml'] 196 | subprocess.check_call(cmd, env=self.testenv) 197 | self.assertEqual(pr.get_default_specfile('dummy_project'), 198 | '/tmp/baz.yaml') 199 | 200 | @unittest.skipIf(PYTABLES_NOT_INSTALLED, "HDF export needs PyTables.") 201 | def test_export_hdf(self): 202 | """Test if exporting a dataset to hdf works.""" 203 | tempdir = tempfile.mkdtemp() 204 | cmd = "semantic export pysemantic --dataset iris {0}" 205 | cmd = cmd.format(op.join(tempdir, "iris.h5")) 206 | cmd = cmd.split() 207 | try: 208 | subprocess.check_call(cmd, env=self.testenv) 209 | self.assertTrue(op.exists(op.join(tempdir, "iris.h5"))) 210 | finally: 211 | shutil.rmtree(tempdir) 212 | 213 | def test_set_schema_nonexistent_project(self): 214 | """Test if the set-schema prints proper warnings when trying to set 215 | schema file for nonexistent project. 216 | """ 217 | cmd = ['semantic', 'set-schema', 'dummy_project_3', '/foo'] 218 | output = subprocess.check_output(cmd, env=self.testenv) 219 | msg = """Project {} not found in the configuration. Please use 220 | $ semantic add 221 | to register the project.""".format("dummy_project_3") 222 | self.assertEqual(output.strip(), msg) 223 | 224 | def test_relative_path(self): 225 | """Check if the set-schema and add subcommands convert relative paths 226 | from the cmdline to absolute paths in the config file. 227 | """ 228 | try: 229 | cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml'] 230 | subprocess.check_call(cmd, env=self.testenv) 231 | self.assertTrue(op.isabs(pr.get_default_specfile( 232 | 'dummy_project'))) 233 | pr.remove_project("dummy_project") 234 | cmd = ['semantic', 'add', 'dummy_project', './foo.yaml'] 235 | subprocess.check_call(cmd, env=self.testenv) 236 | self.assertTrue(op.isabs(pr.get_default_specfile( 237 | 'dummy_project'))) 238 | finally: 239 | pr.remove_project("dummy_project_1") 240 | 241 | if __name__ == '__main__': 242 | unittest.main() 243 | -------------------------------------------------------------------------------- /pysemantic/tests/test_custom_traits.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Tests for the custom_traits module.""" 10 | 11 | import unittest 12 | import os.path as op 13 | 14 | from traits.api import HasTraits, Either, List, Str, TraitError 15 | 16 | from pysemantic.custom_traits import AbsFile, ValidTraitList 17 | from pysemantic.tests.test_base import TEST_DATA_DICT 18 | 19 | 20 | class TestCustomTraits(unittest.TestCase): 21 | 22 | """ Testcase for the custom_traits module. This consists purely of testing 23 | whether validation is happening correctly on the custom_traits. 24 | """ 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | class CustomTraits(HasTraits): 29 | def __init__(self, **kwargs): 30 | super(CustomTraits, self).__init__(**kwargs) 31 | self.required = ['filepath'] 32 | filepath = AbsFile 33 | filelist = Either(List(AbsFile), AbsFile) 34 | required = ValidTraitList(Str) 35 | 36 | cls.custom_traits = CustomTraits 37 | 38 | def setUp(self): 39 | self.traits = self.custom_traits(filepath=op.abspath(__file__)) 40 | self.setter = lambda x, y: setattr(self.traits, x, y) 41 | 42 | def test_validtraitlist_trait(self): 43 | """Test if `pysemantic.self.traits.ValidTraitsList` works properly.""" 44 | self.assertItemsEqual(self.traits.required, ['filepath']) 45 | 46 | def test_absfile_either_list_traits(self): 47 | """Test if the AbsFile trait works within Either and List self.traits. 48 | """ 49 | self.traits.filelist = op.abspath(__file__) 50 | self.traits.filelist = [op.abspath(__file__), TEST_DATA_DICT] 51 | self.assertRaises(TraitError, self.setter, "filelist", 52 | [op.basename(__file__)]) 53 | self.assertRaises(TraitError, self.setter, "filelist", ["/foo/bar"]) 54 | self.assertRaises(TraitError, self.setter, "filelist", 55 | op.basename(__file__)) 56 | self.assertRaises(TraitError, self.setter, "filelist", "/foo/bar") 57 | 58 | def test_absolute_filepath_nonexistent(self): 59 | """Test if the Absfile trait raises the correct error when the filepath 60 | is absolute but doesn't exist.""" 61 | self.assertRaisesRegexp(TraitError, 'The filepath does not exist.', 62 | self.setter, "filepath", '/foo/bar') 63 | 64 | def test_absolute_path_file_trait(self): 65 | """Test if the `traits.AbsFile` trait works correctly.""" 66 | self.traits.filepath = op.abspath(__file__) 67 | self.assertRaises(TraitError, self.setter, "filepath", 68 | op.basename(__file__)) 69 | self.assertRaises(TraitError, self.setter, "filepath", "foo/bar") 70 | self.assertRaises(TraitError, self.setter, "filepath", "/foo/bar") 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /pysemantic/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """ 10 | Tests for a the pysemantic.utils module. 11 | """ 12 | 13 | import unittest 14 | import os.path as op 15 | from pysemantic.utils import colnames, get_md5_checksum 16 | 17 | 18 | class TestUtils(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", 22 | "iris.csv") 23 | 24 | def test_colnames(self): 25 | """Test if the column names are read correctly from a file.""" 26 | ideal = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 27 | 'Species'] 28 | actual = colnames(self.filepath) 29 | self.assertItemsEqual(actual, ideal) 30 | 31 | def test_colnames_infer_parser_from_extension(self): 32 | """Test if the colnames function can infer the correct parser from the 33 | file extension.""" 34 | filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", 35 | "person_activity.tsv") 36 | ideal = "sequence_name tag date x y z activity".split() 37 | actual = colnames(filepath) 38 | self.assertItemsEqual(actual, ideal) 39 | 40 | def test_colnames_parser_arg(self): 41 | """Test if the colnames are read if the parser is specified.""" 42 | filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", 43 | "person_activity.tsv") 44 | ideal = "sequence_name tag date x y z activity".split() 45 | from pandas import read_table 46 | actual = colnames(filepath, parser=read_table) 47 | self.assertItemsEqual(actual, ideal) 48 | 49 | def test_colnames_infer_parser_from_sep(self): 50 | """Test if the colnames are read if the separator is specified.""" 51 | filepath = op.join(op.abspath(op.dirname(__file__)), "testdata", 52 | "person_activity.tsv") 53 | ideal = "sequence_name tag date x y z activity".split() 54 | actual = colnames(filepath, sep='\\t') 55 | self.assertItemsEqual(actual, ideal) 56 | 57 | def test_md5(self): 58 | """Test the md5 checksum calculator.""" 59 | ideal = "9b3ecf3031979169c0ecc5e03cfe20a6" 60 | actual = get_md5_checksum(self.filepath) 61 | self.assertEqual(ideal, actual) 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /pysemantic/tests/test_validator.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2015 jaidev 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """Tests for the validator module.""" 10 | 11 | import os 12 | import os.path as op 13 | import cPickle 14 | import unittest 15 | import tempfile 16 | import warnings 17 | import shutil 18 | from copy import deepcopy 19 | 20 | import numpy as np 21 | import pandas as pd 22 | import yaml 23 | from traits.api import TraitError 24 | 25 | from pysemantic.tests.test_base import (BaseTestCase, TEST_DATA_DICT, 26 | _get_iris_args, _dummy_postproc, 27 | _get_person_activity_args) 28 | from pysemantic.validator import (SeriesValidator, SchemaValidator, 29 | DataFrameValidator) 30 | from pysemantic.utils import get_md5_checksum 31 | 32 | try: 33 | from yaml import CLoader as Loader 34 | from yaml import CDumper as Dumper 35 | except ImportError: 36 | from yaml import Loader 37 | from yaml import Dumper 38 | 39 | 40 | class TestSchemaValidator(BaseTestCase): 41 | 42 | """Test the `pysemantic.validator.SchemaValidatorClass`.""" 43 | 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.maxDiff = None 47 | cls.specfile = op.join(op.dirname(__file__), "testdata", 48 | "test_dictionary.yaml") 49 | with open(cls.specfile, "r") as fileobj: 50 | cls._basespecs = yaml.load(fileobj, Loader=Loader) 51 | cls.specs = deepcopy(cls._basespecs) 52 | 53 | # fix the paths in basespecs if they aren't absolute 54 | for _, dataspec in cls.specs.iteritems(): 55 | if not op.isabs(dataspec['path']): 56 | dataspec['path'] = op.join(op.abspath(op.dirname(__file__)), 57 | dataspec['path']) 58 | # The updated values also need to be dumped into the yaml file, because 59 | # some functionality of the validator depends on parsing it. 60 | with open(cls.specfile, "w") as fileobj: 61 | yaml.dump(cls.specs, fileobj, Dumper=Dumper, 62 | default_flow_style=False) 63 | 64 | cls.ideal_activity_parser_args = _get_person_activity_args() 65 | cls.ideal_iris_parser_args = _get_iris_args() 66 | 67 | @classmethod 68 | def tearDownClass(cls): 69 | with open(cls.specfile, "w") as fileobj: 70 | yaml.dump(cls._basespecs, fileobj, Dumper=Dumper, 71 | default_flow_style=False) 72 | 73 | def setUp(self): 74 | # FIXME: This should not be necessary, but without it, a couple of 75 | # tests strangely fail. I think one or both of the following two tests 76 | # are messing up the base specifications. 77 | self.basespecs = deepcopy(self.specs) 78 | 79 | def test_parse_dates_list(self): 80 | """Test if arguments to `parse_dates` are put into a list.""" 81 | specs = deepcopy(self.basespecs['person_activity']) 82 | specs['parse_dates'] = specs['parse_dates'][0] 83 | validator = SchemaValidator(specification=specs) 84 | parser_args = validator.get_parser_args() 85 | self.assertTrue(isinstance(parser_args['parse_dates'], list)) 86 | df = pd.read_csv(**parser_args) 87 | self.assertEqual(df['date'].dtype, np.dtype(' 6 | # 7 | # Distributed under terms of the BSD 3-clause license. 8 | 9 | """ 10 | Misecellaneous bells and whistles. 11 | """ 12 | 13 | import sys 14 | import json 15 | import pandas as pd 16 | import numpy as np 17 | import datetime 18 | 19 | DATA_TYPES = {'String': str, 'Date/Time': datetime.date, 'Float': float, 20 | 'Integer': int} 21 | 22 | 23 | class TypeEncoder(json.JSONEncoder): 24 | 25 | def default(self, obj): 26 | if isinstance(obj, type): 27 | return str(obj) 28 | elif isinstance(obj, set): 29 | return list(obj) 30 | elif callable(obj): 31 | return ".".join((obj.__module__, obj.__name__)) 32 | elif isinstance(obj, np.ndarray): 33 | return np.array_str(obj) 34 | else: 35 | if "Engine" in str(obj): 36 | return str(obj) 37 | return json.JSONEncoder.default(self, obj) 38 | 39 | 40 | def generate_questionnaire(filepath): 41 | """Generate a questionnaire for data at `filepath`. 42 | 43 | This questionnaire will be presented to the client, which helps us 44 | automatically generate the schema. 45 | 46 | :param filepath: Path to the file that needs to be ingested. 47 | :type filepath: str 48 | :return: A dictionary of questions and their possible answers. The format 49 | of the dictionary is such that every key is a question to be put to the 50 | client, and its value is a list of possible answers. The first item in the 51 | list is the default value. 52 | :rtype: dict 53 | """ 54 | qdict = {} 55 | if filepath.endswith(".tsv"): 56 | dataframe = pd.read_table(filepath) 57 | else: 58 | dataframe = pd.read_csv(filepath) 59 | for col in dataframe.columns: 60 | qstring = "What is the data type of {}?".format(col) 61 | if "float" in str(dataframe[col].dtype).lower(): 62 | defaultType = "Float" 63 | elif "object" in str(dataframe[col].dtype).lower(): 64 | defaultType = "String" 65 | elif "int" in str(dataframe[col].dtype).lower(): 66 | defaultType = "Integer" 67 | typeslist = DATA_TYPES.keys() 68 | typeslist.remove(defaultType) 69 | typeslist = [defaultType] + typeslist 70 | qdict[qstring] = typeslist 71 | return qdict 72 | 73 | 74 | def colnames(filename, parser=None, **kwargs): 75 | """ 76 | Read the column names of a delimited file, without actually reading the 77 | whole file. This is simply a wrapper around `pandas.read_csv`, which reads 78 | only one row and returns the column names. 79 | 80 | 81 | :param filename: Path to the file to be read 82 | :param kwargs: Arguments to be passed to the `pandas.read_csv` 83 | :type filename: str 84 | :rtype: list 85 | 86 | :Example: 87 | 88 | Suppose we want to see the column names of the Fisher iris dataset. 89 | 90 | >>> colnames("/path/to/iris.csv") 91 | ['Sepal Length', 'Petal Length', 'Sepal Width', 'Petal Width', 'Species'] 92 | 93 | """ 94 | if 'nrows' in kwargs: 95 | UserWarning("The nrows parameter is pointless here. This function only" 96 | "reads one row.") 97 | kwargs.pop('nrows') 98 | if parser is None: 99 | if "sep" in kwargs: 100 | sep = kwargs.get('sep') 101 | if sep == r"\t": 102 | parser = pd.read_table 103 | kwargs.pop('sep') 104 | else: 105 | parser = pd.read_csv 106 | elif filename.endswith('.tsv'): 107 | parser = pd.read_table 108 | else: 109 | parser = pd.read_csv 110 | return parser(filename, nrows=1, **kwargs).columns.tolist() 111 | 112 | 113 | def get_md5_checksum(filepath): 114 | """Get the md5 checksum of a file. 115 | 116 | :param filepath: Path to the file of which to calculate the md5 checksum. 117 | :type filepath: Str 118 | :return: MD5 checksum of the file. 119 | :rtype: Str 120 | :Example: 121 | 122 | >>> get_md5_checksum('pysemantic/tests/testdata/iris.csv') 123 | '9b3ecf3031979169c0ecc5e03cfe20a6' 124 | 125 | """ 126 | import subprocess 127 | if sys.platform == "darwin": 128 | cmd = "md5 -q {}".format(filepath).split() 129 | else: 130 | cmd = "md5sum {}".format(filepath).split() 131 | return subprocess.check_output(cmd).rstrip().split()[0] 132 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | traits 3 | pandas 4 | docopt 5 | nose 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os.path as op 3 | 4 | CONF_PATH = op.join(op.expanduser("~"), "pysemantic.conf") 5 | if not op.exists(CONF_PATH): 6 | with open(CONF_PATH, "w") as fid: 7 | fid.write("# Config file added by the pysemantic setup script.") 8 | fid.write("\n") 9 | print "Config file added at {}".format(CONF_PATH) 10 | 11 | NAME = "pysemantic" 12 | 13 | setup( 14 | name=NAME, 15 | version='0.1.1', 16 | author='Jaidev Deshpande', 17 | author_email='deshpande.jaidev@gmail.com', 18 | description="A traits based data validation module for pandas data structures.", 19 | url="https://github.com/jaidevd/pysemantic", 20 | long_description=open("README.rst").read(), 21 | entry_points={ 22 | 'console_scripts': ['semantic = pysemantic.cli:main'], 23 | }, 24 | packages=find_packages(), 25 | install_requires=['pyyaml', 'traits', 'pandas', 'docopt'] 26 | ) 27 | --------------------------------------------------------------------------------