├── .dockerignore ├── .github └── FUNDING.yml ├── .gitignore ├── .readthedocs.yml ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── Makefile.bat ├── README.md ├── build.sh ├── docs ├── Makefile ├── autobuild.bat ├── autobuild.sh ├── make.bat └── source │ ├── _code │ ├── colon-cat-interaction.csv │ ├── colon-con-interaction.csv │ ├── data.csv │ ├── demo-formulas.py │ ├── demo.py │ ├── divide-interaction.csv │ ├── no-intercept.csv │ ├── star-cat-interaction.csv │ ├── star-con-interaction.csv │ ├── transformed-continuous.csv │ └── two-way-interactions.csv │ ├── _logo │ ├── logo-1000x1000.png │ ├── logo-250x250.png │ └── logo-500x500.png │ ├── _static │ ├── css │ │ └── override.css │ ├── favicon.ico │ └── images │ │ ├── logo-small.png │ │ ├── logo.png │ │ ├── ooc-logo.png │ │ └── ooc-small.png │ ├── _templates │ └── .gitkeep │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ ├── quickstart.rst │ ├── refs.bib │ ├── robots.txt │ ├── ydot.rst │ └── zzz-bib.rst ├── logo.png ├── publish.sh ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_formula.py └── test_spark.py └── ydot ├── __init__.py ├── formula.py └── spark.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | .idea/ 3 | docs/build/ 4 | .pytest_cache/ 5 | build/ 6 | coverage/ 7 | dist/ 8 | ydot.egg-info/ 9 | docs/build/ 10 | .coverage 11 | .noseids 12 | .ipynb_checkpoints/ 13 | joblib_memmap/ 14 | .DS_store -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: vangj 4 | patreon: vangj 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: https://oneoffcoder.com/ 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | .idea/ 3 | docs/build/ 4 | coverage/ 5 | .coverage 6 | .noseids 7 | dist/ 8 | ydot.egg-info/ 9 | build/ 10 | .ipynb_checkpoints/ 11 | .pypirc 12 | .pypircc 13 | joblib_memmap/ 14 | .pytest_cache/ 15 | .DS_store -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: requirements.txt -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oneoffcoder/python-java:latest 2 | 3 | LABEL author="Jee Vang, Ph.D." 4 | LABEL email="vangjee@gmail.com" 5 | 6 | ARG AAPI_VERSION 7 | ARG APYPI_REPO 8 | 9 | ENV API_VERSION=$AAPI_VERSION 10 | ENV PYPI_REPO=$APYPI_REPO 11 | 12 | RUN apt-get update \ 13 | && apt-get upgrade -y 14 | COPY . /code 15 | RUN pip install -r /code/requirements.txt 16 | RUN /code/publish.sh -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2017 Jee Vang 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt README.md 2 | prune tests* -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init clean lint test 2 | .DEFAULT_GOAL := build 3 | 4 | init: 5 | pip install -r requirements.txt 6 | 7 | lint: 8 | python -m flake8 ./ydot 9 | 10 | test: clean lint 11 | nosetests tests 12 | 13 | build: test 14 | python setup.py bdist_egg 15 | 16 | build-dist: compile 17 | python setup.py bdist_egg sdist bdist_wheel 18 | 19 | install: build 20 | python setup.py install 21 | 22 | publish: build 23 | python setup.py sdist upload -r pypi 24 | 25 | compile: 26 | python -m compileall -f ./ydot 27 | 28 | clean: 29 | find . -type f -name '*.pyc' -delete 30 | find . -type d -name '__pycache__' -delete 31 | rm -fr coverage/ 32 | rm -fr dist/ 33 | rm -fr build/ 34 | rm -fr ydot.egg-info/ 35 | rm -fr jupyter/.ipynb_checkpoints/ 36 | rm -fr joblib_memmap/ 37 | rm -fr docs/build/ 38 | rm -fr .pytest_cache/ 39 | rm -f .coverage 40 | rm -f .noseids 41 | 42 | -------------------------------------------------------------------------------- /Makefile.bat: -------------------------------------------------------------------------------- 1 | @ECHO off 2 | if /I %1 == default goto :default 3 | if /I %1 == init goto :init 4 | if /I %1 == lint goto :lint 5 | if /I %1 == test goto :test 6 | if /I %1 == clean goto :clean 7 | if /I %1 == build goto :build 8 | if /I %1 == install goto :install 9 | 10 | goto :eof ::can be ommited to run the `default` function similarly to makefiles 11 | 12 | :default 13 | goto :test 14 | 15 | :init 16 | pip install -r requirements.txt 17 | goto :eof 18 | 19 | :lint 20 | python -m flake8 ./ydot 21 | goto :eof 22 | 23 | :test 24 | nosetests tests 25 | goto :eof 26 | 27 | :clean 28 | del /S *.pyc 29 | rmdir /S /Q coverage 30 | rmdir /S /Q dist 31 | rmdir /S /Q build 32 | rmdir /S /Q ydot.egg-info 33 | rmdir /S /Q jupyter/.ipynb_checkpoints 34 | rmdir /S /Q docs/build 35 | rmdir /S /Q joblib_memmap 36 | rmdir /S /Q .pytest_cache 37 | del .coverage 38 | del .noseids 39 | goto :eof 40 | 41 | :build 42 | python setup.py bdist_egg sdist bdist_wheel 43 | goto :eof 44 | 45 | :install 46 | python setup.py install 47 | goto :eof -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![ydot logo](https://ydot.readthedocs.io/en/latest/_images/logo.png) 2 | 3 | # ydot 4 | 5 | R-like formulas for Spark Dataframes. 6 | 7 | - [Documentation](https://ydot.readthedocs.io/) 8 | - [PyPi](https://pypi.org/project/ydot/) 9 | - [Gitter](https://gitter.im/dataflava/ydot) 10 | 11 | Now you have the expressive power of R-like formulas to produce design matrices for your experimental needs. This API is based off of [patsy](https://patsy.readthedocs.io/en/latest/), but for use with Apache Spark dataframes. Given a Spark dataframe, you can express your design matrices with something that resembles the following. 12 | 13 | `y ~ x1 + x2 + (x3 + a + b)**2` 14 | 15 | Here's a short and sweet example. 16 | 17 | ```python 18 | from ydot.spark import smatrices 19 | 20 | spark_df = get_a_spark_dataframe() 21 | formula = 'y ~ x1 + x2 + (x3 + a + b)**2' 22 | y, X = smatrices(formula, spark_df) 23 | ``` 24 | 25 | # Software Copyright 26 | 27 | ``` 28 | Copyright 2020 One-Off Coder 29 | 30 | Licensed under the Apache License, Version 2.0 (the "License"); 31 | you may not use this file except in compliance with the License. 32 | You may obtain a copy of the License at 33 | 34 | http://www.apache.org/licenses/LICENSE-2.0 35 | 36 | Unless required by applicable law or agreed to in writing, software 37 | distributed under the License is distributed on an "AS IS" BASIS, 38 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 39 | See the License for the specific language governing permissions and 40 | limitations under the License. 41 | ``` 42 | 43 | # Book Copyright 44 | 45 | Copyright 2020 One-Off Coder 46 | 47 | This work is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) by [One-Off Coder](https://www.oneoffcoder.com). 48 | 49 | ![Creative Commons Attribution 4.0 International License](https://i.creativecommons.org/l/by/4.0/88x31.png "Creative Commons Attribution 4.0 International License") 50 | 51 | # Art Copyright 52 | 53 | Copyright 2020 Daytchia Vang 54 | 55 | # Citation 56 | 57 | ``` 58 | @misc{oneoffcoder_ydot_2020, 59 | title={ydot, R-like formulas for Spark Dataframes}, 60 | url={https://github.com/oneoffcoder/pyspark-formula}, 61 | author={Jee Vang}, 62 | year={2020}, 63 | month={Dec}} 64 | ``` 65 | 66 | # Sponsor, Love 67 | 68 | - [Patreon](https://www.patreon.com/vangj) 69 | - [GitHub](https://github.com/sponsors/vangj) -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DOCKER_FILE=Dockerfile 4 | DOCKER_REPO=ydot 5 | DOCKER_TAG=local 6 | AAPI_VERSION=version 7 | APYPI_REPO=repo 8 | 9 | while getopts v:r: option 10 | do 11 | case "${option}" 12 | in 13 | v) AAPI_VERSION=${OPTARG};; 14 | r) APYPI_REPO=${OPTARG};; 15 | esac 16 | done 17 | 18 | if [[ "version" == AAPI_VERSION || "repo" == $APYPI_REPO ]]; then 19 | echo "Usage: ./build.sh -r [pypi|testpypi] -v [version]" 20 | echo " -r repository, pypi or testpypi" 21 | echo " -v version e.g. 0.2.5" 22 | else 23 | docker build --no-cache \ 24 | -f $DOCKER_FILE \ 25 | --build-arg AAPI_VERSION=$AAPI_VERSION \ 26 | --build-arg APYPI_REPO=$APYPI_REPO \ 27 | -t ${DOCKER_REPO}:${DOCKER_TAG} . 28 | fi -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/autobuild.bat: -------------------------------------------------------------------------------- 1 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000 -------------------------------------------------------------------------------- /docs/autobuild.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000 4 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_code/colon-cat-interaction.csv: -------------------------------------------------------------------------------- 1 | Intercept,b[T.low],b[T.mid],a[T.right]:b[high],a[T.right]:b[low],a[T.right]:b[mid] 2 | 1.0,1.0,0.0,0.0,0.0,0.0 3 | 1.0,1.0,0.0,0.0,0.0,0.0 4 | 1.0,0.0,0.0,1.0,0.0,0.0 5 | 1.0,0.0,1.0,0.0,0.0,1.0 6 | 1.0,1.0,0.0,0.0,0.0,0.0 7 | -------------------------------------------------------------------------------- /docs/source/_code/colon-con-interaction.csv: -------------------------------------------------------------------------------- 1 | Intercept,x1:x2 2 | 1.0,76.83302248278848 3 | 1.0,84.73542172597531 4 | 1.0,55.154885818557126 5 | 1.0,97.44678088481062 6 | 1.0,52.341422295472896 7 | -------------------------------------------------------------------------------- /docs/source/_code/data.csv: -------------------------------------------------------------------------------- 1 | a,b,x1,x2,y 2 | left,low,19.945536387662504,3.85214120038979,0.0 3 | left,low,20.674308066353493,4.098585619118175,1.0 4 | right,high,20.346647025958433,2.7107604387194626,1.0 5 | right,mid,18.699653829045985,5.2111542692543065,1.0 6 | left,low,21.51851187887476,2.432390426907621,1.0 7 | right,mid,20.989823705535017,3.6774523253171734,1.0 8 | right,high,20.277680897136328,2.4873300559969604,0.0 9 | right,mid,19.551410645704927,2.3549674965407372,0.0 10 | right,low,20.96196624352397,3.1665930443154995,0.0 11 | right,mid,19.172421360793678,3.562224297579924,1.0 -------------------------------------------------------------------------------- /docs/source/_code/demo-formulas.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import choice 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pyspark.sql import SparkSession 7 | 8 | from ydot.spark import smatrices 9 | 10 | random.seed(37) 11 | np.random.seed(37) 12 | 13 | 14 | def get_spark_dataframe(spark): 15 | n = 100 16 | data = { 17 | 'a': [choice(['left', 'right']) for _ in range(n)], 18 | 'b': [choice(['high', 'mid', 'low']) for _ in range(n)], 19 | 'x1': np.random.normal(20, 1, n), 20 | 'x2': np.random.normal(3, 1, n), 21 | 'y': [choice([1.0, 0.0]) for _ in range(n)] 22 | } 23 | pdf = pd.DataFrame(data) 24 | 25 | sdf = spark.createDataFrame(pdf) 26 | return sdf 27 | 28 | 29 | if __name__ == '__main__': 30 | try: 31 | spark = (SparkSession.builder 32 | .master('local[4]') 33 | .appName('local-testing-pyspark') 34 | .getOrCreate()) 35 | sdf = get_spark_dataframe(spark) 36 | 37 | formulas = [ 38 | { 39 | 'f': 'y ~ np.sin(x1) + np.cos(x2) + a + b', 40 | 'o': 'transformed-continuous.csv' 41 | }, 42 | { 43 | 'f': 'y ~ x1*x2', 44 | 'o': 'star-con-interaction.csv' 45 | }, 46 | { 47 | 'f': 'y ~ a*b', 48 | 'o': 'star-cat-interaction.csv' 49 | }, 50 | { 51 | 'f': 'y ~ x1:x2', 52 | 'o': 'colon-con-interaction.csv' 53 | }, 54 | { 55 | 'f': 'y ~ a:b', 56 | 'o': 'colon-cat-interaction.csv' 57 | }, 58 | { 59 | 'f': 'y ~ (x1 + x2) / (a + b)', 60 | 'o': 'divide-interaction.csv' 61 | }, 62 | { 63 | 'f': 'y ~ x1 + x2 + a - 1', 64 | 'o': 'no-intercept.csv' 65 | } 66 | ] 67 | 68 | for item in formulas: 69 | f = item['f'] 70 | o = item['o'] 71 | 72 | y, X = smatrices(f, sdf) 73 | y = y.toPandas() 74 | X = X.toPandas() 75 | 76 | X.head(5).to_csv(o, index=False) 77 | 78 | s = f""" 79 | .. csv-table:: {f} 80 | :file: _code/{o} 81 | :header-rows: 1 82 | """ 83 | print(s.strip()) 84 | except Exception as e: 85 | print(e) 86 | finally: 87 | try: 88 | spark.stop() 89 | print('closed spark') 90 | except Exception as e: 91 | print(e) 92 | -------------------------------------------------------------------------------- /docs/source/_code/demo.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import choice 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pyspark.sql import SparkSession 7 | 8 | from ydot.spark import smatrices 9 | 10 | random.seed(37) 11 | np.random.seed(37) 12 | 13 | 14 | def get_spark_dataframe(spark): 15 | n = 100 16 | data = { 17 | 'a': [choice(['left', 'right']) for _ in range(n)], 18 | 'b': [choice(['high', 'mid', 'low']) for _ in range(n)], 19 | 'x1': np.random.normal(20, 1, n), 20 | 'x2': np.random.normal(3, 1, n), 21 | 'y': [choice([1.0, 0.0]) for _ in range(n)] 22 | } 23 | pdf = pd.DataFrame(data) 24 | 25 | sdf = spark.createDataFrame(pdf) 26 | return sdf 27 | 28 | 29 | if __name__ == '__main__': 30 | try: 31 | spark = (SparkSession.builder 32 | .master('local[4]') 33 | .appName('local-testing-pyspark') 34 | .getOrCreate()) 35 | sdf = get_spark_dataframe(spark) 36 | 37 | y, X = smatrices('y ~ (x1 + x2 + a + b)**2', sdf) 38 | y = y.toPandas() 39 | X = X.toPandas() 40 | 41 | print(X.head(10)) 42 | X.head(10).to_csv('two-way-interactions.csv', index=False) 43 | except Exception as e: 44 | print(e) 45 | finally: 46 | try: 47 | spark.stop() 48 | print('closed spark') 49 | except Exception as e: 50 | print(e) 51 | -------------------------------------------------------------------------------- /docs/source/_code/divide-interaction.csv: -------------------------------------------------------------------------------- 1 | Intercept,x1,x2,x1:x2:a[left],x1:x2:a[right],x1:x2:b[T.low],x1:x2:b[T.mid] 2 | 1.0,19.945536387662504,3.85214120038979,76.83302248278848,0.0,76.83302248278848,0.0 3 | 1.0,20.674308066353493,4.098585619118175,84.73542172597531,0.0,84.73542172597531,0.0 4 | 1.0,20.346647025958433,2.7107604387194626,0.0,55.154885818557126,0.0,0.0 5 | 1.0,18.699653829045985,5.2111542692543065,0.0,97.44678088481062,0.0,97.44678088481062 6 | 1.0,21.51851187887476,2.432390426907621,52.341422295472896,0.0,52.341422295472896,0.0 7 | -------------------------------------------------------------------------------- /docs/source/_code/no-intercept.csv: -------------------------------------------------------------------------------- 1 | a[left],a[right],x1,x2 2 | 1.0,0.0,19.945536387662504,3.85214120038979 3 | 1.0,0.0,20.674308066353493,4.098585619118175 4 | 0.0,1.0,20.346647025958433,2.7107604387194626 5 | 0.0,1.0,18.699653829045985,5.2111542692543065 6 | 1.0,0.0,21.51851187887476,2.432390426907621 7 | -------------------------------------------------------------------------------- /docs/source/_code/star-cat-interaction.csv: -------------------------------------------------------------------------------- 1 | Intercept,a[T.right],b[T.low],b[T.mid],a[T.right]:b[T.low],a[T.right]:b[T.mid] 2 | 1.0,0.0,1.0,0.0,0.0,0.0 3 | 1.0,0.0,1.0,0.0,0.0,0.0 4 | 1.0,1.0,0.0,0.0,0.0,0.0 5 | 1.0,1.0,0.0,1.0,0.0,1.0 6 | 1.0,0.0,1.0,0.0,0.0,0.0 7 | -------------------------------------------------------------------------------- /docs/source/_code/star-con-interaction.csv: -------------------------------------------------------------------------------- 1 | Intercept,x1,x2,x1:x2 2 | 1.0,19.945536387662504,3.85214120038979,76.83302248278848 3 | 1.0,20.674308066353493,4.098585619118175,84.73542172597531 4 | 1.0,20.346647025958433,2.7107604387194626,55.154885818557126 5 | 1.0,18.699653829045985,5.2111542692543065,97.44678088481062 6 | 1.0,21.51851187887476,2.432390426907621,52.341422295472896 7 | -------------------------------------------------------------------------------- /docs/source/_code/transformed-continuous.csv: -------------------------------------------------------------------------------- 1 | Intercept,a[T.right],b[T.low],b[T.mid],np.sin(x1),np.cos(x2) 2 | 1.0,0.0,1.0,0.0,0.8893769205406579,-0.758004200582313 3 | 1.0,0.0,1.0,0.0,0.9679261582216445,-0.5759807266894401 4 | 1.0,1.0,0.0,0.0,0.9972849995254774,-0.9086185088676886 5 | 1.0,1.0,0.0,1.0,-0.14934132364604816,0.4783416124776783 6 | 1.0,0.0,1.0,0.0,0.45523550315103734,-0.7588816501987654 7 | -------------------------------------------------------------------------------- /docs/source/_code/two-way-interactions.csv: -------------------------------------------------------------------------------- 1 | Intercept,a[T.right],b[T.low],b[T.mid],a[T.right]:b[T.low],a[T.right]:b[T.mid],x1,x1:a[T.right],x1:b[T.low],x1:b[T.mid],x2,x2:a[T.right],x2:b[T.low],x2:b[T.mid],x1:x2 2 | 1.0,0.0,1.0,0.0,0.0,0.0,19.945536387662504,0.0,19.945536387662504,0.0,3.85214120038979,0.0,3.85214120038979,0.0,76.83302248278848 3 | 1.0,0.0,1.0,0.0,0.0,0.0,20.674308066353493,0.0,20.674308066353493,0.0,4.098585619118175,0.0,4.098585619118175,0.0,84.73542172597531 4 | 1.0,1.0,0.0,0.0,0.0,0.0,20.346647025958433,20.346647025958433,0.0,0.0,2.7107604387194626,2.7107604387194626,0.0,0.0,55.154885818557126 5 | 1.0,1.0,0.0,1.0,0.0,1.0,18.699653829045985,18.699653829045985,0.0,18.699653829045985,5.2111542692543065,5.2111542692543065,0.0,5.2111542692543065,97.44678088481062 6 | 1.0,0.0,1.0,0.0,0.0,0.0,21.51851187887476,0.0,21.51851187887476,0.0,2.432390426907621,0.0,2.432390426907621,0.0,52.341422295472896 7 | 1.0,1.0,0.0,1.0,0.0,1.0,20.989823705535017,20.989823705535017,0.0,20.989823705535017,3.6774523253171734,3.6774523253171734,0.0,3.6774523253171734,77.18907599391727 8 | 1.0,1.0,0.0,0.0,0.0,0.0,20.277680897136328,20.277680897136328,0.0,0.0,2.4873300559969604,2.4873300559969604,0.0,0.0,50.437285161362595 9 | 1.0,1.0,0.0,1.0,0.0,1.0,19.551410645704927,19.551410645704927,0.0,19.551410645704927,2.3549674965407372,2.3549674965407372,0.0,2.3549674965407372,46.04293658215565 10 | 1.0,1.0,1.0,0.0,1.0,0.0,20.96196624352397,20.96196624352397,20.96196624352397,0.0,3.1665930443154995,3.1665930443154995,3.1665930443154995,0.0,66.3780165019193 11 | 1.0,1.0,0.0,1.0,0.0,1.0,19.172421360793678,19.172421360793678,0.0,19.172421360793678,3.562224297579924,3.562224297579924,0.0,3.562224297579924,68.29646521485958 12 | -------------------------------------------------------------------------------- /docs/source/_logo/logo-1000x1000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-1000x1000.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-250x250.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-250x250.png -------------------------------------------------------------------------------- /docs/source/_logo/logo-500x500.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-500x500.png -------------------------------------------------------------------------------- /docs/source/_static/css/override.css: -------------------------------------------------------------------------------- 1 | table.expand { 2 | width: 100%; 3 | } 4 | table.rc-headers, th.rc-headers, td.rc-headers { 5 | border: 1px dashed blue; 6 | border-collapse: collapse; 7 | padding: 5px; 8 | } 9 | th.heading, td.heading { 10 | font-weight: bold; 11 | } -------------------------------------------------------------------------------- /docs/source/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/favicon.ico -------------------------------------------------------------------------------- /docs/source/_static/images/logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/logo-small.png -------------------------------------------------------------------------------- /docs/source/_static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/ooc-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/ooc-logo.png -------------------------------------------------------------------------------- /docs/source/_static/images/ooc-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/ooc-small.png -------------------------------------------------------------------------------- /docs/source/_templates/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_templates/.gitkeep -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../../')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ydot' 21 | copyright = '2020, One-Off Coder' 22 | author = 'Jee Vang, Ph.D.' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.0.6' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.todo', 37 | 'sphinx.ext.coverage', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.githubpages', 40 | 'sphinxcontrib.bibtex', 41 | 'sphinxcontrib.blockdiag', 42 | 'sphinx_sitemap' 43 | ] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # List of patterns, relative to source directory, that match files and 49 | # directories to ignore when looking for source files. 50 | # This pattern also affects html_static_path and html_extra_path. 51 | exclude_patterns = [] 52 | 53 | 54 | # -- Options for HTML output ------------------------------------------------- 55 | 56 | # The theme to use for HTML and HTML Help pages. See the documentation for 57 | # a list of builtin themes. 58 | # 59 | html_theme = 'sphinx_rtd_theme' 60 | 61 | # Add any paths that contain custom static files (such as style sheets) here, 62 | # relative to this directory. They are copied after the builtin static files, 63 | # so a file named "default.css" will overwrite the builtin "default.css". 64 | html_static_path = ['_static'] 65 | html_css_files = [ 66 | 'css/override.css', 67 | ] 68 | html_extra_path = ['robots.txt'] 69 | html_show_sourcelink = False 70 | html_show_sphinx = False 71 | html_last_updated_fmt = '%b %d, %Y, %X' 72 | html_logo = '_static/images/logo-small.png' 73 | html_favicon = '_static/favicon.ico' 74 | html_theme_options = { 75 | 'canonical_url': 'https://ydot.readthedocs.io/', 76 | 'analytics_id': 'UA-150762273-1', # Provided by Google in your dashboard 77 | 'logo_only': False, 78 | 'display_version': True, 79 | 'prev_next_buttons_location': 'bottom', 80 | 'style_external_links': True, 81 | 'style_nav_header_background': '#0085CA', 82 | # Toc options 83 | 'collapse_navigation': True, 84 | 'sticky_navigation': True, 85 | 'navigation_depth': 4, 86 | 'includehidden': True, 87 | 'titles_only': False 88 | } -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: R-like formulas for Spark Dataframes. 3 | :keywords: statistics, pyspark, formula, patsy, spark, dataframe, regression, classification, data, machine learning, artificial intelligence 4 | :robots: index, follow 5 | :abstract: A Python API to produce PySpark dataframe models from R-like formula expressions. 6 | :author: Jee Vang, Ph.D. 7 | :contact: g@oneoffcoder.com 8 | :copyright: One-Off Coder 9 | :content: global 10 | :generator: Sphinx 11 | :language: English 12 | :rating: general 13 | :reply-to: info@oneoffcoder.com 14 | :web_author: Jee Vang, Ph.D. 15 | :revisit-after: 1 days 16 | 17 | .. ydot documentation master file, created by 18 | sphinx-quickstart on Sun Dec 6 17:42:42 2020. 19 | You can adapt this file completely to your liking, but it should at least 20 | contain the root `toctree` directive. 21 | 22 | ydot 23 | ==== 24 | 25 | .. image:: _static/images/logo.png 26 | :align: center 27 | :alt: ydot logo. 28 | 29 | ``ydot`` is a Python API to produce PySpark dataframe models from R-like formula expressions. This project is based on `patsy `_ :cite:`2020:patsy`. As a quickstart, let's say you have a Spark dataframe with data as follows. 30 | 31 | .. csv-table:: Dummy Data in a Spark Dataframe 32 | :file: _code/data.csv 33 | :header-rows: 1 34 | 35 | Now, let's say you want to model this dataset as follows. 36 | 37 | - ``y ~ x_1 + x_2 + a + b`` 38 | 39 | Then all you have to do is use the ``smatrices()`` function. 40 | 41 | .. code-block:: python 42 | :linenos: 43 | 44 | from ydot.spark import smatrices 45 | 46 | formula = 'y ~ x1 + x2 + a + b' 47 | y, X = smatrices(formula, sdf) 48 | 49 | Observe that ``y`` and ``X`` will be Spark dataframes as specified by the formula. Here's a more interesting example where you want a model specified up to all two-way interactions. 50 | 51 | - ``y ~ (x1 + x2 + a + b)**2`` 52 | 53 | Then you could issue the code as below. 54 | 55 | .. code-block:: python 56 | :linenos: 57 | 58 | from ydot.spark import smatrices 59 | 60 | formula = 'y ~ (x1 + x2 + a + b)**2' 61 | y, X = smatrices(formula, sdf) 62 | 63 | Your resulting ``X`` Spark dataframe will look like the following. 64 | 65 | .. csv-table:: Dummy Data Transformed by Formula 66 | :file: _code/two-way-interactions.csv 67 | :header-rows: 1 68 | 69 | In general, what you get with ``patsy`` is what you get with ``ydot``, however, there are exceptions. For example, the builtin functions such as ``standardize()`` and ``center()`` available with ``patsy`` will not work against Spark dataframes. Additionally, patsy allows for custom transforms, but such transforms (or user defined functions) must be visible. For now, only numpy-based transformed are allowed against continuous variables (or numeric columns). 70 | 71 | .. toctree:: 72 | :maxdepth: 2 73 | :caption: Contents 74 | 75 | quickstart 76 | zzz-bib 77 | 78 | .. toctree:: 79 | :maxdepth: 2 80 | :caption: API Documentation 81 | 82 | modules 83 | 84 | 85 | 86 | Indices and tables 87 | ================== 88 | 89 | * :ref:`genindex` 90 | * :ref:`modindex` 91 | * :ref:`search` 92 | 93 | About 94 | ===== 95 | 96 | .. image:: _static/images/ooc-logo.png 97 | :alt: One-Off Coder logo. 98 | 99 | One-Off Coder is an educational, service and product company. Please visit us online to discover how we may help you achieve life-long success in your personal coding career or with your company's business goals and objectives. 100 | 101 | - |Website_Link| 102 | - |Facebook_Link| 103 | - |Twitter_Link| 104 | - |Instagram_Link| 105 | - |YouTube_Link| 106 | - |LinkedIn_Link| 107 | 108 | .. |Website_Link| raw:: html 109 | 110 | Website 111 | 112 | .. |Facebook_Link| raw:: html 113 | 114 | Facebook 115 | 116 | .. |Twitter_Link| raw:: html 117 | 118 | Twitter 119 | 120 | .. |Instagram_Link| raw:: html 121 | 122 | Instagram 123 | 124 | .. |YouTube_Link| raw:: html 125 | 126 | YouTube 127 | 128 | .. |LinkedIn_Link| raw:: html 129 | 130 | LinkedIn 131 | 132 | Copyright 133 | ========= 134 | 135 | Documentation 136 | ------------- 137 | 138 | .. raw:: html 139 | 140 | 141 | This work is licensed under a Creative Commons Attribution 4.0 International License by One-Off Coder. 142 |
143 |
144 | 145 | Creative Commons License 146 | 147 |
148 |
149 | 150 | 151 | Software 152 | -------- 153 | 154 | :: 155 | 156 | Copyright 2020 One-Off Coder 157 | 158 | Licensed under the Apache License, Version 2.0 (the "License"); 159 | you may not use this file except in compliance with the License. 160 | You may obtain a copy of the License at 161 | 162 | http://www.apache.org/licenses/LICENSE-2.0 163 | 164 | Unless required by applicable law or agreed to in writing, software 165 | distributed under the License is distributed on an "AS IS" BASIS, 166 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 167 | See the License for the specific language governing permissions and 168 | limitations under the License. 169 | 170 | Art 171 | --- 172 | 173 | :: 174 | 175 | Copyright 2020 Daytchia Vang 176 | 177 | Citation 178 | ======== 179 | 180 | :: 181 | 182 | @misc{oneoffcoder_ydot_2020, 183 | title={ydot, R-like formulas for Spark Dataframes}, 184 | url={https://github.com/oneoffcoder/pyspark-formula}, 185 | author={Jee Vang}, 186 | year={2020}, 187 | month={Dec}} 188 | 189 | Author 190 | ====== 191 | 192 | Jee Vang, Ph.D. 193 | 194 | - |Patreon_Link| 195 | - |Github_Link| 196 | 197 | .. |Patreon_Link| raw:: html 198 | 199 | Patreon: support is appreciated 200 | 201 | .. |Github_Link| raw:: html 202 | 203 | GitHub: sponsorship will help us change the world for the better 204 | 205 | Help 206 | ==== 207 | 208 | - |Source_Link| 209 | - |Gitter_Link| 210 | 211 | .. |Source_Link| raw:: html 212 | 213 | GitHub: source code 214 | 215 | .. |Gitter_Link| raw:: html 216 | 217 | Gitter: chat -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :maxdepth: 4 3 | 4 | ydot 5 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Basic 5 | ----- 6 | 7 | The best way to learn ``R``-style formula syntax with ``ydot`` is to head on over to `patsy `_ :cite:`2020:patsy` and read the documentation. Below, we show very simple code to transform a Spark dataframe into two design matrices (these are also Spark dataframes), ``y`` and ``X``, using a formula that defines a model up to two-way interactions. 8 | 9 | .. literalinclude:: _code/demo.py 10 | :language: python 11 | :linenos: 12 | 13 | More 14 | ---- 15 | 16 | We use the code below to generate the models (data) below. 17 | 18 | .. literalinclude:: _code/demo-formulas.py 19 | :language: python 20 | :linenos: 21 | 22 | You can use ``numpy`` functions against continuous variables. 23 | 24 | .. csv-table:: y ~ np.sin(x1) + np.cos(x2) + a + b 25 | :file: _code/transformed-continuous.csv 26 | :header-rows: 1 27 | 28 | The ``*`` specifies interactions and keeps lower order terms. 29 | 30 | .. csv-table:: y ~ x1*x2 31 | :file: _code/star-con-interaction.csv 32 | :header-rows: 1 33 | 34 | .. csv-table:: y ~ a*b 35 | :file: _code/star-cat-interaction.csv 36 | :header-rows: 1 37 | 38 | The ``:`` specifies interactions and drops lower order terms. 39 | 40 | .. csv-table:: y ~ x1:x2 41 | :file: _code/colon-con-interaction.csv 42 | :header-rows: 1 43 | 44 | .. csv-table:: y ~ a:b 45 | :file: _code/colon-cat-interaction.csv 46 | :header-rows: 1 47 | 48 | The ``/`` is **quirky** according to the patsy documentation, but it is shorthand for ``a / b = a + a:b``. 49 | 50 | .. csv-table:: y ~ (x1 + x2) / (a + b) 51 | :file: _code/divide-interaction.csv 52 | :header-rows: 1 53 | 54 | If you need to drop the ``Intercept``, add ``- 1`` at the end. Note that one of the dummy variables for ``a`` is not dropped. This could be a bug with patsy. 55 | 56 | .. csv-table:: y ~ x1 + x2 + a - 1 57 | :file: _code/no-intercept.csv 58 | :header-rows: 1 -------------------------------------------------------------------------------- /docs/source/refs.bib: -------------------------------------------------------------------------------- 1 | @misc{2020:patsy, 2 | author = {patsy}, 3 | title = {patsy - Describing statistical models in Python}, 4 | url = {https://patsy.readthedocs.io/en/latest/index.html}, 5 | addendum = "(accessed: 12.07.2020)" 6 | } -------------------------------------------------------------------------------- /docs/source/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | Sitemap: https://ydot.readthedocs.io/sitemap.xml -------------------------------------------------------------------------------- /docs/source/ydot.rst: -------------------------------------------------------------------------------- 1 | PySpark Formula 2 | =============== 3 | 4 | Formula 5 | ------- 6 | 7 | The ``formula`` module contains code to extract values from a record (e.g. a Spark dataframe Record) based on the model definition. 8 | 9 | .. automodule:: ydot.formula 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | :special-members: __init__ 14 | 15 | Spark 16 | ----- 17 | 18 | The ``spark`` module contains code to transform a Spark dataframe into ``design matrices`` as specified by a formula. 19 | 20 | .. automodule:: ydot.spark 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | :special-members: __init__ -------------------------------------------------------------------------------- /docs/source/zzz-bib.rst: -------------------------------------------------------------------------------- 1 | Bibliography 2 | ------------ 3 | 4 | .. bibliography:: refs.bib 5 | :all: -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/logo.png -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIST=/code/dist/ydot-${API_VERSION}.tar.gz 4 | 5 | buildCode() { 6 | echo "start the build" 7 | cd /code \ 8 | && make clean \ 9 | && make \ 10 | && python setup.py sdist bdist bdist_wheel \ 11 | && twine check dist/* \ 12 | && cd /code/docs \ 13 | && make html 14 | } 15 | 16 | updateVersion() { 17 | echo "replace version of software to ${API_VERSION}" 18 | sed -i "s/version='0.2.3'/version='${API_VERSION}'/g" /code/setup.py 19 | } 20 | 21 | copyCredentials() { 22 | if [[ -f /code/.pypirc ]]; then 23 | echo "copying over .pypirc" 24 | cp /code/.pypirc /root/.pypirc 25 | fi 26 | } 27 | 28 | publish() { 29 | echo "python publish" 30 | 31 | if [[ -f /root/.pypirc ]]; then 32 | if [[ -f ${SOURCE_DIST} ]]; then 33 | echo "uploading source" 34 | cd /code \ 35 | && make clean \ 36 | && python setup.py sdist \ 37 | && twine upload --repository ${PYPI_REPO} ${SOURCE_DIST} 38 | else 39 | echo "no ${SOURCE_DIST} found!" 40 | fi 41 | else 42 | echo "no .pypirc found!" 43 | fi 44 | } 45 | 46 | cleanUp() { 47 | if [[ -f /root/.pypirc ]]; then 48 | echo "cleaning up" 49 | rm -f /root/.pypirc 50 | fi 51 | } 52 | 53 | build() { 54 | echo "python build" 55 | buildCode 56 | publish 57 | } 58 | 59 | conda init bash 60 | . /root/.bashrc 61 | updateVersion 62 | copyCredentials 63 | build 64 | cleanUp 65 | 66 | echo "done!" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # TEST 2 | nose 3 | coverage 4 | # LINT OR DIE 5 | flake8 6 | pep8 7 | pyflakes 8 | # LIBS 9 | numpy 10 | scipy 11 | pandas 12 | pyspark 13 | patsy 14 | # DOCUMENTATION 15 | sphinx 16 | sphinx_rtd_theme 17 | sphinxcontrib-bibtex 18 | sphinxcontrib-blockdiag 19 | sphinx-sitemap 20 | # PUBLISHING 21 | twine 22 | setuptools -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [flake8] 5 | max-line-length = 120 6 | ignore = E501 E731 7 | 8 | [nosetests] 9 | verbosity = 3 10 | with-doctest = 1 11 | with-coverage = 1 12 | with-id = 1 13 | cover-erase = 1 14 | cover-html = 1 15 | cover-html-dir = coverage 16 | cover-package = ydot 17 | detailed-errors = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md', 'r') as fh: 4 | long_desc = fh.read() 5 | 6 | setup( 7 | name='ydot', 8 | version='0.0.6', 9 | author='Jee Vang', 10 | author_email='vangjee@gmail.com', 11 | packages=find_packages(exclude=('*.tests', '*.tests.*', 'tests.*', 'tests')), 12 | description='R-like formulas for Spark Dataframes', 13 | long_description=long_desc, 14 | long_description_content_type='text/markdown', 15 | url='https://github.com/oneoffcoder/pyspark-formula', 16 | keywords=' '.join( 17 | ['statistics', 'pyspark', 'formula', 'patsy', 'spark', 18 | 'dataframe', 'regression', 'classification', 'data', 19 | 'machine learning', 'artificial intelligence']), 20 | install_requires=['scipy', 'numpy', 'pandas', 'scikit-learn', 'pyspark', 'patsy'], 21 | classifiers=[ 22 | 'Programming Language :: Python :: 3', 23 | 'License :: OSI Approved :: Apache Software License', 24 | 'Operating System :: OS Independent', 25 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 26 | 'Intended Audience :: Developers', 27 | 'Intended Audience :: Science/Research', 28 | 'Development Status :: 5 - Production/Stable' 29 | ], 30 | include_package_data=True, 31 | test_suite='nose.collector' 32 | ) 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_formula.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from nose import with_setup 5 | 6 | from ydot.formula import TermEnum, InteractionExtractor 7 | 8 | 9 | def setup(): 10 | """ 11 | Setup. 12 | :return: None. 13 | """ 14 | np.random.seed(37) 15 | random.seed(37) 16 | 17 | 18 | def teardown(): 19 | """ 20 | Teardown. 21 | :return: None. 22 | """ 23 | pass 24 | 25 | 26 | @with_setup(setup, teardown) 27 | def test_get_extractor(): 28 | """ 29 | Tests get extractor. 30 | 31 | :return: None. 32 | """ 33 | record = { 34 | 'x1': 20, 35 | 'x2': 5, 36 | 'a': 'left', 37 | 'b': 'mid' 38 | } 39 | terms = [ 40 | 'Intercept', 41 | "C(a, levels=profile['a'])[T.right]", 42 | "C(b, levels=profile['b'])[T.mid]", 43 | "C(b, levels=profile['b'])[T.high]", 44 | 'x1', 45 | 'x2'] 46 | expected = [ 47 | TermEnum.INT, 48 | TermEnum.LVL, 49 | TermEnum.LVL, 50 | TermEnum.LVL, 51 | TermEnum.CON, 52 | TermEnum.CON 53 | ] 54 | 55 | for i, term in enumerate(terms): 56 | extractor = TermEnum.get_extractor(record, term) 57 | lhs = expected[i] 58 | rhs = extractor._type 59 | # print(extractor) 60 | assert lhs == rhs 61 | 62 | 63 | @with_setup(setup, teardown) 64 | def test_basic_extractions(): 65 | """ 66 | Tests basic extractions. 67 | 68 | :return: None. 69 | """ 70 | record = { 71 | 'x1': 20.0, 72 | 'x2': 5.0, 73 | 'a': 'left', 74 | 'b': 'mid' 75 | } 76 | terms = [ 77 | 'Intercept', 78 | "C(a, levels=profile['a'])[T.left]", 79 | "C(a, levels=profile['a'])[T.right]", 80 | "C(b, levels=profile['b'])[T.low]", 81 | "C(b, levels=profile['b'])[T.mid]", 82 | "C(b, levels=profile['b'])[T.high]", 83 | 'x1', 84 | 'x2', 85 | 'a[left]', 'a[right]', 86 | 'a[T.left]', 'a[T.right]', 87 | 'b[low]', 'b[mid]', 'b[high]', 88 | 'b[T.low]', 'b[T.mid]', 'b[T.high]' 89 | ] 90 | expected = [ 91 | 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 20.0, 5.0, 92 | 1.0, 0.0, 93 | 1.0, 0.0, 94 | 0.0, 1.0, 0.0, 95 | 0.0, 1.0, 0.0 96 | ] 97 | 98 | for i, term in enumerate(terms): 99 | extractor = TermEnum.get_extractor(record, term) 100 | lhs = extractor.value 101 | rhs = expected[i] 102 | # print(f'{extractor._term}: {lhs}') 103 | assert lhs == rhs 104 | 105 | 106 | @with_setup(setup, teardown) 107 | def test_function_extractions(): 108 | """ 109 | Tests extractions of functions on continuous variables. 110 | 111 | :return: None. 112 | """ 113 | record = { 114 | 'x1': 20.0, 115 | 'x2': 5.0 116 | } 117 | terms = [ 118 | 'x1', 119 | 'x2', 120 | 'np.abs(x1)', 121 | 'np.log(x1)', 122 | 'np.sin(x1)', 123 | 'np.log(np.sin(x1))' 124 | ] 125 | expected = [ 126 | 20.0, 5.0, 20.0, 127 | 2.995732273553991, 128 | 0.9129452507276277, 129 | -0.09107936652955065 130 | ] 131 | 132 | for i, term in enumerate(terms): 133 | extractor = TermEnum.get_extractor(record, term) 134 | lhs = extractor.value 135 | rhs = expected[i] 136 | # print(f'{extractor._term}: {lhs}') 137 | assert lhs == rhs 138 | 139 | 140 | @with_setup(setup, teardown) 141 | def test_interaction_extractions(): 142 | """ 143 | Tests extractions of functions on continuous variables. 144 | 145 | :return: None. 146 | """ 147 | record = { 148 | 'x1': 20.0, 149 | 'x2': 5.0, 150 | 'a': 'left', 151 | 'b': 'mid' 152 | } 153 | terms = [ 154 | 'Intercept', 155 | 'x1', 156 | 'x2', 157 | 'x1:x2:a[left]', 158 | 'x1:x2:a[right]', 159 | 'x1:x2:b[T.low]', 160 | 'x1:x2:b[T.mid]', 161 | 'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]', 162 | 'a[T.left]:b[T.mid]', 'a[T.left]:b[T.high]', 163 | "x1:x2:C(a, levels=profile['a'])[left]", "x1:x2:C(a, levels=profile['a'])[right]", 164 | "x1:x2:C(b, levels=profile['b'])[T.mid]", "x1:x2:C(b, levels=profile['b'])[T.high]", 165 | "np.abs(x1):a[T.left]" 166 | ] 167 | expected = [ 168 | 1.0, 169 | 20.0, 170 | 5.0, 171 | 100.0, 172 | 0.0, 173 | 0.0, 174 | 100.0, 175 | 0.0, 0.0, 176 | 1.0, 0.0, 177 | 100.0, 178 | 0.0, 179 | 100.0, 180 | 0.0, 181 | 20.0 182 | ] 183 | 184 | for i, term in enumerate(terms): 185 | extractor = InteractionExtractor(record, term) 186 | lhs = extractor.value 187 | rhs = expected[i] 188 | # print(f'{extractor._terms}: {lhs}') 189 | assert lhs == rhs 190 | -------------------------------------------------------------------------------- /tests/test_spark.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import unittest 4 | from itertools import product 5 | 6 | import pandas as pd 7 | from pyspark.sql import SparkSession 8 | 9 | from ydot.spark import get_profile, get_columns, smatrices 10 | 11 | 12 | class PySparkTest(unittest.TestCase): 13 | """ 14 | PySpark test class. 15 | """ 16 | 17 | @classmethod 18 | def supress_py4j_logging(cls): 19 | """ 20 | Supresses p4j logging. 21 | 22 | :return: None. 23 | """ 24 | logger = logging.getLogger('py4j') 25 | logger.setLevel(logging.WARN) 26 | 27 | @classmethod 28 | def create_pyspark_session(cls): 29 | """ 30 | Creates a PySpark session. 31 | 32 | :return: PySpark session. 33 | """ 34 | return (SparkSession.builder 35 | .master('local[4]') 36 | .appName('local-testing-pyspark') 37 | .getOrCreate()) 38 | 39 | @classmethod 40 | def setUpClass(cls): 41 | """ 42 | Sets up the class. 43 | 44 | :return: None. 45 | """ 46 | cls.supress_py4j_logging() 47 | cls.spark = cls.create_pyspark_session() 48 | random.seed(37) 49 | 50 | @classmethod 51 | def tearDownClass(cls): 52 | """ 53 | Tears down the class. 54 | 55 | :return: None. 56 | """ 57 | cls.spark.stop() 58 | 59 | @staticmethod 60 | def _get_profile(): 61 | """ 62 | Gets the profile of a dataset. 63 | 64 | :return: Dictionary. 65 | """ 66 | profile = { 67 | 'a': ['left', 'right'], 68 | 'b': ['high', 'mid', 'low'], 69 | 'x1': [20.0], 70 | 'x2': [3.0], 71 | 'y': [1.0] 72 | } 73 | return profile 74 | 75 | @staticmethod 76 | def _get_pdf(): 77 | """ 78 | Gets a Pandas dataframe based on made-up profile. 79 | 80 | :return: Pandas dataframe. 81 | """ 82 | profile = PySparkTest._get_profile() 83 | data = product(*(v for _, v in profile.items())) 84 | columns = [k for k, _ in profile.items()] 85 | df = pd.DataFrame(data, columns=columns) 86 | 87 | return df 88 | 89 | def _get_sdf(self): 90 | """ 91 | Gets a Spark dataframe based on made-up profile. 92 | 93 | :return: Spark dataframe. 94 | """ 95 | pdf = PySparkTest._get_pdf() 96 | sdf = self.spark.createDataFrame(pdf) 97 | return sdf 98 | 99 | 100 | class SparkTest(PySparkTest): 101 | """ 102 | Tests Spark operations. 103 | """ 104 | 105 | def test_get_profile(self): 106 | """ 107 | Tests getting profile of a Spark dataframe. 108 | 109 | :return: None. 110 | """ 111 | sdf = self._get_sdf() 112 | sdf.printSchema() 113 | observed = get_profile(sdf) 114 | expected = {'b': ['mid', 'low', 'high'], 'a': ['right', 'left'], 'x1': [1.0], 'x2': [1.0], 'y': [1.0]} 115 | 116 | for k, lhs_vals in expected.items(): 117 | assert k in observed 118 | rhs_vals = observed[k] 119 | 120 | assert len(lhs_vals) == len(rhs_vals) 121 | for v in lhs_vals: 122 | assert v in rhs_vals 123 | 124 | def test_get_columns_simple_formula_with_profile(self): 125 | """ 126 | Tests get columns (simple) with profile specified. 127 | 128 | :return: None. 129 | """ 130 | formula = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])" 131 | sdf = self._get_sdf() 132 | profile = {'b': ['mid', 'low', 'high'], 133 | 'a': ['right', 'left'], 134 | 'x1': [1.0], 135 | 'x2': [1.0], 136 | 'y': [1.0]} 137 | 138 | y_observed, X_observed = get_columns(formula, sdf, profile=profile) 139 | 140 | y_expected = ['y'] 141 | X_expected = ['Intercept', 142 | "C(a, levels=profile['a'])[T.left]", 143 | "C(b, levels=profile['b'])[T.low]", 144 | "C(b, levels=profile['b'])[T.high]", 145 | 'x1', 146 | 'x2'] 147 | 148 | assert len(y_observed) == len(y_expected) 149 | assert len(X_observed) == len(X_expected) 150 | 151 | for y in y_observed: 152 | assert y in y_expected 153 | 154 | for x in X_observed: 155 | assert x in X_expected 156 | 157 | def test_get_columns_simple_formula_no_profile(self): 158 | """ 159 | Tests get columns (simple) without a profile specified. 160 | 161 | :return: None. 162 | """ 163 | formula = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])" 164 | sdf = self._get_sdf() 165 | 166 | y_observed, X_observed = get_columns(formula, sdf) 167 | 168 | y_expected = ['y'] 169 | X_expected = ['Intercept', 170 | "C(a, levels=profile['a'])[T.left]", 171 | "C(b, levels=profile['b'])[T.low]", 172 | "C(b, levels=profile['b'])[T.high]", 173 | 'x1', 174 | 'x2'] 175 | 176 | assert len(y_observed) == len(y_expected) 177 | assert len(X_observed) == len(X_expected) 178 | 179 | for y in y_observed: 180 | assert y in y_expected 181 | 182 | for x in X_observed: 183 | assert x in X_expected 184 | 185 | def test_get_columns_variety_with_profile(self): 186 | """ 187 | Tests a variety of formulas with profile. 188 | 189 | :return: None. 190 | """ 191 | f1 = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])" 192 | f2 = "y ~ (x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b']))**2" 193 | f3 = "y ~ x1:x2 + C(a,levels=profile['a']):C(b, levels=profile['b'])" 194 | f4 = "y ~ x1*x2 + C(a,levels=profile['a'])*C(b, levels=profile['b'])" 195 | f5 = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b']) - 1" 196 | f6 = "y ~ (x1 + x2) / (C(a,levels=profile['a']) + C(b, levels=profile['b']))" 197 | 198 | formulas = [f1, f2, f3, f4, f5, f6] 199 | 200 | sdf = self._get_sdf() 201 | profile = {'b': ['low', 'mid', 'high'], 202 | 'a': ['left', 'right'], 203 | 'x1': [1.0], 204 | 'x2': [1.0], 205 | 'y': [1.0]} 206 | 207 | yy = [ 208 | ['y'], 209 | ['y'], 210 | ['y'], 211 | ['y'], 212 | ['y'], 213 | ['y'] 214 | ] 215 | XX = [ 216 | ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]", 217 | "C(b, levels=profile['b'])[T.high]", 'x1', 'x2'], 218 | ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]", 219 | "C(b, levels=profile['b'])[T.high]", "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.mid]", 220 | "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.high]", 'x1', 221 | "x1:C(a, levels=profile['a'])[T.right]", "x1:C(b, levels=profile['b'])[T.mid]", 222 | "x1:C(b, levels=profile['b'])[T.high]", 'x2', "x2:C(a, levels=profile['a'])[T.right]", 223 | "x2:C(b, levels=profile['b'])[T.mid]", "x2:C(b, levels=profile['b'])[T.high]", 'x1:x2'], 224 | ['Intercept', "C(b, levels=profile['b'])[T.mid]", "C(b, levels=profile['b'])[T.high]", 225 | "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[low]", 226 | "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[mid]", 227 | "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[high]", 'x1:x2'], 228 | ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]", 229 | "C(b, levels=profile['b'])[T.high]", "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.mid]", 230 | "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.high]", 'x1', 'x2', 'x1:x2'], 231 | ["C(a, levels=profile['a'])[left]", "C(a, levels=profile['a'])[right]", "C(b, levels=profile['b'])[T.mid]", 232 | "C(b, levels=profile['b'])[T.high]", 'x1', 'x2'], 233 | ['Intercept', 'x1', 'x2', "x1:x2:C(a, levels=profile['a'])[left]", "x1:x2:C(a, levels=profile['a'])[right]", 234 | "x1:x2:C(b, levels=profile['b'])[T.mid]", "x1:x2:C(b, levels=profile['b'])[T.high]"] 235 | ] 236 | 237 | for i, formula in enumerate(formulas): 238 | y_observed, X_observed = get_columns(formula, sdf, profile=profile) 239 | y_expected, X_expected = yy[i], XX[i] 240 | 241 | # print(f'{i}: {formula}') 242 | # print(y_observed) 243 | # print(X_observed) 244 | # print('-' * 15) 245 | 246 | assert len(y_observed) == len(y_expected) 247 | assert len(X_observed) == len(X_expected) 248 | 249 | for y in y_observed: 250 | assert y in y_expected 251 | 252 | for x in X_observed: 253 | assert x in X_expected 254 | 255 | def test_get_columns_variety_no_profile(self): 256 | """ 257 | Tests a variety of formulas without profile. 258 | 259 | :return: None. 260 | """ 261 | f1 = "y ~ x1 + x2 + a + b" 262 | f2 = "y ~ (x1 + x2 + a + b)**2" 263 | f3 = "y ~ x1:x2 + a:b" 264 | f4 = "y ~ x1*x2 + a*b" 265 | f5 = "y ~ x1 + x2 + a + b - 1" 266 | f6 = "y ~ (x1 + x2) / (a + b)" 267 | 268 | formulas = [f1, f2, f3, f4, f5, f6] 269 | 270 | sdf = self._get_sdf() 271 | 272 | yy = [ 273 | ['y'], 274 | ['y'], 275 | ['y'], 276 | ['y'], 277 | ['y'], 278 | ['y'] 279 | ] 280 | XX = [ 281 | ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'], 282 | ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]', 'x1', 283 | 'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]', 'x2', 'x2:a[T.right]', 'x2:b[T.low]', 'x2:b[T.mid]', 284 | 'x1:x2'], 285 | ['Intercept', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[high]', 'a[T.right]:b[low]', 'a[T.right]:b[mid]', 286 | 'x1:x2'], 287 | ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]', 'x1', 288 | 'x2', 'x1:x2'], 289 | ['a[left]', 'a[right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'], 290 | ['Intercept', 'x1', 'x2', 'x1:x2:a[left]', 'x1:x2:a[right]', 'x1:x2:b[T.low]', 'x1:x2:b[T.mid]'] 291 | ] 292 | 293 | for i, formula in enumerate(formulas): 294 | y_observed, X_observed = get_columns(formula, sdf) 295 | y_expected, X_expected = yy[i], XX[i] 296 | 297 | # print(f'{i}: {formula}') 298 | # print(y_observed) 299 | # print(X_observed) 300 | # print('-' * 15) 301 | 302 | assert len(y_observed) == len(y_expected) 303 | assert len(X_observed) == len(X_expected) 304 | 305 | for y in y_observed: 306 | assert y in y_expected 307 | 308 | for x in X_observed: 309 | assert x in X_expected 310 | 311 | def test_smatrices_simple(self): 312 | """ 313 | Test simple smatrices. 314 | 315 | :return: None. 316 | """ 317 | f = 'y ~ x1 + x2 + a + b' 318 | sdf = self._get_sdf() 319 | e_rows = sdf.count() 320 | 321 | y_expected = ['y'] 322 | X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'] 323 | 324 | y, X = smatrices(f, sdf) 325 | y, X = y.toPandas(), X.toPandas() 326 | 327 | y_observed = list(y.columns) 328 | X_observed = list(X.columns) 329 | 330 | # print(y_observed) 331 | # print('~' * 15) 332 | # print(y) 333 | # print('=' * 15) 334 | # 335 | # print(X_observed) 336 | # print('~' * 20) 337 | # print(X) 338 | 339 | assert e_rows == y.shape[0] 340 | assert e_rows == X.shape[0] 341 | assert len(y_expected) == len(y_observed) 342 | assert len(X_expected) == len(X_observed) 343 | 344 | for v in y_expected: 345 | assert v in y_observed 346 | for v in X_expected: 347 | assert v in X_observed 348 | 349 | def test_smatrices_simple_drop_intercept(self): 350 | """ 351 | Test simple smatrices dropping intercept. Note that dropping intercept creates a 352 | situation where the one-hot encoded variables are not dropped! Bug with patsy? 353 | 354 | :return: None. 355 | """ 356 | f = 'y ~ x1 + x2 + a + b - 1' 357 | sdf = self._get_sdf() 358 | e_rows = sdf.count() 359 | 360 | y_expected = ['y'] 361 | X_expected = ['a[left]', 'a[right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'] 362 | 363 | y, X = smatrices(f, sdf) 364 | y, X = y.toPandas(), X.toPandas() 365 | 366 | y_observed = list(y.columns) 367 | X_observed = list(X.columns) 368 | 369 | # print(y_observed) 370 | # print('~' * 15) 371 | # print(y) 372 | # print('=' * 15) 373 | # 374 | # print(X_observed) 375 | # print('~' * 20) 376 | # print(X) 377 | 378 | assert e_rows == y.shape[0] 379 | assert e_rows == X.shape[0] 380 | assert len(y_expected) == len(y_observed) 381 | assert len(X_expected) == len(X_observed) 382 | 383 | for v in y_expected: 384 | assert v in y_observed 385 | for v in X_expected: 386 | assert v in X_observed 387 | 388 | def test_smatrices_two_way(self): 389 | """ 390 | Test smatrices with two-way interaction. 391 | 392 | :return: None. 393 | """ 394 | f = 'y ~ (x1 + x2 + a + b)**2' 395 | sdf = self._get_sdf() 396 | e_rows = sdf.count() 397 | 398 | y_expected = ['y'] 399 | X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 400 | 'a[T.right]:b[T.mid]', 'x1', 'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]', 401 | 'x2', 'x2:a[T.right]', 'x2:b[T.low]', 'x2:b[T.mid]', 'x1:x2'] 402 | 403 | y, X = smatrices(f, sdf) 404 | y, X = y.toPandas(), X.toPandas() 405 | 406 | y_observed = list(y.columns) 407 | X_observed = list(X.columns) 408 | 409 | # print(y_observed) 410 | # print('~' * 15) 411 | # print(y) 412 | # print('=' * 15) 413 | # 414 | # print(X_observed) 415 | # print('~' * 20) 416 | # print(X) 417 | 418 | assert e_rows == y.shape[0] 419 | assert e_rows == X.shape[0] 420 | assert len(y_expected) == len(y_observed) 421 | assert len(X_expected) == len(X_observed) 422 | 423 | for v in y_expected: 424 | assert v in y_observed 425 | for v in X_expected: 426 | assert v in X_observed 427 | 428 | def test_smatrices_three_way(self): 429 | """ 430 | Test smatrices with three-way interaction. 431 | 432 | :return: None. 433 | """ 434 | f = 'y ~ (x1 + x2 + a + b)**3' 435 | sdf = self._get_sdf() 436 | e_rows = sdf.count() 437 | 438 | y_expected = ['y'] 439 | X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 440 | 'a[T.right]:b[T.mid]', 'x1', 'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]', 441 | 'x1:a[T.right]:b[T.low]', 'x1:a[T.right]:b[T.mid]', 'x2', 'x2:a[T.right]', 442 | 'x2:b[T.low]', 'x2:b[T.mid]', 'x2:a[T.right]:b[T.low]', 'x2:a[T.right]:b[T.mid]', 443 | 'x1:x2', 'x1:x2:a[T.right]', 'x1:x2:b[T.low]', 'x1:x2:b[T.mid]'] 444 | 445 | y, X = smatrices(f, sdf) 446 | y, X = y.toPandas(), X.toPandas() 447 | 448 | y_observed = list(y.columns) 449 | X_observed = list(X.columns) 450 | 451 | # print(y_observed) 452 | # print('~' * 15) 453 | # print(y) 454 | # print('=' * 15) 455 | # 456 | # print(X_observed) 457 | # print('~' * 20) 458 | # print(X) 459 | 460 | assert e_rows == y.shape[0] 461 | assert e_rows == X.shape[0] 462 | assert len(y_expected) == len(y_observed) 463 | assert len(X_expected) == len(X_observed) 464 | 465 | for v in y_expected: 466 | assert v in y_observed 467 | for v in X_expected: 468 | assert v in X_observed 469 | 470 | def test_smatrices_weird(self): 471 | """ 472 | Test smatrices with weird interactions. 473 | 474 | :return: None. 475 | """ 476 | f = 'np.sin(y) + y ~ np.abs(x1) + (x2 + a)**2 + (np.cos(x2) + b)**2' 477 | sdf = self._get_sdf() 478 | e_rows = sdf.count() 479 | 480 | y_expected = ['np.sin(y)', 'y'] 481 | X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'np.abs(x1)', 'x2', 482 | 'x2:a[T.right]', 'np.cos(x2)', 'np.cos(x2):b[T.low]', 'np.cos(x2):b[T.mid]'] 483 | 484 | y, X = smatrices(f, sdf) 485 | y, X = y.toPandas(), X.toPandas() 486 | 487 | y_observed = list(y.columns) 488 | X_observed = list(X.columns) 489 | 490 | # print(y_observed) 491 | # print('~' * 15) 492 | # print(y) 493 | # print('=' * 15) 494 | # 495 | # print(X_observed) 496 | # print('~' * 20) 497 | # print(X) 498 | 499 | assert e_rows == y.shape[0] 500 | assert e_rows == X.shape[0] 501 | assert len(y_expected) == len(y_observed) 502 | assert len(X_expected) == len(X_observed) 503 | 504 | for v in y_expected: 505 | assert v in y_observed 506 | for v in X_expected: 507 | assert v in X_observed 508 | -------------------------------------------------------------------------------- /ydot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/ydot/__init__.py -------------------------------------------------------------------------------- /ydot/formula.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from abc import ABC, abstractmethod 3 | from enum import IntEnum 4 | from functools import reduce 5 | 6 | import pandas as pd 7 | 8 | 9 | class TermEnum(IntEnum): 10 | """ 11 | Term types. 12 | 13 | - CAT: categorical without levels specified 14 | - LVL: categorical with levels specified 15 | - CON: continuous 16 | - FUN: continuous with function transformations 17 | - INT: intercept 18 | """ 19 | CAT = 1 20 | LVL = 2 21 | CON = 3 22 | FUN = 4 23 | INT = 5 24 | 25 | @staticmethod 26 | def get_extractor(record, term): 27 | """ 28 | Gets the associated extractor based on the specified term. 29 | 30 | :param record: Dictionary. 31 | :param term: Model term. 32 | :return: Extractor. 33 | """ 34 | 35 | if term.startswith('C'): 36 | return LvlExtractor(record, term) 37 | elif '[' in term and ']' in term: 38 | return CatExtractor(record, term) 39 | elif 'Intercept' == term: 40 | return IntExtractor(record, term) 41 | elif '(' in term and ')' in term: 42 | return FunExtractor(record, term) 43 | else: 44 | return ConExtractor(record, term) 45 | 46 | 47 | class Extractor(ABC): 48 | """ 49 | Extractor to get value based on model term. 50 | """ 51 | 52 | def __init__(self, record, term, term_type): 53 | """ 54 | ctor. 55 | 56 | :param: Dictionary. 57 | :term: Model term. 58 | :term_type: Type of term. 59 | :return: None 60 | """ 61 | self._record = record 62 | self._term = term 63 | self._type = term_type 64 | 65 | def __repr__(self): 66 | return f'{self.__class__.__name__}[term={self._term}, type={self._type.name}]' 67 | 68 | @property 69 | @abstractmethod 70 | def value(self): 71 | """ 72 | Gets the extracted value. 73 | """ 74 | pass 75 | 76 | 77 | class CatExtractor(Extractor): 78 | """ 79 | Categorical extractor (no levels). 80 | """ 81 | 82 | def __init__(self, record, term): 83 | """ 84 | ctor. 85 | 86 | :param record: Dictionary. 87 | :param term: Model term. 88 | :return: None. 89 | """ 90 | super().__init__(record, term, TermEnum.CAT) 91 | 92 | @property 93 | def value(self): 94 | idx = self._term.index('[') 95 | x_name = self._term[0:idx] 96 | 97 | if x_name not in self._record or self._record[x_name] is None: 98 | return None 99 | 100 | lhs = self._term.rindex('[') + 1 101 | rhs = self._term.rindex(']') 102 | x_val = self._term[lhs:rhs] 103 | x_val = x_val.replace('T.', '') 104 | 105 | if self._record[x_name] == x_val: 106 | return 1.0 107 | return 0.0 108 | 109 | 110 | class LvlExtractor(Extractor): 111 | """ 112 | Categorical extractor (with levels). 113 | """ 114 | 115 | def __init__(self, record, term): 116 | """ 117 | ctor. 118 | 119 | :param record: Dictionary. 120 | :param term: Model term. 121 | :return: None. 122 | """ 123 | super().__init__(record, term, TermEnum.LVL) 124 | 125 | @property 126 | def value(self): 127 | lhs = self._term.index('(') + 1 128 | rhs = self._term.index(',') 129 | x_name = self._term[lhs:rhs] 130 | 131 | if x_name not in self._record or self._record[x_name] is None: 132 | return None 133 | 134 | lhs = self._term.rindex('[') + 1 135 | rhs = self._term.rindex(']') 136 | x_val = self._term[lhs:rhs] 137 | x_val = x_val.replace('T.', '') 138 | 139 | if self._record[x_name] == x_val: 140 | return 1.0 141 | return 0.0 142 | 143 | 144 | class ConExtractor(Extractor): 145 | """ 146 | Continuous extractor (no functions). 147 | """ 148 | 149 | def __init__(self, record, term): 150 | """ 151 | ctor. 152 | 153 | :param record: Dictionary. 154 | :param term: Model term. 155 | :return: None. 156 | """ 157 | super().__init__(record, term, TermEnum.CON) 158 | 159 | @property 160 | def value(self): 161 | return self._record[self._term] if self._term in self._record else None 162 | 163 | 164 | class IntExtractor(Extractor): 165 | """ 166 | Intercept extractor. Always returns 1.0. 167 | """ 168 | 169 | def __init__(self, record, term): 170 | """ 171 | ctor. 172 | 173 | :param record: Dictionary. 174 | :param term: Model term. 175 | :return: None. 176 | """ 177 | super().__init__(record, term, TermEnum.INT) 178 | 179 | @property 180 | def value(self): 181 | return 1.0 182 | 183 | 184 | class FunExtractor(Extractor): 185 | """ 186 | Continuous extractor (with functions defined). 187 | """ 188 | 189 | def __init__(self, record, term): 190 | """ 191 | ctor. 192 | 193 | :param record: Dictionary. 194 | :param term: Model term. 195 | :return: None. 196 | """ 197 | super().__init__(record, term, TermEnum.FUN) 198 | 199 | # flake8: noqa: F841 200 | @property 201 | def value(self): 202 | lhs = self._term.rindex('(') + 1 203 | rhs = self._term.index(')') 204 | x_name = self._term[lhs:rhs] 205 | expression = f'{self._term[0:lhs]}val{self._term[rhs:]}' 206 | val = self._record[x_name] if x_name in self._record else None 207 | if pd.isna(val): 208 | return None 209 | 210 | if 'np.' in expression: 211 | np = importlib.import_module('numpy') 212 | v = eval(expression) 213 | 214 | if isinstance(v, np.generic): 215 | v = np.asscalar(v) 216 | return v 217 | 218 | 219 | class InteractionExtractor(object): 220 | """ 221 | Interaction extractor for interaction effects. 222 | """ 223 | 224 | def __init__(self, record, terms): 225 | """ 226 | ctor. 227 | 228 | :param record: Dictionary. 229 | :param terms: Model term (possibly with interaction effects). 230 | :return: None. 231 | """ 232 | self._terms = terms 233 | extractors = [TermEnum.get_extractor(record, term) for term in terms.split(':')] 234 | values = [e.value for e in extractors] 235 | values = [v for v in values if pd.notna(v)] 236 | 237 | if len(values) != len(extractors): 238 | self.__value = None 239 | else: 240 | self.__value = reduce(lambda a, b: a * b, values) 241 | 242 | def __repr__(self): 243 | return f'{self.__class__.__name__}[terms={self._terms}]' 244 | 245 | @property 246 | def value(self): 247 | return self.__value 248 | -------------------------------------------------------------------------------- /ydot/spark.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from itertools import product 3 | 4 | import pandas as pd 5 | from patsy.highlevel import dmatrices 6 | from pyspark import Row 7 | 8 | from ydot.formula import InteractionExtractor 9 | 10 | 11 | def get_profile(sdf): 12 | """ 13 | Gets the field profiles of the specified Spark dataframe. 14 | 15 | :param sdf: Spark dataframe. 16 | :return: Dictionary. 17 | """ 18 | dtypes = {k: v for k, v in sdf.dtypes} 19 | cat_types = sdf.rdd \ 20 | .map(lambda r: r.asDict()) \ 21 | .flatMap(lambda r: [((k, r[k]), 1) for k, v in dtypes.items() if v == 'string']) \ 22 | .reduceByKey(lambda a, b: a + b) \ 23 | .map(lambda tup: (tup[0][0], {tup[0][1]: tup[1]})) \ 24 | .reduceByKey(lambda a, b: {**a, **b}) \ 25 | .map(lambda tup: (tup[0], [(k, v) for k, v in tup[1].items()])) \ 26 | .map(lambda tup: (tup[0], sorted(tup[1], key=lambda t: (t[1], t[0]), reverse=True))) \ 27 | .map(lambda tup: (tup[0], [t[0] for t in tup[1]])) \ 28 | .collect() 29 | cat_types = {tup[0]: tup[1] for tup in cat_types} 30 | con_types = {k: [1.0] for k, v in dtypes.items() if v != 'string'} 31 | all_types = {**cat_types, **con_types} 32 | return all_types 33 | 34 | 35 | # flake8: noqa: F841 36 | def get_columns(formula, sdf, profile=None): 37 | """ 38 | Gets the expanded columns of the specified Spark dataframe using the specified formula. 39 | 40 | :param formula: Formula (R-like, based on patsy). 41 | :param sdf: Spark dataframe. 42 | :param profile: Profile. Default is `None` and profile will be determined empirically. 43 | :return: Tuple of columns for y, X. 44 | """ 45 | if profile is None: 46 | profile = get_profile(sdf) 47 | 48 | data = product(*(v for _, v in profile.items())) 49 | columns = [k for k, _ in profile.items()] 50 | df = pd.DataFrame(data, columns=columns) 51 | 52 | if 'np.' in formula: 53 | np = importlib.import_module('numpy') 54 | y, X = dmatrices(formula, df, return_type='dataframe') 55 | 56 | return list(y), list(X) 57 | 58 | 59 | def __smatrices(columns, sdf): 60 | """ 61 | Constructs new Spark dataframe based on columns. 62 | 63 | :param columns: Columns generated from patsy. 64 | :param sdf: Spark dataframe. 65 | :return: Spark dataframe. 66 | """ 67 | 68 | def to_record(record): 69 | return Row(**{term: InteractionExtractor(record, term).value for term in columns}) 70 | 71 | return sdf.rdd \ 72 | .map(lambda r: to_record(r.asDict())) \ 73 | .toDF() 74 | 75 | 76 | def smatrices(formula, sdf, profile=None): 77 | """ 78 | Gets tuple of design/model matrices. 79 | 80 | :param formula: Formula. 81 | :param sdf: Spark dataframe. 82 | :param profile: Dictionary of data profile. 83 | :return: y, X Spark dataframes. 84 | """ 85 | y_cols, X_cols = get_columns(formula, sdf, profile=profile) 86 | X = __smatrices(X_cols, sdf) 87 | y = __smatrices(y_cols, sdf) 88 | 89 | return y, X 90 | --------------------------------------------------------------------------------