├── .dockerignore
├── .github
    └── FUNDING.yml
├── .gitignore
├── .readthedocs.yml
├── Dockerfile
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── Makefile.bat
├── README.md
├── build.sh
├── docs
    ├── Makefile
    ├── autobuild.bat
    ├── autobuild.sh
    ├── make.bat
    └── source
    │   ├── _code
    │       ├── colon-cat-interaction.csv
    │       ├── colon-con-interaction.csv
    │       ├── data.csv
    │       ├── demo-formulas.py
    │       ├── demo.py
    │       ├── divide-interaction.csv
    │       ├── no-intercept.csv
    │       ├── star-cat-interaction.csv
    │       ├── star-con-interaction.csv
    │       ├── transformed-continuous.csv
    │       └── two-way-interactions.csv
    │   ├── _logo
    │       ├── logo-1000x1000.png
    │       ├── logo-250x250.png
    │       └── logo-500x500.png
    │   ├── _static
    │       ├── css
    │       │   └── override.css
    │       ├── favicon.ico
    │       └── images
    │       │   ├── logo-small.png
    │       │   ├── logo.png
    │       │   ├── ooc-logo.png
    │       │   └── ooc-small.png
    │   ├── _templates
    │       └── .gitkeep
    │   ├── conf.py
    │   ├── index.rst
    │   ├── modules.rst
    │   ├── quickstart.rst
    │   ├── refs.bib
    │   ├── robots.txt
    │   ├── ydot.rst
    │   └── zzz-bib.rst
├── logo.png
├── publish.sh
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_formula.py
    └── test_spark.py
└── ydot
    ├── __init__.py
    ├── formula.py
    └── spark.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | **/*.pyc
 2 | .idea/
 3 | docs/build/
 4 | .pytest_cache/
 5 | build/
 6 | coverage/
 7 | dist/
 8 | ydot.egg-info/
 9 | docs/build/
10 | .coverage
11 | .noseids
12 | .ipynb_checkpoints/
13 | joblib_memmap/
14 | .DS_store


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: vangj
 4 | patreon: vangj
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: https://oneoffcoder.com/
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.pyc
 2 | .idea/
 3 | docs/build/
 4 | coverage/
 5 | .coverage
 6 | .noseids
 7 | dist/
 8 | ydot.egg-info/
 9 | build/
10 | .ipynb_checkpoints/
11 | .pypirc
12 | .pypircc
13 | joblib_memmap/
14 | .pytest_cache/
15 | .DS_store


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |   version: 3.7
22 |   install:
23 |     - requirements: requirements.txt


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM oneoffcoder/python-java:latest
 2 | 
 3 | LABEL author="Jee Vang, Ph.D."
 4 | LABEL email="vangjee@gmail.com"
 5 | 
 6 | ARG AAPI_VERSION
 7 | ARG APYPI_REPO
 8 | 
 9 | ENV API_VERSION=$AAPI_VERSION
10 | ENV PYPI_REPO=$APYPI_REPO
11 | 
12 | RUN apt-get update \
13 |     && apt-get upgrade -y
14 | COPY . /code
15 | RUN pip install -r /code/requirements.txt
16 | RUN /code/publish.sh


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2017 Jee Vang
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt README.md
2 | prune tests*


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: init clean lint test
 2 | .DEFAULT_GOAL := build
 3 | 
 4 | init:
 5 | 	pip install -r requirements.txt
 6 | 
 7 | lint:
 8 | 	python -m flake8 ./ydot
 9 | 
10 | test: clean lint
11 | 	nosetests tests
12 | 
13 | build: test
14 | 	python setup.py bdist_egg
15 | 
16 | build-dist: compile
17 | 	python setup.py bdist_egg sdist bdist_wheel
18 | 
19 | install: build
20 | 	python setup.py install
21 | 
22 | publish: build
23 | 	python setup.py sdist upload -r pypi
24 | 
25 | compile:
26 | 	python -m compileall -f ./ydot
27 | 
28 | clean:
29 | 	find . -type f -name '*.pyc' -delete
30 | 	find . -type d -name '__pycache__' -delete
31 | 	rm -fr coverage/
32 | 	rm -fr dist/
33 | 	rm -fr build/
34 | 	rm -fr ydot.egg-info/
35 | 	rm -fr jupyter/.ipynb_checkpoints/
36 | 	rm -fr joblib_memmap/
37 | 	rm -fr docs/build/
38 | 	rm -fr .pytest_cache/
39 | 	rm -f .coverage
40 | 	rm -f .noseids
41 | 
42 | 


--------------------------------------------------------------------------------
/Makefile.bat:
--------------------------------------------------------------------------------
 1 | @ECHO off
 2 | if /I %1 == default goto :default
 3 | if /I %1 == init goto :init
 4 | if /I %1 == lint goto :lint
 5 | if /I %1 == test goto :test
 6 | if /I %1 == clean goto :clean
 7 | if /I %1 == build goto :build
 8 | if /I %1 == install goto :install
 9 | 
10 | goto :eof ::can be ommited to run the `default` function similarly to makefiles
11 | 
12 | :default
13 | goto :test
14 | 
15 | :init
16 | pip install -r requirements.txt
17 | goto :eof
18 | 
19 | :lint
20 | python -m flake8 ./ydot
21 | goto :eof
22 | 
23 | :test
24 | nosetests tests
25 | goto :eof
26 | 
27 | :clean
28 | del /S *.pyc
29 | rmdir /S /Q coverage
30 | rmdir /S /Q dist
31 | rmdir /S /Q build
32 | rmdir /S /Q ydot.egg-info
33 | rmdir /S /Q jupyter/.ipynb_checkpoints
34 | rmdir /S /Q docs/build
35 | rmdir /S /Q joblib_memmap
36 | rmdir /S /Q .pytest_cache
37 | del .coverage
38 | del .noseids
39 | goto :eof
40 | 
41 | :build
42 | python setup.py bdist_egg sdist bdist_wheel
43 | goto :eof
44 | 
45 | :install
46 | python setup.py install
47 | goto :eof


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![ydot logo](https://ydot.readthedocs.io/en/latest/_images/logo.png)
 2 | 
 3 | # ydot
 4 | 
 5 | R-like formulas for Spark Dataframes.
 6 | 
 7 | - [Documentation](https://ydot.readthedocs.io/)
 8 | - [PyPi](https://pypi.org/project/ydot/) 
 9 | - [Gitter](https://gitter.im/dataflava/ydot)
10 | 
11 | Now you have the expressive power of R-like formulas to produce design matrices for your experimental needs. This API is based off of [patsy](https://patsy.readthedocs.io/en/latest/), but for use with Apache Spark dataframes. Given a Spark dataframe, you can express your design matrices with something that resembles the following.
12 | 
13 | `y ~ x1 + x2 + (x3 + a + b)**2`
14 | 
15 | Here's a short and sweet example.
16 | 
17 | ```python
18 | from ydot.spark import smatrices
19 | 
20 | spark_df = get_a_spark_dataframe()
21 | formula = 'y ~ x1 + x2 + (x3 + a + b)**2'
22 | y, X = smatrices(formula, spark_df)
23 | ```
24 | 
25 | # Software Copyright
26 | 
27 | ```
28 | Copyright 2020 One-Off Coder
29 | 
30 | Licensed under the Apache License, Version 2.0 (the "License");
31 | you may not use this file except in compliance with the License.
32 | You may obtain a copy of the License at
33 | 
34 |     http://www.apache.org/licenses/LICENSE-2.0
35 | 
36 | Unless required by applicable law or agreed to in writing, software
37 | distributed under the License is distributed on an "AS IS" BASIS,
38 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
39 | See the License for the specific language governing permissions and
40 | limitations under the License.
41 | ```
42 | 
43 | # Book Copyright
44 | 
45 | Copyright 2020 One-Off Coder
46 | 
47 | This work is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) by [One-Off Coder](https://www.oneoffcoder.com).
48 | 
49 | ![Creative Commons Attribution 4.0 International License](https://i.creativecommons.org/l/by/4.0/88x31.png "Creative Commons Attribution 4.0 International License")
50 | 
51 | # Art Copyright
52 | 
53 | Copyright 2020 Daytchia Vang
54 | 
55 | # Citation
56 | 
57 | ```
58 | @misc{oneoffcoder_ydot_2020,
59 | title={ydot, R-like formulas for Spark Dataframes},
60 | url={https://github.com/oneoffcoder/pyspark-formula},
61 | author={Jee Vang},
62 | year={2020},
63 | month={Dec}}
64 | ```
65 | 
66 | # Sponsor, Love
67 | 
68 | - [Patreon](https://www.patreon.com/vangj)
69 | - [GitHub](https://github.com/sponsors/vangj)


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DOCKER_FILE=Dockerfile
 4 | DOCKER_REPO=ydot
 5 | DOCKER_TAG=local
 6 | AAPI_VERSION=version
 7 | APYPI_REPO=repo
 8 | 
 9 | while getopts v:r: option
10 | do
11 |   case "${option}"
12 |   in
13 |   v) AAPI_VERSION=${OPTARG};;
14 |   r) APYPI_REPO=${OPTARG};;
15 | esac
16 | done
17 | 
18 | if [[ "version" == AAPI_VERSION || "repo" == $APYPI_REPO ]]; then
19 |   echo "Usage: ./build.sh -r [pypi|testpypi] -v [version]"
20 |   echo "     -r repository, pypi or testpypi"
21 |   echo "     -v version e.g. 0.2.5"
22 | else
23 |   docker build --no-cache \
24 |     -f $DOCKER_FILE \
25 |     --build-arg AAPI_VERSION=$AAPI_VERSION \
26 |     --build-arg APYPI_REPO=$APYPI_REPO \
27 |     -t ${DOCKER_REPO}:${DOCKER_TAG} .
28 | fi


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/autobuild.bat:
--------------------------------------------------------------------------------
1 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000


--------------------------------------------------------------------------------
/docs/autobuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python -m sphinx_autobuild ./source ./build -b html --host 0.0.0.0 --port 8000
4 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_code/colon-cat-interaction.csv:
--------------------------------------------------------------------------------
1 | Intercept,b[T.low],b[T.mid],a[T.right]:b[high],a[T.right]:b[low],a[T.right]:b[mid]
2 | 1.0,1.0,0.0,0.0,0.0,0.0
3 | 1.0,1.0,0.0,0.0,0.0,0.0
4 | 1.0,0.0,0.0,1.0,0.0,0.0
5 | 1.0,0.0,1.0,0.0,0.0,1.0
6 | 1.0,1.0,0.0,0.0,0.0,0.0
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/colon-con-interaction.csv:
--------------------------------------------------------------------------------
1 | Intercept,x1:x2
2 | 1.0,76.83302248278848
3 | 1.0,84.73542172597531
4 | 1.0,55.154885818557126
5 | 1.0,97.44678088481062
6 | 1.0,52.341422295472896
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/data.csv:
--------------------------------------------------------------------------------
 1 | a,b,x1,x2,y
 2 | left,low,19.945536387662504,3.85214120038979,0.0
 3 | left,low,20.674308066353493,4.098585619118175,1.0
 4 | right,high,20.346647025958433,2.7107604387194626,1.0
 5 | right,mid,18.699653829045985,5.2111542692543065,1.0
 6 | left,low,21.51851187887476,2.432390426907621,1.0
 7 | right,mid,20.989823705535017,3.6774523253171734,1.0
 8 | right,high,20.277680897136328,2.4873300559969604,0.0
 9 | right,mid,19.551410645704927,2.3549674965407372,0.0
10 | right,low,20.96196624352397,3.1665930443154995,0.0
11 | right,mid,19.172421360793678,3.562224297579924,1.0


--------------------------------------------------------------------------------
/docs/source/_code/demo-formulas.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from random import choice
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | from ydot.spark import smatrices
 9 | 
10 | random.seed(37)
11 | np.random.seed(37)
12 | 
13 | 
14 | def get_spark_dataframe(spark):
15 |     n = 100
16 |     data = {
17 |         'a': [choice(['left', 'right']) for _ in range(n)],
18 |         'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
19 |         'x1': np.random.normal(20, 1, n),
20 |         'x2': np.random.normal(3, 1, n),
21 |         'y': [choice([1.0, 0.0]) for _ in range(n)]
22 |     }
23 |     pdf = pd.DataFrame(data)
24 | 
25 |     sdf = spark.createDataFrame(pdf)
26 |     return sdf
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     try:
31 |         spark = (SparkSession.builder
32 |                  .master('local[4]')
33 |                  .appName('local-testing-pyspark')
34 |                  .getOrCreate())
35 |         sdf = get_spark_dataframe(spark)
36 | 
37 |         formulas = [
38 |             {
39 |                 'f': 'y ~ np.sin(x1) + np.cos(x2) + a + b',
40 |                 'o': 'transformed-continuous.csv'
41 |             },
42 |             {
43 |                 'f': 'y ~ x1*x2',
44 |                 'o': 'star-con-interaction.csv'
45 |             },
46 |             {
47 |                 'f': 'y ~ a*b',
48 |                 'o': 'star-cat-interaction.csv'
49 |             },
50 |             {
51 |                 'f': 'y ~ x1:x2',
52 |                 'o': 'colon-con-interaction.csv'
53 |             },
54 |             {
55 |                 'f': 'y ~ a:b',
56 |                 'o': 'colon-cat-interaction.csv'
57 |             },
58 |             {
59 |                 'f': 'y ~ (x1 + x2) / (a + b)',
60 |                 'o': 'divide-interaction.csv'
61 |             },
62 |             {
63 |                 'f': 'y ~ x1 + x2 + a - 1',
64 |                 'o': 'no-intercept.csv'
65 |             }
66 |         ]
67 | 
68 |         for item in formulas:
69 |             f = item['f']
70 |             o = item['o']
71 | 
72 |             y, X = smatrices(f, sdf)
73 |             y = y.toPandas()
74 |             X = X.toPandas()
75 | 
76 |             X.head(5).to_csv(o, index=False)
77 | 
78 |             s = f"""
79 |             .. csv-table:: {f}
80 |                :file: _code/{o}
81 |                :header-rows: 1
82 |             """
83 |             print(s.strip())
84 |     except Exception as e:
85 |         print(e)
86 |     finally:
87 |         try:
88 |             spark.stop()
89 |             print('closed spark')
90 |         except Exception as e:
91 |             print(e)
92 | 


--------------------------------------------------------------------------------
/docs/source/_code/demo.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from random import choice
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | from ydot.spark import smatrices
 9 | 
10 | random.seed(37)
11 | np.random.seed(37)
12 | 
13 | 
14 | def get_spark_dataframe(spark):
15 |     n = 100
16 |     data = {
17 |         'a': [choice(['left', 'right']) for _ in range(n)],
18 |         'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
19 |         'x1': np.random.normal(20, 1, n),
20 |         'x2': np.random.normal(3, 1, n),
21 |         'y': [choice([1.0, 0.0]) for _ in range(n)]
22 |     }
23 |     pdf = pd.DataFrame(data)
24 | 
25 |     sdf = spark.createDataFrame(pdf)
26 |     return sdf
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     try:
31 |         spark = (SparkSession.builder
32 |                  .master('local[4]')
33 |                  .appName('local-testing-pyspark')
34 |                  .getOrCreate())
35 |         sdf = get_spark_dataframe(spark)
36 | 
37 |         y, X = smatrices('y ~ (x1 + x2 + a + b)**2', sdf)
38 |         y = y.toPandas()
39 |         X = X.toPandas()
40 | 
41 |         print(X.head(10))
42 |         X.head(10).to_csv('two-way-interactions.csv', index=False)
43 |     except Exception as e:
44 |         print(e)
45 |     finally:
46 |         try:
47 |             spark.stop()
48 |             print('closed spark')
49 |         except Exception as e:
50 |             print(e)
51 | 


--------------------------------------------------------------------------------
/docs/source/_code/divide-interaction.csv:
--------------------------------------------------------------------------------
1 | Intercept,x1,x2,x1:x2:a[left],x1:x2:a[right],x1:x2:b[T.low],x1:x2:b[T.mid]
2 | 1.0,19.945536387662504,3.85214120038979,76.83302248278848,0.0,76.83302248278848,0.0
3 | 1.0,20.674308066353493,4.098585619118175,84.73542172597531,0.0,84.73542172597531,0.0
4 | 1.0,20.346647025958433,2.7107604387194626,0.0,55.154885818557126,0.0,0.0
5 | 1.0,18.699653829045985,5.2111542692543065,0.0,97.44678088481062,0.0,97.44678088481062
6 | 1.0,21.51851187887476,2.432390426907621,52.341422295472896,0.0,52.341422295472896,0.0
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/no-intercept.csv:
--------------------------------------------------------------------------------
1 | a[left],a[right],x1,x2
2 | 1.0,0.0,19.945536387662504,3.85214120038979
3 | 1.0,0.0,20.674308066353493,4.098585619118175
4 | 0.0,1.0,20.346647025958433,2.7107604387194626
5 | 0.0,1.0,18.699653829045985,5.2111542692543065
6 | 1.0,0.0,21.51851187887476,2.432390426907621
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/star-cat-interaction.csv:
--------------------------------------------------------------------------------
1 | Intercept,a[T.right],b[T.low],b[T.mid],a[T.right]:b[T.low],a[T.right]:b[T.mid]
2 | 1.0,0.0,1.0,0.0,0.0,0.0
3 | 1.0,0.0,1.0,0.0,0.0,0.0
4 | 1.0,1.0,0.0,0.0,0.0,0.0
5 | 1.0,1.0,0.0,1.0,0.0,1.0
6 | 1.0,0.0,1.0,0.0,0.0,0.0
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/star-con-interaction.csv:
--------------------------------------------------------------------------------
1 | Intercept,x1,x2,x1:x2
2 | 1.0,19.945536387662504,3.85214120038979,76.83302248278848
3 | 1.0,20.674308066353493,4.098585619118175,84.73542172597531
4 | 1.0,20.346647025958433,2.7107604387194626,55.154885818557126
5 | 1.0,18.699653829045985,5.2111542692543065,97.44678088481062
6 | 1.0,21.51851187887476,2.432390426907621,52.341422295472896
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/transformed-continuous.csv:
--------------------------------------------------------------------------------
1 | Intercept,a[T.right],b[T.low],b[T.mid],np.sin(x1),np.cos(x2)
2 | 1.0,0.0,1.0,0.0,0.8893769205406579,-0.758004200582313
3 | 1.0,0.0,1.0,0.0,0.9679261582216445,-0.5759807266894401
4 | 1.0,1.0,0.0,0.0,0.9972849995254774,-0.9086185088676886
5 | 1.0,1.0,0.0,1.0,-0.14934132364604816,0.4783416124776783
6 | 1.0,0.0,1.0,0.0,0.45523550315103734,-0.7588816501987654
7 | 


--------------------------------------------------------------------------------
/docs/source/_code/two-way-interactions.csv:
--------------------------------------------------------------------------------
 1 | Intercept,a[T.right],b[T.low],b[T.mid],a[T.right]:b[T.low],a[T.right]:b[T.mid],x1,x1:a[T.right],x1:b[T.low],x1:b[T.mid],x2,x2:a[T.right],x2:b[T.low],x2:b[T.mid],x1:x2
 2 | 1.0,0.0,1.0,0.0,0.0,0.0,19.945536387662504,0.0,19.945536387662504,0.0,3.85214120038979,0.0,3.85214120038979,0.0,76.83302248278848
 3 | 1.0,0.0,1.0,0.0,0.0,0.0,20.674308066353493,0.0,20.674308066353493,0.0,4.098585619118175,0.0,4.098585619118175,0.0,84.73542172597531
 4 | 1.0,1.0,0.0,0.0,0.0,0.0,20.346647025958433,20.346647025958433,0.0,0.0,2.7107604387194626,2.7107604387194626,0.0,0.0,55.154885818557126
 5 | 1.0,1.0,0.0,1.0,0.0,1.0,18.699653829045985,18.699653829045985,0.0,18.699653829045985,5.2111542692543065,5.2111542692543065,0.0,5.2111542692543065,97.44678088481062
 6 | 1.0,0.0,1.0,0.0,0.0,0.0,21.51851187887476,0.0,21.51851187887476,0.0,2.432390426907621,0.0,2.432390426907621,0.0,52.341422295472896
 7 | 1.0,1.0,0.0,1.0,0.0,1.0,20.989823705535017,20.989823705535017,0.0,20.989823705535017,3.6774523253171734,3.6774523253171734,0.0,3.6774523253171734,77.18907599391727
 8 | 1.0,1.0,0.0,0.0,0.0,0.0,20.277680897136328,20.277680897136328,0.0,0.0,2.4873300559969604,2.4873300559969604,0.0,0.0,50.437285161362595
 9 | 1.0,1.0,0.0,1.0,0.0,1.0,19.551410645704927,19.551410645704927,0.0,19.551410645704927,2.3549674965407372,2.3549674965407372,0.0,2.3549674965407372,46.04293658215565
10 | 1.0,1.0,1.0,0.0,1.0,0.0,20.96196624352397,20.96196624352397,20.96196624352397,0.0,3.1665930443154995,3.1665930443154995,3.1665930443154995,0.0,66.3780165019193
11 | 1.0,1.0,0.0,1.0,0.0,1.0,19.172421360793678,19.172421360793678,0.0,19.172421360793678,3.562224297579924,3.562224297579924,0.0,3.562224297579924,68.29646521485958
12 | 


--------------------------------------------------------------------------------
/docs/source/_logo/logo-1000x1000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-1000x1000.png


--------------------------------------------------------------------------------
/docs/source/_logo/logo-250x250.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-250x250.png


--------------------------------------------------------------------------------
/docs/source/_logo/logo-500x500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_logo/logo-500x500.png


--------------------------------------------------------------------------------
/docs/source/_static/css/override.css:
--------------------------------------------------------------------------------
 1 | table.expand {
 2 |     width: 100%;
 3 | }
 4 | table.rc-headers, th.rc-headers, td.rc-headers {
 5 |             border: 1px dashed blue;
 6 |             border-collapse: collapse;
 7 |             padding: 5px;
 8 | }
 9 | th.heading, td.heading {
10 |     font-weight: bold;
11 | }


--------------------------------------------------------------------------------
/docs/source/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/favicon.ico


--------------------------------------------------------------------------------
/docs/source/_static/images/logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/logo-small.png


--------------------------------------------------------------------------------
/docs/source/_static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/logo.png


--------------------------------------------------------------------------------
/docs/source/_static/images/ooc-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/ooc-logo.png


--------------------------------------------------------------------------------
/docs/source/_static/images/ooc-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_static/images/ooc-small.png


--------------------------------------------------------------------------------
/docs/source/_templates/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/docs/source/_templates/.gitkeep


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../../'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'ydot'
21 | copyright = '2020, One-Off Coder'
22 | author = 'Jee Vang, Ph.D.'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '0.0.6'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.doctest',
36 |     'sphinx.ext.todo',
37 |     'sphinx.ext.coverage',
38 |     'sphinx.ext.mathjax',
39 |     'sphinx.ext.githubpages',
40 |     'sphinxcontrib.bibtex',
41 |     'sphinxcontrib.blockdiag',
42 |     'sphinx_sitemap'
43 | ]
44 | 
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 | 
48 | # List of patterns, relative to source directory, that match files and
49 | # directories to ignore when looking for source files.
50 | # This pattern also affects html_static_path and html_extra_path.
51 | exclude_patterns = []
52 | 
53 | 
54 | # -- Options for HTML output -------------------------------------------------
55 | 
56 | # The theme to use for HTML and HTML Help pages.  See the documentation for
57 | # a list of builtin themes.
58 | #
59 | html_theme = 'sphinx_rtd_theme'
60 | 
61 | # Add any paths that contain custom static files (such as style sheets) here,
62 | # relative to this directory. They are copied after the builtin static files,
63 | # so a file named "default.css" will overwrite the builtin "default.css".
64 | html_static_path = ['_static']
65 | html_css_files = [
66 |     'css/override.css',
67 | ]
68 | html_extra_path = ['robots.txt']
69 | html_show_sourcelink = False
70 | html_show_sphinx = False
71 | html_last_updated_fmt = '%b %d, %Y, %X'
72 | html_logo = '_static/images/logo-small.png'
73 | html_favicon = '_static/favicon.ico'
74 | html_theme_options = {
75 |     'canonical_url': 'https://ydot.readthedocs.io/',
76 |     'analytics_id': 'UA-150762273-1',  #  Provided by Google in your dashboard
77 |     'logo_only': False,
78 |     'display_version': True,
79 |     'prev_next_buttons_location': 'bottom',
80 |     'style_external_links': True,
81 |     'style_nav_header_background': '#0085CA',
82 |     # Toc options
83 |     'collapse_navigation': True,
84 |     'sticky_navigation': True,
85 |     'navigation_depth': 4,
86 |     'includehidden': True,
87 |     'titles_only': False
88 | }


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. meta::
  2 |    :description: R-like formulas for Spark Dataframes.
  3 |    :keywords: statistics, pyspark, formula, patsy, spark, dataframe, regression, classification, data, machine learning, artificial intelligence
  4 |    :robots: index, follow
  5 |    :abstract: A Python API to produce PySpark dataframe models from R-like formula expressions.
  6 |    :author: Jee Vang, Ph.D.
  7 |    :contact: g@oneoffcoder.com
  8 |    :copyright: One-Off Coder
  9 |    :content: global
 10 |    :generator: Sphinx
 11 |    :language: English
 12 |    :rating: general
 13 |    :reply-to: info@oneoffcoder.com
 14 |    :web_author: Jee Vang, Ph.D.
 15 |    :revisit-after: 1 days
 16 | 
 17 | .. ydot documentation master file, created by
 18 |    sphinx-quickstart on Sun Dec  6 17:42:42 2020.
 19 |    You can adapt this file completely to your liking, but it should at least
 20 |    contain the root `toctree` directive.
 21 | 
 22 | ydot
 23 | ====
 24 | 
 25 | .. image:: _static/images/logo.png
 26 |    :align: center
 27 |    :alt: ydot logo.
 28 | 
 29 | ``ydot`` is a Python API to produce PySpark dataframe models from R-like formula expressions. This project is based on `patsy <https://patsy.readthedocs.io/en/latest/index.html>`_ :cite:`2020:patsy`. As a quickstart, let's say you have a Spark dataframe with data as follows.
 30 | 
 31 | .. csv-table:: Dummy Data in a Spark Dataframe
 32 |    :file: _code/data.csv
 33 |    :header-rows: 1
 34 | 
 35 | Now, let's say you want to model this dataset as follows.
 36 | 
 37 | - ``y ~ x_1 + x_2 + a + b``
 38 | 
 39 | Then all you have to do is use the ``smatrices()`` function.
 40 | 
 41 | .. code-block:: python
 42 |    :linenos:
 43 | 
 44 |    from ydot.spark import smatrices
 45 | 
 46 |    formula = 'y ~ x1 + x2 + a + b'
 47 |    y, X = smatrices(formula, sdf)
 48 | 
 49 | Observe that ``y`` and ``X`` will be Spark dataframes as specified by the formula. Here's a more interesting example where you want a model specified up to all two-way interactions.
 50 | 
 51 | - ``y ~ (x1 + x2 + a + b)**2``
 52 | 
 53 | Then you could issue the code as below.
 54 | 
 55 | .. code-block:: python
 56 |    :linenos:
 57 | 
 58 |    from ydot.spark import smatrices
 59 | 
 60 |    formula = 'y ~ (x1 + x2 + a + b)**2'
 61 |    y, X = smatrices(formula, sdf)
 62 | 
 63 | Your resulting ``X`` Spark dataframe will look like the following.
 64 | 
 65 | .. csv-table:: Dummy Data Transformed by Formula
 66 |    :file: _code/two-way-interactions.csv
 67 |    :header-rows: 1
 68 | 
 69 | In general, what you get with ``patsy`` is what you get with ``ydot``, however, there are exceptions. For example, the builtin functions such as ``standardize()`` and ``center()`` available with ``patsy`` will not work against Spark dataframes. Additionally, patsy allows for custom transforms, but such transforms (or user defined functions) must be visible. For now, only numpy-based transformed are allowed against continuous variables (or numeric columns).
 70 | 
 71 | .. toctree::
 72 |    :maxdepth: 2
 73 |    :caption: Contents
 74 | 
 75 |    quickstart
 76 |    zzz-bib
 77 | 
 78 | .. toctree::
 79 |    :maxdepth: 2
 80 |    :caption: API Documentation
 81 | 
 82 |    modules
 83 | 
 84 | 
 85 | 
 86 | Indices and tables
 87 | ==================
 88 | 
 89 | * :ref:`genindex`
 90 | * :ref:`modindex`
 91 | * :ref:`search`
 92 | 
 93 | About
 94 | =====
 95 | 
 96 | .. image:: _static/images/ooc-logo.png
 97 |    :alt: One-Off Coder logo.
 98 | 
 99 | One-Off Coder is an educational, service and product company. Please visit us online to discover how we may help you achieve life-long success in your personal coding career or with your company's business goals and objectives.
100 | 
101 | - |Website_Link|
102 | - |Facebook_Link|
103 | - |Twitter_Link|
104 | - |Instagram_Link|
105 | - |YouTube_Link|
106 | - |LinkedIn_Link|
107 | 
108 | .. |Website_Link| raw:: html
109 | 
110 |    <a href="https://www.oneoffcoder.com" target="_blank">Website</a>
111 | 
112 | .. |Facebook_Link| raw:: html
113 | 
114 |    <a href="https://www.facebook.com/One-Off-Coder-309817926496801/" target="_blank">Facebook</a>
115 | 
116 | .. |Twitter_Link| raw:: html
117 | 
118 |    <a href="https://twitter.com/oneoffcoder" target="_blank">Twitter</a>
119 | 
120 | .. |Instagram_Link| raw:: html
121 | 
122 |    <a href="https://www.instagram.com/oneoffcoder/" target="_blank">Instagram</a>
123 | 
124 | .. |YouTube_Link| raw:: html
125 | 
126 |    <a href="https://www.youtube.com/channel/UCCCv8Glpb2dq2mhUj5mcHCQ" target="_blank">YouTube</a>
127 | 
128 | .. |LinkedIn_Link| raw:: html
129 | 
130 |    <a href="https://www.linkedin.com/company/one-off-coder/" target="_blank">LinkedIn</a>
131 | 
132 | Copyright
133 | =========
134 | 
135 | Documentation
136 | -------------
137 | 
138 | .. raw:: html
139 | 
140 |     <embed>
141 |     This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/" target="_blank">Creative Commons Attribution 4.0 International License</a> by <a href="https://www.oneoffcoder.com" target="_blank">One-Off Coder</a>.
142 |     <br />
143 |     <br />
144 |     <a rel="license" href="http://creativecommons.org/licenses/by/4.0/" target="_blank">
145 |         <img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/88x31.png" />
146 |     </a>
147 |     <br />
148 |     <br />
149 |     </embed>
150 | 
151 | Software
152 | --------
153 | 
154 | ::
155 | 
156 |     Copyright 2020 One-Off Coder
157 | 
158 |     Licensed under the Apache License, Version 2.0 (the "License");
159 |     you may not use this file except in compliance with the License.
160 |     You may obtain a copy of the License at
161 | 
162 |        http://www.apache.org/licenses/LICENSE-2.0
163 | 
164 |     Unless required by applicable law or agreed to in writing, software
165 |     distributed under the License is distributed on an "AS IS" BASIS,
166 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
167 |     See the License for the specific language governing permissions and
168 |     limitations under the License.
169 | 
170 | Art
171 | ---
172 | 
173 | ::
174 | 
175 |     Copyright 2020 Daytchia Vang
176 | 
177 | Citation
178 | ========
179 | 
180 | ::
181 | 
182 |     @misc{oneoffcoder_ydot_2020,
183 |     title={ydot, R-like formulas for Spark Dataframes},
184 |     url={https://github.com/oneoffcoder/pyspark-formula},
185 |     author={Jee Vang},
186 |     year={2020},
187 |     month={Dec}}
188 | 
189 | Author
190 | ======
191 | 
192 | Jee Vang, Ph.D.
193 | 
194 | - |Patreon_Link|
195 | - |Github_Link|
196 | 
197 | .. |Patreon_Link| raw:: html
198 | 
199 |    <a href="https://www.patreon.com/vangj" target="_blank">Patreon</a>: support is appreciated
200 | 
201 | .. |Github_Link| raw:: html
202 | 
203 |    <a href="https://github.com/sponsors/vangj" target="_blank">GitHub</a>: sponsorship will help us change the world for the better
204 | 
205 | Help
206 | ====
207 | 
208 | - |Source_Link|
209 | - |Gitter_Link|
210 | 
211 | .. |Source_Link| raw:: html
212 | 
213 |    <a href="https://github.com/oneoffcoder/pyspark-formula" target="_blank">GitHub</a>: source code
214 | 
215 | .. |Gitter_Link| raw:: html
216 | 
217 |    <a href="https://gitter.im/dataflava/ydot" target="_blank">Gitter</a>: chat


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | .. toctree::
2 |    :maxdepth: 4
3 | 
4 |    ydot
5 | 


--------------------------------------------------------------------------------
/docs/source/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ==========
 3 | 
 4 | Basic
 5 | -----
 6 | 
 7 | The best way to learn ``R``-style formula syntax with ``ydot`` is to head on over to `patsy <https://patsy.readthedocs.io/en/latest/index.html>`_ :cite:`2020:patsy` and read the documentation. Below, we show very simple code to transform a Spark dataframe into two design matrices (these are also Spark dataframes), ``y`` and ``X``, using a formula that defines a model up to two-way interactions.
 8 | 
 9 | .. literalinclude:: _code/demo.py
10 |    :language: python
11 |    :linenos:
12 | 
13 | More
14 | ----
15 | 
16 | We use the code below to generate the models (data) below.
17 | 
18 | .. literalinclude:: _code/demo-formulas.py
19 |    :language: python
20 |    :linenos:
21 | 
22 | You can use ``numpy`` functions against continuous variables.
23 | 
24 | .. csv-table:: y ~ np.sin(x1) + np.cos(x2) + a + b
25 |    :file: _code/transformed-continuous.csv
26 |    :header-rows: 1
27 | 
28 | The ``*`` specifies interactions and keeps lower order terms.
29 | 
30 | .. csv-table:: y ~ x1*x2
31 |    :file: _code/star-con-interaction.csv
32 |    :header-rows: 1
33 | 
34 | .. csv-table:: y ~ a*b
35 |    :file: _code/star-cat-interaction.csv
36 |    :header-rows: 1
37 | 
38 | The ``:`` specifies interactions and drops lower order terms.
39 | 
40 | .. csv-table:: y ~ x1:x2
41 |    :file: _code/colon-con-interaction.csv
42 |    :header-rows: 1
43 | 
44 | .. csv-table:: y ~ a:b
45 |    :file: _code/colon-cat-interaction.csv
46 |    :header-rows: 1
47 | 
48 | The ``/`` is **quirky** according to the patsy documentation, but it is shorthand for ``a / b = a + a:b``.
49 | 
50 | .. csv-table:: y ~ (x1 + x2) / (a + b)
51 |    :file: _code/divide-interaction.csv
52 |    :header-rows: 1
53 | 
54 | If you need to drop the ``Intercept``, add ``- 1`` at the end. Note that one of the dummy variables for ``a`` is not dropped. This could be a bug with patsy.
55 | 
56 | .. csv-table:: y ~ x1 + x2 + a - 1
57 |    :file: _code/no-intercept.csv
58 |    :header-rows: 1


--------------------------------------------------------------------------------
/docs/source/refs.bib:
--------------------------------------------------------------------------------
1 | @misc{2020:patsy,
2 |     author = {patsy},
3 |     title = {patsy - Describing statistical models in Python},
4 |     url  = {https://patsy.readthedocs.io/en/latest/index.html},
5 |     addendum = "(accessed: 12.07.2020)"
6 | }


--------------------------------------------------------------------------------
/docs/source/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Allow: /
3 | Sitemap: https://ydot.readthedocs.io/sitemap.xml


--------------------------------------------------------------------------------
/docs/source/ydot.rst:
--------------------------------------------------------------------------------
 1 | PySpark Formula
 2 | ===============
 3 | 
 4 | Formula
 5 | -------
 6 | 
 7 | The ``formula`` module contains code to extract values from a record (e.g. a Spark dataframe Record) based on the model definition.
 8 | 
 9 | .. automodule:: ydot.formula
10 |     :members:
11 |     :undoc-members:
12 |     :show-inheritance:
13 |     :special-members: __init__
14 | 
15 | Spark
16 | -----
17 | 
18 | The ``spark`` module contains code to transform a Spark dataframe into ``design matrices`` as specified by a formula.
19 | 
20 | .. automodule:: ydot.spark
21 |     :members:
22 |     :undoc-members:
23 |     :show-inheritance:
24 |     :special-members: __init__


--------------------------------------------------------------------------------
/docs/source/zzz-bib.rst:
--------------------------------------------------------------------------------
1 | Bibliography
2 | ------------
3 | 
4 | .. bibliography:: refs.bib
5 |     :all:


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/logo.png


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SOURCE_DIST=/code/dist/ydot-${API_VERSION}.tar.gz
 4 | 
 5 | buildCode() {
 6 |   echo "start the build"
 7 |   cd /code \
 8 |     && make clean \
 9 |     && make \
10 |     && python setup.py sdist bdist bdist_wheel \
11 |     && twine check dist/* \
12 |     && cd /code/docs \
13 |     && make html
14 | }
15 | 
16 | updateVersion() {
17 |   echo "replace version of software to ${API_VERSION}"
18 |   sed -i "s/version='0.2.3'/version='${API_VERSION}'/g" /code/setup.py
19 | }
20 | 
21 | copyCredentials() {
22 |   if [[ -f /code/.pypirc ]]; then
23 |     echo "copying over .pypirc"
24 |     cp /code/.pypirc /root/.pypirc
25 |   fi
26 | }
27 | 
28 | publish() {
29 |   echo "python publish"
30 | 
31 |   if [[ -f /root/.pypirc ]]; then
32 |     if [[ -f ${SOURCE_DIST} ]]; then
33 |       echo "uploading source"
34 |       cd /code \
35 |         && make clean \
36 |         && python setup.py sdist \
37 |         && twine upload --repository ${PYPI_REPO} ${SOURCE_DIST}
38 |     else
39 |       echo "no ${SOURCE_DIST} found!"
40 |     fi
41 |   else
42 |     echo "no .pypirc found!"
43 |   fi
44 | }
45 | 
46 | cleanUp() {
47 |   if [[ -f /root/.pypirc ]]; then
48 |     echo "cleaning up"
49 |     rm -f /root/.pypirc
50 |   fi
51 | }
52 | 
53 | build() {
54 |   echo "python build"
55 |   buildCode
56 |   publish
57 | }
58 | 
59 | conda init bash
60 | . /root/.bashrc
61 | updateVersion
62 | copyCredentials
63 | build
64 | cleanUp
65 | 
66 | echo "done!"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # TEST
 2 | nose
 3 | coverage
 4 | # LINT OR DIE
 5 | flake8
 6 | pep8
 7 | pyflakes
 8 | # LIBS
 9 | numpy
10 | scipy
11 | pandas
12 | pyspark
13 | patsy
14 | # DOCUMENTATION
15 | sphinx
16 | sphinx_rtd_theme
17 | sphinxcontrib-bibtex
18 | sphinxcontrib-blockdiag
19 | sphinx-sitemap
20 | # PUBLISHING
21 | twine
22 | setuptools


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description-file = README.md
 3 | 
 4 | [flake8]
 5 | max-line-length = 120
 6 | ignore = E501 E731
 7 | 
 8 | [nosetests]
 9 | verbosity = 3
10 | with-doctest = 1
11 | with-coverage = 1
12 | with-id = 1
13 | cover-erase = 1
14 | cover-html = 1
15 | cover-html-dir = coverage
16 | cover-package = ydot
17 | detailed-errors = 1


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('README.md', 'r') as fh:
 4 |     long_desc = fh.read()
 5 | 
 6 | setup(
 7 |     name='ydot',
 8 |     version='0.0.6',
 9 |     author='Jee Vang',
10 |     author_email='vangjee@gmail.com',
11 |     packages=find_packages(exclude=('*.tests', '*.tests.*', 'tests.*', 'tests')),
12 |     description='R-like formulas for Spark Dataframes',
13 |     long_description=long_desc,
14 |     long_description_content_type='text/markdown',
15 |     url='https://github.com/oneoffcoder/pyspark-formula',
16 |     keywords=' '.join(
17 |         ['statistics', 'pyspark', 'formula', 'patsy', 'spark',
18 |          'dataframe', 'regression', 'classification', 'data',
19 |          'machine learning', 'artificial intelligence']),
20 |     install_requires=['scipy', 'numpy', 'pandas', 'scikit-learn', 'pyspark', 'patsy'],
21 |     classifiers=[
22 |         'Programming Language :: Python :: 3',
23 |         'License :: OSI Approved :: Apache Software License',
24 |         'Operating System :: OS Independent',
25 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
26 |         'Intended Audience :: Developers',
27 |         'Intended Audience :: Science/Research',
28 |         'Development Status :: 5 - Production/Stable'
29 |     ],
30 |     include_package_data=True,
31 |     test_suite='nose.collector'
32 | )
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_formula.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | from nose import with_setup
  5 | 
  6 | from ydot.formula import TermEnum, InteractionExtractor
  7 | 
  8 | 
  9 | def setup():
 10 |     """
 11 |     Setup.
 12 |     :return: None.
 13 |     """
 14 |     np.random.seed(37)
 15 |     random.seed(37)
 16 | 
 17 | 
 18 | def teardown():
 19 |     """
 20 |     Teardown.
 21 |     :return: None.
 22 |     """
 23 |     pass
 24 | 
 25 | 
 26 | @with_setup(setup, teardown)
 27 | def test_get_extractor():
 28 |     """
 29 |     Tests get extractor.
 30 | 
 31 |     :return: None.
 32 |     """
 33 |     record = {
 34 |         'x1': 20,
 35 |         'x2': 5,
 36 |         'a': 'left',
 37 |         'b': 'mid'
 38 |     }
 39 |     terms = [
 40 |         'Intercept',
 41 |         "C(a, levels=profile['a'])[T.right]",
 42 |         "C(b, levels=profile['b'])[T.mid]",
 43 |         "C(b, levels=profile['b'])[T.high]",
 44 |         'x1',
 45 |         'x2']
 46 |     expected = [
 47 |         TermEnum.INT,
 48 |         TermEnum.LVL,
 49 |         TermEnum.LVL,
 50 |         TermEnum.LVL,
 51 |         TermEnum.CON,
 52 |         TermEnum.CON
 53 |     ]
 54 | 
 55 |     for i, term in enumerate(terms):
 56 |         extractor = TermEnum.get_extractor(record, term)
 57 |         lhs = expected[i]
 58 |         rhs = extractor._type
 59 |         # print(extractor)
 60 |         assert lhs == rhs
 61 | 
 62 | 
 63 | @with_setup(setup, teardown)
 64 | def test_basic_extractions():
 65 |     """
 66 |     Tests basic extractions.
 67 | 
 68 |     :return: None.
 69 |     """
 70 |     record = {
 71 |         'x1': 20.0,
 72 |         'x2': 5.0,
 73 |         'a': 'left',
 74 |         'b': 'mid'
 75 |     }
 76 |     terms = [
 77 |         'Intercept',
 78 |         "C(a, levels=profile['a'])[T.left]",
 79 |         "C(a, levels=profile['a'])[T.right]",
 80 |         "C(b, levels=profile['b'])[T.low]",
 81 |         "C(b, levels=profile['b'])[T.mid]",
 82 |         "C(b, levels=profile['b'])[T.high]",
 83 |         'x1',
 84 |         'x2',
 85 |         'a[left]', 'a[right]',
 86 |         'a[T.left]', 'a[T.right]',
 87 |         'b[low]', 'b[mid]', 'b[high]',
 88 |         'b[T.low]', 'b[T.mid]', 'b[T.high]'
 89 |     ]
 90 |     expected = [
 91 |         1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 20.0, 5.0,
 92 |         1.0, 0.0,
 93 |         1.0, 0.0,
 94 |         0.0, 1.0, 0.0,
 95 |         0.0, 1.0, 0.0
 96 |     ]
 97 | 
 98 |     for i, term in enumerate(terms):
 99 |         extractor = TermEnum.get_extractor(record, term)
100 |         lhs = extractor.value
101 |         rhs = expected[i]
102 |         # print(f'{extractor._term}: {lhs}')
103 |         assert lhs == rhs
104 | 
105 | 
106 | @with_setup(setup, teardown)
107 | def test_function_extractions():
108 |     """
109 |     Tests extractions of functions on continuous variables.
110 | 
111 |     :return: None.
112 |     """
113 |     record = {
114 |         'x1': 20.0,
115 |         'x2': 5.0
116 |     }
117 |     terms = [
118 |         'x1',
119 |         'x2',
120 |         'np.abs(x1)',
121 |         'np.log(x1)',
122 |         'np.sin(x1)',
123 |         'np.log(np.sin(x1))'
124 |     ]
125 |     expected = [
126 |         20.0, 5.0, 20.0,
127 |         2.995732273553991,
128 |         0.9129452507276277,
129 |         -0.09107936652955065
130 |     ]
131 | 
132 |     for i, term in enumerate(terms):
133 |         extractor = TermEnum.get_extractor(record, term)
134 |         lhs = extractor.value
135 |         rhs = expected[i]
136 |         # print(f'{extractor._term}: {lhs}')
137 |         assert lhs == rhs
138 | 
139 | 
140 | @with_setup(setup, teardown)
141 | def test_interaction_extractions():
142 |     """
143 |     Tests extractions of functions on continuous variables.
144 | 
145 |     :return: None.
146 |     """
147 |     record = {
148 |         'x1': 20.0,
149 |         'x2': 5.0,
150 |         'a': 'left',
151 |         'b': 'mid'
152 |     }
153 |     terms = [
154 |         'Intercept',
155 |         'x1',
156 |         'x2',
157 |         'x1:x2:a[left]',
158 |         'x1:x2:a[right]',
159 |         'x1:x2:b[T.low]',
160 |         'x1:x2:b[T.mid]',
161 |         'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]',
162 |         'a[T.left]:b[T.mid]', 'a[T.left]:b[T.high]',
163 |         "x1:x2:C(a, levels=profile['a'])[left]", "x1:x2:C(a, levels=profile['a'])[right]",
164 |         "x1:x2:C(b, levels=profile['b'])[T.mid]", "x1:x2:C(b, levels=profile['b'])[T.high]",
165 |         "np.abs(x1):a[T.left]"
166 |     ]
167 |     expected = [
168 |         1.0,
169 |         20.0,
170 |         5.0,
171 |         100.0,
172 |         0.0,
173 |         0.0,
174 |         100.0,
175 |         0.0, 0.0,
176 |         1.0, 0.0,
177 |         100.0,
178 |         0.0,
179 |         100.0,
180 |         0.0,
181 |         20.0
182 |     ]
183 | 
184 |     for i, term in enumerate(terms):
185 |         extractor = InteractionExtractor(record, term)
186 |         lhs = extractor.value
187 |         rhs = expected[i]
188 |         # print(f'{extractor._terms}: {lhs}')
189 |         assert lhs == rhs
190 | 


--------------------------------------------------------------------------------
/tests/test_spark.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import unittest
  4 | from itertools import product
  5 | 
  6 | import pandas as pd
  7 | from pyspark.sql import SparkSession
  8 | 
  9 | from ydot.spark import get_profile, get_columns, smatrices
 10 | 
 11 | 
 12 | class PySparkTest(unittest.TestCase):
 13 |     """
 14 |     PySpark test class.
 15 |     """
 16 | 
 17 |     @classmethod
 18 |     def supress_py4j_logging(cls):
 19 |         """
 20 |         Supresses p4j logging.
 21 | 
 22 |         :return: None.
 23 |         """
 24 |         logger = logging.getLogger('py4j')
 25 |         logger.setLevel(logging.WARN)
 26 | 
 27 |     @classmethod
 28 |     def create_pyspark_session(cls):
 29 |         """
 30 |         Creates a PySpark session.
 31 | 
 32 |         :return: PySpark session.
 33 |         """
 34 |         return (SparkSession.builder
 35 |                 .master('local[4]')
 36 |                 .appName('local-testing-pyspark')
 37 |                 .getOrCreate())
 38 | 
 39 |     @classmethod
 40 |     def setUpClass(cls):
 41 |         """
 42 |         Sets up the class.
 43 | 
 44 |         :return: None.
 45 |         """
 46 |         cls.supress_py4j_logging()
 47 |         cls.spark = cls.create_pyspark_session()
 48 |         random.seed(37)
 49 | 
 50 |     @classmethod
 51 |     def tearDownClass(cls):
 52 |         """
 53 |         Tears down the class.
 54 | 
 55 |         :return: None.
 56 |         """
 57 |         cls.spark.stop()
 58 | 
 59 |     @staticmethod
 60 |     def _get_profile():
 61 |         """
 62 |         Gets the profile of a dataset.
 63 | 
 64 |         :return: Dictionary.
 65 |         """
 66 |         profile = {
 67 |             'a': ['left', 'right'],
 68 |             'b': ['high', 'mid', 'low'],
 69 |             'x1': [20.0],
 70 |             'x2': [3.0],
 71 |             'y': [1.0]
 72 |         }
 73 |         return profile
 74 | 
 75 |     @staticmethod
 76 |     def _get_pdf():
 77 |         """
 78 |         Gets a Pandas dataframe based on made-up profile.
 79 | 
 80 |         :return: Pandas dataframe.
 81 |         """
 82 |         profile = PySparkTest._get_profile()
 83 |         data = product(*(v for _, v in profile.items()))
 84 |         columns = [k for k, _ in profile.items()]
 85 |         df = pd.DataFrame(data, columns=columns)
 86 | 
 87 |         return df
 88 | 
 89 |     def _get_sdf(self):
 90 |         """
 91 |         Gets a Spark dataframe based on made-up profile.
 92 | 
 93 |         :return: Spark dataframe.
 94 |         """
 95 |         pdf = PySparkTest._get_pdf()
 96 |         sdf = self.spark.createDataFrame(pdf)
 97 |         return sdf
 98 | 
 99 | 
100 | class SparkTest(PySparkTest):
101 |     """
102 |     Tests Spark operations.
103 |     """
104 | 
105 |     def test_get_profile(self):
106 |         """
107 |         Tests getting profile of a Spark dataframe.
108 | 
109 |         :return: None.
110 |         """
111 |         sdf = self._get_sdf()
112 |         sdf.printSchema()
113 |         observed = get_profile(sdf)
114 |         expected = {'b': ['mid', 'low', 'high'], 'a': ['right', 'left'], 'x1': [1.0], 'x2': [1.0], 'y': [1.0]}
115 | 
116 |         for k, lhs_vals in expected.items():
117 |             assert k in observed
118 |             rhs_vals = observed[k]
119 | 
120 |             assert len(lhs_vals) == len(rhs_vals)
121 |             for v in lhs_vals:
122 |                 assert v in rhs_vals
123 | 
124 |     def test_get_columns_simple_formula_with_profile(self):
125 |         """
126 |         Tests get columns (simple) with profile specified.
127 | 
128 |         :return: None.
129 |         """
130 |         formula = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])"
131 |         sdf = self._get_sdf()
132 |         profile = {'b': ['mid', 'low', 'high'],
133 |                    'a': ['right', 'left'],
134 |                    'x1': [1.0],
135 |                    'x2': [1.0],
136 |                    'y': [1.0]}
137 | 
138 |         y_observed, X_observed = get_columns(formula, sdf, profile=profile)
139 | 
140 |         y_expected = ['y']
141 |         X_expected = ['Intercept',
142 |                       "C(a, levels=profile['a'])[T.left]",
143 |                       "C(b, levels=profile['b'])[T.low]",
144 |                       "C(b, levels=profile['b'])[T.high]",
145 |                       'x1',
146 |                       'x2']
147 | 
148 |         assert len(y_observed) == len(y_expected)
149 |         assert len(X_observed) == len(X_expected)
150 | 
151 |         for y in y_observed:
152 |             assert y in y_expected
153 | 
154 |         for x in X_observed:
155 |             assert x in X_expected
156 | 
157 |     def test_get_columns_simple_formula_no_profile(self):
158 |         """
159 |         Tests get columns (simple) without a profile specified.
160 | 
161 |         :return: None.
162 |         """
163 |         formula = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])"
164 |         sdf = self._get_sdf()
165 | 
166 |         y_observed, X_observed = get_columns(formula, sdf)
167 | 
168 |         y_expected = ['y']
169 |         X_expected = ['Intercept',
170 |                       "C(a, levels=profile['a'])[T.left]",
171 |                       "C(b, levels=profile['b'])[T.low]",
172 |                       "C(b, levels=profile['b'])[T.high]",
173 |                       'x1',
174 |                       'x2']
175 | 
176 |         assert len(y_observed) == len(y_expected)
177 |         assert len(X_observed) == len(X_expected)
178 | 
179 |         for y in y_observed:
180 |             assert y in y_expected
181 | 
182 |         for x in X_observed:
183 |             assert x in X_expected
184 | 
185 |     def test_get_columns_variety_with_profile(self):
186 |         """
187 |         Tests a variety of formulas with profile.
188 | 
189 |         :return: None.
190 |         """
191 |         f1 = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b'])"
192 |         f2 = "y ~ (x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b']))**2"
193 |         f3 = "y ~ x1:x2 + C(a,levels=profile['a']):C(b, levels=profile['b'])"
194 |         f4 = "y ~ x1*x2 + C(a,levels=profile['a'])*C(b, levels=profile['b'])"
195 |         f5 = "y ~ x1 + x2 + C(a,levels=profile['a']) + C(b, levels=profile['b']) - 1"
196 |         f6 = "y ~ (x1 + x2) / (C(a,levels=profile['a']) + C(b, levels=profile['b']))"
197 | 
198 |         formulas = [f1, f2, f3, f4, f5, f6]
199 | 
200 |         sdf = self._get_sdf()
201 |         profile = {'b': ['low', 'mid', 'high'],
202 |                    'a': ['left', 'right'],
203 |                    'x1': [1.0],
204 |                    'x2': [1.0],
205 |                    'y': [1.0]}
206 | 
207 |         yy = [
208 |             ['y'],
209 |             ['y'],
210 |             ['y'],
211 |             ['y'],
212 |             ['y'],
213 |             ['y']
214 |         ]
215 |         XX = [
216 |             ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]",
217 |              "C(b, levels=profile['b'])[T.high]", 'x1', 'x2'],
218 |             ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]",
219 |              "C(b, levels=profile['b'])[T.high]", "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.mid]",
220 |              "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.high]", 'x1',
221 |              "x1:C(a, levels=profile['a'])[T.right]", "x1:C(b, levels=profile['b'])[T.mid]",
222 |              "x1:C(b, levels=profile['b'])[T.high]", 'x2', "x2:C(a, levels=profile['a'])[T.right]",
223 |              "x2:C(b, levels=profile['b'])[T.mid]", "x2:C(b, levels=profile['b'])[T.high]", 'x1:x2'],
224 |             ['Intercept', "C(b, levels=profile['b'])[T.mid]", "C(b, levels=profile['b'])[T.high]",
225 |              "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[low]",
226 |              "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[mid]",
227 |              "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[high]", 'x1:x2'],
228 |             ['Intercept', "C(a, levels=profile['a'])[T.right]", "C(b, levels=profile['b'])[T.mid]",
229 |              "C(b, levels=profile['b'])[T.high]", "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.mid]",
230 |              "C(a, levels=profile['a'])[T.right]:C(b, levels=profile['b'])[T.high]", 'x1', 'x2', 'x1:x2'],
231 |             ["C(a, levels=profile['a'])[left]", "C(a, levels=profile['a'])[right]", "C(b, levels=profile['b'])[T.mid]",
232 |              "C(b, levels=profile['b'])[T.high]", 'x1', 'x2'],
233 |             ['Intercept', 'x1', 'x2', "x1:x2:C(a, levels=profile['a'])[left]", "x1:x2:C(a, levels=profile['a'])[right]",
234 |              "x1:x2:C(b, levels=profile['b'])[T.mid]", "x1:x2:C(b, levels=profile['b'])[T.high]"]
235 |         ]
236 | 
237 |         for i, formula in enumerate(formulas):
238 |             y_observed, X_observed = get_columns(formula, sdf, profile=profile)
239 |             y_expected, X_expected = yy[i], XX[i]
240 | 
241 |             # print(f'{i}: {formula}')
242 |             # print(y_observed)
243 |             # print(X_observed)
244 |             # print('-' * 15)
245 | 
246 |             assert len(y_observed) == len(y_expected)
247 |             assert len(X_observed) == len(X_expected)
248 | 
249 |             for y in y_observed:
250 |                 assert y in y_expected
251 | 
252 |             for x in X_observed:
253 |                 assert x in X_expected
254 | 
255 |     def test_get_columns_variety_no_profile(self):
256 |         """
257 |         Tests a variety of formulas without profile.
258 | 
259 |         :return: None.
260 |         """
261 |         f1 = "y ~ x1 + x2 + a + b"
262 |         f2 = "y ~ (x1 + x2 + a + b)**2"
263 |         f3 = "y ~ x1:x2 + a:b"
264 |         f4 = "y ~ x1*x2 + a*b"
265 |         f5 = "y ~ x1 + x2 + a + b - 1"
266 |         f6 = "y ~ (x1 + x2) / (a + b)"
267 | 
268 |         formulas = [f1, f2, f3, f4, f5, f6]
269 | 
270 |         sdf = self._get_sdf()
271 | 
272 |         yy = [
273 |             ['y'],
274 |             ['y'],
275 |             ['y'],
276 |             ['y'],
277 |             ['y'],
278 |             ['y']
279 |         ]
280 |         XX = [
281 |             ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'],
282 |             ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]', 'x1',
283 |              'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]', 'x2', 'x2:a[T.right]', 'x2:b[T.low]', 'x2:b[T.mid]',
284 |              'x1:x2'],
285 |             ['Intercept', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[high]', 'a[T.right]:b[low]', 'a[T.right]:b[mid]',
286 |              'x1:x2'],
287 |             ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]', 'a[T.right]:b[T.mid]', 'x1',
288 |              'x2', 'x1:x2'],
289 |             ['a[left]', 'a[right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2'],
290 |             ['Intercept', 'x1', 'x2', 'x1:x2:a[left]', 'x1:x2:a[right]', 'x1:x2:b[T.low]', 'x1:x2:b[T.mid]']
291 |         ]
292 | 
293 |         for i, formula in enumerate(formulas):
294 |             y_observed, X_observed = get_columns(formula, sdf)
295 |             y_expected, X_expected = yy[i], XX[i]
296 | 
297 |             # print(f'{i}: {formula}')
298 |             # print(y_observed)
299 |             # print(X_observed)
300 |             # print('-' * 15)
301 | 
302 |             assert len(y_observed) == len(y_expected)
303 |             assert len(X_observed) == len(X_expected)
304 | 
305 |             for y in y_observed:
306 |                 assert y in y_expected
307 | 
308 |             for x in X_observed:
309 |                 assert x in X_expected
310 | 
311 |     def test_smatrices_simple(self):
312 |         """
313 |         Test simple smatrices.
314 | 
315 |         :return: None.
316 |         """
317 |         f = 'y ~ x1 + x2 + a + b'
318 |         sdf = self._get_sdf()
319 |         e_rows = sdf.count()
320 | 
321 |         y_expected = ['y']
322 |         X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2']
323 | 
324 |         y, X = smatrices(f, sdf)
325 |         y, X = y.toPandas(), X.toPandas()
326 | 
327 |         y_observed = list(y.columns)
328 |         X_observed = list(X.columns)
329 | 
330 |         # print(y_observed)
331 |         # print('~' * 15)
332 |         # print(y)
333 |         # print('=' * 15)
334 |         #
335 |         # print(X_observed)
336 |         # print('~' * 20)
337 |         # print(X)
338 | 
339 |         assert e_rows == y.shape[0]
340 |         assert e_rows == X.shape[0]
341 |         assert len(y_expected) == len(y_observed)
342 |         assert len(X_expected) == len(X_observed)
343 | 
344 |         for v in y_expected:
345 |             assert v in y_observed
346 |         for v in X_expected:
347 |             assert v in X_observed
348 | 
349 |     def test_smatrices_simple_drop_intercept(self):
350 |         """
351 |         Test simple smatrices dropping intercept. Note that dropping intercept creates a
352 |         situation where the one-hot encoded variables are not dropped! Bug with patsy?
353 | 
354 |         :return: None.
355 |         """
356 |         f = 'y ~ x1 + x2 + a + b - 1'
357 |         sdf = self._get_sdf()
358 |         e_rows = sdf.count()
359 | 
360 |         y_expected = ['y']
361 |         X_expected = ['a[left]', 'a[right]', 'b[T.low]', 'b[T.mid]', 'x1', 'x2']
362 | 
363 |         y, X = smatrices(f, sdf)
364 |         y, X = y.toPandas(), X.toPandas()
365 | 
366 |         y_observed = list(y.columns)
367 |         X_observed = list(X.columns)
368 | 
369 |         # print(y_observed)
370 |         # print('~' * 15)
371 |         # print(y)
372 |         # print('=' * 15)
373 |         #
374 |         # print(X_observed)
375 |         # print('~' * 20)
376 |         # print(X)
377 | 
378 |         assert e_rows == y.shape[0]
379 |         assert e_rows == X.shape[0]
380 |         assert len(y_expected) == len(y_observed)
381 |         assert len(X_expected) == len(X_observed)
382 | 
383 |         for v in y_expected:
384 |             assert v in y_observed
385 |         for v in X_expected:
386 |             assert v in X_observed
387 | 
388 |     def test_smatrices_two_way(self):
389 |         """
390 |         Test smatrices with two-way interaction.
391 | 
392 |         :return: None.
393 |         """
394 |         f = 'y ~ (x1 + x2 + a + b)**2'
395 |         sdf = self._get_sdf()
396 |         e_rows = sdf.count()
397 | 
398 |         y_expected = ['y']
399 |         X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]',
400 |                       'a[T.right]:b[T.mid]', 'x1', 'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]',
401 |                       'x2', 'x2:a[T.right]', 'x2:b[T.low]', 'x2:b[T.mid]', 'x1:x2']
402 | 
403 |         y, X = smatrices(f, sdf)
404 |         y, X = y.toPandas(), X.toPandas()
405 | 
406 |         y_observed = list(y.columns)
407 |         X_observed = list(X.columns)
408 | 
409 |         # print(y_observed)
410 |         # print('~' * 15)
411 |         # print(y)
412 |         # print('=' * 15)
413 |         #
414 |         # print(X_observed)
415 |         # print('~' * 20)
416 |         # print(X)
417 | 
418 |         assert e_rows == y.shape[0]
419 |         assert e_rows == X.shape[0]
420 |         assert len(y_expected) == len(y_observed)
421 |         assert len(X_expected) == len(X_observed)
422 | 
423 |         for v in y_expected:
424 |             assert v in y_observed
425 |         for v in X_expected:
426 |             assert v in X_observed
427 | 
428 |     def test_smatrices_three_way(self):
429 |         """
430 |         Test smatrices with three-way interaction.
431 | 
432 |         :return: None.
433 |         """
434 |         f = 'y ~ (x1 + x2 + a + b)**3'
435 |         sdf = self._get_sdf()
436 |         e_rows = sdf.count()
437 | 
438 |         y_expected = ['y']
439 |         X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'a[T.right]:b[T.low]',
440 |                       'a[T.right]:b[T.mid]', 'x1', 'x1:a[T.right]', 'x1:b[T.low]', 'x1:b[T.mid]',
441 |                       'x1:a[T.right]:b[T.low]', 'x1:a[T.right]:b[T.mid]', 'x2', 'x2:a[T.right]',
442 |                       'x2:b[T.low]', 'x2:b[T.mid]', 'x2:a[T.right]:b[T.low]', 'x2:a[T.right]:b[T.mid]',
443 |                       'x1:x2', 'x1:x2:a[T.right]', 'x1:x2:b[T.low]', 'x1:x2:b[T.mid]']
444 | 
445 |         y, X = smatrices(f, sdf)
446 |         y, X = y.toPandas(), X.toPandas()
447 | 
448 |         y_observed = list(y.columns)
449 |         X_observed = list(X.columns)
450 | 
451 |         # print(y_observed)
452 |         # print('~' * 15)
453 |         # print(y)
454 |         # print('=' * 15)
455 |         #
456 |         # print(X_observed)
457 |         # print('~' * 20)
458 |         # print(X)
459 | 
460 |         assert e_rows == y.shape[0]
461 |         assert e_rows == X.shape[0]
462 |         assert len(y_expected) == len(y_observed)
463 |         assert len(X_expected) == len(X_observed)
464 | 
465 |         for v in y_expected:
466 |             assert v in y_observed
467 |         for v in X_expected:
468 |             assert v in X_observed
469 | 
470 |     def test_smatrices_weird(self):
471 |         """
472 |         Test smatrices with weird interactions.
473 | 
474 |         :return: None.
475 |         """
476 |         f = 'np.sin(y) + y ~ np.abs(x1) + (x2 + a)**2 + (np.cos(x2) + b)**2'
477 |         sdf = self._get_sdf()
478 |         e_rows = sdf.count()
479 | 
480 |         y_expected = ['np.sin(y)', 'y']
481 |         X_expected = ['Intercept', 'a[T.right]', 'b[T.low]', 'b[T.mid]', 'np.abs(x1)', 'x2',
482 |                       'x2:a[T.right]', 'np.cos(x2)', 'np.cos(x2):b[T.low]', 'np.cos(x2):b[T.mid]']
483 | 
484 |         y, X = smatrices(f, sdf)
485 |         y, X = y.toPandas(), X.toPandas()
486 | 
487 |         y_observed = list(y.columns)
488 |         X_observed = list(X.columns)
489 | 
490 |         # print(y_observed)
491 |         # print('~' * 15)
492 |         # print(y)
493 |         # print('=' * 15)
494 |         #
495 |         # print(X_observed)
496 |         # print('~' * 20)
497 |         # print(X)
498 | 
499 |         assert e_rows == y.shape[0]
500 |         assert e_rows == X.shape[0]
501 |         assert len(y_expected) == len(y_observed)
502 |         assert len(X_expected) == len(X_observed)
503 | 
504 |         for v in y_expected:
505 |             assert v in y_observed
506 |         for v in X_expected:
507 |             assert v in X_observed
508 | 


--------------------------------------------------------------------------------
/ydot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oneoffcoder/pyspark-formula/ea7b43cdddcb475063cef478007647323d048f7a/ydot/__init__.py


--------------------------------------------------------------------------------
/ydot/formula.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | from abc import ABC, abstractmethod
  3 | from enum import IntEnum
  4 | from functools import reduce
  5 | 
  6 | import pandas as pd
  7 | 
  8 | 
  9 | class TermEnum(IntEnum):
 10 |     """
 11 |     Term types.
 12 | 
 13 |     - CAT: categorical without levels specified
 14 |     - LVL: categorical with levels specified
 15 |     - CON: continuous
 16 |     - FUN: continuous with function transformations
 17 |     - INT: intercept
 18 |     """
 19 |     CAT = 1
 20 |     LVL = 2
 21 |     CON = 3
 22 |     FUN = 4
 23 |     INT = 5
 24 | 
 25 |     @staticmethod
 26 |     def get_extractor(record, term):
 27 |         """
 28 |         Gets the associated extractor based on the specified term.
 29 | 
 30 |         :param record: Dictionary.
 31 |         :param term: Model term.
 32 |         :return: Extractor.
 33 |         """
 34 | 
 35 |         if term.startswith('C'):
 36 |             return LvlExtractor(record, term)
 37 |         elif '[' in term and ']' in term:
 38 |             return CatExtractor(record, term)
 39 |         elif 'Intercept' == term:
 40 |             return IntExtractor(record, term)
 41 |         elif '(' in term and ')' in term:
 42 |             return FunExtractor(record, term)
 43 |         else:
 44 |             return ConExtractor(record, term)
 45 | 
 46 | 
 47 | class Extractor(ABC):
 48 |     """
 49 |     Extractor to get value based on model term.
 50 |     """
 51 | 
 52 |     def __init__(self, record, term, term_type):
 53 |         """
 54 |         ctor.
 55 | 
 56 |         :param: Dictionary.
 57 |         :term: Model term.
 58 |         :term_type: Type of term.
 59 |         :return: None
 60 |         """
 61 |         self._record = record
 62 |         self._term = term
 63 |         self._type = term_type
 64 | 
 65 |     def __repr__(self):
 66 |         return f'{self.__class__.__name__}[term={self._term}, type={self._type.name}]'
 67 | 
 68 |     @property
 69 |     @abstractmethod
 70 |     def value(self):
 71 |         """
 72 |         Gets the extracted value.
 73 |         """
 74 |         pass
 75 | 
 76 | 
 77 | class CatExtractor(Extractor):
 78 |     """
 79 |     Categorical extractor (no levels).
 80 |     """
 81 | 
 82 |     def __init__(self, record, term):
 83 |         """
 84 |         ctor.
 85 | 
 86 |         :param record: Dictionary.
 87 |         :param term: Model term.
 88 |         :return: None.
 89 |         """
 90 |         super().__init__(record, term, TermEnum.CAT)
 91 | 
 92 |     @property
 93 |     def value(self):
 94 |         idx = self._term.index('[')
 95 |         x_name = self._term[0:idx]
 96 | 
 97 |         if x_name not in self._record or self._record[x_name] is None:
 98 |             return None
 99 | 
100 |         lhs = self._term.rindex('[') + 1
101 |         rhs = self._term.rindex(']')
102 |         x_val = self._term[lhs:rhs]
103 |         x_val = x_val.replace('T.', '')
104 | 
105 |         if self._record[x_name] == x_val:
106 |             return 1.0
107 |         return 0.0
108 | 
109 | 
110 | class LvlExtractor(Extractor):
111 |     """
112 |     Categorical extractor (with levels).
113 |     """
114 | 
115 |     def __init__(self, record, term):
116 |         """
117 |         ctor.
118 | 
119 |         :param record: Dictionary.
120 |         :param term: Model term.
121 |         :return: None.
122 |         """
123 |         super().__init__(record, term, TermEnum.LVL)
124 | 
125 |     @property
126 |     def value(self):
127 |         lhs = self._term.index('(') + 1
128 |         rhs = self._term.index(',')
129 |         x_name = self._term[lhs:rhs]
130 | 
131 |         if x_name not in self._record or self._record[x_name] is None:
132 |             return None
133 | 
134 |         lhs = self._term.rindex('[') + 1
135 |         rhs = self._term.rindex(']')
136 |         x_val = self._term[lhs:rhs]
137 |         x_val = x_val.replace('T.', '')
138 | 
139 |         if self._record[x_name] == x_val:
140 |             return 1.0
141 |         return 0.0
142 | 
143 | 
144 | class ConExtractor(Extractor):
145 |     """
146 |     Continuous extractor (no functions).
147 |     """
148 | 
149 |     def __init__(self, record, term):
150 |         """
151 |         ctor.
152 | 
153 |         :param record: Dictionary.
154 |         :param term: Model term.
155 |         :return: None.
156 |         """
157 |         super().__init__(record, term, TermEnum.CON)
158 | 
159 |     @property
160 |     def value(self):
161 |         return self._record[self._term] if self._term in self._record else None
162 | 
163 | 
164 | class IntExtractor(Extractor):
165 |     """
166 |     Intercept extractor. Always returns 1.0.
167 |     """
168 | 
169 |     def __init__(self, record, term):
170 |         """
171 |         ctor.
172 | 
173 |         :param record: Dictionary.
174 |         :param term: Model term.
175 |         :return: None.
176 |         """
177 |         super().__init__(record, term, TermEnum.INT)
178 | 
179 |     @property
180 |     def value(self):
181 |         return 1.0
182 | 
183 | 
184 | class FunExtractor(Extractor):
185 |     """
186 |     Continuous extractor (with functions defined).
187 |     """
188 | 
189 |     def __init__(self, record, term):
190 |         """
191 |         ctor.
192 | 
193 |         :param record: Dictionary.
194 |         :param term: Model term.
195 |         :return: None.
196 |         """
197 |         super().__init__(record, term, TermEnum.FUN)
198 | 
199 |     # flake8: noqa: F841
200 |     @property
201 |     def value(self):
202 |         lhs = self._term.rindex('(') + 1
203 |         rhs = self._term.index(')')
204 |         x_name = self._term[lhs:rhs]
205 |         expression = f'{self._term[0:lhs]}val{self._term[rhs:]}'
206 |         val = self._record[x_name] if x_name in self._record else None
207 |         if pd.isna(val):
208 |             return None
209 | 
210 |         if 'np.' in expression:
211 |             np = importlib.import_module('numpy')
212 |         v = eval(expression)
213 | 
214 |         if isinstance(v, np.generic):
215 |             v = np.asscalar(v)
216 |         return v
217 | 
218 | 
219 | class InteractionExtractor(object):
220 |     """
221 |     Interaction extractor for interaction effects.
222 |     """
223 | 
224 |     def __init__(self, record, terms):
225 |         """
226 |         ctor.
227 | 
228 |         :param record: Dictionary.
229 |         :param terms: Model term (possibly with interaction effects).
230 |         :return: None.
231 |         """
232 |         self._terms = terms
233 |         extractors = [TermEnum.get_extractor(record, term) for term in terms.split(':')]
234 |         values = [e.value for e in extractors]
235 |         values = [v for v in values if pd.notna(v)]
236 | 
237 |         if len(values) != len(extractors):
238 |             self.__value = None
239 |         else:
240 |             self.__value = reduce(lambda a, b: a * b, values)
241 | 
242 |     def __repr__(self):
243 |         return f'{self.__class__.__name__}[terms={self._terms}]'
244 | 
245 |     @property
246 |     def value(self):
247 |         return self.__value
248 | 


--------------------------------------------------------------------------------
/ydot/spark.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from itertools import product
 3 | 
 4 | import pandas as pd
 5 | from patsy.highlevel import dmatrices
 6 | from pyspark import Row
 7 | 
 8 | from ydot.formula import InteractionExtractor
 9 | 
10 | 
11 | def get_profile(sdf):
12 |     """
13 |     Gets the field profiles of the specified Spark dataframe.
14 | 
15 |     :param sdf: Spark dataframe.
16 |     :return: Dictionary.
17 |     """
18 |     dtypes = {k: v for k, v in sdf.dtypes}
19 |     cat_types = sdf.rdd \
20 |         .map(lambda r: r.asDict()) \
21 |         .flatMap(lambda r: [((k, r[k]), 1) for k, v in dtypes.items() if v == 'string']) \
22 |         .reduceByKey(lambda a, b: a + b) \
23 |         .map(lambda tup: (tup[0][0], {tup[0][1]: tup[1]})) \
24 |         .reduceByKey(lambda a, b: {**a, **b}) \
25 |         .map(lambda tup: (tup[0], [(k, v) for k, v in tup[1].items()])) \
26 |         .map(lambda tup: (tup[0], sorted(tup[1], key=lambda t: (t[1], t[0]), reverse=True))) \
27 |         .map(lambda tup: (tup[0], [t[0] for t in tup[1]])) \
28 |         .collect()
29 |     cat_types = {tup[0]: tup[1] for tup in cat_types}
30 |     con_types = {k: [1.0] for k, v in dtypes.items() if v != 'string'}
31 |     all_types = {**cat_types, **con_types}
32 |     return all_types
33 | 
34 | 
35 | # flake8: noqa: F841
36 | def get_columns(formula, sdf, profile=None):
37 |     """
38 |     Gets the expanded columns of the specified Spark dataframe using the specified formula.
39 | 
40 |     :param formula: Formula (R-like, based on patsy).
41 |     :param sdf: Spark dataframe.
42 |     :param profile: Profile. Default is `None` and profile will be determined empirically.
43 |     :return: Tuple of columns for y, X.
44 |     """
45 |     if profile is None:
46 |         profile = get_profile(sdf)
47 | 
48 |     data = product(*(v for _, v in profile.items()))
49 |     columns = [k for k, _ in profile.items()]
50 |     df = pd.DataFrame(data, columns=columns)
51 | 
52 |     if 'np.' in formula:
53 |         np = importlib.import_module('numpy')
54 |     y, X = dmatrices(formula, df, return_type='dataframe')
55 | 
56 |     return list(y), list(X)
57 | 
58 | 
59 | def __smatrices(columns, sdf):
60 |     """
61 |     Constructs new Spark dataframe based on columns.
62 | 
63 |     :param columns: Columns generated from patsy.
64 |     :param sdf: Spark dataframe.
65 |     :return: Spark dataframe.
66 |     """
67 | 
68 |     def to_record(record):
69 |         return Row(**{term: InteractionExtractor(record, term).value for term in columns})
70 | 
71 |     return sdf.rdd \
72 |         .map(lambda r: to_record(r.asDict())) \
73 |         .toDF()
74 | 
75 | 
76 | def smatrices(formula, sdf, profile=None):
77 |     """
78 |     Gets tuple of design/model matrices.
79 | 
80 |     :param formula: Formula.
81 |     :param sdf: Spark dataframe.
82 |     :param profile: Dictionary of data profile.
83 |     :return: y, X Spark dataframes.
84 |     """
85 |     y_cols, X_cols = get_columns(formula, sdf, profile=profile)
86 |     X = __smatrices(X_cols, sdf)
87 |     y = __smatrices(y_cols, sdf)
88 | 
89 |     return y, X
90 | 


--------------------------------------------------------------------------------