├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── CONTRIBUTIONS.md ├── LEGAL.txt ├── LICENSE ├── NOTICE ├── README.md ├── cmake ├── LinuxUtils.cmake ├── OSXUtils.cmake └── Utils.cmake ├── deploy ├── CMakeLists.txt ├── PGXN │ ├── CMakeLists.txt │ ├── META.json.in │ ├── generate_package.sh.in │ └── zipignore.in ├── PackageMaker │ ├── CMakeLists.txt │ └── Welcome.html ├── RPM │ └── CMakeLists.txt ├── description.txt ├── gppkg │ ├── CMakeLists.txt │ ├── gppkg_spec.yml.in │ └── pdltools.spec.in ├── hawq_install ├── pdltools.spec.in ├── postflight.sh ├── preflight.sh └── rpm_post.sh ├── doc ├── CMakeLists.txt ├── bin │ ├── CMakeLists.txt │ ├── doxypy.py │ ├── py_filter.sh.in │ └── sql_filter.sh.in ├── changelog.dox.in ├── etc │ ├── CMakeLists.txt │ ├── DoxygenLayout.xml │ ├── SQLCommon.m4_in │ ├── header.html │ ├── pdltools_extra.css │ └── user.doxyfile.in ├── gettingstarted.dox.in ├── imgs │ ├── pdl.png │ ├── pdltools_pull_request_travis_success.png │ ├── pdltools_sample_pull_request_1.png │ ├── pdltools_sample_pull_request_2.png │ └── pdltools_sample_travis.png ├── installpage.dox.in ├── mainpage.dox.in └── src │ ├── sql.ll │ └── sql.yy ├── license ├── PDLTools.txt └── third_party │ └── uriparser-0.7.9.txt ├── src ├── CMakeLists.txt ├── bin │ ├── CMakeLists.txt │ └── pdlpack ├── config │ ├── CMakeLists.txt │ ├── Modules.yml │ ├── Ports.yml │ ├── SUgAR_version.yml │ └── Version.yml ├── modules │ ├── common │ │ └── common.c │ ├── complex │ │ ├── complex_type.c │ │ ├── complex_type.h │ │ └── float_utils.h │ ├── edit_distance │ │ └── edit_distance.c │ ├── nlp │ │ └── PorterStemmer.c │ ├── sampling │ │ └── sampling.c │ └── uri_utils │ │ └── uri_utils.c ├── pdlpack │ ├── CMakeLists.txt │ ├── __init__.py │ ├── argparse.py │ ├── argparse.pyc │ ├── configyml.py │ ├── configyml.pyc │ ├── pdlpack.py │ └── plpy.py ├── ports │ ├── CMakeLists.txt │ ├── greenplum │ │ ├── 4.2 │ │ │ └── CMakeLists.txt │ │ ├── 4.3 │ │ │ └── CMakeLists.txt │ │ ├── 4.3ORCA │ │ │ └── CMakeLists.txt │ │ ├── CMakeLists.txt │ │ ├── cmake │ │ │ ├── FindGreenplum.cmake │ │ │ ├── FindGreenplum_4_2.cmake │ │ │ ├── FindGreenplum_4_3.cmake │ │ │ ├── FindGreenplum_4_3ORCA.cmake │ │ │ ├── FindPostgreSQL.cmake │ │ │ ├── GreenplumUtils.cmake │ │ │ └── PostgreSQLUtils.cmake │ │ └── modules │ │ │ ├── anonymization │ │ │ ├── anonymization.content │ │ │ ├── anonymization.sql_in │ │ │ ├── anonymization.yml │ │ │ └── test │ │ │ │ └── test_anonymization.sql_in │ │ │ ├── balance_dataset │ │ │ ├── balance_dataset.content │ │ │ ├── balance_dataset.sql_in │ │ │ ├── balance_dataset.yml │ │ │ └── test │ │ │ │ └── test_balance_dataset.sql_in │ │ │ ├── common │ │ │ ├── common.content │ │ │ ├── common.yml │ │ │ └── udfs.sql_in │ │ │ ├── complete_linkage │ │ │ ├── complete_linkage.content │ │ │ ├── complete_linkage.sql_in │ │ │ ├── complete_linkage.yml │ │ │ └── test │ │ │ │ └── test_complete_linkage.sql_in │ │ │ ├── complex │ │ │ ├── complex.content │ │ │ ├── complex.yml │ │ │ ├── complex_type.sql_in │ │ │ └── test │ │ │ │ └── test_complex_type.sql_in │ │ │ ├── edit_distance │ │ │ ├── edit_distance.content │ │ │ ├── edit_distance.sql_in │ │ │ ├── edit_distance.yml │ │ │ └── test │ │ │ │ └── test_edit_distance.sql_in │ │ │ ├── generic_utilities │ │ │ ├── array_utilities.sql_in │ │ │ ├── generic_utilities.content │ │ │ ├── generic_utilities.sql_in │ │ │ ├── generic_utilities.yml │ │ │ └── test │ │ │ │ ├── test_array_utilities.sql_in │ │ │ │ └── test_generic_utilities.sql_in │ │ │ ├── grid_search │ │ │ ├── grid_search.content │ │ │ ├── grid_search.sql_in │ │ │ ├── grid_search.yml │ │ │ └── test │ │ │ │ └── test_grid_search.sql_in │ │ │ ├── hits │ │ │ ├── hits.content │ │ │ ├── hits.sql_in │ │ │ ├── hits.yml │ │ │ └── test │ │ │ │ └── test_hits.sql_in │ │ │ ├── kd_tree │ │ │ ├── kd_tree.content │ │ │ ├── kd_tree.sql_in │ │ │ ├── kd_tree.yml │ │ │ └── test │ │ │ │ └── test_kd_tree.sql_in │ │ │ ├── normalized_cut │ │ │ ├── normalized_cut.content │ │ │ ├── normalized_cut.sql_in │ │ │ ├── normalized_cut.yml │ │ │ └── test │ │ │ │ └── test_normalized_cut.sql_in │ │ │ ├── one_vs_rest │ │ │ ├── one_vs_rest.content │ │ │ ├── one_vs_rest.yml │ │ │ ├── one_vs_rest_data_prep.sql_in │ │ │ └── test │ │ │ │ └── test_one_vs_rest_data_prep.sql_in │ │ │ ├── pagerank │ │ │ ├── pagerank.content │ │ │ ├── pagerank.sql_in │ │ │ ├── pagerank.yml │ │ │ └── test │ │ │ │ └── test_pagerank.sql_in │ │ │ ├── plr_placeholder │ │ │ ├── plr_placeholder.yml │ │ │ ├── plr_sample.sql_in │ │ │ └── test │ │ │ │ └── test_plr_sample.sql_in │ │ │ ├── prediction_metrics │ │ │ ├── prediction_metrics.content │ │ │ ├── prediction_metrics.sql_in │ │ │ ├── prediction_metrics.yml │ │ │ └── test │ │ │ │ └── test_prediction_metrics.sql_in │ │ │ ├── sampling │ │ │ ├── sampling.content │ │ │ ├── sampling.sql_in │ │ │ ├── sampling.yml │ │ │ └── test │ │ │ │ └── test_sampling.sql_in │ │ │ ├── session │ │ │ ├── session.content │ │ │ ├── session.sql_in │ │ │ ├── session.yml │ │ │ └── test │ │ │ │ └── test_session.sql_in │ │ │ ├── stemming │ │ │ ├── porter_stemmer.sql_in │ │ │ ├── stemming.content │ │ │ ├── stemming.yml │ │ │ └── test │ │ │ │ └── test_porter_stemmer.sql_in │ │ │ ├── sugar │ │ │ ├── sugar.content │ │ │ ├── sugar.sql_in │ │ │ ├── sugar.yml │ │ │ └── test │ │ │ │ └── test_sugar.sql_in │ │ │ └── uri_utils │ │ │ ├── test │ │ │ └── test_uri_utils.sql_in │ │ │ ├── uri_utils.content │ │ │ ├── uri_utils.sql_in │ │ │ └── uri_utils.yml │ └── hawq │ │ ├── 1.2 │ │ ├── CMakeLists.txt │ │ └── config │ │ │ ├── CMakeLists.txt │ │ │ └── Modules.yml │ │ ├── 1.3 │ │ ├── CMakeLists.txt │ │ └── config │ │ │ ├── CMakeLists.txt │ │ │ └── Modules.yml │ │ ├── 2.0 │ │ ├── CMakeLists.txt │ │ └── config │ │ │ ├── CMakeLists.txt │ │ │ └── Modules.yml │ │ ├── CMakeLists.txt │ │ └── cmake │ │ ├── FindHAWQ.cmake │ │ ├── FindHAWQ_1_2.cmake │ │ ├── FindHAWQ_1_3.cmake │ │ ├── FindHAWQ_2_0.cmake │ │ └── HAWQUtils.cmake └── utils │ ├── __init__.py │ ├── argparse.py │ ├── argparse.pyc │ └── lib_contents.py └── tutorials └── PDLtools_tutorial.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore build directory 2 | /build* 3 | 4 | # Ignore generated code files 5 | *.so 6 | *.o 7 | *.pyc 8 | 9 | # Meta data of Mac OS X's Finder.app 10 | .DS_Store 11 | 12 | # LaTeX temporary files 13 | *.aux 14 | *.bbl 15 | *.blg 16 | *.log 17 | *.out 18 | *.synctex.gz 19 | *.toc 20 | *.thm 21 | auto 22 | _region_.tex 23 | auto 24 | *.swp 25 | *.fdb_latexmk 26 | *.swo # vim swap file 27 | 28 | # Biblatex temporary files 29 | *-blx.bib 30 | *.run.xml 31 | 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | sudo: true 3 | git: 4 | submodules: false 5 | compiler: 6 | - gcc 7 | python: 8 | - '2.6' 9 | addons: 10 | artifacts: true 11 | services: 12 | - postgresql 13 | install: 14 | - pip install --user setuptools 15 | - mkdir -p ${TRAVIS_BUILD_DIR}/tools 16 | - cd ${TRAVIS_BUILD_DIR}/tools 17 | #- sudo add-apt-repository -y ppa:texlive-backports/ppa 18 | #- sudo apt-get -y update 19 | #- sudo apt-get -y install texlive-full 20 | - wget http://ftp.stack.nl/pub/users/dimitri/doxygen-1.8.7.linux.bin.tar.gz 21 | - tar -zxvf doxygen-1.8.7.linux.bin.tar.gz -C ${TRAVIS_BUILD_DIR}/tools 22 | - sudo apt-get install -y rpm 23 | before_script: 24 | - export PATH=${TRAVIS_BUILD_DIR}/tools/bin:${TRAVIS_BUILD_DIR}/tools/doxygen-1.8.7/bin:$PATH 25 | script: 26 | - cd ${TRAVIS_BUILD_DIR} 27 | - mkdir build 28 | - cd build 29 | - cmake .. 30 | - make 31 | - make package 32 | - make gppkg 33 | - make doc 34 | #- cd ${TRAVIS_BUILD_DIR}/doc/user/latex 35 | #- make pdf 36 | notifications: 37 | email: 38 | recipients: 39 | - sramanujam@pivotal.io 40 | on_success: change 41 | on_failure: always 42 | deploy: 43 | provider: releases 44 | api_key: 45 | secure: fYH3dUdIIjAydnbXyVzx6NwdWy7s0ZKiAdP5cBX4QK4bQK4JjHuBVCX8YRnH2lKdqgOeOHfRykvDWOaZyi0QOR62o4/j+5C/w5G883GW+7iN1UE+eWPw8jZU3ZMhRvGkFzvP9CHQfv0S+pRqK5lTF9bEn68vq+jcYG1dkqI+D1i17xt77Q1vbTN/oZfMwVMLaT5Cf4h2jdv9eRoTzx509peClrHo57O+vDRndVGhTDVQBgMHjoKUy86oNcVCp2JHYnvVrprKC6fhdyjr4SaCb6ZDinhGHK0q+UGL1IQw3BsQNHUGwftriT/uR2qBw352YdvpqXRW0jUg027Ov4dJUjcG/Eji4TE+9LbieAXLCEE5JPoxos47JU1nISReLmg2DUWbDGvh+TWiF25PiCL3xkmqAvqQydnIHw3ceS1HaiXC0lYp4uapXeawBGmaRLFCS7k8q50VaWLakwVodJtfhs/jkqO9ei4WV4kzf2MGbCXtXFKtgqo8qrcZCo1VhkaR+3AQsPIySO36otVteW2wXeykMVX+7ya6U5GC4BHWazh7yD6uAzVgkGu/yttuyTzeh5qrSmnM6blERrHjsAssf+Mtra7+mXW728mM8RMDwTMU+whlIUvnFxc16mBEV96cDm6MsmnQ1EByVRQoRxFWp/LOZ4mK+2aAyWkKlIrpLu0= 46 | file_glob: true 47 | file: 48 | - "pdltools-*-Linux.rpm" 49 | on: 50 | repo: pivotalsoftware/PDLTools 51 | tags: true 52 | all_branches: true 53 | skip_cleanup: true 54 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ====================================================================== 2 | # -- CMake setup 3 | # ====================================================================== 4 | 5 | project(PDLTools) 6 | 7 | cmake_minimum_required(VERSION 2.8.4 FATAL_ERROR) 8 | 9 | include(ExternalProject) 10 | 11 | # ====================================================================== 12 | # -- Local definitions (filenames, paths, etc.) 13 | # ====================================================================== 14 | 15 | # The default PDL Tools root directory should be "/usr/local/pdltools" and not 16 | # "/usr/local" (which is the CMake default) 17 | 18 | if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 19 | set(CMAKE_INSTALL_PREFIX "/usr/local/pdltools" CACHE PATH 20 | "Install path prefix, prepended onto install directories." FORCE 21 | ) 22 | endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 23 | 24 | set(PDLTOOLS_VERSION_YML ${CMAKE_CURRENT_SOURCE_DIR}/src/config/Version.yml) 25 | 26 | set(PDLTOOLS_THIRD_PARTY ${CMAKE_BINARY_DIR}/third_party) 27 | 28 | # Set the directory for tools needed during build time 29 | set(PDLTOOLS_BUILD_TOOLS ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 30 | 31 | # Read and parse Version.yml file 32 | message (" ------------------------------------------------------------") 33 | message (" PDLTOOLS VERSION INFO (${PDLTOOLS_VERSION_YML})") 34 | 35 | file(READ "${PDLTOOLS_VERSION_YML}" _PDLTOOLS_VERSION_CONTENTS) 36 | string(REGEX REPLACE "^.*version:[ \t]*([^\n]*)\n.*" "\\1" PDLTOOLS_VERSION_STRING "${_PDLTOOLS_VERSION_CONTENTS}") 37 | string(REGEX REPLACE "([0-9]+).*" "\\1" PDLTOOLS_VERSION_MAJOR "${PDLTOOLS_VERSION_STRING}") 38 | string(REGEX REPLACE "[0-9]+\\.([0-9]+).*" "\\1" PDLTOOLS_VERSION_MINOR "${PDLTOOLS_VERSION_STRING}") 39 | 40 | if("${PDLTOOLS_VERSION_STRING}" MATCHES "[0-9]+\\.[0-9]+\\.([0-9]+).*") 41 | string(REGEX REPLACE "[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" PDLTOOLS_VERSION_PATCH "${PDLTOOLS_VERSION_STRING}") 42 | else() 43 | set(PDLTOOLS_VERSION_PATCH 0) 44 | endif() 45 | 46 | message (" PDLTOOLS_VERSION_STRING .. : ${PDLTOOLS_VERSION_STRING}") 47 | message (" PDLTOOLS_VERSION_MAJOR ... : ${PDLTOOLS_VERSION_MAJOR}") 48 | message (" PDLTOOLS_VERSION_MINOR ... : ${PDLTOOLS_VERSION_MINOR}") 49 | message (" PDLTOOLS_VERSION_PATCH ... : ${PDLTOOLS_VERSION_PATCH}") 50 | message (" ------------------------------------------------------------") 51 | 52 | 53 | # ------------------------------------------------------------------ 54 | # Find M4 binary (we need this for doxygen docs pre-processing) 55 | # ------------------------------------------------------------------ 56 | if(SOLARIS) 57 | # Solaris ships GNU m4 as gm4, so we want to use that 58 | find_program(M4_BINARY gm4 59 | PATHS /usr/sfw/bin 60 | DOC "Path to the GNU m4 preprocessor." 61 | ) 62 | else() 63 | find_program(M4_BINARY m4 64 | PATHS /usr/local/bin /usr/bin /bin /opt/local/bin 65 | DOC "Path to the GNU m4 preprocessor." 66 | ) 67 | endif() 68 | 69 | if(NOT M4_BINARY) 70 | message(FATAL_ERROR "Cannot find the m4 preprocessor.") 71 | endif(NOT M4_BINARY) 72 | 73 | 74 | 75 | # =============================================================== 76 | # -- Install Read-me files and license directory 77 | # =============================================================== 78 | 79 | install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/license" 80 | DESTINATION . 81 | COMPONENT core 82 | PATTERN ".DS_Store" EXCLUDE 83 | ) 84 | install( 85 | FILES 86 | "${CMAKE_CURRENT_SOURCE_DIR}/README.md" 87 | DESTINATION doc 88 | COMPONENT core 89 | ) 90 | 91 | # ====================================================================== 92 | # -- Local includes 93 | # ====================================================================== 94 | 95 | list(APPEND CMAKE_MODULE_PATH 96 | "${PDLTOOLS_BUILD_TOOLS}") 97 | 98 | include(Utils) 99 | include(LinuxUtils) 100 | include(OSXUtils) 101 | 102 | if(CMAKE_COMPILER_IS_GNUCC) 103 | # Let's store the gcc version in a variable 104 | execute_process( 105 | COMMAND ${CMAKE_C_COMPILER} -dumpversion 106 | OUTPUT_VARIABLE GNUCC_VERSION 107 | OUTPUT_STRIP_TRAILING_WHITESPACE) 108 | 109 | # A useful summary of warning options can be found here: 110 | # http://developer.apple.com/tools/xcode/compilercodewarnings.html 111 | # Note: gcc does not implicitly set _POSIX_C_SOURCE or _XOPEN_SOURCE 112 | # when using -std=c99. 113 | # http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_02_01_01 114 | # We specify that we are POSIX.1-2001 compliant and XSI-conforming. We only 115 | # need to specify _XOPEN_SOURCE as _POSIX_C_SOURCE will be set implicitly. 116 | set(CMAKE_C_FLAGS "-std=c99 -pedantic -Wall -Wextra -D_XOPEN_SOURCE=600" 117 | CACHE STRING 118 | "Flags used by the compiler during all build types." FORCE) 119 | endif() 120 | 121 | # ====================================================================== 122 | # -- Add subdirectories 123 | # ====================================================================== 124 | 125 | add_subdirectory(src) 126 | add_subdirectory(deploy) 127 | add_subdirectory(doc) 128 | 129 | # ====================================================================== 130 | # -- Install path for specific pdltools version 131 | # ====================================================================== 132 | 133 | set(CMAKE_PDLTOOLS_ROOT "${CMAKE_INSTALL_PREFIX}") 134 | set(CMAKE_INSTALL_PREFIX "${CMAKE_PDLTOOLS_ROOT}/Versions/${PDLTOOLS_VERSION_STRING}") 135 | 136 | # Create symlink doc 137 | install(CODE " 138 | EXECUTE_PROCESS(COMMAND ln -nsf 139 | ${CMAKE_PDLTOOLS_ROOT}/Current/doc 140 | ${CMAKE_PDLTOOLS_ROOT}/doc 141 | ) 142 | ") 143 | 144 | -------------------------------------------------------------------------------- /CONTRIBUTIONS.md: -------------------------------------------------------------------------------- 1 | Contributing to PDLTools 2 | ========================= 3 | If you're a Pivotal employee and would like to contribute to PDLTools, this guide is for you. Following these step-by-step instructions you should be able to easily add your module to PDLTools. 4 | 5 | 1. Since you may not have push access to the master repo, fork the base repo [pivotalsoftware/PDLTools](https://github.com/pivotalsoftware/PDLTools), into your own account on GitHub. 6 | 2. Clone your forked repo into a VM. You can download the GPDB sandbox VM here: [GPDB Sandbox](https://network.pivotal.io/products/pivotal-gpdb#/releases/567/file_groups/337). Make sure you create an account on [PivNet](http://network.pivotal.io). You can get the latest GPDB sandbox VMs by going directly to [greenplum.org](http://greenplum.org) 7 | 3. Create a branch to keep track of your contribution: `git checkout -b my_contribution` 8 | 4. Look at one of the more recent contributions such as [kd-tree](https://github.com/pivotalsoftware/PDLTools/pull/11/commits/84dcf00b72c5d4a9f11b299d7fa8b3d3b02010c7) to get an idea of all the files you'll have to touch to include your contribution. You can also look at the [sample_contribution_kl_divergence](https://github.com/pivotalsoftware/PDLTools/commit/9a0980a1b2b64a1a04c7ecfa76b233273779d191) commit to get a high level idea of what a contribution entails. Your contribution should include unit tests to validate the functionalities in your module. Also ensure your contribution is well documented. You can navigate to the `$BUILD/doc/user/html/index.html` or `$BUILD/doc/user/latex/refman.pdf` files in your local repo to check if the documentation for your contribution is appearing as expected on Doxygen docs. 9 | 5. Commit your changes to your branch (ex: `my_contribution`) on your GitHub account. 10 | 6. Submit a pull-request from the branch you created on your head fork (ex: vatsan/PDLTools) to the same branch on the basefork (pivotalsoftware/PDLTools). 11 | ![pdltools_sample_pull_request_1](https://github.com/pivotalsoftware/PDLTools/blob/master/doc/imgs/pdltools_sample_pull_request_1.png) 12 | ![pdltools_sample_pull_request_2](https://github.com/pivotalsoftware/PDLTools/blob/master/doc/imgs/pdltools_sample_pull_request_2.png) 13 | 7. This will automatically trigger a Travis CI build. If your contribution had no errors, you should see something like the following on CI. 14 | ![Travis-CI success](https://github.com/pivotalsoftware/PDLTools/blob/master/doc/imgs/pdltools_sample_travis.png) 15 | The committers to PDLTools will see the following: 16 | ![Travis-CI success committer view](https://github.com/pivotalsoftware/PDLTools/blob/master/doc/imgs/pdltools_pull_request_travis_success.png) 17 | 8. The committers to pivotalsoftware/PDLTools will then merge your contribution to the base fork and voila, you should be able to see your contribution on [PDLTools User Docs](http://pivotalsoftware.github.io/PDLTools/). When a release is eventually created off the main branch, the installers for that release will contain your module. 18 | 19 | Creating Releases 20 | ================= 21 | If you're a `committer` on the basefork [pivotalsoftware/PDLTools](https://github.com/pivotalsoftware/PDLTools) and would like to create a new release of PDLTools, these instructions are for you. 22 | 23 | The [.travis.yml](https://github.com/pivotalsoftware/PDLTools/blob/master/.travis.yml) file is already setup to instruct Travis-CI to automatically upload the relevant artifacts that were generated (ex: a tarball of the source, rpms/gppkgs, HTML docs, PDF docs etc.) to a releases branch on the basefork on GitHub. The set of all available releases can be viewed at [PDLTools Releases](https://github.com/pivotalsoftware/PDLTools/releases). Currently (as of Mar-2016), Travis CI will only generate a tarball of the source. PDF doc generation, rpm generation & gppkg generation are TBD. While the generation of those artifacts is under construction, we encourage you to manually create them on your VM and attach them to the generated release. 24 | 25 | When you are ready to create a new release of PDLTools, do the following: 26 | 27 | 1. Create a branch for your release: `git checkout -b 1.6` 28 | 2. Update the relevant files like `$PDLTOOLS_HOME/src/config/Version.yml` and `$PDLTOOLS_HOME/doc/changelog.dox.in` to indicate the latest release, commit your changes and push the new branch to the basefork. 29 | 3. Create a tag for the latest release. Travis-CI will use this to automatically upload the generated artifacts against this release: `git tag -a v1.6 -m '1.6 release'`, `git push origin —tags`. 30 | 4. You should be able to see your release automatically at [pivotalsoftware/PDLTools: releases](https://github.com/pivotalsoftware/PDLTools/releases). Feel free to edit it, annotate it, uploaded any other binaries which weren't generated automatically by Travis (ex: gppkg, PDF docs etc.) and let users know that the latest release is available for use. 31 | -------------------------------------------------------------------------------- /LEGAL.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2016 Pivotal Software, Inc. All rights reserved. 2 | 3 | Unauthorized use, copying or distribution of this source code via any 4 | medium is strictly prohibited without the express written consent of 5 | Pivotal Software, Inc. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 8 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 9 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 10 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 11 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 12 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 13 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Redistribution and use in source and binary forms, with or without 2 | modification, are permitted provided that the following conditions are 3 | met: 4 | 5 | 1. Redistributions of source code must retain the above copyright 6 | notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above 9 | copyright notice, this list of conditions and the following 10 | disclaimer in the documentation and/or other materials provided 11 | with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 14 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 15 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 16 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 17 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 18 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 19 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2022 VMware, Inc. 2 | 3 | This product is licensed to you under the BSD 2 clause (the "License"). You may not use this product except in compliance with the License. 4 | 5 | This product may include a number of subcomponents with separate copyright notices and license terms. Your use of these subcomponents is subject to the terms and conditions of the subcomponent's license, as noted in the LICENSE file. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PDL Tools 2 | ========= 3 | 4 | PDL Tools is a library of reusable tools used and developed by the Pivotal Data Science and Data Engineering teams. 5 | 6 | [![Build Status](https://travis-ci.org/pivotalsoftware/PDLTools.svg?branch=master)](https://travis-ci.org/pivotalsoftware/PDLTools) 7 | 8 | Usage docs 9 | ============ 10 | 11 | http://pivotalsoftware.github.io/PDLTools/ 12 | 13 | Binaries (Pivotal internal) 14 | ============================ 15 | 16 | [PDLTools binaries](https://drive.google.com/a/pivotal.io/folderview?id=0B43lMs8oQk7xcGJqdlN6SElWOTQ&usp=sharing) 17 | 18 | 19 | Pre-requisites 20 | =============== 21 | 22 | The following are the pre-requisites for building PDLTools: 23 | 24 | Required: 25 | * Pivotal Greenplum or Apache HAWQ ([GPDB sandbox](https://network.pivotal.io/products/pivotal-gpdb), [HAWQ sandbox](https://network.pivotal.io/products/pivotal-hdb)) 26 | * Apache MADlib ([Download](http://madlib.incubator.apache.org/download.html)) 27 | * cmake (3.5 recommended) 28 | * GNU C and C++ compilers (gcc, g++) 29 | * Flex (>= 2.5.33) 30 | * Bison (>= 2.4) 31 | * rpmbuild 32 | 33 | Optional: 34 | * Doxygen (1.8.7 recommended, if generating HTML docs), 35 | * LaTeX (if generating PDF docs) 36 | 37 | 38 | Building 39 | ========= 40 | 41 | For CentOS or Red Hat Enterprise Linux, install the pre-requisite tools: 42 | 43 | sudo yum install cmake gcc gcc-c++ flex bison rpm-build 44 | 45 | From either the Greenplum or HAWQ master node, follow these steps as the `gpadmin` user: 46 | 47 | curl -L -o pdltools-1.7.zip https://github.com/pivotalsoftware/PDLTools/archive/v1.7.zip 48 | unzip pdltools-1.7.zip 49 | cd PDLTools-1.7 50 | source /usr/local/hawq/greenplum_path.sh 51 | mkdir build 52 | cd build 53 | cmake .. -DRPM_INSTALL_PREFIX=$GPHOME 54 | curl -L -o third_party/downloads/uriparser-0.7.9.tar.bz2 https://sourceforge.net/projects/uriparser/files/uriparser-0.7.9.tar.bz2 55 | curl -L -o third_party/downloads/cpptest-1.1.2.tar.gz https://sourceforge.net/projects/cpptest/files/cpptest-1.1.2.tar.gz 56 | make 2> /dev/null 57 | 58 | Generating Doxygen User Docs 59 | ============================= 60 | 61 | You can generate Doxygen docs for the project as follows: 62 | 63 | make doc 64 | 65 | This will create the user docs under $BUILD/doc/user/html. 66 | You can also generate a PDF of the user doc by running 67 | 68 | cd build/doc/user/latex && make pdf 69 | 70 | This will generate a PDF titled `refman.pdf` in $PDLTOOLS/build/doc/user/latex 71 | 72 | 73 | Packaging 74 | ========== 75 | 76 | To create an rpm package which you can ship for installation into other machines, run the following (from the build directory): 77 | 78 | make package 79 | 80 | When installing on a cluster, it is best to create a gppkg installer. Run the following (from the build directory): 81 | 82 | make gppkg 83 | 84 | Installation 85 | ============= 86 | 87 | Installation is a two-step process. First, you will have to install MADlib _and_ PDL Tools on either the Greenplum or HAWQ master node. 88 | To do this, you will run the following: 89 | 90 | gppkg -i 91 | 92 | For example, run the following (from the build directory): 93 | 94 | gppkg -i deploy/gppkg/2.0/pdltools-1.7-hawq2.0-rhel5-x86_64.gppkg 95 | 96 | This will place all the relevant binaries & SQL files at the appropriate location (usually `$GPHOME/pdltools`). 97 | Next, you will have to install the SQL UDFs in the target database. 98 | 99 | To install pdltools into a database of your choice, run the following (consider adding `pdlpack` in your PATH): 100 | 101 | $GPHOME/pdltools/bin/pdlpack install [-s ] [-S ] [-M ] -c @:/ 102 | 103 | For example: 104 | 105 | $GPHOME/pdltools/bin/pdlpack install -s pdltools -c gpadmin@mdw:5432/testdb 106 | 107 | The default schemas are `pdltools` for the main schema, `sugarlib` for SUgAR and `madlib` to search for MADlib objects. 108 | 109 | Running Install Check Tests 110 | ============================= 111 | 112 | Post installation, you can run the unit tests in PDL Tools with the install-check command like so: 113 | 114 | $GPHOME/pdltools/bin/pdlpack install-check -s pdltools -c gpadmin@mdw:5432/testdb 115 | 116 | Parameters for `install-check` are the same as parameters for `install`. 117 | If any of the tests fail, you will see an error message displayed on your console. 118 | 119 | Contributing to PDLTools 120 | ======================== 121 | 122 | If you're interested in contributing to PDLTools, please refer to the instructions at [Guidelines for contributing to PDLTools](https://github.com/pivotalsoftware/PDLTools/blob/master/CONTRIBUTIONS.md) 123 | 124 | Legal 125 | ====== 126 | Copyright (c) 2013-2016 Pivotal Software, Inc. All rights reserved. 127 | 128 | Unauthorized use, copying or distribution of this source code via any 129 | medium is strictly prohibited without the express written consent of 130 | Pivotal Software, Inc. 131 | 132 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 133 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 134 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 135 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 136 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 137 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 138 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 139 | -------------------------------------------------------------------------------- /cmake/LinuxUtils.cmake: -------------------------------------------------------------------------------- 1 | # Get the RedHat/CentOS version 2 | macro(rh_version OUT_VERSION) 3 | if(EXISTS "/etc/redhat-release") 4 | file(READ "/etc/redhat-release" _REDHAT_RELEASE_CONTENT) 5 | string(REGEX REPLACE "[^0-9.]*([0-9.]+)[^0-9.]*\$" "\\1" ${OUT_VERSION} 6 | "${_REDHAT_RELEASE_CONTENT}" 7 | ) 8 | else(EXISTS "/etc/redhat-release") 9 | set(${OUT_VERSION} "${OUT_VERSION}-NOTFOUND") 10 | endif(EXISTS "/etc/redhat-release") 11 | endmacro(rh_version) 12 | -------------------------------------------------------------------------------- /cmake/OSXUtils.cmake: -------------------------------------------------------------------------------- 1 | # Get the architectures in a Mac OS X binary 2 | macro(osx_archs FILENAME OUT_ARCHS) 3 | execute_process( 4 | COMMAND /usr/bin/lipo -info ${FILENAME} 5 | OUTPUT_VARIABLE _LIPO_OUTPUT) 6 | string(REPLACE "\n" "" _LIPO_OUTPUT ${_LIPO_OUTPUT}) 7 | string(REGEX REPLACE ".*:[ ]*([^ ].*[^ ])[ ]*\$" "\\1" ${OUT_ARCHS} "${_LIPO_OUTPUT}") 8 | string(REPLACE " " ";" ${OUT_ARCHS} ${${OUT_ARCHS}}) 9 | endmacro(osx_archs) 10 | -------------------------------------------------------------------------------- /deploy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Packaging 3 | # ------------------------------------------------------------------------------ 4 | 5 | # -- Define which package generators to use, depending on the current 6 | # platform ------------------------------------------------------------------ 7 | 8 | if(APPLE) 9 | list(APPEND CPACK_GENERATOR 10 | PackageMaker 11 | ) 12 | elseif(UNIX) 13 | list(APPEND CPACK_GENERATOR 14 | RPM 15 | ) 16 | endif() 17 | 18 | 19 | # -- General settings for all/multiple packages generators --------------------- 20 | 21 | if(PACKAGE_SUFFIX) 22 | set(_PACKAGE_SUFFIX "-${PACKAGE_SUFFIX}") 23 | else(PACKAGE_SUFFIX) 24 | set(_PACKAGE_SUFFIX "") 25 | endif(PACKAGE_SUFFIX) 26 | 27 | set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/description.txt") 28 | set(CPACK_PACKAGE_DESCRIPTION_SUMMARY 29 | "PDL Tools Library.") 30 | set(CPACK_PACKAGE_FILE_NAME 31 | "pdltools${_PACKAGE_SUFFIX}-${PDLTOOLS_VERSION_STRING}-${CMAKE_SYSTEM_NAME}") 32 | set(CPACK_PACKAGE_INSTALL_DIRECTORY "pdltools") 33 | set(CPACK_PACKAGE_NAME "PDLTools${_PACKAGE_SUFFIX}") 34 | set(CPACK_PACKAGE_VENDOR "PDLTools") 35 | set(CPACK_PACKAGE_VERSION ${PDLTOOLS_VERSION_STRING}) 36 | set(CPACK_PACKAGE_VERSION_MAJOR ${PDLTOOLS_VERSION_MAJOR}) 37 | set(CPACK_PACKAGE_VERSION_MINOR ${PDLTOOLS_VERSION_MINOR}) 38 | set(CPACK_PACKAGE_VERSION_PATCH ${PDLTOOLS_VERSION_PATCH}) 39 | 40 | # CPACK_PACKAGING_INSTALL_PREFIX has to be set in the generator-specific 41 | # section! 42 | 43 | set(CPACK_RPM_USER_BINARY_SPECFILE ${CMAKE_CURRENT_SOURCE_DIR}/pdltools.spec.in) 44 | 45 | # -- Set settings for specific package generators ------------------------------ 46 | 47 | add_subdirectory(PackageMaker) 48 | add_subdirectory(PGXN) 49 | add_subdirectory(RPM) 50 | # gppkg depends on macros from RPM! 51 | add_subdirectory(gppkg) 52 | 53 | 54 | # -- Finally do the packaging! ------------------------------------------------- 55 | 56 | set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/rpm_post.sh") 57 | set(CPACK_PREFLIGHT_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/preflight.sh) 58 | set(CPACK_POSTFLIGHT_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/postflight.sh) 59 | set(CPACK_MONOLITHIC_INSTALL 1) 60 | include(CPack) 61 | 62 | 63 | # -- We can now use CPack commands to do customization ------------------------- 64 | 65 | cpack_add_component(doc 66 | DISPLAY_NAME Documentation 67 | DESCRIPTION "API reference and documentation (generated with Doxygen)." 68 | ) 69 | 70 | cpack_add_component(core 71 | DISPLAY_NAME "PDL Tools Core" 72 | DESCRIPTION "DBMS-independent files installed with every PDL Tools installation." 73 | REQUIRED 74 | ) 75 | 76 | cpack_add_component_group(ports 77 | DISPLAY_NAME "DBMS-Specific Components" 78 | DESCRIPTION "DBMS-specific files and libraries." 79 | EXPANDED 80 | ) 81 | 82 | file(GLOB PORT_COMPONENTS "${CMAKE_CURRENT_BINARY_DIR}/Component_*.cmake") 83 | foreach(PORT_COMPONENT ${PORT_COMPONENTS}) 84 | include("${PORT_COMPONENT}") 85 | endforeach(PORT_COMPONENT) 86 | -------------------------------------------------------------------------------- /deploy/PGXN/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Packaging for the PostgreSQL Extension Network (PGXN), http://pgxn.org 3 | # ------------------------------------------------------------------------------ 4 | 5 | set(PDLTOOLS_PGXN_RELEASE_NUMBER 1) 6 | set(PDLTOOLS_PGXN_VERSION_STR 7 | "${PDLTOOLS_VERSION_MAJOR}.${PDLTOOLS_VERSION_MINOR}.${PDLTOOLS_VERSION_PATCH}release${PDLTOOLS_PGXN_RELEASE_NUMBER}") 8 | set(PDLTOOLS_PGXN_NAME "pdltools-pgxn-${PDLTOOLS_PGXN_VERSION_STR}") 9 | 10 | configure_file(META.json.in META.json) 11 | configure_file(generate_package.sh.in generate_package.sh @ONLY) 12 | configure_file(zipignore.in zipignore) 13 | add_custom_command( 14 | OUTPUT pdltools.zip 15 | COMMAND "${CMAKE_COMMAND}" -E create_symlink 16 | "${CMAKE_CURRENT_BINARY_DIR}/META.json" 17 | "${CMAKE_SOURCE_DIR}/META.json" 18 | COMMAND "${CMAKE_CURRENT_BINARY_DIR}/generate_package.sh" 19 | COMMAND "${CMAKE_COMMAND}" -E remove 20 | "${CMAKE_SOURCE_DIR}/META.json" 21 | COMMENT "Creating PGXN zip file." 22 | VERBATIM 23 | ) 24 | add_custom_target(pgxn DEPENDS pdltools.zip) 25 | -------------------------------------------------------------------------------- /deploy/PGXN/META.json.in: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pdltools", 3 | "abstract": "A library of tools for Data Scientists over a SQL interface", 4 | "description": "A library of tools for Data Scientists over a SQL interface", 5 | "version": "@PDLTOOLS_PGXN_VERSION_STR@", 6 | "maintainer": "PDL Tools development team", 7 | "license": "proprietary", 8 | "provides": { 9 | "pdltools": { 10 | "file": "pdltools--@PDLTOOLS_VERSION_MAJOR@.@PDLTOOLS_VERSION_MINOR@.@PDLTOOLS_VERSION_PATCH@.sql", 11 | "docfile": "ReadMe.txt", 12 | "version": "@PDLTOOLS_VERSION_MAJOR@.@PDLTOOLS_VERSION_MINOR@.@PDLTOOLS_VERSION_PATCH@" 13 | } 14 | }, 15 | "resources": { 16 | "homepage": "https://sites.google.com/a/pivotal.io/global-data-science/pdl-tools", 17 | "repository": { 18 | "url": "https://stash.greenplum.com/scm/ds/dstools.git", 19 | "web": "https://stash.greenplum.com/scm/ds/dstools", 20 | "type": "git" 21 | } 22 | }, 23 | "release_status": "stable", 24 | "tags": [ 25 | "machine learning", 26 | "analytics", 27 | "data wrangling" 28 | ], 29 | 30 | 31 | "meta-spec": { 32 | "version": "1.0.0", 33 | "url": "http://pgxn.org/meta/spec.txt" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /deploy/PGXN/generate_package.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TEMPDIR=`mktemp -d -t pdltools` 4 | "@CMAKE_COMMAND@" -E create_symlink \ 5 | "@CMAKE_SOURCE_DIR@" \ 6 | "${TEMPDIR}/@PDLTOOLS_PGXN_NAME@" 7 | "@CMAKE_COMMAND@" -E create_symlink \ 8 | "@CMAKE_CURRENT_BINARY_DIR@/zipignore" \ 9 | "${TEMPDIR}/zipignore" 10 | cd "${TEMPDIR}" 11 | zip --exclude @zipignore \ 12 | -r "@CMAKE_CURRENT_BINARY_DIR@/@PDLTOOLS_PGXN_NAME@.zip" \ 13 | "@PDLTOOLS_PGXN_NAME@" 14 | -------------------------------------------------------------------------------- /deploy/PGXN/zipignore.in: -------------------------------------------------------------------------------- 1 | */.git/* 2 | */build* 3 | *.so 4 | *.o 5 | *.pyc 6 | */.DS_Store 7 | *.aux 8 | *.bbl 9 | *.blg 10 | *.log 11 | *.out 12 | *.synctex.gz 13 | *.toc 14 | *.thm 15 | *-blx.bib 16 | *.run.xml 17 | -------------------------------------------------------------------------------- /deploy/PackageMaker/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Packaging with MacOS X PackageMaker 3 | # ------------------------------------------------------------------------------ 4 | # 5 | # Important: Set variables using set(... PARENT_SCOPE), so that the scope of the 6 | # definition extends to the parent scope 7 | 8 | 9 | # -- Set PackageMaker-specific variables --------------------------------------- 10 | 11 | set(CPACK_RESOURCE_FILE_README 12 | "${CPACK_PACKAGE_DESCRIPTION_FILE}" PARENT_SCOPE) 13 | set(CPACK_RESOURCE_FILE_LICENSE 14 | "${CMAKE_SOURCE_DIR}/license/PDLTools.txt" PARENT_SCOPE) 15 | set(CPACK_RESOURCE_FILE_WELCOME 16 | "${CMAKE_CURRENT_SOURCE_DIR}/Welcome.html" PARENT_SCOPE) 17 | set(CPACK_OSX_PACKAGE_VERSION "10.5" PARENT_SCOPE) 18 | #set(CPACK_PACKAGE_DEFAULT_LOCATION "/usr/local/pdltools/Versions/${PDLTOOLS_VERSION_STRING}" PARENT_SCOPE) 19 | #set(CPACK_PACKAGING_INSTALL_PREFIX "/" PARENT_SCOPE) 20 | -------------------------------------------------------------------------------- /deploy/PackageMaker/Welcome.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | Welcome to PDL Tools 7 | 8 |

Welcome to PDL Tools!

9 |

This installer will guide you through the process of installing PDL Tools onto 10 | your computer.

11 | 12 | 13 | -------------------------------------------------------------------------------- /deploy/RPM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Packaging with RPM 3 | # ------------------------------------------------------------------------------ 4 | # 5 | # Important: Set variables using set(... PARENT_SCOPE), so that the scope of the 6 | # definition extends to the parent scope 7 | 8 | 9 | # Get information about the environment 10 | rh_version(RH_VERSION) 11 | 12 | # -- Set RPM-specific variables ------------------------------------------------ 13 | 14 | set(CPACK_RPM_PACKAGE_ARCHITECTURE x86_64 PARENT_SCOPE) 15 | set(CPACK_RPM_PACKAGE_LICENSE "Proprietary License" PARENT_SCOPE) 16 | set(CPACK_RPM_PACKAGE_GROUP "Development/Libraries" PARENT_SCOPE) 17 | set(CPACK_PACKAGING_INSTALL_PREFIX "/usr/local/pdltools/Versions/${PDLTOOLS_VERSION_STRING}" PARENT_SCOPE) 18 | 19 | set(_PACKAGE_REQUIRES "m4 >= 1.4") 20 | if(RH_VERSION AND RH_VERSION VERSION_LESS "6.0") 21 | # on RH/CentOS 5, there is no Python 2.6 or higher in the default 22 | # repositories. 23 | set(CPACK_RPM_PACKAGE_REQUIRES "python, ${_PACKAGE_REQUIRES}" PARENT_SCOPE) 24 | else() 25 | set(CPACK_RPM_PACKAGE_REQUIRES "python >= 2.6, ${_PACKAGE_REQUIRES}" PARENT_SCOPE) 26 | endif() 27 | 28 | # We do all deployment preparation with our CMake build script, so we do 29 | # not want to do any post-processing as part of the RPM generation. 30 | set(CPACK_RPM_SPEC_MORE_DEFINE "%undefine __os_install_post" PARENT_SCOPE) 31 | 32 | -------------------------------------------------------------------------------- /deploy/description.txt: -------------------------------------------------------------------------------- 1 | A library of tools for use by Data Scientists over a SQL interface. 2 | -------------------------------------------------------------------------------- /deploy/gppkg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Packaging for Greenplum's gppkg 3 | # ------------------------------------------------------------------------------ 4 | 5 | set(PDLTOOLS_GPPKG_VERSION "1.8") 6 | set(PDLTOOLS_GPPKG_RELEASE_NUMBER 1) 7 | set(PDLTOOLS_GPPKG_RPM_SOURCE_DIR 8 | "${CMAKE_BINARY_DIR}/_CPack_Packages/Linux/RPM/${CPACK_PACKAGE_FILE_NAME}" 9 | ) 10 | # gppkg expects that the file name for the RPM from which the gppkg is generated 11 | # follows the pattern "--..rpm". Otherwise, 12 | # uninstallation will not work (MPP-18078). Note that has to be 13 | # consistent with the version in pdltools.spec.in. gppkg deduces the 14 | # uninstallation command line options from the filename! 15 | set(PDLTOOLS_GPPKG_RPM_FILE_NAME 16 | "pdltools-${PDLTOOLS_VERSION_STRING}-${PDLTOOLS_GPPKG_RELEASE_NUMBER}.${CPACK_RPM_PACKAGE_ARCHITECTURE}.rpm") 17 | 18 | find_program( 19 | GPPKG_BINARY 20 | gppkg 21 | PATH /usr/local/greenplum-db/bin 22 | DOC "Path to Greenplum gppkg" 23 | ) 24 | find_program( 25 | RPMBUILD_BINARY 26 | rpmbuild 27 | DOC "Path to rpmbuild" 28 | ) 29 | file(READ "${CPACK_PACKAGE_DESCRIPTION_FILE}" CPACK_RPM_PACKAGE_DESCRIPTION) 30 | 31 | # The target gppkg is a meta target that depends on all version-specific 32 | # gppkg targets 33 | add_custom_target(gppkg 34 | COMMENT "Generating all Greenplum gppkg installers" 35 | ) 36 | 37 | # We now run generated files, one for each Greenplum version 38 | file(GLOB GPPKG_VERSIONS "${CMAKE_CURRENT_BINARY_DIR}/Version_*.cmake") 39 | foreach(GPPKG_VERSION ${GPPKG_VERSIONS}) 40 | include("${GPPKG_VERSION}") 41 | endforeach(GPPKG_VERSION) 42 | 43 | set(CPACK_RPM_SPEC_MORE_DEFINE "%undefine __os_install_post" PARENT_SCOPE) 44 | -------------------------------------------------------------------------------- /deploy/gppkg/gppkg_spec.yml.in: -------------------------------------------------------------------------------- 1 | Pkgname: pdltools 2 | Architecture: @CPACK_RPM_PACKAGE_ARCHITECTURE@ 3 | Version: @PDLTOOLS_GPPKG_VERSION@-@GPDB_VARIANT_SHORT@@GPDB_VERSION_LC@ 4 | OS: rhel5 5 | GPDBVersion: @GPDB_VERSION_LC@ 6 | Description: PDL Tools is a library of reusable tools used and developed by Pivotal Data Labs. 7 | PostInstall: 8 | - Master: "echo 'Please run the following command to deploy PDL Tools'; 9 | echo 'usage: pdlpack install [-s schema_name] -p @PORT_NAME@ [-S sugar_schema_name]'; 10 | echo ' [-M madlib_schema_name] -c user@host:port/database'; 11 | echo 'Example:'; 12 | echo ' $ $GPHOME/pdltools/bin/pdlpack install -s pdltools -p @PORT_NAME@ -c gpadmin@mdw:5432/testdb'; 13 | echo ' This will install PDL Tools objects into a @GPDB_VARIANT@ database named \"testdb\"'; 14 | echo ' running on server \"mdw\" on port 5432. Installer will try to login as'; 15 | echo ' \"gpadmin\" and will prompt for password. The target schema for the main'; 16 | echo ' library will be \"pdltools\" (specified here explicitly) and the target'; 17 | echo ' schema for the SUgAR library, packaged with it, will be \"sugarlib\"'; 18 | echo ' (specified here implicitly). MADlib objects will be searched for in the'; 19 | echo ' \"madlib\" schema (also specified here implicitly).'; 20 | echo ' In all cases, these schema choices are the default options.'; 21 | echo ''; 22 | echo 'To uninstall the package, use \"gppkg -r\".'; 23 | echo 'For additional options run:'; 24 | echo '$ pdlpack --help'; 25 | echo 'Release notes and additional documentation can be found at'; 26 | echo 'https://sites.google.com/a/pivotal.io/global-data-science/pdl-tools,'; 27 | echo 'as well as on http://pdl-tools.pa.pivotal.io/'; 28 | echo 'accessible through the Pivotal VPN.';" 29 | -------------------------------------------------------------------------------- /deploy/gppkg/pdltools.spec.in: -------------------------------------------------------------------------------- 1 | %define _topdir @CMAKE_CURRENT_BINARY_DIR@/@GPDB_VERSION@ 2 | %define __os_install_post %{nil} 3 | %define _rpmfilename @PDLTOOLS_GPPKG_RPM_FILE_NAME@ 4 | %define _unpackaged_files_terminate_build 0 5 | %define _pdltools_version @PDLTOOLS_VERSION_STRING@ 6 | 7 | BuildRoot: @PDLTOOLS_GPPKG_RPM_SOURCE_DIR@ 8 | Summary: PDL Tools for Greenplum Database 9 | License: @CPACK_RPM_PACKAGE_LICENSE@ 10 | Name: pdltools 11 | Version: @PDLTOOLS_VERSION_STRING@ 12 | Release: @PDLTOOLS_GPPKG_RELEASE_NUMBER@ 13 | Group: @CPACK_RPM_PACKAGE_GROUP@ 14 | Prefix: /usr/local 15 | AutoReq: no 16 | AutoProv: no 17 | BuildArch: @CPACK_RPM_PACKAGE_ARCHITECTURE@ 18 | Provides: /bin/sh 19 | 20 | %description 21 | @CPACK_RPM_PACKAGE_DESCRIPTION@ 22 | 23 | %prep 24 | : 25 | 26 | %install 27 | if [ ! @PDLTOOLS_GPPKG_RPM_SOURCE_DIR@ -ef $RPM_BUILD_ROOT ] 28 | then 29 | mkdir -p $RPM_BUILD_ROOT 30 | ln -s @PDLTOOLS_GPPKG_RPM_SOURCE_DIR@/* $RPM_BUILD_ROOT/ 31 | fi 32 | 33 | 34 | %post 35 | echo $RPM_INSTALL_PREFIX 36 | echo "DONE" 37 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Versions/%{_pdltools_version} $RPM_INSTALL_PREFIX/pdltools/Current 38 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Current/bin $RPM_INSTALL_PREFIX/pdltools/bin 39 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Current/doc $RPM_INSTALL_PREFIX/pdltools/doc 40 | 41 | %files 42 | %((cd "@PDLTOOLS_GPPKG_RPM_SOURCE_DIR@@CPACK_PACKAGING_INSTALL_PREFIX@" && find . \( -type f -or -type l \) | grep -E -v "^\./ports/.*" && find ./ports/@PORT_NAME@ \( -type f -or -type l \) | grep -E -v "^\./ports/@PORT_NAME@/[[:digit:]]+\.[[:digit:]]+/.*" && find ./ports/@PORT_NAME@/@GPDB_VERSION@ \( -type f -or -type l \)) | cut -c 2- | awk '{ print "\"@CPACK_PACKAGING_INSTALL_PREFIX@" $0 "\""}') 43 | -------------------------------------------------------------------------------- /deploy/hawq_install: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Python wrapper for installing the PDLTools rpm file for HAWQ 1.2. 4 | Since gppkgs are supported in HAWQ only from version 1.3 and above, we use this work-around. 5 | This script has been adapted from the bash version used by MADlib Eng team 6 | Srivatsan Ramanujam , 2 Oct 2014 7 | """ 8 | import os 9 | 10 | def install_pdltools(pdltools_rpm, hostfile): 11 | """ 12 | Simple python wrapper that installs the RPM 13 | """ 14 | gphome = os.environ['GPHOME'] if os.environ.has_key('GPHOME') else None 15 | if(not gphome): 16 | print '$GPHOME not found. Please source $GPHOME/greenplum_path.sh and retry' 17 | return 18 | 19 | rpm_db = """{gphome}/share/packages/database""".format(gphome=gphome) 20 | rpm_pkg_name = os.path.basename(pdltools_rpm).replace('-Linux.rpm','') 21 | package_name = os.path.basename(pdltools_rpm) 22 | 23 | hosts = [h for h in open(hostfile).read().split('\n') if h] 24 | ssh_rpmq = """ssh {host} rpm -q {rpm_pkg_name} --dbpath {rpm_db}""" 25 | ssh_uninstall = """ssh {host} rpm -ev --allmatches {rpm_pkg_name} --dbpath {rpm_db}""" 26 | 27 | #1) Check and remove any existing installations of the same version of the package 28 | for host in hosts: 29 | print 'Querying {host} for any existing installation of {package_name}'.format(host=host, package_name=package_name) 30 | rc = os.system(ssh_rpmq.format(host=host, rpm_pkg_name=rpm_pkg_name, rpm_db=rpm_db)) 31 | if(not rc): 32 | print 'Removing existing installation of {package_name} on {host}'.format(host=host, package_name=package_name) 33 | rc = os.system(ssh_uninstall.format(host=host, rpm_db=rpm_db, rpm_pkg_name=rpm_pkg_name)) 34 | if(rc !=0): 35 | print 'Error uninstalling the rpm on {host}'.format(host=host) 36 | return 37 | 38 | #2) Copy the new rpm to all nodes 39 | gpscp = """gpscp -f {hostfile} {pdltools_rpm} =:{gphome}""".format(hostfile=hostfile, pdltools_rpm=pdltools_rpm, gphome=gphome) 40 | print 'Copying {package_name} to all hosts'.format(package_name=package_name) 41 | print gpscp 42 | rc = os.system(gpscp) 43 | if(rc!=0): 44 | print 'Error copying {package_name} to one or more hosts'.format(package_name=package_name) 45 | return 46 | 47 | #3) Install the rpm on all nodes 48 | gpssh_install = """gpssh -f {hostfile} rpm -v -i {rpmfile} --nodeps --dbpath {rpm_db} --prefix {gphome} 49 | """.format(hostfile=hostfile, rpmfile=os.path.join(gphome,package_name), rpm_db=rpm_db, gphome=gphome) 50 | print 'Installing {package_name} on all hosts'.format(package_name=package_name) 51 | print gpssh_install 52 | rc = os.system(gpssh_install) 53 | if(rc != 0): 54 | print 'Error installing {package_name} on one or more hosts'.format(package_name=package_name) 55 | return 56 | print 'Successfully installed PDLTools on all nodes under {gphome}/pdltools'.format(gphome=gphome) 57 | 58 | if(__name__ == '__main__'): 59 | from sys import argv 60 | if(len(argv) !=3): 61 | print 'Usage: ./hawq_install ' 62 | else: 63 | if(not argv[1].strip().endswith('.rpm')): 64 | print "Error: {rpm_file} doesn't appear to be an rpm file. Please try again".format(rpm_file=argv[1]) 65 | else: 66 | install_pdltools(argv[1],argv[2]) 67 | -------------------------------------------------------------------------------- /deploy/pdltools.spec.in: -------------------------------------------------------------------------------- 1 | # -*- rpm-spec -*- 2 | %define _rpmdir @CPACK_RPM_DIRECTORY@ 3 | %define _rpmfilename @CPACK_RPM_FILE_NAME@ 4 | %define _unpackaged_files_terminate_build 0 5 | %define _topdir @CPACK_RPM_DIRECTORY@ 6 | %define _pdltools_version @CPACK_PACKAGE_VERSION@ 7 | 8 | BuildRoot: @CPACK_RPM_DIRECTORY@/@CPACK_PACKAGE_FILE_NAME@@CPACK_RPM_PACKAGE_COMPONENT_PART_PATH@ 9 | Summary: @CPACK_RPM_PACKAGE_SUMMARY@ 10 | Name: @CPACK_RPM_PACKAGE_NAME@ 11 | Version: @CPACK_RPM_PACKAGE_VERSION@ 12 | Release: @CPACK_RPM_PACKAGE_RELEASE@ 13 | License: @CPACK_RPM_PACKAGE_LICENSE@ 14 | Group: @CPACK_RPM_PACKAGE_GROUP@ 15 | Prefix: /usr/local 16 | Vendor: @CPACK_RPM_PACKAGE_VENDOR@ 17 | 18 | @TMP_RPM_URL@ 19 | @TMP_RPM_REQUIRES@ 20 | @TMP_RPM_PROVIDES@ 21 | @TMP_RPM_OBSOLETES@ 22 | @TMP_RPM_BUILDARCH@ 23 | 24 | @TMP_RPM_SPEC_INSTALL_POST@ 25 | @CPACK_RPM_SPEC_MORE_DEFINE@ 26 | @CPACK_RPM_COMPRESSION_TYPE_TMP@ 27 | 28 | %description 29 | @CPACK_RPM_PACKAGE_DESCRIPTION@ 30 | 31 | %prep 32 | mv $RPM_BUILD_ROOT "@CPACK_TOPLEVEL_DIRECTORY@/tmpBBroot" 33 | 34 | #p build 35 | 36 | %install 37 | if [ -e $RPM_BUILD_ROOT ]; 38 | then 39 | rm -rf $RPM_BUILD_ROOT 40 | fi 41 | mv "@CPACK_TOPLEVEL_DIRECTORY@/tmpBBroot" $RPM_BUILD_ROOT 42 | 43 | %clean 44 | 45 | %post 46 | @CPACK_RPM_SPEC_POSTINSTALL@ 47 | 48 | %postun 49 | @CPACK_RPM_SPEC_POSTUNINSTALL@ 50 | 51 | %pre 52 | @CPACK_RPM_SPEC_PREINSTALL@ 53 | 54 | %preun 55 | @CPACK_RPM_SPEC_PREUNINSTALL@ 56 | 57 | %files 58 | %defattr(-,root,root,-) 59 | @CPACK_RPM_INSTALL_FILES@ 60 | 61 | %changelog 62 | * Fri May 3 2013 Rahul Iyer 63 | Replaced Cpack autogenerated spec file with this file. 64 | -------------------------------------------------------------------------------- /deploy/postflight.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # $0 - Script Path, $1 - Package Path, $2 - Target Location, and $3 - Target Volumn 4 | 5 | PDLTOOLS_VERSION=1.8 6 | 7 | find /usr/local/pdltools/bin -type d -exec cp -RPf {} /usr/local/pdltools/old_bin \; 2>/dev/null 8 | find /usr/local/pdltools/bin -depth -type d -exec rm -r {} \; 2>/dev/null 9 | 10 | find /usr/local/pdltools/doc -type d -exec cp -RPf {} /usr/local/pdltools/old_doc \; 2>/dev/null 11 | find /usr/local/pdltools/doc -depth -type d -exec rm -r {} \; 2>/dev/null 12 | 13 | #ln -sf $2 /usr/local/pdltools/Current 14 | if [ -d "/usr/local/pdltools" ] 15 | then 16 | ln -nsf /usr/local/pdltools/Versions/$PDLTOOLS_VERSION /usr/local/pdltools/Current 17 | ln -nsf /usr/local/pdltools/Current/bin /usr/local/pdltools/bin 18 | ln -nsf /usr/local/pdltools/Current/doc /usr/local/pdltools/doc 19 | fi 20 | 21 | if [ -d "/usr/local/pdltools/Versions.bak" ] 22 | then 23 | mv -f /usr/local/pdltools/Versions.bak/* /usr/local/pdltools/Versions/ 24 | rm -rf /usr/local/pdltools/Versions.bak 25 | fi 26 | -------------------------------------------------------------------------------- /deploy/preflight.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # $0 - Script Path, $1 - Package Path, $2 - Target Location, and $3 - Target Volumn 4 | 5 | if [ -d "/usr/local/pdltools/Versions" ] 6 | then 7 | mv /usr/local/pdltools/Versions /usr/local/pdltools/Versions.bak 8 | fi 9 | -------------------------------------------------------------------------------- /deploy/rpm_post.sh: -------------------------------------------------------------------------------- 1 | find $RPM_INSTALL_PREFIX/pdltools/bin -type d -exec cp -RPf {} $RPM_INSTALL_PREFIX/pdltools/old_bin \; 2>/dev/null 2 | find $RPM_INSTALL_PREFIX/pdltools/bin -depth -type d -exec rm -r {} \; 2>/dev/null 3 | 4 | find $RPM_INSTALL_PREFIX/pdltools/doc -type d -exec cp -RPf {} $RPM_INSTALL_PREFIX/pdltools/old_doc \; 2>/dev/null 5 | find $RPM_INSTALL_PREFIX/pdltools/doc -depth -type d -exec rm -r {} \; 2>/dev/null 6 | 7 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Versions/%{_pdltools_version} $RPM_INSTALL_PREFIX/pdltools/Current 8 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Current/bin $RPM_INSTALL_PREFIX/pdltools/bin 9 | ln -nsf $RPM_INSTALL_PREFIX/pdltools/Current/doc $RPM_INSTALL_PREFIX/pdltools/doc 10 | -------------------------------------------------------------------------------- /doc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDL Tools Documentation 3 | # ------------------------------------------------------------------------------ 4 | 5 | set(DOXYGEN_README_FILE "../README.md" CACHE STRING 6 | "Path to ReadMe file relative to the doc directory after installation") 7 | set(DOXYGEN_LICENSE_DIR "../../license" CACHE STRING 8 | "Path to license directory relative to the doc directory after installation") 9 | 10 | set(DOXYGEN_PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}") 11 | 12 | configure_file( 13 | mainpage.dox.in 14 | "${CMAKE_CURRENT_BINARY_DIR}/mainpage.dox" 15 | @ONLY 16 | ) 17 | 18 | #Getting started 19 | configure_file( 20 | gettingstarted.dox.in 21 | "${CMAKE_CURRENT_BINARY_DIR}/gettingstarted.dox" 22 | @ONLY 23 | ) 24 | 25 | #Installation instructions 26 | configure_file( 27 | installpage.dox.in 28 | "${CMAKE_CURRENT_BINARY_DIR}/installpage.dox" 29 | @ONLY 30 | ) 31 | 32 | #Change Log 33 | configure_file( 34 | changelog.dox.in 35 | "${CMAKE_CURRENT_BINARY_DIR}/changelog.dox" 36 | @ONLY 37 | ) 38 | 39 | configure_file( 40 | etc/pdltools_extra.css 41 | "${CMAKE_CURRENT_BINARY_DIR}/etc/pdltools_extra.css" 42 | @ONLY 43 | ) 44 | 45 | configure_file( 46 | etc/DoxygenLayout.xml 47 | "${CMAKE_CURRENT_BINARY_DIR}/etc/DoxygenLayout.xml" 48 | @ONLY 49 | ) 50 | 51 | configure_file( 52 | etc/header.html 53 | "${CMAKE_CURRENT_BINARY_DIR}/etc/header.html" 54 | @ONLY 55 | ) 56 | 57 | 58 | 59 | file( 60 | COPY 61 | imgs/pdl.png 62 | DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/imgs" 63 | ) 64 | 65 | set(_DOXYGEN_INPUT_USER 66 | "\"${CMAKE_CURRENT_BINARY_DIR}/mainpage.dox\"" 67 | "\"${CMAKE_CURRENT_BINARY_DIR}/gettingstarted.dox\"" 68 | "\"${CMAKE_CURRENT_BINARY_DIR}/installpage.dox\"" 69 | "\"${CMAKE_CURRENT_BINARY_DIR}/changelog.dox\"" 70 | "\"${CMAKE_SOURCE_DIR}/src/modules\"" 71 | "\"${CMAKE_SOURCE_DIR}/src/ports\"" 72 | ) 73 | join_strings(DOXYGEN_INPUT_USER " " "${_DOXYGEN_INPUT_USER}") 74 | 75 | 76 | set(DOXYGEN_INCLUDE_PATH "\"${CMAKE_SOURCE_DIR}/src\" \"${CMAKE_SOURCE_DIR}/src/ports/greenplum\"") 77 | 78 | set(DOXYGEN_OUTPUT_USER "${CMAKE_CURRENT_BINARY_DIR}/user" CACHE PATH 79 | "Base path where the documentation generated by Doxygen will be put (abolsute or relative to \${CMAKE_BINARY_DIR}/doc/etc)" 80 | ) 81 | set(DOXYGEN_HTML_OUTPUT html CACHE STRING 82 | "Path (relative to \${DOXYGEN_OUTPUT_} where HTML docs will be put." 83 | ) 84 | 85 | # -- Set macros for SQL/Python files ------------------------------------------- 86 | 87 | set(PORT_UC "DOXYGEN") 88 | set(DBMS "doxygen") 89 | set(DBMS_UC "${PORT_UC}") 90 | set(${DBMS_UC}_VERSION_STRING "0.0.0") 91 | set(${DBMS_UC}_VERSION_MAJOR "0") 92 | set(${DBMS_UC}_VERSION_MINOR "0") 93 | set(${DBMS_UC}_VERSION_PATCH "0") 94 | set(${DBMS_UC}_ARCHITECTURE "all") 95 | set(DBMS_FEATURES "__HAS_ORDERED_AGGREGATES__") 96 | define_m4_macros(M4_DEFINES_CMD_LINE M4_DEFINES_CODE ${DBMS_FEATURES}) 97 | 98 | 99 | # -- Build doxysql (the SQL parser) using flex and bison ----------------------- 100 | 101 | find_package(FLEX 2.5.33) 102 | find_package(BISON 2.4) 103 | find_package(Doxygen) 104 | 105 | if(FLEX_FOUND AND BISON_FOUND AND DOXYGEN_FOUND) 106 | # The FindFLEX module by cmake unfortunately does not cover the 107 | # case when FlexLexer.h is in a non-standard location. 108 | get_dir_name(_FLEX_BASE "${FLEX_EXECUTABLE}") 109 | get_dir_name(_FLEX_BASE "${_FLEX_BASE}") 110 | find_path(_FLEX_INCLUDE_DIR 111 | NAMES "FlexLexer.h" 112 | HINTS "${_FLEX_BASE}/include" 113 | ) 114 | if(_FLEX_INCLUDE_DIR) 115 | include_directories(BEFORE "${_FLEX_INCLUDE_DIR}") 116 | endif(_FLEX_INCLUDE_DIR) 117 | 118 | BISON_TARGET(doxysqlParser src/sql.yy ${CMAKE_CURRENT_BINARY_DIR}/sql.parser.cc) 119 | FLEX_TARGET(doxysqlScanner src/sql.ll ${CMAKE_CURRENT_BINARY_DIR}/sql.scanner.cc) 120 | ADD_FLEX_BISON_DEPENDENCY(doxysqlScanner doxysqlParser) 121 | 122 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 123 | add_executable(doxysql ${BISON_doxysqlParser_OUTPUTS} 124 | ${FLEX_doxysqlScanner_OUTPUTS}) 125 | set_target_properties(doxysql PROPERTIES 126 | RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin") 127 | 128 | 129 | # -- Copy executable and configuration files ----------------------------------- 130 | 131 | add_subdirectory(bin) 132 | add_subdirectory(etc) 133 | 134 | # -- Run doxygen --------------------------------------------------------------- 135 | 136 | set(_DOXYGEN_UNNEEDED_WARNINGS_FILTER egrep -v 137 | "warning:.*\\(@param is not found in the argument list.*kwargs\\)\$|The following parameters.*kwargs\\) are not documented\\)|parameter 'kwargs'\$") 138 | 139 | 140 | add_custom_target(doc 141 | COMMAND ${DOXYGEN_EXECUTABLE} user.doxyfile | 142 | ${_DOXYGEN_UNNEEDED_WARNINGS_FILTER} 143 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/etc" 144 | DEPENDS doxysql doxyBinFiles 145 | COMMENT "Generating user-level documentation..." 146 | VERBATIM 147 | ) 148 | 149 | 150 | # -- Install doc/user/html output directory to doc/html ------------------------ 151 | 152 | # We specify OPTIONAL, which means it will not be an error if the user 153 | # documentation does not exist 154 | install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/user/html 155 | DESTINATION doc 156 | OPTIONAL 157 | COMPONENT doc 158 | PATTERN ".DS_Store" EXCLUDE 159 | ) 160 | 161 | 162 | # -- Notify user if we could not run doxygen 163 | 164 | else(FLEX_FOUND AND BISON_FOUND AND DOXYGEN_FOUND) 165 | message(STATUS "Could not find recent versions of at least one of flex, " 166 | "bison, doxygen, or dot (part of graphviz, needed for doxygen). " 167 | "Documentation will not be built.") 168 | endif(FLEX_FOUND AND BISON_FOUND AND DOXYGEN_FOUND) 169 | 170 | # -- Add subdirectories -------------------------------------------------------- 171 | 172 | -------------------------------------------------------------------------------- /doc/bin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDL Tools Documentation Executable Files 3 | # ------------------------------------------------------------------------------ 4 | 5 | set(BIN_FILES 6 | doxypy.py 7 | ) 8 | 9 | add_files(BIN_TARGET_FILES . "${CMAKE_CURRENT_BINARY_DIR}" ${BIN_FILES}) 10 | add_custom_target(doxyBinFiles ALL DEPENDS ${BIN_TARGET_FILES}) 11 | 12 | list(APPEND M4_ARGUMENTS 13 | "\"-DPDLTOOLS_SCHEMA=pdltools\"" 14 | "\"-I${CMAKE_BINARY_DIR}/doc/etc\"" 15 | ) 16 | join_strings(_M4_ARGUMENTS " " "${M4_ARGUMENTS}") 17 | set(_M4_ARGUMENTS "${_M4_ARGUMENTS}") 18 | 19 | configure_file(py_filter.sh.in py_filter.sh @ONLY) 20 | configure_file(sql_filter.sh.in sql_filter.sh @ONLY) 21 | -------------------------------------------------------------------------------- /doc/bin/py_filter.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | FILTER="../bin/doxypy.py /dev/stdin" 3 | 4 | @M4_BINARY@ @_M4_ARGUMENTS@ $1 | ${FILTER} 5 | -------------------------------------------------------------------------------- /doc/bin/sql_filter.sh.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | FILTER="../bin/doxysql" 3 | 4 | @M4_BINARY@ @_M4_ARGUMENTS@ $1 | ${FILTER} -f '$(@M4_BINARY@ @_M4_ARGUMENTS@'"$1)" 5 | -------------------------------------------------------------------------------- /doc/changelog.dox.in: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | @page changelog 4 | 5 | ## PDL Tools Revision History 6 | ### PDL Tools 1.8 (packaged with SUgAR 1.0) -- Current version 7 | - Add function for splitting data into train/test sets. 8 | - Add module for normalized cuts. 9 | - Add module for HITS algorithm. 10 | - Modified complete_linkage to support HAWQ. 11 | - Fixed auc function to handle division by zero errors. 12 | 13 | ### PDL Tools 1.7 (packaged with SUgAR 1.0) 14 | - Add module kd_tree for KD-tree and its application to k-nearest neighbour search. 15 | - Add module pagerank. 16 | - Add grid search function for elastic net in grid_search module. 17 | - Add PDL Tools tutorials in Jupyter notebook. 18 | 19 | ### PDL Tools 1.6 (packaged with SUgAR 1.0) 20 | - Add module one_vs_rest for multi-class classification. 21 | - Add module grid_search for grid searching functions. 22 | - Add module generic_utilities (includes multi-table summary and array 23 | utilities). 24 | 25 | ### PDL Tools 1.5 (packaged with SUgAR 1.0) 26 | - Modifications in preparation to make PDLTools public. Removing module with pending IP (connected components). 27 | 28 | ### PDL Tools 1.4 (packaged with SUgAR 1.0) 29 | - Added module for prediction metrics (regression+binary+multi-class). 30 | - Added module to balance datasets for classification. 31 | - Added SUgAR "format_glob" (globbing with user-defined transformation). 32 | - Added SUgAR support for user-extension tables. 33 | - Installer rewritten for database-safe upgrades. 34 | - More modules (e.g. SUgAR) now supported on HAWQ. 35 | 36 | ### PDL Tools 1.3 (packaged with SUgAR 0.4.4) 37 | - Ported PDL Tools to work on HAWQ (1.2 and 1.3) 38 | - Modified pdlpack to accommodate installation on HAWQ 39 | - Added Complete Linkage algorithm. 40 | - Added multi-graph support for Connected Components algorithm. 41 | 42 | ### PDL Tools 1.2.2 (packaged with SUgAR 0.4.4) 43 | - Added Stratified Sampling. 44 | - Added installation instructions for non-gpadmin install. 45 | 46 | ### PDL Tools 1.2.1 (packaged with SUgAR 0.4.4) 47 | - Added Porter Stemming. 48 | - Path changes to pivotal.io domain. 49 | - Added 'version()' function. 50 | - Name change to PDL Tools. 51 | 52 | ### DS Tools 1.2 (packaged with SUgAR 0.4.4) 53 | 54 | - Switched to Doxygen-based documentation (HTML+PDF) 55 | - Added Complex number support 56 | - MADlib schema can now be specified explicitly in install. 57 | 58 | ### DS Tools 1.1.2 (packaged with SUgAR 0.4.3) 59 | 60 | - Added column globbing in SUgAR (pseudofunction 'glob'). 61 | - Added ability to generate all-columns-but-one in pivoting. 62 | - Dos2unix on all files, to avoid '\r' in 'usage' printouts. 63 | 64 | ### DS Tools 1.1.1 (packaged with SUgAR 0.4.2) 65 | 66 | - Fixed bugs in Connected Components to do with schema name scoping. 67 | - Installation now GRANTs USAGE on the schemas created. 68 | - Documentation improvements in SUgAR and in the DS Tools installer. 69 | - DS Tools now packaged for both GPDB 4.2 and GPDB 4.3. 70 | 71 | ### DS Tools 1.1 (packaged with SUgAR 0.4.1) 72 | 73 | - Upgrade support added. 74 | 75 | ### DS Tools 1.0 (packaged with SUgAR 0.4) 76 | 77 | - Initial packaged version of DS Tools. 78 | - All SUgAR documentation has been 79 | - Completely revamped and rewritten from 80 | - SUgAR 0.3 (initial release). 81 | 82 | 83 | 84 | 85 | */ 86 | -------------------------------------------------------------------------------- /doc/etc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDL Tools Documentation Configuration Files 3 | # ------------------------------------------------------------------------------ 4 | 5 | configure_file(user.doxyfile.in user.doxyfile) 6 | configure_file(SQLCommon.m4_in SQLCommon.m4) 7 | -------------------------------------------------------------------------------- /doc/etc/DoxygenLayout.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /doc/etc/SQLCommon.m4_in: -------------------------------------------------------------------------------- 1 | /* 2 | * During build time, macro definitions will be inserted here. 3 | */ 4 | @M4_DEFINES_CODE@ 5 | -------------------------------------------------------------------------------- /doc/etc/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | $treeview 12 | $search 13 | $mathjax 14 | 15 | 16 | $extrastylesheet 17 | 18 | 19 | 29 | 30 | 31 | 32 |
33 | 34 | 35 |
36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 49 | 50 | 51 | 52 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
44 |
$projectname 45 |  $projectnumber 46 |
47 |
$projectbrief
48 |
53 |
$projectbrief
54 |
$searchbox
65 |
66 | 67 | 68 | -------------------------------------------------------------------------------- /doc/etc/pdltools_extra.css: -------------------------------------------------------------------------------- 1 | /* PDL Tools CSS customizations (Copied from MADlib CSS + adding minor edits where required) */ 2 | 3 | /* Indent paragraphs in the main text, but not in framed boxes */ 4 | div.contents > p, div.contents > pre, div.contents > ul, div.contents > div.fragment, dd { 5 | margin-left: 20px; 6 | } 7 | 8 | /* Resize logo */ 9 | #projectlogo img{ 10 | width: 250px; 11 | height: 80px; 12 | } 13 | 14 | /* Increase spacing between titled paragraphs in the main text, but not in 15 | framed boxes */ 16 | div.contents > dl { 17 | margin-top: 2em; 18 | } 19 | 20 | /* Increase spacing between list items in the main text */ 21 | div.contents li { 22 | margin-top: 1em; 23 | } 24 | 25 | /* No automtic line wrapping at white spaces in
 or \verbatim
 26 |    environments. */
 27 | pre.fragment {
 28 |     word-wrap: normal;
 29 | }
 30 | 
 31 | /* Distinguish backgrounds for syntax, example commands, and results */
 32 | pre.syntax {
 33 |         border: 1px solid #999999;
 34 |         color: #00004D;
 35 |         background-color: #CBD4E7;
 36 |         padding: 4px 6px;
 37 |         margin: 4px 8px 4px 2px;
 38 |         overflow: auto;
 39 |         word-wrap: break-word;
 40 |         font-size:  9pt;
 41 |         line-height: 125%;
 42 |         font-family: monospace, fixed;
 43 |         font-size: 105%;
 44 | }
 45 | 
 46 | pre.example {
 47 |         border: 1px solid #999999;
 48 |         background-color: #E8E8E8;
 49 |         padding: 4px 6px;
 50 |         margin: 4px 8px 4px 2px;
 51 |         overflow: auto;
 52 |         word-wrap: break-word;
 53 |         font-size:  9pt;
 54 |         line-height: 125%;
 55 |         font-family: monospace, fixed;
 56 |         font-size: 105%;
 57 | }
 58 | 
 59 | pre.result {
 60 |         border: 1px solid #999999;
 61 |         background-color: #FFFFCC;
 62 |         padding: 4px 6px;
 63 |         margin: 4px 8px 4px 2px;
 64 |         overflow: auto;
 65 |         word-wrap: break-word;
 66 |         font-size:  9pt;
 67 |         line-height: 125%;
 68 |         font-family: monospace, fixed;
 69 |         font-size: 105%;
 70 | }
 71 | 
 72 | 
 73 | /* No padding for paragraph headers (in its infinite wisdom, doxygen uses 
74 | environments for that) */ 75 | dl { 76 | padding-left: 0; 77 | margin-top: 1em; 78 | } 79 | 80 | /* Increase the font size for paragraph headers */ 81 | dt { 82 | font-size: 120%; 83 | margin-bottom: 1em; 84 | } 85 | 86 | /* The first column should align with normal text. So we cannot use 87 | border-spacing. */ 88 | table.params { 89 | border-spacing: 0; 90 | } 91 | 92 | /* Add some padding instead of border-spacing */ 93 | td.paramname { 94 | padding: 1px 1em 1px 0; 95 | } 96 | 97 | /* We move the bar a out of the text frame, so that the text aligns well with 98 | the rest. Note: margin-left + border-width-left + padding-left = 0 */ 99 | /* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug 100 | { 101 | margin-left: -6px; 102 | padding-left: 2px; 103 | } 104 | */ 105 | 106 | /* Style parameter lists formatted with definition lists. */ 107 | dl.arglist { 108 | margin-left: 20px; 109 | margin-top: 0px; 110 | } 111 | 112 | dl.arglist dt { 113 | font-size: 100%; 114 | font-weight: bold; 115 | color: #00004D; 116 | margin-bottom: 0px; 117 | } 118 | 119 | div.toc { 120 | background-color: #FFF5B8; 121 | } 122 | 123 | /* Increase font size for toc.li from 10px to 14px */ 124 | div.toc li { 125 | font: 14px/1.4 Verdana,DejaVu Sans,Geneva,sans-serif; 126 | margin-top: 5px; 127 | padding-left: 10px; 128 | padding-top: 2px; 129 | } 130 | 131 | 132 | div.versionlist li.head { 133 | font: 12px/1.2 Verdana, DejaVu Sans, Geneva, sans-serif; 134 | display: inline; 135 | margin-right: 10px; 136 | } 137 | 138 | div.versionlist ul { 139 | display: inline; 140 | align: right; 141 | } 142 | 143 | div.versionlist li { 144 | font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif; 145 | display: inline; 146 | margin-right: 10px; 147 | } 148 | 149 | /* Table style for output table columns and descriptions */ 150 | table.output { 151 | border: 0; 152 | margin-left: 20px; 153 | } 154 | 155 | .output th { 156 | text-align: right; 157 | vertical-align: top; 158 | padding-right: 15px; 159 | font-weight: normal; 160 | color: #354C7B; 161 | } 162 | 163 | -------------------------------------------------------------------------------- /doc/imgs/pdl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/doc/imgs/pdl.png -------------------------------------------------------------------------------- /doc/imgs/pdltools_pull_request_travis_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/doc/imgs/pdltools_pull_request_travis_success.png -------------------------------------------------------------------------------- /doc/imgs/pdltools_sample_pull_request_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/doc/imgs/pdltools_sample_pull_request_1.png -------------------------------------------------------------------------------- /doc/imgs/pdltools_sample_pull_request_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/doc/imgs/pdltools_sample_pull_request_2.png -------------------------------------------------------------------------------- /doc/imgs/pdltools_sample_travis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/doc/imgs/pdltools_sample_travis.png -------------------------------------------------------------------------------- /license/PDLTools.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2017 by Pivotal, Inc. All rights reserved. 2 | -------------------------------------------------------------------------------- /license/third_party/uriparser-0.7.9.txt: -------------------------------------------------------------------------------- 1 | uriparser - RFC 3986 URI parsing library 2 | 3 | Copyright (C) 2007, Weijia Song 4 | Copyright (C) 2007, Sebastian Pipping 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above 16 | copyright notice, this list of conditions and the following 17 | disclaimer in the documentation and/or other materials 18 | provided with the distribution. 19 | 20 | * Neither the name of the nor the names of its 21 | contributors may be used to endorse or promote products 22 | derived from this software without specific prior written 23 | permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 31 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 34 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 36 | OF THE POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /src/bin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDL Tools binary files 3 | # ------------------------------------------------------------------------------ 4 | 5 | 6 | # -- 1. Copy files ------------------------------------------------------------- 7 | 8 | set(PDLPACK_BINARIES 9 | pdlpack 10 | ) 11 | add_files(BINARY_TARGET_FILES . "${CMAKE_CURRENT_BINARY_DIR}" ${PDLPACK_BINARIES}) 12 | add_custom_target(binaryFiles ALL DEPENDS ${BINARY_TARGET_FILES}) 13 | add_dependencies(binaryFiles pdlpackFiles) 14 | 15 | 16 | # -- 2. Install binary files to $PDLTOOLS_ROOT/bin ------------------------------- 17 | 18 | install(PROGRAMS ${BINARY_TARGET_FILES} 19 | DESTINATION bin 20 | COMPONENT core 21 | ) 22 | -------------------------------------------------------------------------------- /src/bin/pdlpack: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script does the following: 4 | # 0. If indicated by environent variables, look for DBMS-supplied Python 5 | # installation. E.g., Greenplum supplies its own Python, and its currently 6 | # the first choice to use because it guarantees that python, pygresql 7 | # (currently needed to connect to a GP database), and libpg all have the same 8 | # architecture. 9 | # 1. Of step 0 failed, find python interpreter by 10 | # - first looking for "python${VERSION}" in $PATH where 11 | # ${VERSION} in {2.7, 2.6} 12 | # - Only if that fails, look for "python" in $PATH 13 | # 2. Pass all arguments to ../pdlpack/pdlpack.py 14 | 15 | PYTHON_PREFIX="python" 16 | PYTHON_VERSIONS="2.7 2.6" 17 | 18 | # create absolute path to pdlpack.py 19 | pushd `dirname $0` > /dev/null 20 | PDLPACK_PATH="$(pwd -P)/../pdlpack/pdlpack.py" 21 | popd > /dev/null 22 | 23 | # Initialization 24 | DID_NOT_FIND_INTERPRETER=1 25 | 26 | # Platform-specific overrides 27 | if test "$GPHOME" && test "$PYTHONHOME" && \ 28 | test "${PYTHONHOME:0:${#GPHOME}}" = "$GPHOME"; then 29 | 30 | DID_NOT_FIND_INTERPRETER=0 31 | PYTHON_EXE_NAME="${PYTHONHOME}/bin/python" 32 | fi 33 | 34 | errorNoPythonFound() { 35 | echo "No Python interpreter found. Please install Python 2.6 or higher to" \ 36 | "run pdlpack." 37 | exit 1 38 | } 39 | 40 | setAndTestPythonVesion() { 41 | PYTHON_EXE_NAME="${PYTHON_PREFIX}$1" 42 | command -v "${PYTHON_EXE_NAME}" > /dev/null 43 | DID_NOT_FIND_INTERPRETER=$? 44 | } 45 | 46 | 47 | # function main() 48 | if test $DID_NOT_FIND_INTERPRETER -ne 0; then 49 | for VERSION in $PYTHON_VERSIONS; do 50 | setAndTestPythonVesion "${VERSION}" 51 | if test $DID_NOT_FIND_INTERPRETER -eq 0; then 52 | break 53 | fi 54 | done 55 | fi 56 | 57 | if test $DID_NOT_FIND_INTERPRETER -ne 0; then 58 | setAndTestPythonVesion "" 59 | fi 60 | 61 | if test $DID_NOT_FIND_INTERPRETER -ne 0; then 62 | errorNoPythonFound 63 | fi 64 | 65 | "$PYTHON_EXE_NAME" "${PDLPACK_PATH}" "$@" 66 | -------------------------------------------------------------------------------- /src/config/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDL Tools configuration files (These are configuration files used by the 3 | # installer). End users are not supposed to make modifications. 4 | # ------------------------------------------------------------------------------ 5 | 6 | 7 | # -- 1. Copy all *.yml files --------------------------------------------------- 8 | 9 | file(GLOB_RECURSE CONFIG_FILES 10 | RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" 11 | "*.yml" 12 | ) 13 | add_files(CONFIG_TARGET_FILES . "${CMAKE_CURRENT_BINARY_DIR}" ${CONFIG_FILES}) 14 | add_custom_target(configFiles ALL DEPENDS ${CONFIG_TARGET_FILES}) 15 | 16 | 17 | 18 | # -- 2. Install config files to $PDLTOOLS_ROOT/config ---------------------------- 19 | 20 | install(FILES ${CONFIG_TARGET_FILES} 21 | DESTINATION config 22 | COMPONENT core 23 | ) 24 | -------------------------------------------------------------------------------- /src/config/Modules.yml: -------------------------------------------------------------------------------- 1 | ### 2 | # List of methods/modules and their dependencies: 3 | ### 4 | modules: 5 | - name: common 6 | - name: uri_utils 7 | depends: ['common'] 8 | - name: edit_distance 9 | depends: ['common'] 10 | - name: anonymization 11 | depends: ['common'] 12 | - name: session 13 | depends: ['common'] 14 | - name: sugar 15 | depends: ['common'] 16 | - name: complex 17 | depends: ['common'] 18 | - name: stemming 19 | depends: ['common'] 20 | - name: sampling 21 | depends: ['common'] 22 | - name: complete_linkage 23 | depends: ['common'] 24 | - name: prediction_metrics 25 | depends: ['common'] 26 | - name: balance_dataset 27 | depends: ['common'] 28 | - name: one_vs_rest 29 | depends: ['common'] 30 | - name: grid_search 31 | depends: ['common'] 32 | - name: generic_utilities 33 | depends: ['common'] 34 | - name: kd_tree 35 | depends: ['common'] 36 | - name: pagerank 37 | depends: ['common'] 38 | - name: hits 39 | depends: ['common'] 40 | - name: normalized_cut 41 | depends: ['common'] 42 | # - name: plr_placeholder 43 | # depends: ['common'] 44 | # PL/R package disabled until PL/R will be part of Continuous Integration tool. 45 | -------------------------------------------------------------------------------- /src/config/Ports.yml: -------------------------------------------------------------------------------- 1 | # List of DB Ports to compile PDL Tools for. 2 | # 3 | # Synopsis: 4 | # - name : descriptive name of the new DB port 5 | # 6 | 7 | greenplum: 8 | name: Greenplum DB 9 | hawq: 10 | name: HAWQ 11 | 12 | -------------------------------------------------------------------------------- /src/config/SUgAR_version.yml: -------------------------------------------------------------------------------- 1 | sugar_version: 1.0 2 | -------------------------------------------------------------------------------- /src/config/Version.yml: -------------------------------------------------------------------------------- 1 | version: 1.8 2 | -------------------------------------------------------------------------------- /src/modules/common/common.c: -------------------------------------------------------------------------------- 1 | /** 2 | * File: common.c 3 | * Magic block invocation 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef PG_MODULE_MAGIC 11 | PG_MODULE_MAGIC; 12 | #endif 13 | 14 | -------------------------------------------------------------------------------- /src/modules/complex/complex_type.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * complex.h - Declarations for complex data type 4 | * 5 | * 6 | * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 1994, Regents of the University of California 8 | * 9 | * 10 | * NOTE 11 | * These routines do *not* use the float types from adt/. 12 | * 13 | * XXX These routines were not written by a numerical analyst. 14 | * 15 | *------------------------------------------------------------------------- 16 | */ 17 | #ifndef COMPLEX_H 18 | #define COMPLEX_H 19 | #include "postgres.h" 20 | #include "fmgr.h" 21 | #include 22 | /* 23 | * GCC has problems with header files on e.g. Solaris. 24 | * That OS defines the imaginary type, but GCC does not. 25 | * Probably needed elsewhere, e.g. AIX. 26 | * And use on Win32/64 suppresses warnings. 27 | * The warning is also seen on Mac OS 10.5. 28 | */ 29 | #if defined(__GNUC__) && (defined(__sun__) || defined(Win32)) 30 | #undef I 31 | #define I (__extension__ 1.0iF) 32 | #endif 33 | 34 | /* 35 | * represents the complex number x + yi 36 | * 37 | * We don't choose to use double complex defined in c99 because the 38 | * reasons list below: 39 | * 40 | * 1. Functions defined in /usr/include/complex.h is not portable enough. 41 | * 2. We can't use functions provided in /usr/include/complex.h directly, 42 | * because we need to check double overflow/underflow during calculation. 43 | * 3. For most portability, we store a complex number with two double numbers, 44 | * one for the real part, another one for the imaginary part. And all these 45 | * functions implemented by ourselves operate directly on the two double numbers, 46 | * so there is no need to do extra transformation between double complex and 47 | * internal representation. 48 | * 49 | * Note: When related with Infinity and NaN, the behaviours may not conform 50 | * the standard C99. But postgres' float8 doesn't conform C standard, because 51 | * we need more strict check. E.x. 5.0/0.0 in C will get Infinity, but in postgres 52 | * it will get a divided by zero error. Likely, (5.0 + 3.0i)/(0 + 0i) in C will get 53 | * (Infinity + Infinityi), but in our implementatioin it will get a divided by zero 54 | * error. 55 | */ 56 | typedef struct 57 | { 58 | double x, 59 | y; 60 | } Complex; 61 | 62 | #define INIT_COMPLEX(_c, _re, _im)\ 63 | do\ 64 | {\ 65 | (_c)->x = (_re);\ 66 | (_c)->y = (_im);\ 67 | } while (0) 68 | 69 | #define re(z) ((z)->x) 70 | #define im(z) ((z)->y) 71 | 72 | /* 73 | * fmgr interface macros 74 | * 75 | * Complex is a fixed-size pass-by-reference type 76 | */ 77 | #define DatumGetComplexP(X) ((Complex*) DatumGetPointer(X)) 78 | #define ComplexPGetDatum(X) PointerGetDatum(X) 79 | #define PG_GETARG_COMPLEX_P(n) DatumGetComplexP(PG_GETARG_DATUM(n)) 80 | #define PG_RETURN_COMPLEX_P(X) return ComplexPGetDatum(X) 81 | 82 | /* public complex routines */ 83 | extern Datum complex_in(PG_FUNCTION_ARGS); 84 | extern Datum complex_out(PG_FUNCTION_ARGS); 85 | extern Datum complex_recv(PG_FUNCTION_ARGS); 86 | extern Datum complex_send(PG_FUNCTION_ARGS); 87 | 88 | /* 89 | * constructs a complex number with two real numbers, 90 | * arg1 as the real part, arg2 as the imaginary part 91 | */ 92 | extern Datum construct_complex(PG_FUNCTION_ARGS); 93 | 94 | /* 95 | * returns arg1*cos(arg2) + arg1*sin(arg2)i 96 | */ 97 | extern Datum construct_complex_trig(PG_FUNCTION_ARGS); 98 | 99 | /* returns real part of arg1 */ 100 | extern Datum complex_re(PG_FUNCTION_ARGS); 101 | 102 | /* returns imaginary part of arg1 */ 103 | extern Datum complex_im(PG_FUNCTION_ARGS); 104 | 105 | /* returns phase of arg1 */ 106 | extern Datum complex_arg(PG_FUNCTION_ARGS); 107 | 108 | /* returns magnitude of arg1 */ 109 | extern Datum complex_mag(PG_FUNCTION_ARGS); 110 | 111 | /* returns conjunction of arg1 */ 112 | extern Datum complex_conj(PG_FUNCTION_ARGS); 113 | 114 | /* checks whether arg1 equals arg2 */ 115 | extern Datum complex_eq(PG_FUNCTION_ARGS); 116 | 117 | /* checks whether arg1 not equals arg2 */ 118 | extern Datum complex_ne(PG_FUNCTION_ARGS); 119 | 120 | /* returns arg1 + arg2 */ 121 | extern Datum complex_pl(PG_FUNCTION_ARGS); 122 | 123 | /* returns +arg1 */ 124 | extern Datum complex_up(PG_FUNCTION_ARGS); 125 | 126 | /* returns arg1 - arg2 */ 127 | extern Datum complex_mi(PG_FUNCTION_ARGS); 128 | 129 | /* returns -arg1 */ 130 | extern Datum complex_um(PG_FUNCTION_ARGS); 131 | 132 | /* returns arg1 * arg2 */ 133 | extern Datum complex_mul(PG_FUNCTION_ARGS); 134 | 135 | /* returns arg1 / arg2 */ 136 | extern Datum complex_div(PG_FUNCTION_ARGS); 137 | 138 | /* returns arg1 ^ arg2, where arg2 is not an integer */ 139 | extern Datum complex_pow(PG_FUNCTION_ARGS); 140 | 141 | /* returns sqrt(arg1) */ 142 | extern Datum complex_sqrt(PG_FUNCTION_ARGS); 143 | 144 | /* returns cbrt(arg1) */ 145 | extern Datum complex_cbrt(PG_FUNCTION_ARGS); 146 | 147 | /* return degrees(arg(arg1)) */ 148 | extern Datum complex_degrees(PG_FUNCTION_ARGS); 149 | 150 | /* returns exp(arg1) */ 151 | extern Datum complex_exp(PG_FUNCTION_ARGS); 152 | 153 | /* returns ln(arg1) */ 154 | extern Datum complex_ln(PG_FUNCTION_ARGS); 155 | 156 | /* returns log10(arg1) */ 157 | extern Datum complex_log10(PG_FUNCTION_ARGS); 158 | 159 | /* returns log(arg1,arg2), arg1 based logarithm */ 160 | extern Datum complex_log(PG_FUNCTION_ARGS); 161 | 162 | /* returns acos(arg1) */ 163 | extern Datum complex_acos(PG_FUNCTION_ARGS); 164 | 165 | /* returns asin(arg1) */ 166 | extern Datum complex_asin(PG_FUNCTION_ARGS); 167 | 168 | /* returns atan(arg1) */ 169 | extern Datum complex_atan(PG_FUNCTION_ARGS); 170 | 171 | /* returns cos(arg1) */ 172 | extern Datum complex_cos(PG_FUNCTION_ARGS); 173 | 174 | /* returns cot(arg1) */ 175 | extern Datum complex_cot(PG_FUNCTION_ARGS); 176 | 177 | /* returns sin(arg1) */ 178 | extern Datum complex_sin(PG_FUNCTION_ARGS); 179 | 180 | /* returns tan(arg1) */ 181 | extern Datum complex_tan(PG_FUNCTION_ARGS); 182 | 183 | /* returns dot product of two 1-dim array with the same number of elements */ 184 | extern Datum complex_dot_product(PG_FUNCTION_ARGS); 185 | 186 | /* returns arg1 + 0i */ 187 | extern Datum float82complex(PG_FUNCTION_ARGS); 188 | extern Datum float42complex(PG_FUNCTION_ARGS); 189 | extern Datum int82complex(PG_FUNCTION_ARGS); 190 | extern Datum int42complex(PG_FUNCTION_ARGS); 191 | extern Datum int22complex(PG_FUNCTION_ARGS); 192 | extern Datum numeric2complex(PG_FUNCTION_ARGS); 193 | #endif /* COMPLEX_H */ 194 | -------------------------------------------------------------------------------- /src/modules/complex/float_utils.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------- 2 | * 3 | * complex.h - Declarations for complex data type 4 | * 5 | * 6 | * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group 7 | * Portions Copyright (c) 1994, Regents of the University of California 8 | * 9 | * 10 | * NOTE 11 | * These routines do *not* use the float types from adt/. 12 | * 13 | * XXX These routines were not written by a numerical analyst. 14 | * 15 | *------------------------------------------------------------------------- 16 | */ 17 | #ifndef FLOAT_UTILS_H 18 | #define FLOAT_UTILS_H 19 | /* 20 | * check to see if a float4/8 val has underflowed or overflowed 21 | */ 22 | #define CHECKFLOATVAL(val, inf_is_valid, zero_is_valid) \ 23 | do { \ 24 | if (isinf(val) && !(inf_is_valid)) \ 25 | ereport(ERROR, \ 26 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ 27 | errmsg("value out of range: overflow"),errOmitLocation(true))); \ 28 | \ 29 | if ((val) == 0.0 && !(zero_is_valid)) \ 30 | ereport(ERROR, \ 31 | (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ 32 | errmsg("value out of range: underflow"),errOmitLocation(true))); \ 33 | } while(0) 34 | 35 | #endif /* FLOAT_UTILS_H */ 36 | -------------------------------------------------------------------------------- /src/modules/sampling/sampling.c: -------------------------------------------------------------------------------- 1 | /** 2 | * File: sampling.c 3 | * C backend for "sampling.sql" 4 | * 5 | * Functions in this file are internal to the Sampling module. They assume 6 | * that all checking of input values has already been performed by the 7 | * functions calling them. They are therefore unsafe to use directly. 8 | * 9 | * Function __sampling_prep_grp takes an array and stores it in a format 10 | * more efficiently accessible. Function __sampling_samp_grp uses this array 11 | * to perform the necessary per-row procesing, with the idea being that 12 | * __sampling_samp_grp is written to be as efficient as possible. 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | 25 | PG_FUNCTION_INFO_V1(__sampling_prep_grp); 26 | 27 | Datum __sampling_prep_grp(PG_FUNCTION_ARGS); 28 | 29 | Datum __sampling_prep_grp(PG_FUNCTION_ARGS) 30 | { 31 | ArrayType *arr; 32 | Datum* elements; 33 | Oid arr_element_type; 34 | int16 arr_element_type_width; 35 | bool arr_element_type_byvalue; 36 | char arr_element_type_alignment_code; 37 | bool* nulls; 38 | 39 | int32 i,n; 40 | float8 *rc; 41 | bytea *buffer; 42 | if (PG_ARGISNULL(0)) { 43 | PG_RETURN_NULL(); 44 | } 45 | arr=PG_GETARG_ARRAYTYPE_P(0); 46 | // CHECKARRVALID(arr); // Checks if any elements are NULL. 47 | if (ARR_NDIM(arr) != 1) { 48 | PG_RETURN_NULL(); 49 | } 50 | // n=(ARR_DIMS(arr))[0]; 51 | 52 | arr_element_type = ARR_ELEMTYPE(arr); 53 | if (arr_element_type != FLOAT8OID) { 54 | PG_RETURN_NULL(); 55 | } 56 | get_typlenbyvalalign(arr_element_type, &arr_element_type_width, 57 | &arr_element_type_byvalue, &arr_element_type_alignment_code); 58 | deconstruct_array(arr,arr_element_type,arr_element_type_width, 59 | arr_element_type_byvalue, arr_element_type_alignment_code, 60 | &elements,&nulls,&n); 61 | 62 | buffer=(bytea*) palloc(VARHDRSZ+n*sizeof(float8)); 63 | SET_VARSIZE(buffer,VARHDRSZ+n*sizeof(float8)); 64 | rc=(float8*) VARDATA(buffer); 65 | 66 | // p = ARRPTR(arr); 67 | for(i=0;i 0: 132 | found = 0 # flag to check if we find anything new this iteration 133 | newdepdict = dict() 134 | # find the keys with no values 135 | keynoval = filter(lambda t: t[1] == [], depdict.iteritems()) 136 | # find the values that are not keys 137 | valnotkey = set(flatten(depdict.itervalues())) - set(depdict.iterkeys()) 138 | 139 | candidates = set([k[0] for k in keynoval]) | valnotkey 140 | for c in candidates: 141 | if c not in out: 142 | found += 1 143 | out[c] = curlevel 144 | 145 | for k in depdict.iterkeys(): 146 | if depdict[k] != []: 147 | newdepdict[k] = filter(lambda v: v not in valnotkey, depdict[k]) 148 | # newdepdict = dict(newdepdict) 149 | if newdepdict == depdict: 150 | raise PDLPackConfigError(str(depdict)) 151 | else: 152 | depdict = newdepdict 153 | if found > 0: 154 | curlevel += 1 155 | 156 | return out 157 | 158 | ## 159 | # Top-sort the modules in conf 160 | # @param conf a pdlpack configuration 161 | ## 162 | def topsort_modules(conf): 163 | 164 | depdict = dict() 165 | for m in conf['modules']: 166 | try: 167 | depdict[m['name']] = m['depends'] 168 | except: 169 | depdict[m['name']] = [] 170 | try: 171 | module_dict = topsort(depdict) 172 | except PDLPackConfigError as e: 173 | raise PDLPackConfigError("invalid cyclic dependency between modules: " + e.value + "; check Modules.yml files") 174 | missing = set(module_dict.keys()) - set(depdict.keys()) 175 | inverted = dict() 176 | if len(missing) > 0: 177 | for k in depdict.iterkeys(): 178 | for v in depdict[k]: 179 | if v not in inverted: 180 | inverted[v] = set() 181 | inverted[v].add(k) 182 | print "configyml : ERROR : required modules missing from Modules.yml: " 183 | for m in missing: 184 | print " " + m + " (required by " + str(list(inverted[m])) + ")" 185 | exit(2) 186 | conf['modules'] = sorted(conf['modules'], key=lambda m:module_dict[m['name']]) 187 | return conf 188 | -------------------------------------------------------------------------------- /src/pdlpack/configyml.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/src/pdlpack/configyml.pyc -------------------------------------------------------------------------------- /src/pdlpack/plpy.py: -------------------------------------------------------------------------------- 1 | # ============================================================================== 2 | # plpy.py 3 | # 4 | # This module provides the abstraction level between the database 5 | # and the user python code (e.g. kmeans.py). 6 | # 7 | # ============================================================================== 8 | 9 | 10 | import sys 11 | from types import * 12 | 13 | 14 | try: 15 | from pygresql import pg 16 | except Exception, e: 17 | try: 18 | import pg 19 | except Exception, e: 20 | errorMsg = "unable to import The PyGreSQL Python module (pg.py) - %s\n" % str(e) 21 | sys.stderr.write(str(errorMsg)) 22 | sys.exit(2) 23 | 24 | # This method establishes the connection to a database. 25 | # 26 | # Example: 27 | # ----- my_run_kmeans.py ----- 28 | # import plpy 29 | # from kmeans 30 | # ... 31 | # plpy.setcon( 'kmeans', 'localhost', 5432, 'myuser', 'mypass') 32 | # ... 33 | # print kmeans.kmeans_run( 50, 1); 34 | # ---------- 35 | 36 | 37 | def connect(dbname, host, port, user, passwd): 38 | global db 39 | db = pg.DB(dbname=dbname, 40 | host=host, 41 | port=port, 42 | user=user, 43 | passwd=passwd) 44 | 45 | 46 | def close(): 47 | db.close() 48 | 49 | # The following functions should be used inside the user modules 50 | # in order to make their code uniform for both external python scripts 51 | # or from in-database pl/python functions. 52 | # 53 | # Example: 54 | # ----- kmeans.py ----- 55 | # import plpy 56 | # ... 57 | # def kmeans_run(): 58 | # ... 59 | # plpy.execute( 'CREATE TEMP TABLE a (a INT)'); 60 | # plpy.info( 'Created table a.'); 61 | # ---------- 62 | 63 | 64 | def execute(sql): 65 | rv = db.query(sql.encode('utf-8')) 66 | if type(rv) is NoneType: 67 | return 0 68 | elif type(rv) is StringType: 69 | return rv 70 | else: 71 | return rv.dictresult() 72 | 73 | 74 | def info(msg): 75 | print 'INFO: ' + msg 76 | 77 | 78 | def error(msg): 79 | print 'ERROR: ' + msg 80 | exit( 1) 81 | -------------------------------------------------------------------------------- /src/ports/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ====================================================================== 2 | # ====================================================================== 3 | add_subdirectory(greenplum) 4 | add_subdirectory(hawq) 5 | -------------------------------------------------------------------------------- /src/ports/greenplum/4.2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_greenplum_version() 2 | -------------------------------------------------------------------------------- /src/ports/greenplum/4.3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_greenplum_version() 2 | -------------------------------------------------------------------------------- /src/ports/greenplum/4.3ORCA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_greenplum_version() 2 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/FindGreenplum.cmake: -------------------------------------------------------------------------------- 1 | # Set defaults that can be overridden by files that include this file: 2 | if(NOT DEFINED _FIND_PACKAGE_FILE) 3 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 4 | endif(NOT DEFINED _FIND_PACKAGE_FILE) 5 | 6 | # Set parameters for calling FindPostgreSQL.cmake 7 | set(_NEEDED_PG_CONFIG_PACKAGE_NAME "Greenplum Database") 8 | set(_PG_CONFIG_VERSION_NUM_MACRO "GP_VERSION_NUM") 9 | set(_PG_CONFIG_VERSION_MACRO "GP_VERSION") 10 | set(_SEARCH_PATH_HINTS 11 | "/usr/local/greenplum-db/bin" 12 | "$ENV{GPHOME}/bin" 13 | ) 14 | 15 | include("${CMAKE_CURRENT_LIST_DIR}/FindPostgreSQL.cmake") 16 | 17 | if(${PKG_NAME}_FOUND) 18 | # server/funcapi.h ultimately includes server/access/xact.h, from which 19 | # cdb/cdbpathlocus.h is included 20 | execute_process(COMMAND ${${PKG_NAME}_PG_CONFIG} --pkgincludedir 21 | OUTPUT_VARIABLE ${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS 22 | OUTPUT_STRIP_TRAILING_WHITESPACE 23 | ) 24 | set(${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS 25 | "${${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS}/internal") 26 | endif(${PKG_NAME}_FOUND) 27 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/FindGreenplum_4_2.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/FindGreenplum_4_3.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/FindGreenplum_4_3ORCA.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindGreenplum.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/GreenplumUtils.cmake: -------------------------------------------------------------------------------- 1 | # Define Greenplum feature macros 2 | # 3 | function(define_greenplum_features IN_VERSION OUT_FEATURES) 4 | if(NOT ${IN_VERSION} VERSION_LESS "4.1") 5 | list(APPEND ${OUT_FEATURES} __HAS_ORDERED_AGGREGATES__) 6 | endif() 7 | 8 | if(NOT ${IN_VERSION} VERSION_LESS "4.3") 9 | list(APPEND ${OUT_FEATURES} __HAS_FUNCTION_PROPERTIES__) 10 | endif() 11 | 12 | # Pass values to caller 13 | set(${OUT_FEATURES} "${${OUT_FEATURES}}" PARENT_SCOPE) 14 | endfunction(define_greenplum_features) 15 | 16 | function(add_gppkg GPDB_VERSION GPDB_VARIANT GPDB_VARIANT_SHORT) 17 | string(TOLOWER ${GPDB_VERSION} GPDB_VERSION_LC) 18 | string(REPLACE "." "_" VERSION_ "${GPDB_VERSION}") 19 | 20 | file(WRITE "${CMAKE_BINARY_DIR}/deploy/gppkg/Version_${IN_PORT_VERSION}.cmake" " 21 | 22 | file(MAKE_DIRECTORY 23 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/BUILD\" 24 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/SPECS\" 25 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/RPMS\" 26 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/gppkg\" 27 | ) 28 | set(GPDB_VERSION \"${GPDB_VERSION}\") 29 | set(GPDB_VERSION_LC \"${GPDB_VERSION_LC}\") 30 | set(GPDB_VARIANT \"${GPDB_VARIANT}\") 31 | set(GPDB_VARIANT_SHORT \"${GPDB_VARIANT_SHORT}\") 32 | string(TOLOWER \"${GPDB_VARIANT}\" PORT_NAME) 33 | 34 | configure_file( 35 | pdltools.spec.in 36 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/SPECS/pdltools.spec\" 37 | ) 38 | configure_file( 39 | gppkg_spec.yml.in 40 | \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}/gppkg/gppkg_spec.yml\" 41 | ) 42 | if(GPPKG_BINARY AND RPMBUILD_BINARY) 43 | add_custom_target(gppkg_${PORT_VERSION_UNDERSCORE} 44 | COMMAND cmake -E create_symlink \"\${PDLTOOLS_GPPKG_RPM_SOURCE_DIR}\" 45 | \"\${CPACK_PACKAGE_FILE_NAME}-gppkg\" 46 | COMMAND \"\${RPMBUILD_BINARY}\" -bb SPECS/pdltools.spec 47 | COMMAND cmake -E rename "RPMS/\${PDLTOOLS_GPPKG_RPM_FILE_NAME}" 48 | "gppkg/\${PDLTOOLS_GPPKG_RPM_FILE_NAME}" 49 | COMMAND \"\${GPPKG_BINARY}\" --build gppkg 50 | DEPENDS \"${CMAKE_BINARY_DIR}/\${CPACK_PACKAGE_FILE_NAME}.rpm\" 51 | WORKING_DIRECTORY \"\${CMAKE_CURRENT_BINARY_DIR}/${IN_PORT_VERSION}\" 52 | COMMENT \"Generating Greenplum ${IN_PORT_VERSION} gppkg installer...\" 53 | VERBATIM 54 | ) 55 | else(GPPKG_BINARY AND RPMBUILD_BINARY) 56 | add_custom_target(gppkg_${PORT_VERSION_UNDERSCORE} 57 | COMMAND cmake -E echo \"Could not find gppkg and/or rpmbuild.\" 58 | \"Please rerun cmake.\" 59 | ) 60 | endif(GPPKG_BINARY AND RPMBUILD_BINARY) 61 | 62 | # Unfortunately, we cannot set a dependency to the built-in package target, 63 | # i.e., the following does not work: 64 | # add_dependencies(gppkg package) 65 | 66 | add_dependencies(gppkg gppkg_${PORT_VERSION_UNDERSCORE}) 67 | ") 68 | endfunction(add_gppkg) 69 | 70 | -------------------------------------------------------------------------------- /src/ports/greenplum/cmake/PostgreSQLUtils.cmake: -------------------------------------------------------------------------------- 1 | # Define PostgreSQL feature macros 2 | # 3 | function(define_postgresql_features IN_VERSION OUT_FEATURES) 4 | if(NOT ${IN_VERSION} VERSION_LESS "9.0") 5 | list(APPEND ${OUT_FEATURES} __HAS_ORDERED_AGGREGATES__) 6 | endif() 7 | 8 | # Pass values to caller 9 | set(${OUT_FEATURES} "${${OUT_FEATURES}}" PARENT_SCOPE) 10 | endfunction(define_postgresql_features) 11 | 12 | # Add the installer group for this port and a component for all files that are 13 | # not version-specific 14 | # 15 | # We dynamically generate a CMake input file here. This is because the effects 16 | # of cpack_add_component_group() are not globally visible. Hence, we generate 17 | # a file in the deploy directory that CMake will execute only at the very end. 18 | # The motivation is that that way we want to have a clean separation between 19 | # port-specific source code and general code. 20 | # 21 | function(cpack_add_port_group_and_component_for_all_versions) 22 | file(WRITE "${PORT_DEPLOY_SCRIPT}" " 23 | cpack_add_component_group(${PORT} 24 | DISPLAY_NAME \"${PORT} Support\" 25 | DESCRIPTION \"PDL Tools support for ${PORT}.\" 26 | PARENT_GROUP ports 27 | ) 28 | cpack_add_component(${PORT}_any 29 | DISPLAY_NAME \"All Versions\" 30 | DESCRIPTION \"PDL Tools files shared by all ${PORT} versions.\" 31 | GROUP ${PORT} 32 | )") 33 | endfunction(cpack_add_port_group_and_component_for_all_versions) 34 | 35 | 36 | # Add the installer component for version-specific files 37 | # 38 | function(cpack_add_version_component) 39 | file(APPEND "${PORT_DEPLOY_SCRIPT}" " 40 | cpack_add_component(${DBMS} 41 | DISPLAY_NAME \"${IN_PORT_VERSION}\" 42 | DESCRIPTION \"PDL Tools files specific to ${PORT} ${IN_PORT_VERSION}.\" 43 | GROUP ${PORT} 44 | )") 45 | endfunction(cpack_add_version_component) 46 | 47 | 48 | # Determine the versions of this port that we need to build for. 49 | # 50 | # If the user specifies at least one ${PORT_UC}_X_Y_PG_CONFIG, we only build 51 | # for that specific version. If no such variable is defined, we look for any 52 | # version of this port. This function will have a *side effect* in that case: 53 | # It sets one ${PORT_UC}_X_Y_PG_CONFIG to the path to pg_config that was found. 54 | # 55 | function(determine_target_versions OUT_VERSIONS) 56 | get_subdirectories("${CMAKE_CURRENT_SOURCE_DIR}" SUPPORTED_VERSIONS) 57 | get_filtered_list(SUPPORTED_VERSIONS "^[0-9]+.[0-9]+.*$" ${SUPPORTED_VERSIONS}) 58 | 59 | foreach(VERSION ${SUPPORTED_VERSIONS}) 60 | string(REPLACE "." "_" VERSION_UNDERSCORE "${VERSION}") 61 | if(DEFINED ${PORT_UC}_${VERSION_UNDERSCORE}_PG_CONFIG) 62 | list(APPEND ${OUT_VERSIONS} ${VERSION}) 63 | endif() 64 | endforeach(VERSION) 65 | if(NOT DEFINED ${OUT_VERSIONS}) 66 | find_package(${PORT}) 67 | 68 | if(${PORT_UC}_FOUND) 69 | # Due to the ABI incompatibility between 4.3.4 and 4.3.5, 70 | # PDLTools treats 4.3.5+ as DB version that is different from 4.3 71 | if(${PORT_UC} STREQUAL "GREENPLUM" AND 72 | ${${PORT_UC}_VERSION_MAJOR} EQUAL 4 AND 73 | ${${PORT_UC}_VERSION_MINOR} EQUAL 3 AND 74 | ${${PORT_UC}_VERSION_PATCH} GREATER 4) 75 | set(VERSION "4.3ORCA") 76 | else() 77 | set(VERSION "${${PORT_UC}_VERSION_MAJOR}.${${PORT_UC}_VERSION_MINOR}") 78 | endif() 79 | list(FIND SUPPORTED_VERSIONS "${VERSION}" _POS) 80 | if(_POS EQUAL -1) 81 | string(REPLACE ";" ", " _SUPPORTED_VERSIONS_STR "${SUPPORTED_VERSIONS}") 82 | message(STATUS "Found pg_config at " 83 | "\"${${PORT_UC}_PG_CONFIG}\", but it points to ${PORT} version " 84 | "${${PORT_UC}_VERSION_STRING}, which is not one of the supported " 85 | "versions (${_SUPPORTED_VERSIONS_STR}). You may try to " 86 | "copy a similar version folder in \"src/ports/${PORT_DIR_NAME}\" and " 87 | "rename it to \"${VERSION}\". This may or may not work, and is " 88 | "unsupported in any case.") 89 | else(_POS EQUAL -1) 90 | string(REPLACE "." "_" VERSION_UNDERSCORE "${VERSION}") 91 | 92 | # Side effect: 93 | set(${PORT_UC}_${VERSION_UNDERSCORE}_PG_CONFIG 94 | "${${PORT_UC}_PG_CONFIG}" PARENT_SCOPE) 95 | set(${OUT_VERSIONS} "${VERSION}") 96 | endif(_POS EQUAL -1) 97 | endif(${PORT_UC}_FOUND) 98 | endif(NOT DEFINED ${OUT_VERSIONS}) 99 | 100 | # Pass values to caller 101 | set(${OUT_VERSIONS} "${${OUT_VERSIONS}}" PARENT_SCOPE) 102 | # ${PORT_UC}_${_VERSION_UNDERSCORE}_PG_CONFIG might have been set earlier! 103 | # (the side effect) 104 | endfunction(determine_target_versions) 105 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/anonymization/anonymization.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.create_anonymization_table(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.anonymize() 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.create_anonymization_table(pg_catalog.varchar) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.create_anonymization_table() 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.anonymize(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog._varchar, pg_catalog.varchar) 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.anonymize(pg_catalog.varchar) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.__anonymize_imp(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog._varchar, pg_catalog.varchar, pg_catalog.bool) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.deanonymize() 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.deanonymize(pg_catalog.varchar) 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.deanonymize(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog._varchar, pg_catalog.varchar) 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/anonymization/anonymization.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.1 2 | compatible: 1.2.1 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/anonymization/test/test_anonymization.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_anonymization.sql 2 | -- Unit test for anonymization utility. 3 | 4 | CREATE TABLE clickstream( 5 | userid VARCHAR, 6 | time_stamp TIMESTAMP, 7 | affected_user VARCHAR 8 | ) DISTRIBUTED RANDOMLY; 9 | 10 | INSERT INTO clickstream VALUES 11 | ('Max', TIMESTAMP '2013-Aug-23 04:57:02.15', 'Max'), 12 | ('Tori', TIMESTAMP '2013-Aug-23 04:59:17.83', 'Max'), 13 | ('Max', TIMESTAMP '2013-Aug-23 05:03:01.42', 'Tori'), 14 | ('Tori', TIMESTAMP '2013-Aug-23 17:32:37.08', 'Tori'); 15 | 16 | SELECT create_anonymization_table('clickstream', 17 | 'anonymization_table', 18 | 'userid'); 19 | 20 | SELECT assert((SELECT count(*) 21 | FROM anonymization_table)::TEXT,'2'); 22 | 23 | SELECT assert((SELECT count(DISTINCT id) 24 | FROM anonymization_table)::TEXT,'2'); 25 | 26 | SELECT assert((SELECT count(DISTINCT anon_id) 27 | FROM anonymization_table)::TEXT,'2'); 28 | 29 | SELECT assert((SELECT CASE WHEN min(length(anon_id))>25 THEN 't' ELSE 'f' END 30 | FROM anonymization_table),'t'); 31 | 32 | SELECT anonymize(current_schema()::TEXT,'clickstream', 33 | 'anonymized_clickstream', 34 | array['userid','affected_user'], 35 | 'anonymization_table'); 36 | 37 | SELECT assert((SELECT sum((au=anon)::INTEGER) FROM 38 | (SELECT a.userid au,t.anon_id anon 39 | FROM clickstream o, 40 | anonymized_clickstream a, 41 | anonymization_table t 42 | WHERE o.time_stamp=a.time_stamp AND o.userid=t.id 43 | UNION 44 | SELECT a.affected_user,t.anon_id 45 | FROM clickstream o, 46 | anonymized_clickstream a, 47 | anonymization_table t 48 | WHERE o.time_stamp=a.time_stamp AND o.affected_user=t.id) x)::TEXT,'2'); 49 | 50 | SELECT deanonymize(current_schema()::TEXT,'anonymized_clickstream', 51 | 'deanonymized_clickstream', 52 | array['userid','affected_user'], 53 | 'anonymization_table'); 54 | 55 | SELECT assert((SELECT sum((ou=du)::INTEGER) FROM 56 | (SELECT o.userid ou,d.userid du 57 | FROM clickstream o, 58 | deanonymized_clickstream d 59 | WHERE o.time_stamp=d.time_stamp 60 | UNION 61 | SELECT o.affected_user,d.affected_user 62 | FROM clickstream o, 63 | deanonymized_clickstream d 64 | WHERE o.time_stamp=d.time_stamp) x)::TEXT,'2'); 65 | 66 | DROP TABLE clickstream; 67 | 68 | DROP TABLE anonymization_table; 69 | 70 | DROP TABLE anonymized_clickstream; 71 | 72 | DROP TABLE deanonymized_clickstream; 73 | 74 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/balance_dataset/balance_dataset.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.balance_dataset(pg_catalog.varchar) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.balance_dataset() 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.balance_dataset(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/balance_dataset/balance_dataset.yml: -------------------------------------------------------------------------------- 1 | identical: 1.4 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/balance_dataset/test/test_balance_dataset.sql_in: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------ 2 | -- File: test_balance_dataset.sql_in 3 | -- Unit test for balancing datasets module 4 | ------------------------------------------------------------------------------------------------- 5 | create table testing_unbalanced 6 | ( 7 | id int, 8 | f1 float, 9 | f2 float, 10 | f3 float, 11 | label text 12 | ) distributed by (id); 13 | 14 | insert into testing_unbalanced 15 | values 16 | (1, 0.35, 0.52, 0.61, 'a'), 17 | (2, 0.24, 0.51, 0.63, 'a'), 18 | (3, 0.31, 0.49, 0.67, 'a'), 19 | (5, 0.20, 0.35, 0.26, 'b'), 20 | (6, 0.21, 0.36, 0.28, 'b'), 21 | (7, 0.49, 0.70, 0.90, 'c'); 22 | 23 | SELECT assert( 24 | PDLTOOLS_SCHEMA.balance_dataset( 25 | 'testing_unbalanced', 26 | 'id', 27 | 'label', 28 | 'testing_balanced_dataset' 29 | ), 30 | 'Balanced dataset created: testing_balanced_dataset' 31 | ); 32 | 33 | -- Count the number of records for each class in the balanced dataset 34 | create table balanced_label_counts 35 | as 36 | ( 37 | select label, 38 | count(distinct id_balanced) 39 | from testing_balanced_dataset 40 | group by 1 41 | ) distributed randomly; 42 | 43 | -- If the balancing is correct, every class should have atleast 44 | -- (max_class/2)+1 samples 45 | select assert(sum(ratio),3, 'Assertion test') 46 | from 47 | ( 48 | select label, 49 | max_class_rec_count/count as ratio 50 | from balanced_label_counts t1, 51 | ( 52 | select max(count) as max_class_rec_count 53 | from balanced_label_counts 54 | ) t2 55 | ) bal; 56 | 57 | drop table if exists balanced_label_counts; 58 | drop table if exists testing_unbalanced; 59 | drop table if exists testing_balanced_dataset; 60 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/common/common.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.assert(pg_catalog.text, pg_catalog.text) 2 | REL: TABLE SUGAR_SCHEMA.migrationhistory 3 | REL: TABLE PDLTOOLS_SCHEMA.migrationhistory 4 | REL: SEQUENCE PDLTOOLS_SCHEMA.migrationhistory_id_seq 5 | REL: SEQUENCE SUGAR_SCHEMA.migrationhistory_id_seq 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.assert(pg_catalog.bool, pg_catalog.bool) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.pdltools_version() 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.assert(pg_catalog.text, pg_catalog.text, pg_catalog.text) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.version() 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.__random_str(pg_catalog.int4) 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/common/common.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.1 2 | compatible: 1.2.1 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/common/udfs.sql_in: -------------------------------------------------------------------------------- 1 | -- A collection of UDFs essential for PDL Tools 2 | -- These may be required for testing scripts or may be a common folder for UDFs required for PDL Tools in general 3 | 4 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.assert(actual text, expected text, msg text) 5 | RETURNS boolean AS 6 | $BODY$ 7 | BEGIN 8 | IF NOT actual=expected THEN 9 | RAISE EXCEPTION 'Failed assertion: %',msg; 10 | END IF; 11 | RETURN TRUE; 12 | END 13 | $BODY$ LANGUAGE plpgsql IMMUTABLE; 14 | 15 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.assert(actual text, expected text) 16 | RETURNS boolean AS 17 | $BODY$ 18 | BEGIN 19 | PERFORM PDLTOOLS_SCHEMA.assert(actual, expected, ''); 20 | RETURN TRUE; 21 | END 22 | $BODY$ LANGUAGE plpgsql IMMUTABLE; 23 | 24 | 25 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.assert(actual boolean, expected boolean) 26 | RETURNS boolean AS 27 | $BODY$ 28 | BEGIN 29 | PERFORM PDLTOOLS_SCHEMA.assert( 30 | case when actual=True then 'True' else 'False' end, 31 | case when expected=True then 'True' else 'False' end, 32 | '' 33 | ); 34 | RETURN TRUE; 35 | END 36 | $BODY$ LANGUAGE plpgsql IMMUTABLE; 37 | 38 | 39 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.pdltools_version() 40 | RETURNS VARCHAR AS 41 | $BODY$ 42 | SELECT 'PDLTOOLS_VERSION'::VARCHAR; 43 | $BODY$ LANGUAGE SQL IMMUTABLE; 44 | 45 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.version() 46 | RETURNS VARCHAR AS 47 | $BODY$ 48 | SELECT E'PDL Tools: PDLTOOLS_VERSION\nSUgAR: SUGAR_VERSION'::VARCHAR; 49 | $BODY$ LANGUAGE SQL IMMUTABLE; 50 | 51 | 52 | /* Helper function to avoid internal table clashes */ 53 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.__random_str(len integer) 54 | RETURNS text AS 55 | $BODY$ 56 | SELECT string_agg(a) 57 | FROM ( 58 | SELECT chr(ascii('a') + (random() * 25)::integer) a 59 | FROM generate_series(1,$1) 60 | ) foo 61 | $BODY$ 62 | LANGUAGE sql VOLATILE; 63 | 64 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complete_linkage/complete_linkage.content: -------------------------------------------------------------------------------- 1 | REL: TYPE PDLTOOLS_SCHEMA.__clink_cluster_activeness 2 | REL: TYPE PDLTOOLS_SCHEMA.__clink_set_of_nearest_clust_complete 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.cut_hclust_tree() 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.__clink_find_nn_complete(pg_catalog.text, pg_catalog.text, pg_catalog._int4, PDLTOOLS_SCHEMA.__clink_cluster_activeness) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.complete_linkage(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.numeric) 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.complete_linkage() 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.__clink_expand_nn_table_complete(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, PDLTOOLS_SCHEMA.__clink_cluster_activeness) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.cut_hclust_tree(pg_catalog.text) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.cut_hclust_tree(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.float8, pg_catalog.text) 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.complete_linkage(pg_catalog.text) 11 | UDF: FUNCTION PDLTOOLS_SCHEMA.__clink_update_nn_table_complete(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, PDLTOOLS_SCHEMA.__clink_cluster_activeness) 12 | UDF: FUNCTION PDLTOOLS_SCHEMA.__clink_init_nn_table_complete(pg_catalog.text, pg_catalog.text, pg_catalog.text, PDLTOOLS_SCHEMA.__clink_cluster_activeness) 13 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complete_linkage/complete_linkage.yml: -------------------------------------------------------------------------------- 1 | identical: 1.4 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complete_linkage/test/test_complete_linkage.sql_in: -------------------------------------------------------------------------------- 1 | 2 | -- File: test_complete_linkage.sql_in 3 | -- Unit test for complete_linkage.sql_in 4 | 5 | -- TEST: Null case - build entire tree 6 | -- 1. Create a test pairwise distance matrix 7 | CREATE TEMP TABLE complete_linkage_test_dist (id1 INT, id2 INT, dist FLOAT) DISTRIBUTED RANDOMLY; 8 | 9 | INSERT INTO complete_linkage_test_dist VALUES 10 | (1,2,0), 11 | (1,3,1), 12 | (1,4,5), 13 | (2,3,2), 14 | (2,4,4), 15 | (3,4,3); 16 | 17 | -- 2. Build a hierarchical clustering tree 18 | SELECT PDLTOOLS_SCHEMA.complete_linkage('complete_linkage_test_dist','id1','id2','dist','complete_linkage_test_output',NULL,NULL); 19 | 20 | -- 3. Cut the clustering tree at a certain height (2 in the example) to obtain flat clusters. 21 | SELECT PDLTOOLS_SCHEMA.cut_hclust_tree('complete_linkage_test_dist','id1','id2','dist','complete_linkage_test_output',2,'complete_linkage_test_cluster'); 22 | 23 | -- Check if the number of clusters is as expected. 24 | SELECT PDLTOOLS_SCHEMA.assert((SELECT count(*) FROM complete_linkage_test_cluster)::TEXT, '2'::TEXT); 25 | 26 | -- Check if the clustering result is as expected. 27 | SELECT PDLTOOLS_SCHEMA.assert(string_agg(cluster_id::TEXT || ' ' || array_to_string(array(SELECT * FROM unnest(member) ORDER BY 1),';') || ' ' || height::TEXT || ' ' || exemplar::TEXT || ',' ORDER BY cluster_id),'1 4 0 4,2 1;2;3 2 1,'::TEXT) FROM complete_linkage_test_cluster; 28 | 29 | -- Clean up temp tables. 30 | DROP TABLE complete_linkage_test_dist; 31 | DROP TABLE complete_linkage_test_cluster; 32 | DROP TABLE complete_linkage_test_output; 33 | 34 | -- TEST: Stop clustering at threshold=1.5 35 | -- 1. Create a test pairwise distance matrix 36 | CREATE TEMP TABLE complete_linkage_test_dist (id1 INT, id2 INT, dist FLOAT) DISTRIBUTED RANDOMLY; 37 | 38 | INSERT INTO complete_linkage_test_dist VALUES 39 | (1,2,0), 40 | (1,3,1), 41 | (1,4,5), 42 | (2,3,2), 43 | (2,4,4), 44 | (3,4,3); 45 | 46 | -- 2. Build a hierarchical clustering tree 47 | SELECT PDLTOOLS_SCHEMA.complete_linkage('complete_linkage_test_dist','id1','id2','dist','complete_linkage_test_output','threshold',1.5); 48 | 49 | -- Check if the number of clusters is as expected. 50 | SELECT PDLTOOLS_SCHEMA.assert((SELECT count(*) FROM complete_linkage_test_output where height >=1.5)::TEXT, '0'::TEXT); 51 | 52 | -- Clean up temp tables. 53 | DROP TABLE complete_linkage_test_dist; 54 | DROP TABLE complete_linkage_test_output; 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complex/complex.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.re(PDLTOOLS_SCHEMA.complex) 2 | UDO: OPERATOR PDLTOOLS_SCHEMA.+(double precision,PDLTOOLS_SCHEMA.complex) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_pl(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.complexabs(PDLTOOLS_SCHEMA.complex) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.mul(PDLTOOLS_SCHEMA.complex, pg_catalog.float8) 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.numeric2point(pg_catalog.numeric) 7 | UDO: OPERATOR PDLTOOLS_SCHEMA.*(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.pl(PDLTOOLS_SCHEMA.complex, pg_catalog.float8) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_trig(pg_catalog.float8, pg_catalog.float8) 10 | UDO: OPERATOR PDLTOOLS_SCHEMA.+(PDLTOOLS_SCHEMA.complex,double precision) 11 | UDF: FUNCTION PDLTOOLS_SCHEMA.abs(PDLTOOLS_SCHEMA.complex) 12 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_mul(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 13 | UDF: FUNCTION PDLTOOLS_SCHEMA.float82complex(pg_catalog.float8) 14 | UDO: OPERATOR PDLTOOLS_SCHEMA./(double precision,PDLTOOLS_SCHEMA.complex) 15 | UDF: FUNCTION PDLTOOLS_SCHEMA.radians(PDLTOOLS_SCHEMA.complex) 16 | UDF: FUNCTION PDLTOOLS_SCHEMA.asin(PDLTOOLS_SCHEMA.complex) 17 | UDO: OPERATOR PDLTOOLS_SCHEMA.-(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 18 | UDF: FUNCTION PDLTOOLS_SCHEMA.dotproduct(PDLTOOLS_SCHEMA._complex, PDLTOOLS_SCHEMA._complex) 19 | UDO: OPERATOR PDLTOOLS_SCHEMA.-(double precision,PDLTOOLS_SCHEMA.complex) 20 | UDF: FUNCTION PDLTOOLS_SCHEMA.mul(pg_catalog.float8, PDLTOOLS_SCHEMA.complex) 21 | UDF: FUNCTION PDLTOOLS_SCHEMA.cbrt(PDLTOOLS_SCHEMA.complex) 22 | UDF: FUNCTION PDLTOOLS_SCHEMA.tan(PDLTOOLS_SCHEMA.complex) 23 | UDF: FUNCTION PDLTOOLS_SCHEMA.float42complex(pg_catalog.float4) 24 | UDF: FUNCTION PDLTOOLS_SCHEMA.degrees(PDLTOOLS_SCHEMA.complex) 25 | UDO: OPERATOR PDLTOOLS_SCHEMA.-(NONE,PDLTOOLS_SCHEMA.complex) 26 | UDO: OPERATOR PDLTOOLS_SCHEMA.=(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 27 | UDO: OPERATOR PDLTOOLS_SCHEMA.@(NONE,PDLTOOLS_SCHEMA.complex) 28 | UDT: TYPE PDLTOOLS_SCHEMA._complex 29 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_in(pg_catalog.cstring) 30 | UDF: FUNCTION PDLTOOLS_SCHEMA.cot(PDLTOOLS_SCHEMA.complex) 31 | UDO: OPERATOR PDLTOOLS_SCHEMA.+(NONE,PDLTOOLS_SCHEMA.complex) 32 | UDO: OPERATOR PDLTOOLS_SCHEMA.-(PDLTOOLS_SCHEMA.complex,double precision) 33 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_send(PDLTOOLS_SCHEMA.complex) 34 | UDF: FUNCTION PDLTOOLS_SCHEMA.int22complex(pg_catalog.int2) 35 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_cbrt(PDLTOOLS_SCHEMA.complex) 36 | UDT: TYPE PDLTOOLS_SCHEMA.complex 37 | UDO: OPERATOR PDLTOOLS_SCHEMA.|/(NONE,PDLTOOLS_SCHEMA.complex) 38 | UDF: FUNCTION PDLTOOLS_SCHEMA.acos(PDLTOOLS_SCHEMA.complex) 39 | UDF: FUNCTION PDLTOOLS_SCHEMA.exp(PDLTOOLS_SCHEMA.complex) 40 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_sqrt(PDLTOOLS_SCHEMA.complex) 41 | UDO: OPERATOR PDLTOOLS_SCHEMA.^(PDLTOOLS_SCHEMA.complex,double precision) 42 | UDF: FUNCTION PDLTOOLS_SCHEMA.ln(PDLTOOLS_SCHEMA.complex) 43 | UDO: OPERATOR PDLTOOLS_SCHEMA.*(double precision,PDLTOOLS_SCHEMA.complex) 44 | UDF: FUNCTION PDLTOOLS_SCHEMA.sqrt(PDLTOOLS_SCHEMA.complex) 45 | UDF: FUNCTION PDLTOOLS_SCHEMA.cos(PDLTOOLS_SCHEMA.complex) 46 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_eq(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 47 | UDO: OPERATOR PDLTOOLS_SCHEMA.||/(NONE,PDLTOOLS_SCHEMA.complex) 48 | UDO: OPERATOR PDLTOOLS_SCHEMA.*(PDLTOOLS_SCHEMA.complex,double precision) 49 | UDF: FUNCTION PDLTOOLS_SCHEMA.sin(PDLTOOLS_SCHEMA.complex) 50 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_mi(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 51 | UDF: FUNCTION PDLTOOLS_SCHEMA.im(PDLTOOLS_SCHEMA.complex) 52 | UDF: FUNCTION PDLTOOLS_SCHEMA.int82complex(pg_catalog.int8) 53 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_ne(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 54 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_um(PDLTOOLS_SCHEMA.complex) 55 | UDF: FUNCTION PDLTOOLS_SCHEMA.mi(PDLTOOLS_SCHEMA.complex, pg_catalog.float8) 56 | UDF: FUNCTION PDLTOOLS_SCHEMA.log(PDLTOOLS_SCHEMA.complex) 57 | UDF: FUNCTION PDLTOOLS_SCHEMA.conj(PDLTOOLS_SCHEMA.complex) 58 | UDO: OPERATOR PDLTOOLS_SCHEMA.<>(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 59 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_recv(pg_catalog.internal) 60 | UDF: FUNCTION PDLTOOLS_SCHEMA.div(PDLTOOLS_SCHEMA.complex, pg_catalog.float8) 61 | UDF: FUNCTION PDLTOOLS_SCHEMA.mi(pg_catalog.float8, PDLTOOLS_SCHEMA.complex) 62 | UDF: FUNCTION PDLTOOLS_SCHEMA.atan(PDLTOOLS_SCHEMA.complex) 63 | UDF: FUNCTION PDLTOOLS_SCHEMA.div(pg_catalog.float8, PDLTOOLS_SCHEMA.complex) 64 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_div(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 65 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_up(PDLTOOLS_SCHEMA.complex) 66 | UDO: OPERATOR PDLTOOLS_SCHEMA.+(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 67 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_out(PDLTOOLS_SCHEMA.complex) 68 | UDO: OPERATOR PDLTOOLS_SCHEMA./(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 69 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex_power(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 70 | UDF: FUNCTION PDLTOOLS_SCHEMA.int42complex(pg_catalog.int4) 71 | UDO: OPERATOR PDLTOOLS_SCHEMA./(PDLTOOLS_SCHEMA.complex,double precision) 72 | UDO: OPERATOR PDLTOOLS_SCHEMA.^(PDLTOOLS_SCHEMA.complex,PDLTOOLS_SCHEMA.complex) 73 | UDF: FUNCTION PDLTOOLS_SCHEMA.power(PDLTOOLS_SCHEMA.complex, pg_catalog.float8) 74 | UDF: FUNCTION PDLTOOLS_SCHEMA.log(PDLTOOLS_SCHEMA.complex, PDLTOOLS_SCHEMA.complex) 75 | UDF: FUNCTION PDLTOOLS_SCHEMA.complex(pg_catalog.float8, pg_catalog.float8) 76 | UDF: FUNCTION PDLTOOLS_SCHEMA.pl(pg_catalog.float8, PDLTOOLS_SCHEMA.complex) 77 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complex/complex.yml: -------------------------------------------------------------------------------- 1 | identical: 1.4 2 | compatible: 1.4 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/complex/test/test_complex_type.sql_in: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------- 2 | -- Test cases for the Complex number module 3 | ---------------------------------------------------------------------- 4 | 5 | CREATE SEQUENCE complex_seq; 6 | CREATE TABLE complex_ttbl (id INTEGER NOT NULL DEFAULT 1, orderid INTEGER NOT NULL DEFAULT NEXTVAL('complex_seq'), c COMPLEX) DISTRIBUTED BY (id); 7 | -- regular input 8 | INSERT INTO complex_ttbl(c) VALUES (' 5'); 9 | INSERT INTO complex_ttbl(c) VALUES ('3 '); 10 | INSERT INTO complex_ttbl(c) VALUES (' 6 '); 11 | INSERT INTO complex_ttbl(c) VALUES (' -6 '); 12 | INSERT INTO complex_ttbl(c) VALUES (' 5i'); 13 | INSERT INTO complex_ttbl(c) VALUES ('3i '); 14 | INSERT INTO complex_ttbl(c) VALUES (' 6i '); 15 | INSERT INTO complex_ttbl(c) VALUES (' -6i '); 16 | INSERT INTO complex_ttbl(c) VALUES (' 5 + 3i '); 17 | INSERT INTO complex_ttbl(c) VALUES (' 5 + -3i '); 18 | INSERT INTO complex_ttbl(c) VALUES (' 6 - -7i '); 19 | INSERT INTO complex_ttbl(c) VALUES (' -6 - -7i '); 20 | INSERT INTO complex_ttbl(c) VALUES (' 1.2345678901234e+200 - -1.2345678901234e+200i '); 21 | INSERT INTO complex_ttbl(c) VALUES (' -0 + -0i '); 22 | INSERT INTO complex_ttbl(c) VALUES (' -0 - -0i '); 23 | INSERT INTO complex_ttbl(c) VALUES (' -0 - 0i '); 24 | INSERT INTO complex_ttbl(c) VALUES ('9+10i '); 25 | INSERT INTO complex_ttbl(c) VALUES (' 9+10i'); 26 | INSERT INTO complex_ttbl(c) VALUES ('9-10i'); 27 | 28 | -- re 29 | SELECT assert(re(COMPLEX(5, 3)),5); 30 | SELECT assert(re(COMPLEX(-5, 3)),-5); 31 | SELECT assert(re(COMPLEX(5.1, 3)),5.1); 32 | SELECT assert(re(COMPLEX(-5.1, 3)),-5.1); 33 | 34 | -- im 35 | SELECT assert(im(COMPLEX(5, 3)),3); 36 | SELECT assert(im(COMPLEX(-5, -3)),-3); 37 | SELECT assert(im(COMPLEX(5.1, 3.1)),3.1); 38 | SELECT assert(im(COMPLEX(-5.1, -3.1)),-3.1); 39 | 40 | -- abs 41 | SELECT assert(abs(COMPLEX(4, 3)),5); 42 | SELECT assert(abs(COMPLEX(4, -3)),5); 43 | SELECT assert(abs(COMPLEX('infinity', 3)),'Infinity'); 44 | SELECT assert(abs(COMPLEX('nan', 3)), 'NaN'); 45 | 46 | -- radians 47 | SELECT assert(abs(radians(COMPLEX(1, sqrt(3))) - pi()/3) < 1e-6, True); 48 | SELECT assert(abs(radians(COMPLEX(1, -sqrt(3))) + pi()/3) < 1e-6, True); 49 | SELECT assert(abs(radians(COMPLEX(-1, sqrt(3))) - pi()*2/3) < 1e-6, True); 50 | SELECT assert(abs(radians(COMPLEX(-1, -sqrt(3))) + pi()*2/3) < 1e-6, True); 51 | 52 | SELECT assert(radians(COMPLEX('infinity', 3)), 0); 53 | SELECT assert(abs(radians(COMPLEX('-infinity', 3)) - pi()) < 1e-6, True); 54 | SELECT assert(abs(radians(COMPLEX(5, 'infinity')) - pi()/2) < 1e-6, True); 55 | SELECT assert(abs(radians(COMPLEX(5, '-infinity')) + pi()/2) < 1e-6, True); 56 | SELECT assert(abs(radians(COMPLEX('infinity', 'infinity')) - pi()*45/180) < 1e-6, True); 57 | SELECT assert(radians(COMPLEX('nan', 3)), 'NaN'); 58 | SELECT assert(radians(COMPLEX('-nan', 3)), 'NaN'); 59 | SELECT assert(radians(COMPLEX(5, 'nan')), 'NaN'); 60 | SELECT assert(radians(COMPLEX(5, '-nan')), 'NaN'); 61 | SELECT assert(radians(COMPLEX('nan', 'nan')), 'NaN'); 62 | SELECT assert(radians(COMPLEX('infinity', 'nan')), 'NaN'); 63 | SELECT assert(radians(COMPLEX('nan', 'infinity')),'NaN'); 64 | 65 | -- conj 66 | SELECT assert(conj(COMPLEX(5,3)) = COMPLEX(5,-3),True); 67 | SELECT assert(conj(COMPLEX(5,-3)) = COMPLEX(5,3),True); 68 | 69 | -- not equal 70 | SELECT assert(NOT c != c, True) FROM complex_ttbl; 71 | 72 | -- unary plus 73 | SELECT assert(+(COMPLEX(5, 3)) = COMPLEX(5, 3), True); 74 | SELECT assert(+(COMPLEX(-5, 3)) = COMPLEX(-5, 3), True); 75 | 76 | -- unary minus 77 | SELECT assert(-(COMPLEX(5, 3)) = COMPLEX(-5, -3), True); 78 | SELECT assert(-(COMPLEX(-5, 3)) = COMPLEX(5, -3), True); 79 | SELECT assert(-(COMPLEX(5.1, 3)) = COMPLEX(-5.1, -3), True); 80 | SELECT assert(-(COMPLEX(-5.1, 3)) = COMPLEX(5.1, -3), True); 81 | 82 | -- plus 83 | SELECT assert(COMPLEX(3, 5) + COMPLEX(6, 7) = COMPLEX(9,12), True); 84 | 85 | -- minus 86 | SELECT assert(COMPLEX(3, 5) - COMPLEX(6, 7) = COMPLEX(-3, -2), True); 87 | 88 | -- multiply 89 | SELECT assert(COMPLEX(3, 5) * COMPLEX(6, 7) = COMPLEX(-17, 51), True); 90 | 91 | 92 | -- divide 93 | CREATE OR REPLACE FUNCTION complex_dp_eq(a COMPLEX, b COMPLEX, diff FLOAT8) RETURNS BOOLEAN AS $$ 94 | BEGIN 95 | RETURN (abs(re(a) - re(b)) < diff) AND (abs(im(a) - im(b)) < diff); 96 | END; 97 | $$ LANGUAGE PLPGSQL IMMUTABLE STRICT; 98 | 99 | SELECT assert(COMPLEX(2,2)/COMPLEX(1,1) = COMPLEX(2,0), True); 100 | SELECT assert(COMPLEX(3, 'infinity') / COMPLEX(6, 7) = COMPLEX('infinity', 'infinity'), True); 101 | SELECT assert(COMPLEX(3, 'nan') / COMPLEX(6, 7) = COMPLEX('nan', 'nan'), True); 102 | 103 | -- @ 104 | SELECT assert(@(COMPLEX(5,3)) = abs(COMPLEX(5,3)), True); 105 | 106 | -- pow and ^ 107 | SELECT assert(complex_dp_eq(COMPLEX(1,sqrt(3))^3 , COMPLEX(-1*2^3, 0), 1e-6), True); 108 | SELECT assert(complex_dp_eq(COMPLEX(0.5, 0.5*sqrt(3))^0.5, COMPLEX(0.5*sqrt(3), 0.5), 1e-6), True); 109 | SELECT assert(COMPLEX(5,3)^0 = COMPLEX(1,0), True); 110 | 111 | -- sqrt 112 | SELECT assert(sqrt(COMPLEX(5,3)) = COMPLEX(5,3)^0.5, True); 113 | 114 | -- cbrt 115 | SELECT assert(cbrt(COMPLEX(5,3)) = COMPLEX(5,3)^(1.0/3), True); 116 | 117 | -- degrees 118 | SELECT assert(degrees(COMPLEX(5,3)) = degrees(radians(COMPLEX(5,3))), True); 119 | 120 | -- exp 121 | SELECT assert(complex_dp_eq(exp(COMPLEX(5,3)), COMPLEX(-146.927913908319 , 20.944066208746), 1e-6), True); 122 | 123 | -- ln 124 | SELECT assert(complex_dp_eq(ln(COMPLEX(5,3)), COMPLEX(1.76318026230808 , 0.540419500270584), 1e-6), True); 125 | SELECT assert(complex_dp_eq(exp(ln(COMPLEX(5,3))), COMPLEX(5,3), 1e-6), True); 126 | SELECT assert(ln('0'::COMPLEX)=COMPLEX('-infinity', 0),True); 127 | 128 | -- log10 129 | SELECT assert(complex_dp_eq(log(COMPLEX(5,3)) , ln(COMPLEX(5,3))/ln(10), 1e-6), True); 130 | 131 | -- log 132 | SELECT assert(complex_dp_eq(log(COMPLEX(3,5),COMPLEX(5,3)) , ln(COMPLEX(5,3))/ln(COMPLEX(3,5)), 1e-6), True); 133 | 134 | -- acos 135 | SELECT assert(complex_dp_eq(acos(COMPLEX(5,3)), COMPLEX(0, -1)*ln(COMPLEX(5,3) + COMPLEX(0,1)*sqrt(1 - COMPLEX(5,3)^2)), 1e-6), True); 136 | 137 | -- asin 138 | SELECT assert(complex_dp_eq(asin(COMPLEX(5,3)), COMPLEX(0, -1)*ln(COMPLEX(5,3)*COMPLEX(0,1) + sqrt(1 - COMPLEX(5,3)^2)), 1e-6), True); 139 | 140 | -- atan 141 | SELECT assert(complex_dp_eq(atan(COMPLEX(5,3)), 0.5*COMPLEX(0,1)*(ln(1 - COMPLEX(5,3)*COMPLEX(0,1)) - ln(1 + COMPLEX(5,3)*COMPLEX(0,1))), 1e-6), True); 142 | 143 | -- cos 144 | SELECT assert(complex_dp_eq(cos(COMPLEX(5,3)), COMPLEX( 2.85581500422739 , 9.60638344843258), 1e-6), True); 145 | 146 | -- sin 147 | SELECT assert(complex_dp_eq(sin(COMPLEX(5,3)), COMPLEX(-9.65412547685484 , 2.84169229560635), 1e-6), True); 148 | 149 | -- cot 150 | SELECT assert(complex_dp_eq(cot(COMPLEX(5,3)), cos(COMPLEX(5,3))/sin(COMPLEX(5,3)), 1e-6), True); 151 | 152 | -- tan 153 | SELECT assert(complex_dp_eq(tan(COMPLEX(5,3)), sin(COMPLEX(5,3))/cos(COMPLEX(5,3)), 1e-6), True); 154 | 155 | -- type cast 156 | SELECT assert(COMPLEX '5+3i' ^ '4'::int2 = COMPLEX(5,3)^('4'::COMPLEX), TRUE); 157 | SELECT assert(COMPLEX '5+3i' ^ '4'::INT4 = COMPLEX(5,3)^('4'::COMPLEX), TRUE); 158 | SELECT assert(COMPLEX '5+3i' ^ '4'::INT8 = COMPLEX(5,3)^('4'::COMPLEX), TRUE); 159 | SELECT assert(COMPLEX '5+3i' ^ '4' = COMPLEX(5,3)^('4'::COMPLEX), TRUE); 160 | SELECT assert(COMPLEX '5+3i' ^ '4.5'::FLOAT4 = COMPLEX(5,3)^COMPLEX(4.5,0), TRUE); 161 | SELECT assert(COMPLEX '5+3i' ^ '4.5'::FLOAT8 = COMPLEX(5,3)^COMPLEX(4.5,0), TRUE); 162 | 163 | -- dot product 164 | SELECT assert(dotproduct(ARRAY[COMPLEX(1,3),COMPLEX(5,7)], ARRAY[COMPLEX(2,4),COMPLEX(6,8)]) = COMPLEX(1,3)*COMPLEX(2,4) + COMPLEX(5,7)*COMPLEX(6,8), True); 165 | 166 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/edit_distance/edit_distance.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance_unsafe(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.text, pg_catalog.text) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance_unsafe(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4) 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.levenshtein_distance(pg_catalog.varchar) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.demerau_levenshtein_distance(pg_catalog.text, pg_catalog.text) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.optimal_alignment_distance(pg_catalog.text, pg_catalog.text) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.demerau_levenshtein_distance(pg_catalog.varchar) 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.levenshtein_distance(pg_catalog.text, pg_catalog.text) 11 | UDF: FUNCTION PDLTOOLS_SCHEMA.optimal_alignment_distance() 12 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance() 13 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.text, pg_catalog.text) 14 | UDF: FUNCTION PDLTOOLS_SCHEMA.levenshtein_distance() 15 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance_unsafe() 16 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance_unsafe(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4) 17 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4, pg_catalog.int4) 18 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance_unsafe(pg_catalog.varchar) 19 | UDF: FUNCTION PDLTOOLS_SCHEMA.demerau_levenshtein_distance() 20 | UDF: FUNCTION PDLTOOLS_SCHEMA.optimal_alignment_distance(pg_catalog.varchar) 21 | UDF: FUNCTION PDLTOOLS_SCHEMA.edit_distance(pg_catalog.varchar) 22 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/edit_distance/edit_distance.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.1 2 | compatible: 1.2.1 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/edit_distance/test/test_edit_distance.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_edit_distance.sql 2 | -- Unit test for edit_distance.sql 3 | 4 | SELECT assert(edit_distance_unsafe('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 5 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 6 | 1048576+512,1048576+64,1048576+8, 7 | 1048576+32768,1048576+4096,1048576+1, 8 | '0O','O0')::TEXT, 9 | '11605194'); 10 | 11 | SELECT assert(edit_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 12 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 13 | 1048576+512,1048576+64,1048576+8, 14 | 1048576+32768,1048576+4096,1048576+1, 15 | '0O','O0')::TEXT, 16 | '11605194'); 17 | 18 | SELECT assert(edit_distance_unsafe('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 19 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 20 | 1048576+512,1048576+64,1048576+8, 21 | 1048576+32768,1048576+4096 22 | )::TEXT, 23 | '11605208'); 24 | 25 | SELECT assert(edit_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 26 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 27 | 1048576+512,1048576+64,1048576+8, 28 | 1048576+32768,1048576+4096 29 | )::TEXT, 30 | '11605208'); 31 | 32 | SELECT assert(edit_distance_unsafe('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 33 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 34 | 1048576+512,1048576+64,1048576+8, 35 | 1048576+32768 36 | )::TEXT, 37 | '11633880'); 38 | 39 | SELECT assert(edit_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 40 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 41 | 1048576+512,1048576+64,1048576+8, 42 | 1048576+32768 43 | )::TEXT, 44 | '11633880'); 45 | 46 | SELECT assert(edit_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 47 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 48 | 1048576+512,1048576+64,1048576+8 49 | )::TEXT, 50 | '14681352'); 51 | 52 | SELECT assert(edit_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 53 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN', 54 | 1048576+512,1048576+64 55 | )::TEXT, 56 | '17830464'); 57 | 58 | SELECT assert(levenshtein_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 59 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN')::TEXT, 60 | '14'); 61 | 62 | SELECT assert(demerau_levenshtein_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 63 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN')::TEXT, 64 | '11'); 65 | 66 | SELECT assert(optimal_alignment_distance('ONE_TWO_THREE_F0UR_FIVE_SIX_END', 67 | 'TONE_TW0_TREE_TOUR_EF_ISX_EDEN')::TEXT, 68 | '13'); 69 | 70 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/generic_utilities/generic_utilities.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.multitbl_summary(pg_catalog._text,pg_catalog.text,pg_catalog.bool,pg_catalog.bool,pg_catalog._float8,pg_catalog.int,pg_catalog.bool) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.multitbl_summary(pg_catalog.text) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.multitbl_summary() 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.cols2vec(pg_catalog.text, pg_catalong.text[], pg_catalog.text, pg_catalog.text) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.cols2vec() 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.vec2cols(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalong.text) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.vec2cols(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalong.text, pg_catalog.text) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.vec2cols() 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.dropcols(pg_catalog.text, pg_catalong.text[], pg_catalog.text, pg_catalog.text) 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.dropcols() 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/generic_utilities/generic_utilities.yml: -------------------------------------------------------------------------------- 1 | identical: 1.5 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/generic_utilities/test/test_array_utilities.sql_in: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------ 2 | -- File: test_array_utilities.sql_in 3 | -- Unit test for array_utilities 4 | ------------------------------------------------------------------------------------------------- 5 | DROP TABLE IF EXISTS vec2cols; 6 | CREATE TABLE vec2cols ( 7 | id bigint, 8 | feat_vector integer[], 9 | feat_names text[], 10 | label int, 11 | other_col float 12 | ) distributed by (id); 13 | 14 | 15 | INSERT INTO vec2cols 16 | VALUES 17 | (1, ARRAY[1,2,3,4,5], ARRAY['feat1', 'feat2','feat3', 'feat4', 'feat5'],0, 0.5), 18 | (2, ARRAY[2,2,2,2,2], ARRAY['feat1', 'feat2','feat3', 'feat4', 'feat5'],1, 0.3), 19 | (3, ARRAY[1,2,2,4,6], ARRAY['feat1', 'feat2','feat3', 'feat4', 'feat5'],0, 1.1), 20 | (4, ARRAY[1,2,2,1,1], ARRAY['feat1', 'feat2','feat3', 'feat4', 'feat5'],1, 0.4); 21 | 22 | DROP TABLE IF EXISTS cols2vec; 23 | CREATE TABLE cols2vec ( 24 | id bigint, 25 | label int, 26 | feat1 int, 27 | feat2 int, 28 | feat3 float, 29 | other_col float 30 | ) distributed by (id); 31 | 32 | INSERT INTO cols2vec 33 | VALUES 34 | (1, 0, 1, 1, 0.5, 0.9), 35 | (2, 1, 0, 1, 0.3, 0.3), 36 | (3, 0, 0, 0, 0.1, 1.1), 37 | (4, 1, 1, 0, 0.9, 0.4); 38 | 39 | drop table if exists output_cols2vec; 40 | SELECT assert( 41 | PDLTOOLS_SCHEMA.cols2vec( 42 | 'pdltools_installcheck_generic_utilities.cols2vec', -- input table 43 | array['id','label','other_col'],-- exclude columns on feature vector 44 | 'pdltools_installcheck_generic_utilities.output_cols2vec', -- output table 45 | 'randomly' 46 | ), 47 | 'pdltools_installcheck_generic_utilities.output_cols2vec created successfully' 48 | ); 49 | 50 | drop table if exists output_vec2cols_v1; 51 | SELECT assert( 52 | PDLTOOLS_SCHEMA.vec2cols( 53 | 'pdltools_installcheck_generic_utilities.vec2cols', -- input table 54 | 'feat_vector', -- vector column 55 | 'pdltools_installcheck_generic_utilities.output_vec2cols_v1', -- output table 56 | 'randomly' 57 | ), 58 | 'pdltools_installcheck_generic_utilities.output_vec2cols_v1 created successfully' 59 | ); 60 | 61 | drop table if exists output_vec2cols_v2; 62 | SELECT assert( 63 | PDLTOOLS_SCHEMA.vec2cols( 64 | 'pdltools_installcheck_generic_utilities.vec2cols', -- input table 65 | 'feat_vector', -- vector column 66 | 'feat_names', -- vector names column 67 | 'pdltools_installcheck_generic_utilities.output_vec2cols_v2', -- output table 68 | 'randomly' 69 | ), 70 | 'pdltools_installcheck_generic_utilities.output_vec2cols_v2 created successfully' 71 | ); 72 | 73 | select 74 | assert(array_upper(feature_vector,1),3, 'Assertion test') 75 | from 76 | output_cols2vec 77 | where 78 | id = 1; 79 | 80 | select 81 | assert(array_upper(feature_names,1),3, 'Assertion test') 82 | from 83 | output_cols2vec 84 | where 85 | id = 2; 86 | 87 | select 88 | assert(sum(feat1),5, 'Assertion test') 89 | from 90 | output_vec2cols_v2; 91 | 92 | select 93 | assert(sum(f2),8, 'Assertion test') 94 | from 95 | output_vec2cols_v1; 96 | 97 | 98 | DROP TABLE output_vec2cols_v2, output_vec2cols_v1, output_cols2vec, 99 | cols2vec, vec2cols; 100 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/generic_utilities/test/test_generic_utilities.sql_in: -------------------------------------------------------------------------------- 1 | -- File: generic_utilities.sql_in 2 | -- Unit tests for generic utilities module 3 | 4 | ------------------------------------------------------------------------------------------------- 5 | 6 | -- create test tables 7 | -- test table #1 8 | CREATE TABLE PDLTOOLS_SCHEMA.multitbl_summary_test1 ( 9 | col1 float 10 | ,col2 float 11 | ,col3 float 12 | ); 13 | INSERT INTO PDLTOOLS_SCHEMA.multitbl_summary_test1 ( 14 | SELECT random() 15 | ,random() 16 | ,random() 17 | FROM generate_series(1,100) 18 | ); 19 | 20 | -- test table #2 21 | CREATE TABLE PDLTOOLS_SCHEMA.multitbl_summary_test2 ( 22 | col1 float 23 | ,col2 float 24 | ,col3 float 25 | ); 26 | INSERT INTO PDLTOOLS_SCHEMA.multitbl_summary_test2 ( 27 | SELECT random() 28 | ,random() 29 | ,random() 30 | FROM generate_series(1,100) 31 | ); 32 | 33 | -- Create summary table "sum_output" 34 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.sum_output; 35 | SELECT PDLTOOLS_SCHEMA.multitbl_summary( 36 | array[ 37 | 'PDLTOOLS_SCHEMA.multitbl_summary_test1' 38 | ,'PDLTOOLS_SCHEMA.multitbl_summary_test2' 39 | ,'PDLTOOLS_SCHEMA.multitbl_summary_missing' 40 | ] 41 | ,'PDLTOOLS_SCHEMA.sum_output' 42 | ,TRUE 43 | ,TRUE 44 | ,NULL 45 | ,10 46 | ,TRUE 47 | ); 48 | 49 | -- test that 2 tables were profiled 50 | SELECT assert( 51 | ( 52 | SELECT count(distinct table_schema||table_name) 53 | FROM PDLTOOLS_SCHEMA.sum_output 54 | )::integer, 55 | 2::integer 56 | ); 57 | 58 | -- test that 6 columns were profiled 59 | SELECT assert( 60 | ( 61 | SELECT count(distinct table_schema||table_name||target_column) 62 | FROM PDLTOOLS_SCHEMA.sum_output 63 | )::integer, 64 | 6::integer 65 | ); 66 | 67 | -- clean up to test again with defaults approach 68 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.sum_output; 69 | 70 | -- Create summary table "sum_output" (using defaults) 71 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.sum_output; 72 | SELECT PDLTOOLS_SCHEMA.multitbl_summary( 73 | array[ 74 | 'PDLTOOLS_SCHEMA.multitbl_summary_test1' 75 | ,'PDLTOOLS_SCHEMA.multitbl_summary_test2' 76 | ,'PDLTOOLS_SCHEMA.multitbl_summary_missing' 77 | ] 78 | ,'PDLTOOLS_SCHEMA.sum_output' 79 | ); 80 | 81 | -- test that 2 tables were profiled 82 | SELECT assert( 83 | ( 84 | SELECT count(distinct table_schema||table_name) 85 | FROM PDLTOOLS_SCHEMA.sum_output 86 | )::integer, 87 | 2::integer 88 | ); 89 | 90 | -- test that 6 columns were profiled 91 | SELECT assert( 92 | ( 93 | SELECT count(distinct table_schema||table_name||target_column) 94 | FROM PDLTOOLS_SCHEMA.sum_output 95 | )::integer, 96 | 6::integer 97 | ); 98 | 99 | -- clean up 100 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.multitbl_summary_test1; 101 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.multitbl_summary_test2; 102 | DROP TABLE IF EXISTS PDLTOOLS_SCHEMA.sum_output; 103 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/grid_search/grid_search.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.grid_search_kmeans_all(pgcatalog.text, pgcatalog.text, pgcatalog.int[],pgcatalog.text, pgcatalog.text, pgcatalog.text, pgcatalog.text,pgcatalog.text, pgcatalog.integer,pgcatalog.double precision,pgcatalog.double precision 2 | ) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.integer,pg_catalog.double precision,pg_catalog.double precision 4 | ) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.integer,pg_catalog.double precision 6 | ) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.integer 8 | ) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text 10 | ) 11 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text 12 | ) 13 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeanspp(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text 14 | ) 15 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeans_random(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.integer,pg_catalog.double precision 16 | ) 17 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeans_random(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.integer 18 | ) 19 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeans_random(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text, pg_catalog.text 20 | ) 21 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeans_random(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text, pg_catalog.text 22 | ) 23 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_kmeans_random(pg_catalog.text, pg_catalog.text, pg_catalog.int[],pg_catalog.text 24 | ) 25 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.integer,pg_catalog.float8 26 | ) 27 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.integer 28 | ) 29 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean,pg_catalog.text,pg_catalog.text,pg_catalog.text 30 | ) 31 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean,pg_catalog.text,pg_catalog.text 32 | ) 33 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean,pg_catalog.text 34 | ) 35 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int,pg_catalog.boolean 36 | ) 37 | UDF: FUNCTION PDLTOOLS_SCHEMA.gs_elasticnet_cv(pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.text,pg_catalog.float8[], pg_catalog.float8[],pg_catalog.text,pg_catalog.text,pg_catalog.int 38 | ) -------------------------------------------------------------------------------- /src/ports/greenplum/modules/grid_search/grid_search.yml: -------------------------------------------------------------------------------- 1 | identical: 1.5 2 | compatible: 0.0 3 | libpart: pdltools -------------------------------------------------------------------------------- /src/ports/greenplum/modules/grid_search/test/test_grid_search.sql_in: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------ 2 | -- File: test_grid_search.sql_in 3 | -- Unit test for grid_search module 4 | ------------------------------------------------------------------------------------------------- 5 | -- set up test data 6 | CREATE TEMP TABLE test_grid_search(pid int, points double precision[]); 7 | 8 | COPY test_grid_search (pid, points) FROM stdin DELIMITER '|'; 9 | 1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065} 10 | 2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050} 11 | 3 | {13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185} 12 | 4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480} 13 | 5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735} 14 | 6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450} 15 | 7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290} 16 | 8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295} 17 | 9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045} 18 | 10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045} 19 | \. 20 | 21 | -- set k values to try 22 | \set input_k_values array[1,2,3] 23 | 24 | 25 | /*================================================================= 26 | * test grid search for kmeans++ seeding 27 | ==================================================================*/ 28 | 29 | -- case 1: all parameters given 30 | drop table if exists test_output; 31 | 32 | select assert( 33 | PDLTOOLS_SCHEMA.gs_kmeanspp( 34 | 'pid' 35 | ,'test_output' 36 | ,:input_k_values 37 | ,'test_grid_search' 38 | ,'points' 39 | ,'madlib.squared_dist_norm2' 40 | ,'madlib.avg' 41 | ,30 42 | ,0.001 43 | ,1.0 44 | ), 45 | 'Model parameters written to test_output table' 46 | ); 47 | 48 | -- k values in resulting table should be equal to input_k_values 49 | select assert(array_to_string(k_values,','), array_to_string(:input_k_values,',')) 50 | from ( 51 | select array_agg(num_clusters order by num_clusters) as k_values 52 | from test_output 53 | ) t1; 54 | 55 | -- number of rows in resulting table should be equal to length of input k array 56 | select assert(rowcnt,array_upper(:input_k_values,1)) 57 | from ( 58 | select count(*) as rowcnt 59 | from test_output 60 | ) t1; 61 | 62 | -- case 2: all parameters but seeding_sample_ratio, min_frax_reassigned, and max_num_iterations given 63 | drop table if exists test_output; 64 | 65 | select assert( 66 | PDLTOOLS_SCHEMA.gs_kmeanspp( 67 | 'pid' 68 | ,'test_output' 69 | ,ARRAY[1,2,3] 70 | ,'test_grid_search' 71 | ,'points' 72 | ,'madlib.squared_dist_norm2' 73 | ), 74 | 'Model parameters written to test_output table' 75 | ); 76 | 77 | -- k values in resulting table should be equal to input_k_values 78 | select assert(array_to_string(k_values,','), array_to_string(:input_k_values,',')) 79 | from ( 80 | select array_agg(num_clusters order by num_clusters) as k_values 81 | from test_output 82 | ) t1; 83 | 84 | -- number of rows in resulting table should be equal to length of input k array 85 | select assert(rowcnt,array_upper(:input_k_values,1)) 86 | from ( 87 | select count(*) as rowcnt 88 | from test_output 89 | ) t1; 90 | 91 | 92 | /*================================================================= 93 | * test grid search for kmeans random seeding 94 | ==================================================================*/ 95 | -- set k values to try 96 | \set input_k_values array[3,4,5] 97 | 98 | -- case 1: all parameters given 99 | drop table if exists test_output; 100 | 101 | select assert( 102 | PDLTOOLS_SCHEMA.gs_kmeans_random( 103 | 'pid' 104 | ,'test_output' 105 | ,:input_k_values 106 | ,'test_grid_search' 107 | ,'points' 108 | ,'madlib.squared_dist_norm2' 109 | ,'madlib.avg' 110 | ,30 111 | ,0.001 112 | ), 113 | 'Model parameters written to test_output table' 114 | ); 115 | 116 | -- k values in resulting table should be equal to input_k_values 117 | select assert(array_to_string(k_values,','), array_to_string(:input_k_values,',')) 118 | from ( 119 | select array_agg(num_clusters order by num_clusters) as k_values 120 | from test_output 121 | ) t1; 122 | 123 | -- number of rows in resulting table should be equal to length of input k array 124 | select assert(rowcnt,array_upper(:input_k_values,1)) 125 | from ( 126 | select count(*) as rowcnt 127 | from test_output 128 | ) t1; 129 | 130 | -- case 2: all parameters but seeding_sample_ratio, min_frax_reassigned, and max_num_iterations given 131 | drop table if exists test_output; 132 | 133 | select assert( 134 | PDLTOOLS_SCHEMA.gs_kmeans_random( 135 | 'pid' 136 | ,'test_output' 137 | ,:input_k_values 138 | ,'test_grid_search' 139 | ,'points' 140 | ,'madlib.squared_dist_norm2' 141 | ), 142 | 'Model parameters written to test_output table' 143 | ); 144 | 145 | -- k values in resulting table should be equal to input_k_values 146 | select assert(array_to_string(k_values,','), array_to_string(:input_k_values,',')) 147 | from ( 148 | select array_agg(num_clusters order by num_clusters) as k_values 149 | from test_output 150 | ) t1; 151 | 152 | -- number of rows in resulting table should be equal to length of input k array 153 | select assert(rowcnt,array_upper(:input_k_values,1)) 154 | from ( 155 | select count(*) as rowcnt 156 | from test_output 157 | ) t1; 158 | 159 | 160 | /*================================================================= 161 | * test grid search for kmeans++ seeding 162 | ==================================================================*/ 163 | -- set up test data 164 | DROP TABLE IF EXISTS houses_sample; 165 | CREATE TABLE houses_sample ( id INT, 166 | tax INT, 167 | bedroom INT, 168 | bath FLOAT, 169 | price INT, 170 | size INT, 171 | lot INT 172 | ); 173 | COPY houses_sample FROM STDIN WITH DELIMITER '|'; 174 | 1 | 590 | 2 | 1 | 50000 | 770 | 22100 175 | 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 176 | 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 177 | 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 178 | 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 179 | 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 180 | 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 181 | 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 182 | 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 183 | 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 184 | 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 185 | 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 186 | 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 187 | 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 188 | 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 189 | \. 190 | 191 | 192 | -- test function 193 | select assert( 194 | PDLTOOLS_SCHEMA.gs_elasticnet_cv( 195 | 'houses_sample', 196 | 'houses_en', 197 | 'price', 198 | 'array[tax, bath, size]', 199 | 'gaussian', 200 | ARRAY[0.05, 0.20], -- alphas 201 | ARRAY[0.05, 0.20], -- lambdas 202 | 'id', 203 | 'elastic_net_mdls', 204 | 2, 205 | TRUE, 206 | 'fista' 207 | ), 208 | 'Grid search results written to houses_en' 209 | ); 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/hits/hits.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.hits(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.float) 2 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/hits/hits.yml: -------------------------------------------------------------------------------- 1 | identical: 1.6 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/hits/test/test_hits.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_hits.sql_in 2 | -- Unit test for hits.sql_in 3 | 4 | 5 | -- Create a test dataset. 6 | 7 | 8 | CREATE TEMP TABLE hits_test_data (source TEXT, dest TEXT, groups TEXT) DISTRIBUTED RANDOMLY; 9 | INSERT INTO hits_test_data VALUES 10 | ('02', '12', '1'), 11 | ('03', '07', '1'), 12 | ('01', '07', '1'), 13 | ('01', '10', '1'), 14 | ('02', '12', '1'), 15 | ('05', '07', '1'), 16 | ('08', '05', '1'), 17 | ('05', '05', '1'), 18 | ('06', '09', '1'), 19 | ('04', '12', '1'), 20 | ('02', '06', '1'), 21 | ('05', '12', '1'), 22 | ('07', '12', '1'), 23 | ('02', '12', '1'), 24 | ('02', '07', '1'), 25 | ('01', '10', '1'), 26 | ('02', '06', '1'), 27 | ('02', '12', '1'), 28 | ('00', '09', '1'), 29 | ('00', '07', '1'), 30 | ('A', 'B', '2'), 31 | ('A', 'C', '2'), 32 | ('A', 'D', '2'), 33 | ('A', 'E', '2'), 34 | ('B', 'A', '3'), 35 | ('C', 'A', '3'), 36 | ('D', 'A', '3'), 37 | ('E', 'A', '3'); 38 | 39 | 40 | -- Calculate authority and hub scores for each node. 41 | SELECT PDLTOOLS_SCHEMA.hits('hits_test_data', 'source', 'dest', 'groups', 'hits_test_output', 50, 1e-4); 42 | 43 | 44 | -- Check if the resulting authority score is is correct. 45 | SELECT PDLTOOLS_SCHEMA.assert( 46 | array_to_string(array_agg(round(auth_score::NUMERIC,3) ORDER BY graph_id, node),','), 47 | '0.000,0.000,0.000,0.000,0.000,0.120,0.101,0.372,0.000,0.068,0.058,0.281,0.000,0.250,0.250,0.250,0.250,1.000,0.000,0.000,0.000,0.000') 48 | FROM hits_test_output; 49 | 50 | -- Check if the resulting hub score is correct 51 | SELECT PDLTOOLS_SCHEMA.assert( 52 | array_to_string(array_agg(round(hub_score::NUMERIC,3) ORDER BY graph_id, node),','), 53 | '0.125,0.122,0.214,0.106,0.080,0.220,0.019,0.080,0.034,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000,0.000,0.250,0.250,0.250,0.250') 54 | FROM hits_test_output; 55 | 56 | 57 | 58 | 59 | 60 | -- Clean up temp tables. 61 | DROP TABLE hits_test_data; 62 | DROP TABLE hits_test_output; 63 | 64 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/kd_tree/kd_tree.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.kd_tree(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.text) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.__kdtree_kd_tree_max_variance(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int8, pg_catalog.int4, pg_catalog.int4[], pg_catalog.int4[], pg_catalog.text) 3 | REL: TYPE PDLTOOLS_SCHEMA.__kdtree_kdtree_knn_result 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.kdtree_knn(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.float8[], pg_catalog.int4) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.kdtree_knn(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.text) 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.__kdtree_kdtree_knn_0(pg_catalog.text, pg_catalog.text, pg_catalog.text, vpg_catalog.text, pg_catalog.text, pg_catalog.int4[], pg_catalog.float8[], pg_catalog.int4, PDLTOOLS_SCHEMA.__kdtree_kdtree_knn_result) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.kd_tree() 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.kd_tree(pg_catalog.text) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.kdtree_knn() 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.kdtree_knn(pg_catalog.text) 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/kd_tree/kd_tree.yml: -------------------------------------------------------------------------------- 1 | identical: 1.6dev 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/kd_tree/test/test_kd_tree.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_kd_tree.sql 2 | -- Unit test for kd_tree.sql_in 3 | 4 | 5 | -- Create a test dataset. 6 | CREATE TEMP TABLE kdtree_test_data (id INT, dimension INT, val FLOAT8) DISTRIBUTED RANDOMLY; 7 | INSERT INTO kdtree_test_data VALUES 8 | (1, 1, 0), 9 | (1, 2, 1), 10 | (2, 1, 2), 11 | (2, 2, 2), 12 | (3, 1, 3), 13 | (3, 2, 6), 14 | (4, 1, 4), 15 | (4, 2, 9), 16 | (5, 1, 5), 17 | (5, 2, 7), 18 | (6, 1, 6), 19 | (6, 2, 2), 20 | (7, 1, 7), 21 | (7, 2, 3), 22 | (8, 1, 8), 23 | (8, 2, 4), 24 | (9, 1, 9), 25 | (9, 2, 7), 26 | (10, 1, 10), 27 | (10, 2, 8); 28 | 29 | 30 | -- Construct a KD-tree with each leaf node containing no more than 3 data points. 31 | SELECT PDLTOOLS_SCHEMA.kd_tree('kdtree_test_data', 'id', 'dimension', 'val', 3, 'kdtree_test_output'); 32 | 33 | 34 | -- Check if the KD-tree is correctly constructed. 35 | CREATE TEMP TABLE correct_kdtree_test_output (node_location INT[], split_dimension INT, split_value FLOAT8, pop_variance FLOAT8, leaf_member INT[]) DISTRIBUTED RANDOMLY; 36 | INSERT INTO correct_kdtree_test_output VALUES 37 | (ARRAY[0],2,6,9.2,NULL), 38 | (ARRAY[0,0],NULL,NULL,NULL,ARRAY[1,2,3]), 39 | (ARRAY[0,1],NULL,NULL,NULL,ARRAY[4,5]), 40 | (ARRAY[1],2,4,5.36,NULL), 41 | (ARRAY[1,0],NULL,NULL,NULL,ARRAY[6,7,8]), 42 | (ARRAY[1,1],NULL,NULL,NULL,ARRAY[9,10]), 43 | (NULL,1,5, 9.24, NULL); 44 | 45 | SELECT PDLTOOLS_SCHEMA.assert(count::TEXT, '0'::TEXT) 46 | FROM ( 47 | SELECT count(*) FROM ( 48 | SELECT * FROM kdtree_test_output 49 | EXCEPT 50 | SELECT * FROM correct_kdtree_test_output 51 | ) foo 52 | ) foo; 53 | 54 | 55 | 56 | -- Find the 2-nearest neighbours for a test point (3.5,4). 57 | 58 | SELECT PDLTOOLS_SCHEMA.assert(array_to_string(result,','), '3,2,2.06,2.50'::TEXT) 59 | FROM ( 60 | SELECT array_agg(knn_id ORDER BY knn_dist)::NUMERIC[]|| array_agg(knn_dist ORDER BY knn_dist) result 61 | FROM ( 62 | SELECT unnest(knn_id) knn_id, round(unnest(knn_dist)::NUMERIC,2) knn_dist 63 | FROM PDLTOOLS_SCHEMA.kdtree_knn('kdtree_test_data', 'id', 'dimension', 'val', 'kdtree_test_output', array[3.5,4], 2) 64 | ) foo 65 | ) foo; 66 | 67 | 68 | 69 | -- Find the 2-nearest neighbours for multiple test points that are stored in a table. 70 | 71 | CREATE TEMP TABLE kdtree_knn_query_data (query_id INT, query_point FLOAT8[]) DISTRIBUTED RANDOMLY; 72 | INSERT INTO kdtree_knn_query_data VALUES 73 | (1, array[8,1]), 74 | (2, array[3,4]), 75 | (3, array[1,2.5]); 76 | 77 | SELECT PDLTOOLS_SCHEMA.kdtree_knn('kdtree_test_data', 'id', 'dimension', 'val', 'kdtree_test_output', 78 | 'kdtree_knn_query_data', 'query_id', 'query_point', 2, 'query_result'); 79 | 80 | 81 | SELECT PDLTOOLS_SCHEMA.assert(array_to_string(all_result,','), '1,6,7,2.24,2.24,2,3,2,2.00,2.24,3,2,1,1.12,1.80'::TEXT) 82 | FROM ( 83 | SELECT array_agg(result ORDER BY result) all_result 84 | FROM ( 85 | SELECT array_to_string(ARRAY[query_id]::NUMERIC[] || array_agg(knn_id ORDER BY knn_dist,knn_id)::NUMERIC[] || array_agg(knn_dist ORDER BY knn_dist), ',') result 86 | FROM ( 87 | SELECT query_id, unnest(knn_id) knn_id, round(unnest(knn_dist)::NUMERIC,2) knn_dist 88 | FROM query_result 89 | ) foo 90 | GROUP BY query_id 91 | ) foo 92 | ) foo; 93 | 94 | 95 | -- Clean up temp tables. 96 | DROP TABLE kdtree_test_data; 97 | DROP TABLE kdtree_test_output; 98 | DROP TABLE correct_kdtree_test_output; 99 | DROP TABLE kdtree_knn_query_data; 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/normalized_cut/normalized_cut.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.normalized_cut(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.int4, pg_catalog.text) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.__i_plus_negsqrt_d_w_negsqrt_d(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text,pg_catalog.text, pg_catalog.text) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.normalized_cut() 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.normalized_cut(pg_catalog.text) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.__irlba_cut(pg_catalog.int4[], pg_catalog.int4[], pg_catalog.float8[]) 6 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/normalized_cut/normalized_cut.yml: -------------------------------------------------------------------------------- 1 | identical: 1.7 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/normalized_cut/test/test_normalized_cut.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_normalized_cut.sql_in 2 | -- Unit test for normalized_cut.sql_in 3 | 4 | 5 | -- 1. Create test points/nodes. 6 | CREATE TEMP TABLE ncut_test_data (node INT, feature FLOAT[]) DISTRIBUTED RANDOMLY; 7 | 8 | INSERT INTO ncut_test_data VALUES 9 | (1,ARRAY[0.00513649,0.167166]), 10 | (2,ARRAY[0.44311,0.127502]), 11 | (3,ARRAY[0.312811,0.421223]), 12 | (4,ARRAY[0.373044,0.280389]), 13 | (5,ARRAY[0.166491,0.123695]), 14 | (6,ARRAY[1.04216,1.21863]), 15 | (7,ARRAY[1.06422,1.42166]), 16 | (8,ARRAY[1.19542,1.44384]), 17 | (9,ARRAY[1.01775,1.25931]), 18 | (10,ARRAY[2.81904,3.3038]), 19 | (11,ARRAY[3.08253,3.2119]), 20 | (12,ARRAY[3.232,3.11088]), 21 | (13,ARRAY[3.00462,3.19002]), 22 | (14,ARRAY[2.95855,2.83753]), 23 | (15,ARRAY[3.24319,2.85697]); 24 | 25 | 26 | -- 2. Calculate pairwise similarity, i.e., edge weights. Zero weight is given to edges connecting two 27 | -- nodes with their similarity < 2e-3. The similarity matrix is therefore generally sparse. 28 | 29 | CREATE TEMP TABLE ncut_test_sim AS 30 | ( 31 | WITH a AS ( 32 | SELECT a.node row_id, 33 | b.node col_id, 34 | exp(-madlib.dist_norm2(a.feature, b.feature)/0.5) similarity 35 | FROM ncut_test_data a, ncut_test_data b 36 | ) 37 | SELECT * FROM a WHERE similarity >= 2e-3 38 | ) DISTRIBUTED RANDOMLY; 39 | 40 | 41 | -- 3. Use normalized cut to iteratively partition the graph until each subgraph has no more than 6 nodes. 42 | SELECT pdltools.normalized_cut( 43 | 'ncut_test_sim', -- Sparse pairwise similarity table 44 | 'row_id', -- Row index column 45 | 'col_id', -- Column index column 46 | 'similarity', -- Similarity column 47 | 6, -- Maximum subgraph size 48 | 'ncut_test_output' -- Output table 49 | ); 50 | 51 | 52 | -- 4. Check if the result is correct. 53 | SELECT pdltools.assert( 54 | string_agg(partition_label || ',' || node || ';' ORDER BY partition_label, node), 55 | '0,10;0,11;0,12;0,13;0,14;0,15;10,6;10,7;10,8;10,9;11,1;11,2;11,3;11,4;11,5;'::TEXT 56 | ) 57 | FROM ncut_test_output; 58 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/one_vs_rest/one_vs_rest.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.one_vs_rest_data_prep(pg_catalog.varchar) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.one_vs_rest_data_prep() 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.one_vs_rest_data_prep(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/one_vs_rest/one_vs_rest.yml: -------------------------------------------------------------------------------- 1 | identical: 1.5 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/one_vs_rest/test/test_one_vs_rest_data_prep.sql_in: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------ 2 | -- File: test_one_vs_rest_data_prep.sql_in 3 | -- Unit test for one_vs_rest_data_prep module 4 | ------------------------------------------------------------------------------------------------- 5 | create table testing_one_vs_rest_input ( 6 | id int, 7 | label text, 8 | feature1 float8, 9 | feature2 float8, 10 | feature3 float8 11 | ) distributed by (id); 12 | 13 | insert into testing_one_vs_rest_input 14 | values 15 | (1,'A',-1,2,0), 16 | (2,'A',-2,3,0.4), 17 | (3,'A',-1.7,2.3,-0.01), 18 | (4,'B',5.7,1.3,0.1), 19 | (5,'B',4.9,1.9,2.1), 20 | (6,'B',5.4,1.23,1.1), 21 | (7,'B',6.1,2.4,-0.11), 22 | (8,'C',2.1,-1.76,3.1), 23 | (9,'C',1.8,-1.21,4.1), 24 | (10,'D',1.1,-3.68,5.11), 25 | (11,'D',0.8,-6.11,8.71), 26 | (12,'D',0.9,-5.01,3.19); 27 | 28 | drop table if exists testing_one_vs_rest_output; 29 | 30 | select assert( 31 | PDLTOOLS_SCHEMA.one_vs_rest_data_prep ( 32 | 'testing_one_vs_rest_input', 33 | 'id', 34 | 'label', 35 | 'testing_one_vs_rest_output' 36 | ), 37 | 'One-vs-Rest data table created: testing_one_vs_rest_output' 38 | ); 39 | 40 | -- count number of records in the output table 41 | select assert(total_count,48) from ( 42 | select 43 | count(*) as total_count 44 | from 45 | testing_one_vs_rest_output 46 | )q; 47 | 48 | -- count number of wrong records, should be 0. 49 | select assert(total_wrong,0) from ( 50 | select 51 | count(case when (src_label = 'A' and classifier_number = '1' and new_class_label <> '1') or 52 | (src_label = 'B' and classifier_number = '2' and new_class_label <> '1') or 53 | (src_label = 'C' and classifier_number = '3' and new_class_label <> '1') or 54 | (src_label = 'D' and classifier_number = '4' and new_class_label <>'1') 55 | then 1 else NULL end 56 | ) as total_wrong 57 | from 58 | testing_one_vs_rest_output 59 | )q; 60 | 61 | -- count number of wrong records, should be 0. 62 | select assert(total_wrong,0) from ( 63 | select 64 | count(case when (src_label <> 'A' and classifier_number = '1' and new_class_label = '1') or 65 | (src_label <> 'B' and classifier_number = '2' and new_class_label = '1') or 66 | (src_label <> 'C' and classifier_number = '3' and new_class_label = '1') or 67 | (src_label <> 'D' and classifier_number = '4' and new_class_label = '1') 68 | then 1 else NULL end 69 | ) as total_wrong 70 | from 71 | testing_one_vs_rest_output 72 | )q; 73 | 74 | -- count number of distinct classifiers, should be 4 75 | select assert(num_classifiers,4) from ( 76 | select 77 | count(distinct classifier_number) as num_classifiers 78 | from 79 | testing_one_vs_rest_output 80 | )q; 81 | 82 | drop table if exists testing_one_vs_rest_output; 83 | drop table if exists testing_one_vs_rest_input; -------------------------------------------------------------------------------- /src/ports/greenplum/modules/pagerank/pagerank.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.pagerank(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.float, pg_catalog.int4, pg_catalog.float) 2 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/pagerank/pagerank.yml: -------------------------------------------------------------------------------- 1 | identical: 1.6 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/pagerank/test/test_pagerank.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_pagerank.sql_in 2 | -- Unit test for pagerank.sql_in 3 | 4 | 5 | -- Create a test dataset. 6 | CREATE TEMP TABLE pagerank_test_data (source TEXT, dest TEXT) DISTRIBUTED RANDOMLY; 7 | INSERT INTO pagerank_test_data VALUES 8 | ('B','C'), 9 | ('C','B'), 10 | ('D','A'), 11 | ('D','B'), 12 | ('E','B'), 13 | ('E','D'), 14 | ('E','F'), 15 | ('F','B'), 16 | ('F','E'), 17 | ('G','B'), 18 | ('G','E'), 19 | ('H','B'), 20 | ('H','E'), 21 | ('I','B'), 22 | ('I','E'), 23 | ('J','E'), 24 | ('K','E'); 25 | 26 | 27 | -- Calculate PageRank value for each node. 28 | SELECT PDLTOOLS_SCHEMA.pagerank('pagerank_test_data', 'source', 'dest', 'pagerank_test_output', 0.85, 50, 1e-3); 29 | 30 | 31 | -- Check if the result is correct. 32 | SELECT PDLTOOLS_SCHEMA.assert( 33 | array_to_string(array_agg(round(pagerank::NUMERIC,2) ORDER BY node),','), 34 | '0.03,0.38,0.34,0.04,0.08,0.04,0.02,0.02,0.02,0.02,0.02') 35 | FROM pagerank_test_output; 36 | 37 | 38 | -- Clean up temp tables. 39 | DROP TABLE pagerank_test_data; 40 | DROP TABLE pagerank_test_output; 41 | 42 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/plr_placeholder/plr_placeholder.yml: -------------------------------------------------------------------------------- 1 | identical: 1.3 2 | compatible: 1.3 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/plr_placeholder/plr_sample.sql_in: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------- *//** 2 | 3 | @file plr_sample.sql_in 4 | 5 | @brief Placeholder for functions written in PL/R. 6 | 7 | *//* ----------------------------------------------------------------------- */ 8 | 9 | 10 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.plr_cor(x float8[], y float8[]) 11 | RETURNS float8 12 | AS 13 | $$ 14 | return (cor(x,y)); 15 | $$ language plr; 16 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/plr_placeholder/test/test_plr_sample.sql_in: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------ 2 | -- Test cases for PL/R placeholder 3 | ------------------------------------------------------------------ 4 | 5 | select assert( 6 | ( 7 | select plr_cor(ARRAY[1,2,3],ARRAY[1,2,3]) 8 | ), 9 | 1 10 | ); 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/prediction_metrics/prediction_metrics.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_rmse_final(PDLTOOLS_SCHEMA.__pm_rmse_sum_float8_and_num) 2 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_rmse(pg_catalog.float8, pg_catalog.float8) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_confusion_matrix(pg_catalog.text) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mpe_transition(PDLTOOLS_SCHEMA.__pm_mpe_sum_float8_and_num, pg_catalog.float8, pg_catalog.float8) 5 | REL: TYPE PDLTOOLS_SCHEMA.__pm_r2_float8_float8 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_confusion_matrix() 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_binary_classifier(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_r2_combine(PDLTOOLS_SCHEMA.__pm_r2_float8_float8, PDLTOOLS_SCHEMA.__pm_r2_float8_float8) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_rmse() 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mape_final(PDLTOOLS_SCHEMA.__pm_mape_sum_float8_and_num) 11 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mae() 12 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_r2() 13 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_auc() 14 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_binary_classifier() 15 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_adjusted_r2(pg_catalog.float8, pg_catalog.float8, pg_catalog.int4, pg_catalog.int4) 16 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mpe() 17 | REL: TYPE PDLTOOLS_SCHEMA.__pm_mape_sum_float8_and_num 18 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_r2_transition(PDLTOOLS_SCHEMA.__pm_r2_float8_float8, pg_catalog.float8, pg_catalog.float8) 19 | REL: TYPE PDLTOOLS_SCHEMA.__pm_mpe_sum_float8_and_num 20 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mae_transition(PDLTOOLS_SCHEMA.__pm_arg_sum_float8_and_num, pg_catalog.float8, pg_catalog.float8) 21 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_binary_classifier(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 22 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mpe_combine(PDLTOOLS_SCHEMA.__pm_mpe_sum_float8_and_num, PDLTOOLS_SCHEMA.__pm_mpe_sum_float8_and_num) 23 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_auc(pg_catalog.text, pg_catalog.text, pg_catalog.text) 24 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mae_final(PDLTOOLS_SCHEMA.__pm_arg_sum_float8_and_num) 25 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mae_combine(PDLTOOLS_SCHEMA.__pm_arg_sum_float8_and_num, PDLTOOLS_SCHEMA.__pm_arg_sum_float8_and_num) 26 | REL: TYPE PDLTOOLS_SCHEMA.__pm_rmse_sum_float8_and_num 27 | REL: TYPE PDLTOOLS_SCHEMA.__pm_adjusted_r2_float8_float8 28 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_rmse_combine(PDLTOOLS_SCHEMA.__pm_rmse_sum_float8_and_num, PDLTOOLS_SCHEMA.__pm_rmse_sum_float8_and_num) 29 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_adjusted_r2_transition(PDLTOOLS_SCHEMA.__pm_adjusted_r2_float8_float8, pg_catalog.float8, pg_catalog.float8, pg_catalog.int4, pg_catalog.int4) 30 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_adjusted_r2(pg_catalog.text) 31 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_auc(pg_catalog.text) 32 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_rmse(pg_catalog.text) 33 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_r2(pg_catalog.text) 34 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mpe_final(PDLTOOLS_SCHEMA.__pm_mpe_sum_float8_and_num) 35 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_adjusted_r2() 36 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mae(pg_catalog.text) 37 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_adjusted_r2_final(PDLTOOLS_SCHEMA.__pm_adjusted_r2_float8_float8) 38 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_mpe(pg_catalog.float8, pg_catalog.float8) 39 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mpe(pg_catalog.text) 40 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_adjusted_r2_combine(PDLTOOLS_SCHEMA.__pm_adjusted_r2_float8_float8, PDLTOOLS_SCHEMA.__pm_adjusted_r2_float8_float8) 41 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_rmse_transition(PDLTOOLS_SCHEMA.__pm_rmse_sum_float8_and_num, pg_catalog.float8, pg_catalog.float8) 42 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_binary_classifier(pg_catalog.text) 43 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_auc(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 44 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_mape(pg_catalog.float8, pg_catalog.float8) 45 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mape(pg_catalog.text) 46 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mape_transition(PDLTOOLS_SCHEMA.__pm_mape_sum_float8_and_num, pg_catalog.float8, pg_catalog.float8) 47 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_r2_final(PDLTOOLS_SCHEMA.__pm_r2_float8_float8) 48 | REL: TYPE PDLTOOLS_SCHEMA.__pm_arg_sum_float8_and_num 49 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_r2(pg_catalog.float8, pg_catalog.float8) 50 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_confusion_matrix(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 51 | UDF: FUNCTION PDLTOOLS_SCHEMA.__pm_mf_mape_combine(PDLTOOLS_SCHEMA.__pm_mape_sum_float8_and_num, PDLTOOLS_SCHEMA.__pm_mape_sum_float8_and_num) 52 | UDF: AGGREGATE PDLTOOLS_SCHEMA.mf_mae(pg_catalog.float8, pg_catalog.float8) 53 | UDF: FUNCTION PDLTOOLS_SCHEMA.mf_mape() 54 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/prediction_metrics/prediction_metrics.yml: -------------------------------------------------------------------------------- 1 | identical: 1.4 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/sampling/sampling.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_sampling(pg_catalog.varchar) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_partition(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog._float8) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.__sampling_prep_grp(pg_catalog._float8) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.__sampling_samp_grp(pg_catalog.float8, pg_catalog.bytea) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_sampling() 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_sampling(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.float8) 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_partition() 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.strat_partition(pg_catalog.varchar) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.train_test_split(pg_catalog.text, pg_catalog.text, pg_catalog.double precision) 10 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/sampling/sampling.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.2 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/sampling/test/test_sampling.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_sampling.sql 2 | -- Unit test for stratified sampling utility. 3 | 4 | CREATE TABLE base_table AS SELECT x val1, 2*x val2, (x*x)%3 stratum 5 | FROM generate_series(1,20) x 6 | DISTRIBUTED BY (val1); 7 | 8 | SELECT strat_partition('base_table','labeled_table', 9 | 'val1,val2', 'stratum','label',ARRAY[0.1,0.4,0.01]); 10 | 11 | SELECT strat_sampling('base_table','sampled_table', 12 | 'val1,val2', 'stratum',0.5); 13 | 14 | SELECT strat_partition('base_table','labeled_nonstrat_table', 15 | 'val1,val2', NULL,'label',ARRAY[0.1,0.4,0.01]); 16 | 17 | SELECT strat_sampling('base_table','sampled_nonstrat_table', 18 | 'val1,val2', NULL,0.5); 19 | 20 | CREATE TABLE labeled_count AS 21 | SELECT stratum,label, count(*) cnt FROM labeled_table GROUP BY stratum,label 22 | DISTRIBUTED RANDOMLY; 23 | 24 | CREATE TABLE sampled_count AS 25 | SELECT stratum, count(*) cnt FROM sampled_table GROUP BY stratum 26 | DISTRIBUTED RANDOMLY; 27 | 28 | CREATE TABLE labeled_nonstrat_count AS 29 | SELECT label, count(*) cnt FROM labeled_nonstrat_table GROUP BY label 30 | DISTRIBUTED RANDOMLY; 31 | 32 | CREATE TABLE sampled_nonstrat_count AS 33 | SELECT count(*) cnt FROM sampled_nonstrat_table 34 | DISTRIBUTED RANDOMLY; 35 | 36 | SELECT assert(CASE 37 | WHEN (stratum=0 AND label=0) THEN '1' 38 | WHEN (stratum=0 AND label=1) THEN '2' 39 | WHEN (stratum=0 AND label=2) THEN '1' 40 | WHEN (stratum=0 AND label=3) THEN '2' 41 | WHEN (stratum=1 AND label=0) THEN '2' 42 | WHEN (stratum=1 AND label=1) THEN '5' 43 | WHEN (stratum=1 AND label=2) THEN '1' 44 | WHEN (stratum=1 AND label=3) THEN '6' 45 | ELSE '0' END,cnt::TEXT) FROM labeled_count; 46 | 47 | SELECT assert(CASE 48 | WHEN stratum=0 THEN '3' 49 | WHEN stratum=1 THEN '7' 50 | ELSE '0' END,cnt::TEXT) FROM sampled_count; 51 | 52 | SELECT assert(CASE 53 | WHEN label=0 THEN '2' 54 | WHEN label=1 THEN '8' 55 | WHEN label=2 THEN '1' 56 | WHEN label=3 THEN '9' 57 | ELSE '0' END,cnt::TEXT) FROM labeled_nonstrat_count; 58 | 59 | SELECT assert('10',cnt::TEXT) FROM sampled_nonstrat_count; 60 | 61 | DROP TABLE base_table, labeled_table, sampled_table, labeled_count, 62 | sampled_count, labeled_nonstrat_count, sampled_nonstrat_count; 63 | 64 | 65 | --Unit Test for the test_train_split function 66 | --Create sample data 67 | drop table if exists sample_data; 68 | CREATE TEMP TABLE sample_data(pid int, points double precision[]); 69 | 70 | COPY sample_data (pid, points) FROM stdin DELIMITER '|'; 71 | 1 | {14.23, 1.71, 2.43, 15.6, 127, 2.8, 3.0600, 0.2800, 2.29, 5.64, 1.04, 3.92, 1065} 72 | 2 | {13.2, 1.78, 2.14, 11.2, 1, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.49, 1050} 73 | 3 | {13.16, 2.36, 2.67, 18.6, 101, 2.8, 3.24, 0.3, 2.81, 5.6799, 1.03, 3.17, 1185} 74 | 4 | {14.37, 1.95, 2.5, 16.8, 113, 3.85, 3.49, 0.24, 2.18, 7.8, 0.86, 3.45, 1480} 75 | 5 | {13.24, 2.59, 2.87, 21, 118, 2.8, 2.69, 0.39, 1.82, 4.32, 1.04, 2.93, 735} 76 | 6 | {14.2, 1.76, 2.45, 15.2, 112, 3.27, 3.39, 0.34, 1.97, 6.75, 1.05, 2.85, 1450} 77 | 7 | {14.39, 1.87, 2.45, 14.6, 96, 2.5, 2.52, 0.3, 1.98, 5.25, 1.02, 3.58, 1290} 78 | 8 | {14.06, 2.15, 2.61, 17.6, 121, 2.6, 2.51, 0.31, 1.25, 5.05, 1.06, 3.58, 1295} 79 | 9 | {14.83, 1.64, 2.17, 14, 97, 2.8, 2.98, 0.29, 1.98, 5.2, 1.08, 2.85, 1045} 80 | 10 | {13.86, 1.35, 2.27, 16, 98, 2.98, 3.15, 0.22, 1.8500, 7.2199, 1.01, 3.55, 1045} 81 | \. 82 | 83 | --use the function to create test and train tables 84 | select train_test_split('sample_data', 'pid' , 0.7); 85 | 86 | --Check count in train table 87 | select assert(count_train, 7) 88 | from ( 89 | select count(*) as count_train 90 | from sample_data_train 91 | )t1 92 | ; 93 | 94 | --Check count in test table 95 | select assert(count_test, 3) 96 | from ( 97 | select count(*) as count_test 98 | from sample_data_test 99 | )t1 100 | ; 101 | 102 | --Check whether the test and train tables are disjoint 103 | select assert(cnt, 0) 104 | from ( 105 | select count(*) cnt 106 | from sample_data_train 107 | inner join sample_data_test 108 | using(pid) 109 | )t1 110 | ; 111 | 112 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/session/session.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.__session_sessionization_imp(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar) 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.session_split(pg_catalog.varchar) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.session_split(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar, pg_catalog.interval) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.session_split() 5 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/session/session.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.1 2 | compatible: 1.2.1 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/session/test/test_session.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_session.sql 2 | -- Unit test for sessionization utility. 3 | 4 | CREATE TABLE __temp_clickstream( 5 | userid VARCHAR, 6 | time_stamp TIMESTAMP, 7 | action_type VARCHAR 8 | ) DISTRIBUTED RANDOMLY; 9 | 10 | INSERT INTO __temp_clickstream VALUES 11 | ('Max', TIMESTAMP '2013-Aug-23 04:57:02.15', 'LINK'), 12 | ('Tori', TIMESTAMP '2013-Aug-23 04:59:17.83', 'BID'), 13 | ('Max', TIMESTAMP '2013-Aug-23 05:03:01.42', 'BID'), 14 | ('Max', TIMESTAMP '2013-Aug-23 17:32:37.08', 'BUY'); 15 | 16 | SELECT session_split('__temp_clickstream','__temp_tagged_clickstream', 17 | 'userid','time_stamp','10 MINUTES'::INTERVAL); 18 | 19 | SELECT assert((SELECT sum(a.x) FROM 20 | (SELECT CASE 21 | WHEN (userid='Max' and action_type='LINK' 22 | and is_session_start=1 and session_no=0) 23 | THEN 1001 24 | WHEN (userid='Tori' and action_type='BID' 25 | and is_session_start=1 and session_no=0) 26 | THEN 1002 27 | WHEN (userid='Max' and action_type='BID' 28 | and is_session_start=0 and session_no=0) 29 | THEN 1004 30 | WHEN (userid='Max' and action_type='BUY' 31 | and is_session_start=1 and session_no=1) 32 | THEN 1008 33 | ELSE 1000 34 | END as x 35 | FROM __temp_tagged_clickstream) a)::TEXT,'4015'); 36 | 37 | DROP TABLE __temp_clickstream; 38 | 39 | DROP TABLE __temp_tagged_clickstream; 40 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/stemming/porter_stemmer.sql_in: -------------------------------------------------------------------------------- 1 | /* ----------------------------------------------------------------------- *//** 2 | 3 | @file porter_stemmer.sql_in 4 | 5 | @brief The Porter Stemmer. Pre-processing step in NLP pipelines. 6 | 7 | @author PL/C Wrapper written by Srivatsan Ramanujam 8 | porting original inventor Martin Porter's code 9 | from http://tartarus.org/martin/PorterStemmer/c_thread_safe.txt 10 | @date 12 Aug 2014 11 | 12 | *//* ----------------------------------------------------------------------- */ 13 | 14 | 15 | /** 16 | @addtogroup grp_stem_token 17 | 18 | @brief Apply Porter Stemmer on a token and return the root word 19 | 20 |
Contents 21 | 26 |
27 | 28 | @about 29 | A row function, that stems a token and returns the root word according to 30 | Porter's algorithm 31 | 32 | @anchor stem_token_syntax 33 | @par Syntax 34 |
 35 | FUNCTION stem_token(token text)
 36 | RETURNS text;
 37 | 
38 | 39 | @param token A word to be stemmed 40 | 41 | @returns stemmed/root form of the input token 42 | 43 | @anchor stem_token_usage 44 | @usage 45 | Identifying a root form of a token is a common step in many natural language 46 | processing tasks. Porter Stemmer is an algorithm that identifies a root form 47 | of any given word in English 48 | 49 | @anchor stem_token_example 50 | @examp 51 | @verbatim 52 | user=# SELECT stem_token('pencils'); 53 | stem_token 54 | ---------------- 55 | pencil 56 | (1 row) 57 | 58 | user=# SELECT stem_token('running'); 59 | stem_token 60 | ---------------- 61 | run 62 | (1 row) 63 | @endverbatim 64 | 65 | @sa grp_stem_token_arr 66 | 67 | */ 68 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.stem_token(token text) 69 | RETURNS text 70 | AS 'MODULE_PATHNAME','plc_stem_token' 71 | LANGUAGE C IMMUTABLE STRICT; 72 | 73 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.stem_token() 74 | RETURNS VARCHAR 75 | IMMUTABLE 76 | LANGUAGE SQL 77 | AS 78 | $$ 79 | SELECT $ABC$ 80 | stem_token: Apply Porter Stemmer on a token and return the root word. 81 | 82 | A row function, that stems a token and returns the root word according to 83 | Porter's algorithm 84 | 85 | Synopsis 86 | ======== 87 | PDLTOOLS_SCHEMA.stem_token(token text) 88 | RETURNS text 89 | 90 | token - A word to be stemmed 91 | 92 | Usage 93 | ===== 94 | Returns the stemmed/root form of the token in the input. 95 | 96 | Identifying the root form of a token is a common step in many natural language 97 | processing tasks. The Porter Stemmer is an algorithm that identifies a root 98 | form of any given word in English. 99 | 100 | Example 101 | ======= 102 | SELECT PDLTOOLS_SCHEMA.stem_token('running'); 103 | stem_token 104 | ---------------- 105 | run 106 | (1 row) 107 | 108 | See also: stem_token_arr 109 | $ABC$::VARCHAR; 110 | $$; 111 | 112 | ----------------------------------------------------------------------------------- 113 | 114 | /** 115 | @addtogroup grp_stem_token_arr 116 | 117 | @brief Apply Porter Stemmer on an array of tokens and return an array of root 118 | words 119 | 120 |
Contents 121 | 126 |
127 | 128 | @about 129 | A row function, that stems each word from an array of tokens, returning an 130 | array of the root words according to Porter's algorithm. 131 | 132 | @anchor stem_token_arr_syntax 133 | @par Syntax 134 |
135 | FUNCTION stem_token_arr(token text[])
136 | RETURNS text[];
137 | 
138 | 139 | @param token An array of words to be stemmed 140 | 141 | @returns An array with stemmed/root forms of each of the tokens in the input. 142 | 143 | @anchor stem_token_arr_usage 144 | @usage 145 | Identifying the root form of a token is a common step in many natural language 146 | processing tasks. Porter Stemmer is an algorithm that identifies a root form 147 | of any given word in English. In this function it acts separately on each of 148 | the tokens given to it in an input array. 149 | 150 | @anchor stem_token_arr_example 151 | @examp 152 | @verbatim 153 | select stem_token_arr(ARRAY['pencils','running','walking']); 154 | stem_token_arr 155 | -------------------- 156 | {pencil,run,walk} 157 | (1 row) 158 | @endverbatim 159 | 160 | @sa grp_stem_token 161 | 162 | */ 163 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.stem_token_arr(token_arr text[]) 164 | RETURNS text[] 165 | AS 'MODULE_PATHNAME','plc_stem_token_arr' 166 | LANGUAGE C IMMUTABLE STRICT; 167 | 168 | ----------------------------------------------------------------------------------- 169 | 170 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.stem_token_arr() 171 | RETURNS VARCHAR 172 | IMMUTABLE 173 | LANGUAGE SQL 174 | AS 175 | $$ 176 | SELECT $ABC$ 177 | stem_token_arr: Apply Porter Stemmer on an array of tokens and return an 178 | array of root words. 179 | 180 | A row function, that stems each word from an array of tokens, returning an 181 | array of the root words according to Porter's algorithm. 182 | 183 | For full usage instructions, run "PDLTOOLS_SCHEMA.stem_token_arr('usage')". 184 | $ABC$::VARCHAR; 185 | $$; 186 | 187 | CREATE OR REPLACE FUNCTION PDLTOOLS_SCHEMA.stem_token_arr(option VARCHAR) 188 | RETURNS VARCHAR 189 | IMMUTABLE 190 | LANGUAGE SQL 191 | AS 192 | $$ 193 | SELECT CASE WHEN $1!='usage' THEN PDLTOOLS_SCHEMA.stem_token_arr() ELSE 194 | $ABC$ 195 | stem_token_arr: Apply Porter Stemmer on an array of tokens and return an 196 | array of root words. 197 | 198 | A row function, that stems each word from an array of tokens, returning an 199 | array of the root words according to Porter's algorithm. 200 | 201 | Synopsis 202 | ======== 203 | PDLTOOLS_SCHEMA.stem_token_arr(token text[]) 204 | RETURNS text[] 205 | 206 | token - An array of words to be stemmed 207 | 208 | Usage 209 | ===== 210 | Returns An array with stemmed/root forms of each of the tokens in the input. 211 | 212 | Identifying the root form of a token is a common step in many natural language 213 | processing tasks. The Porter Stemmer is an algorithm that identifies a root 214 | form of any given word in English. In this function it acts separately on each 215 | of the tokens given to it in an input array. 216 | 217 | Example 218 | ======= 219 | SELECT PDLTOOLS_SCHEMA.stem_token_arr(ARRAY['pencils','running','walking']); 220 | stem_token_arr 221 | -------------------- 222 | {pencil,run,walk} 223 | (1 row) 224 | 225 | See also: stem_token 226 | $ABC$ 227 | END; 228 | $$; 229 | 230 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/stemming/stemming.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.stem_token_arr() 2 | UDF: FUNCTION PDLTOOLS_SCHEMA.stem_token_arr(pg_catalog.varchar) 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.stem_token_arr(pg_catalog._text) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.stem_token(pg_catalog.text) 5 | UDF: FUNCTION PDLTOOLS_SCHEMA.stem_token() 6 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/stemming/stemming.yml: -------------------------------------------------------------------------------- 1 | identical: 1.2.1 2 | compatible: 0.0 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/stemming/test/test_porter_stemmer.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_porter_stemmer.sql 2 | -- Unit test for porter_stemmer.sql 3 | 4 | SELECT assert(stem_token('pencils'),'pencil'); 5 | SELECT assert(stem_token('running'),'run'); 6 | SELECT assert(stem_token('run'),'run'); 7 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/sugar/sugar.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_imp(pg_catalog.varchar, pg_catalog.varchar) 2 | UDF: FUNCTION SUGAR_SCHEMA.clamp() 3 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_pivot_expand_compress(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 4 | UDF: FUNCTION SUGAR_SCHEMA.clamp(pg_catalog.numeric, pg_catalog.numeric, pg_catalog.numeric) 5 | UDF: AGGREGATE SUGAR_SCHEMA.unique_element(pg_catalog.anyelement) 6 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_unique_add(pg_catalog.anyelement, pg_catalog.anyelement) 7 | UDF: FUNCTION SUGAR_SCHEMA.freq_vals(pg_catalog.varchar) 8 | UDF: FUNCTION SUGAR_SCHEMA.all_but_one() 9 | UDF: FUNCTION SUGAR_SCHEMA.proportional_trans(pg_catalog.numeric, pg_catalog.numeric, pg_catalog.numeric) 10 | UDF: FUNCTION SUGAR_SCHEMA.freq_vals() 11 | UDF: FUNCTION SUGAR_SCHEMA.sugar(pg_catalog.varchar) 12 | UDF: FUNCTION SUGAR_SCHEMA.all_vals(pg_catalog.varchar) 13 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_pivot_expand(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 14 | UDF: FUNCTION SUGAR_SCHEMA.all_vals() 15 | REL: TABLE SUGAR_SCHEMA.sugar_help_db 16 | UDF: FUNCTION SUGAR_SCHEMA.clamp(pg_catalog.varchar) 17 | UDF: FUNCTION SUGAR_SCHEMA.sugar(pg_catalog.varchar, pg_catalog.varchar, pg_catalog.varchar) 18 | UDF: FUNCTION SUGAR_SCHEMA.all_but_one(pg_catalog.varchar) 19 | UDF: AGGREGATE SUGAR_SCHEMA.choose_any(pg_catalog.anyelement) 20 | REL: TABLE SUGAR_SCHEMA.sugar_db 21 | UDF: FUNCTION SUGAR_SCHEMA.freq_vals(pg_catalog.int4) 22 | UDF: FUNCTION SUGAR_SCHEMA.sugar() 23 | UDF: FUNCTION SUGAR_SCHEMA.invprop_trans(pg_catalog.numeric, pg_catalog.numeric, pg_catalog.numeric) 24 | UDF: FUNCTION SUGAR_SCHEMA.sugar(pg_catalog.varchar, pg_catalog.varchar) 25 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_any_add(pg_catalog.anyelement, pg_catalog.anyelement) 26 | UDF: FUNCTION SUGAR_SCHEMA.invprop_trans(pg_catalog.varchar) 27 | UDF: AGGREGATE SUGAR_SCHEMA.__sugar_fast_agg(pg_catalog.anyelement) 28 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_agg_merge(pg_catalog.anyarray, pg_catalog.anyarray) 29 | UDF: FUNCTION SUGAR_SCHEMA.invprop_trans() 30 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_get_col_names(pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text, pg_catalog.text) 31 | UDF: FUNCTION SUGAR_SCHEMA.proportional_trans() 32 | UDF: FUNCTION SUGAR_SCHEMA.sugar_version() 33 | UDF: FUNCTION SUGAR_SCHEMA.proportional_trans(pg_catalog.varchar) 34 | UDF: FUNCTION SUGAR_SCHEMA.__sugar_agg_add(pg_catalog.anyarray, pg_catalog.anyelement) 35 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/sugar/sugar.yml: -------------------------------------------------------------------------------- 1 | identical: 1.0 2 | compatible: 0.4.4 3 | libpart: sugar 4 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/uri_utils/test/test_uri_utils.sql_in: -------------------------------------------------------------------------------- 1 | -- File: test_uri_utils.sql 2 | -- Unit test for URI utility. 3 | 4 | ------------------------------------------------------------------------------------------------- 5 | 6 | with testcase 7 | as 8 | ( 9 | select (t).* 10 | from 11 | ( 12 | select parse_uri($BODY$http://myself:password@www.Pivotal.io:80/%7ehello/to/you/index.html?who=I&whom=me&more=a%20%22''%5E%5e%41#here$BODY$,false,false) as t 13 | )test 14 | ) 15 | select ( 16 | --Test query string for correctness 17 | (select assert((select query from testcase), $BODY$who=I&whom=me&more=a%20%22''%5E%5e%41$BODY$)) AND 18 | --Test scheme 19 | (select assert((select scheme from testcase), $BODY$http$BODY$)) AND 20 | --Test userinfo 21 | (select assert((select userinfo from testcase), $BODY$myself:password$BODY$)) AND 22 | --Test hosttext 23 | (select assert((select hosttext from testcase), $BODY$www.Pivotal.io$BODY$)) AND 24 | --Test porttext 25 | (select assert((select porttext from testcase), $BODY$80$BODY$)) AND 26 | --Test fragment 27 | (select assert((select fragment from testcase), $BODY$here$BODY$)) AND 28 | --Test path 29 | (select assert((select array_to_string(path,',') from testcase), $BODY$%7ehello,to,you,index.html$BODY$)) AND 30 | --Test absolutepath 31 | (select assert((select case when absolutepath=FALSE then 'f' else 't' end as absolutepath from testcase), $BODY$f$BODY$)) 32 | ); 33 | 34 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/uri_utils/uri_utils.content: -------------------------------------------------------------------------------- 1 | UDF: FUNCTION PDLTOOLS_SCHEMA.extract_uri(pg_catalog.varchar) 2 | REL: TYPE PDLTOOLS_SCHEMA.uri_type 3 | UDF: FUNCTION PDLTOOLS_SCHEMA.parse_uri(pg_catalog.varchar) 4 | UDF: FUNCTION PDLTOOLS_SCHEMA.extract_uri(pg_catalog.text, pg_catalog.bool) 5 | REL: TYPE PDLTOOLS_SCHEMA.uri_array_type 6 | UDF: FUNCTION PDLTOOLS_SCHEMA.parse_uri() 7 | UDF: FUNCTION PDLTOOLS_SCHEMA.parse_uri(pg_catalog.text, pg_catalog.bool, pg_catalog.bool) 8 | UDF: FUNCTION PDLTOOLS_SCHEMA.parse_domain(pg_catalog.text) 9 | UDF: FUNCTION PDLTOOLS_SCHEMA.parse_domain() 10 | UDF: FUNCTION PDLTOOLS_SCHEMA.extract_uri() 11 | -------------------------------------------------------------------------------- /src/ports/greenplum/modules/uri_utils/uri_utils.yml: -------------------------------------------------------------------------------- 1 | identical: 1.4 2 | compatible: 1.2.1 3 | libpart: pdltools 4 | -------------------------------------------------------------------------------- /src/ports/hawq/1.2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_hawq_version() 2 | 3 | if(HAWQ_1_2_FOUND) 4 | add_subdirectory(config) 5 | endif(HAWQ_1_2_FOUND) 6 | -------------------------------------------------------------------------------- /src/ports/hawq/1.2/config/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDLTools configuration files (these are configuration files used by the 3 | # installer). End users are not supposed to make modifications. 4 | # ------------------------------------------------------------------------------ 5 | 6 | 7 | # -- 1. Copy Modules.yml files --------------------------------------------------- 8 | 9 | add_files(CONFIG_TARGET_FILES_HAWQ_1_2 . "${CMAKE_CURRENT_BINARY_DIR}" "Modules.yml") 10 | add_custom_target(configFilesHAWQ12 ALL DEPENDS ${CONFIG_TARGET_FILES_HAWQ_1_2}) 11 | 12 | 13 | 14 | # -- 2. Install config files to $PDLTOOLS_ROOT/config ---------------------------- 15 | 16 | install(FILES ${CONFIG_TARGET_FILES_HAWQ_1_2} 17 | DESTINATION ports/hawq/1.2/config 18 | COMPONENT hawq_1_2 19 | ) 20 | -------------------------------------------------------------------------------- /src/ports/hawq/1.2/config/Modules.yml: -------------------------------------------------------------------------------- 1 | ### 2 | # List of methods/modules and their dependencies: 3 | ### 4 | modules: 5 | - name: common 6 | - name: uri_utils 7 | depends: ['common'] 8 | - name: edit_distance 9 | depends: ['common'] 10 | - name: anonymization 11 | depends: ['common'] 12 | - name: sugar 13 | depends: ['common'] 14 | - name: stemming 15 | depends: ['common'] 16 | - name: prediction_metrics 17 | depends: ['common'] 18 | - name: balance_dataset 19 | depends: ['common'] 20 | - name: one_vs_rest 21 | depends: ['common'] 22 | - name: generic_utilities 23 | depends: ['common'] 24 | - name: grid_search 25 | depends: ['common'] 26 | - name: kd_tree 27 | depends: ['common'] 28 | - name: pagerank 29 | depends: ['common'] 30 | - name: complete_linkage 31 | depends: ['common'] 32 | - name: normalized_cut 33 | depends: ['common'] 34 | -------------------------------------------------------------------------------- /src/ports/hawq/1.3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_hawq_version() 2 | 3 | if(HAWQ_1_3_FOUND) 4 | add_subdirectory(config) 5 | endif(HAWQ_1_3_FOUND) 6 | -------------------------------------------------------------------------------- /src/ports/hawq/1.3/config/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDLTools configuration files (these are configuration files used by the 3 | # installer). End users are not supposed to make modifications. 4 | # ------------------------------------------------------------------------------ 5 | 6 | 7 | # -- 1. Copy Modules.yml files --------------------------------------------------- 8 | 9 | add_files(CONFIG_TARGET_FILES_HAWQ_1_3 . "${CMAKE_CURRENT_BINARY_DIR}" "Modules.yml") 10 | add_custom_target(configFilesHAWQ13 ALL DEPENDS ${CONFIG_TARGET_FILES_HAWQ_1_3}) 11 | 12 | 13 | 14 | # -- 3. Install config files to $PDLTOOLS_ROOT/config ---------------------------- 15 | 16 | install(FILES ${CONFIG_TARGET_FILES_HAWQ_1_3} 17 | DESTINATION ports/hawq/1.3/config 18 | COMPONENT hawq_1_3 19 | ) 20 | -------------------------------------------------------------------------------- /src/ports/hawq/1.3/config/Modules.yml: -------------------------------------------------------------------------------- 1 | ### 2 | # List of methods/modules and their dependencies: 3 | ### 4 | modules: 5 | - name: common 6 | - name: uri_utils 7 | depends: ['common'] 8 | - name: edit_distance 9 | depends: ['common'] 10 | - name: anonymization 11 | depends: ['common'] 12 | - name: sugar 13 | depends: ['common'] 14 | - name: stemming 15 | depends: ['common'] 16 | - name: prediction_metrics 17 | depends: ['common'] 18 | - name: balance_dataset 19 | depends: ['common'] 20 | - name: one_vs_rest 21 | depends: ['common'] 22 | - name: generic_utilities 23 | depends: ['common'] 24 | - name: grid_search 25 | depends: ['common'] 26 | - name: kd_tree 27 | depends: ['common'] 28 | - name: pagerank 29 | depends: ['common'] 30 | - name: complete_linkage 31 | depends: ['common'] 32 | - name: normalized_cut 33 | depends: ['common'] 34 | -------------------------------------------------------------------------------- /src/ports/hawq/2.0/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_current_hawq_version() 2 | 3 | if(HAWQ_2_0_FOUND) 4 | add_subdirectory(config) 5 | endif(HAWQ_2_0_FOUND) 6 | -------------------------------------------------------------------------------- /src/ports/hawq/2.0/config/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # PDLTools configuration files (these are configuration files used by the 3 | # installer). End users are not supposed to make modifications. 4 | # ------------------------------------------------------------------------------ 5 | 6 | 7 | # -- 1. Copy Modules.yml files --------------------------------------------------- 8 | 9 | add_files(CONFIG_TARGET_FILES_HAWQ_2_0 . "${CMAKE_CURRENT_BINARY_DIR}" "Modules.yml") 10 | add_custom_target(configFilesHAWQ20 ALL DEPENDS ${CONFIG_TARGET_FILES_HAWQ_2_0}) 11 | 12 | 13 | 14 | # -- 3. Install config files to $PDLTOOLS_ROOT/config ---------------------------- 15 | 16 | install(FILES ${CONFIG_TARGET_FILES_HAWQ_2_0} 17 | DESTINATION ports/hawq/2.0/config 18 | COMPONENT hawq_2_0 19 | ) 20 | -------------------------------------------------------------------------------- /src/ports/hawq/2.0/config/Modules.yml: -------------------------------------------------------------------------------- 1 | ### 2 | # List of methods/modules and their dependencies: 3 | ### 4 | modules: 5 | - name: common 6 | - name: uri_utils 7 | depends: ['common'] 8 | - name: edit_distance 9 | depends: ['common'] 10 | - name: anonymization 11 | depends: ['common'] 12 | - name: sugar 13 | depends: ['common'] 14 | - name: stemming 15 | depends: ['common'] 16 | - name: prediction_metrics 17 | depends: ['common'] 18 | - name: balance_dataset 19 | depends: ['common'] 20 | - name: one_vs_rest 21 | depends: ['common'] 22 | - name: generic_utilities 23 | depends: ['common'] 24 | - name: grid_search 25 | depends: ['common'] 26 | - name: kd_tree 27 | depends: ['common'] 28 | - name: pagerank 29 | depends: ['common'] 30 | - name: complete_linkage 31 | depends: ['common'] 32 | - name: normalized_cut 33 | depends: ['common'] 34 | 35 | -------------------------------------------------------------------------------- /src/ports/hawq/cmake/FindHAWQ.cmake: -------------------------------------------------------------------------------- 1 | # Set defaults that can be overridden by files that include this file: 2 | if(NOT DEFINED _FIND_PACKAGE_FILE) 3 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 4 | endif(NOT DEFINED _FIND_PACKAGE_FILE) 5 | 6 | # Set parameters for calling FindPostgreSQL.cmake 7 | set(_NEEDED_PG_CONFIG_PACKAGE_NAME "HAWQ") 8 | set(_PG_CONFIG_VERSION_NUM_MACRO "HQ_VERSION_NUM") 9 | set(_PG_CONFIG_VERSION_MACRO "HQ_VERSION") 10 | set(_SEARCH_PATH_HINTS 11 | "/usr/local/hawq/bin" 12 | "$ENV{GPHOME}/bin" 13 | ) 14 | 15 | include("${CMAKE_CURRENT_LIST_DIR}/../../greenplum/cmake/FindPostgreSQL.cmake") 16 | 17 | if(${PKG_NAME}_FOUND) 18 | # server/funcapi.h ultimately includes server/access/xact.h, from which 19 | # cdb/cdbpathlocus.h is included 20 | execute_process(COMMAND ${${PKG_NAME}_PG_CONFIG} --pkgincludedir 21 | OUTPUT_VARIABLE ${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS 22 | OUTPUT_STRIP_TRAILING_WHITESPACE 23 | ) 24 | set(${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS 25 | "${${PKG_NAME}_ADDITIONAL_INCLUDE_DIRS}/internal") 26 | endif(${PKG_NAME}_FOUND) 27 | -------------------------------------------------------------------------------- /src/ports/hawq/cmake/FindHAWQ_1_2.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindHAWQ.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/hawq/cmake/FindHAWQ_1_3.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindHAWQ.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/hawq/cmake/FindHAWQ_2_0.cmake: -------------------------------------------------------------------------------- 1 | set(_FIND_PACKAGE_FILE "${CMAKE_CURRENT_LIST_FILE}") 2 | include("${CMAKE_CURRENT_LIST_DIR}/FindHAWQ.cmake") 3 | -------------------------------------------------------------------------------- /src/ports/hawq/cmake/HAWQUtils.cmake: -------------------------------------------------------------------------------- 1 | # Define HAWQ feature macros 2 | # 3 | function(define_hawq_features IN_VERSION OUT_FEATURES) 4 | list(APPEND ${OUT_FEATURES} __HAS_ORDERED_AGGREGATES__) 5 | list(APPEND ${OUT_FEATURES} __HAS_FUNCTION_PROPERTIES__) 6 | 7 | # Pass values to caller 8 | set(${OUT_FEATURES} "${${OUT_FEATURES}}" PARENT_SCOPE) 9 | endfunction(define_hawq_features) 10 | 11 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/argparse.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vmware-archive/PDLTools/24ce5033407766e432fa00cc171039de2611d140/src/utils/argparse.pyc --------------------------------------------------------------------------------